Last updated: 2019-07-24
This code processes Census of Agriculture data on agricultural land (cropland + pastureland) and % area treated. The main goals are to estimate missing values, remove states outside of the contiguous US (i.e. AK, HI) and join the data to USDA farm resource regions.
Data on county acreage are from the US Agricultural Census, as described here. Farm resource regions are from USDA-ERS, as described here and linked here.
library(tidyverse)
## Function to restore lost zeroes to state & county fips codes and save as a new combined variable called 'fips'
## This function converts the original fips codes to characters and saves the new fips column as a character too
# x is the dataframe
# cty_var is the name of the column with county fips codes (in quotes)
# st_var is the name of the column with state fips codes (in quotes)
std_fips <- function(x, cty_var, st_var){
x[[cty_var]] <- as.character(x[[cty_var]])
x$fips.length<-nchar(x[[cty_var]])
x$fips.add[x$fips.length==1]<-"00"
x$fips.add[x$fips.length==2]<-"0"
x$fips.add[x$fips.length==3]<-""
x[[st_var]] <- as.character(x[[st_var]])
x$fips.length2<-nchar(x[[st_var]])
x$fips.add2[x$fips.length2==1]<-"0"
x$fips.add2[x$fips.length2==2]<-""
x$fips<-paste(x$fips.add2,x[[st_var]],x$fips.add,x[[cty_var]], sep="")
x <- dplyr::select(x, -fips.add, -fips.add2, -fips.length, -fips.length2)
}
## Function to restore lost zeroes to fips codes when state & county already fused
std_fips2 <- function(x, fips_var){
x[[fips_var]] <- as.character(x[[fips_var]])
x$fips.length<-nchar(x[[fips_var]])
x$fips.add[x$fips.length==4]<-"0"
x$fips.add[x$fips.length==5]<-""
x$fips<-paste(x$fips.add,x[[fips_var]], sep="")
x <- dplyr::select(x, -fips.add, -fips.length)
}
### This function finds missing values of a variable in a dataframe and replaces them with the average value of the variable (if available) or with a zero (if no data available)
fill_miss <- function(x, var){
y <- x %>%
mutate(var_new =
ifelse(!is.na(x[[var]]), x[[var]],
mean(x[[var]], na.rm=TRUE)),
interp =
ifelse(!is.na(x[[var]]), "no", "yes")) %>%
mutate(var_new2 = ifelse(is.nan(var_new), 0, var_new)) %>%
dplyr::select(YEAR, fips, var_new2, interp)
names(y)[names(y) == 'var_new2'] <- paste(var, "new", sep="_")
names(y)[names(y) == 'interp'] <- paste(var, "interp", sep="_")
return(y)
}
### Split data frame by county, fill missing values, unsplit, and return new dataframe
# takes a longish time to run
# requires fill_miss function above
fill_df <- function(x, var){
x$fips <- as.factor(x$fips)
x_spl <- split(as.data.frame(x), f=x$fips)
x_spl_cor <- lapply(x_spl, fill_miss, var = var)
x_cor <- do.call(rbind, x_spl_cor)
return(x_cor)
}
# Load county-level acreage data, standardize fips codes, and remove AK and HI
census <- read.csv("../output_big/nass_survey/qs.economics.ac.cty_20170519.csv") %>%
std_fips(cty_var="COUNTY_ANSI", st_var="STATE_ANSI") %>%
filter(!(STATE_ALPHA %in% c("AK","HI")))
# Extract 2012 data for total land area in each county
cty_land <- filter(census, SHORT_DESC=="LAND AREA, INCL NON-AG - ACRES" & YEAR==2012) %>%
select(fips, VALUE) %>%
mutate(acres = as.numeric(gsub(",","",VALUE)),
ha = acres*0.404686)
# Extract data for cropland in each county (Census years)
cty_cropland <- filter(census, (SHORT_DESC=="AG LAND, CROPLAND - ACRES")) %>%
select(fips, YEAR, VALUE) %>%
mutate(crop_ac = as.numeric(gsub(",","",VALUE)),
crop_ha = crop_ac*0.404686) %>%
complete(YEAR, fips)
Warning: NAs introduced by coercion
# Extract data for pastureland in each county (Census years)
cty_pasture <- filter(census, (SHORT_DESC=="AG LAND, PASTURELAND, (EXCL CROPLAND & WOODLAND) - ACRES")) %>%
select(fips, YEAR, VALUE) %>%
mutate(past_ac = as.numeric(gsub(",","",VALUE)),
past_ha = past_ac*0.404686) %>%
complete(YEAR, fips)
Warning: NAs introduced by coercion
# Extract data for acres treated in each county (Census years)
trt_ins <- read.csv("../output_big/nass_survey/qs.environment.trt.cty_20170519.csv") %>%
filter(DOMAINCAT_DESC=="CHEMICAL, INSECTICIDE: (EXCL NEMATICIDES)") %>%
filter(!(STATE_ALPHA %in% c("AK","HI"))) %>%
std_fips(cty_var="COUNTY_ANSI", st_var="STATE_ANSI") %>%
select(fips, YEAR, VALUE) %>%
mutate(ac_trt_ins = as.numeric(gsub(",","",VALUE)),
ha_trt_ins = ac_trt_ins*0.404686) %>%
complete(YEAR, fips)
Warning: NAs introduced by coercion
str(cty_land)
'data.frame': 3070 obs. of 4 variables:
$ fips : chr "01033" "01059" "01077" "01079" ...
$ VALUE: Factor w/ 76168 levels "(D)","1,000",..: 46009 49335 50865 52058 43987 57505 54185 45683 47409 50013 ...
$ acres: num 379277 405644 427328 442034 358360 ...
$ ha : num 153488 164158 172934 178885 145023 ...
str(cty_cropland)
Classes 'tbl_df', 'tbl' and 'data.frame': 12276 obs. of 5 variables:
$ YEAR : int 1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
$ fips : chr "01001" "01003" "01005" "01007" ...
$ VALUE : Factor w/ 76168 levels "(D)","1,000",..: 54923 8504 64678 15506 69864 63453 46242 49879 35084 68024 ...
$ crop_ac: num 49072 126581 65450 16899 79670 ...
$ crop_ha: num 19859 51226 26487 6839 32241 ...
str(cty_pasture)
Classes 'tbl_df', 'tbl' and 'data.frame': 12260 obs. of 5 variables:
$ YEAR : int 1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
$ fips : chr "01001" "01003" "01005" "01007" ...
$ VALUE : Factor w/ 76168 levels "(D)","1,000",..: 13238 73299 11595 61691 20518 41700 74046 70695 18826 70207 ...
$ past_ac: num 15452 9030 14745 6094 19611 ...
$ past_ha: num 6253 3654 5967 2466 7936 ...
str(trt_ins)
Classes 'tbl_df', 'tbl' and 'data.frame': 12248 obs. of 5 variables:
$ YEAR : int 1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
$ fips : chr "01001" "01003" "01005" "01007" ...
$ VALUE : Factor w/ 34797 levels "(D)","1,000",..: 29648 29362 3646 NA 30281 17897 977 27308 33126 7955 ...
$ ac_trt_ins: num 7145 68595 12105 NA 7947 ...
$ ha_trt_ins: num 2891 27759 4899 NA 3216 ...
reg_key <- read.csv("../keys/usda_farm_regions_cty.csv") %>%
std_fips2(fips_var = "fips")
# fill in a couple missing values
# re: 46102, see https://www.census.gov/geo/reference/county-changes.html (tab 2010)
# re: 08014, see https://www.census.gov/geo/reference/county-changes.html (tab 2000)
reg_key <- rbind(reg_key, c("46102", 3, "Northern Great Plains"))
reg_key <- rbind(reg_key, c("08014", 3, "Northern Great Plains"))
str(reg_key)
'data.frame': 3114 obs. of 3 variables:
$ fips : chr "01001" "01003" "01005" "01007" ...
$ region_code: chr "6" "6" "6" "5" ...
$ region_name: Factor w/ 9 levels "Basin and Range",..: 9 9 9 2 2 9 9 2 9 2 ...
cty_crop_fill <- fill_df(cty_cropland, var="crop_ha")
cty_past_fill <- fill_df(cty_pasture, var="past_ha")
cty_trt_fill <- fill_df(trt_ins, var="ha_trt_ins")
str(cty_crop_fill)
'data.frame': 12276 obs. of 4 variables:
$ YEAR : int 1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
$ fips : Factor w/ 3069 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
$ crop_ha_new : num 19859 20900 17138 16711 51226 ...
$ crop_ha_interp: chr "no" "no" "no" "no" ...
str(cty_past_fill)
'data.frame': 12260 obs. of 4 variables:
$ YEAR : int 1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
$ fips : Factor w/ 3065 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
$ past_ha_new : num 6253 6645 9619 9158 3654 ...
$ past_ha_interp: chr "no" "no" "no" "no" ...
str(cty_trt_fill)
'data.frame': 12248 obs. of 4 variables:
$ YEAR : int 1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
$ fips : Factor w/ 3062 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
$ ha_trt_ins_new : num 2891 4731 3416 5568 27759 ...
$ ha_trt_ins_interp: chr "no" "no" "no" "no" ...
new_data <- full_join(cty_crop_fill, cty_past_fill,
by = c("YEAR","fips")) %>%
full_join(cty_trt_fill, by = c("YEAR","fips")) %>%
full_join(cty_land, by = "fips") %>%
rename(crop_ha = crop_ha_new,
past_ha = past_ha_new,
trt_ins_ha = ha_trt_ins_new,
trt_ins_ha_interp = ha_trt_ins_interp) %>%
mutate(crop_ha_interp = ifelse(is.na(crop_ha), "yes", crop_ha_interp),
past_ha_interp = ifelse(is.na(past_ha), "yes", past_ha_interp),
trt_ins_ha_interp = ifelse(is.na(trt_ins_ha), "yes", trt_ins_ha_interp)) %>%
mutate(crop_ha = ifelse(is.na(crop_ha), 0, crop_ha),
past_ha = ifelse(is.na(past_ha), 0, past_ha),
trt_ins_ha = ifelse(is.na(trt_ins_ha), 0, trt_ins_ha)) %>%
left_join(reg_key, by = "fips") %>%
select(-VALUE)
Warning: Column `fips` joining factors with different levels, coercing to
character vector
Warning: Column `fips` joining character vector and factor, coercing into
character vector
# check number of missing values
with(new_data, table(crop_ha_interp, YEAR))
YEAR
crop_ha_interp 1997 2002 2007 2012
no 2992 3055 3054 3050
yes 77 14 15 19
with(new_data, table(past_ha_interp, YEAR))
YEAR
past_ha_interp 1997 2002 2007 2012
no 2971 2948 2969 2949
yes 98 121 100 120
with(new_data, table(trt_ins_ha_interp, YEAR))
YEAR
trt_ins_ha_interp 1997 2002 2007 2012
no 2893 2901 2937 2944
yes 176 168 132 125
write.csv(new_data, file = "../output_big/nass_survey/census_agland_cleaned.csv",
row.names=FALSE)
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
[9] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.1
[5] tools_3.6.1 zeallot_0.1.0 digest_0.6.20 lubridate_1.7.4
[9] jsonlite_1.6 evaluate_0.14 nlme_3.1-140 gtable_0.3.0
[13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0
[17] rstudioapi_0.10 yaml_2.2.0 haven_2.1.1 xfun_0.8
[21] withr_2.1.2 xml2_1.2.0 httr_1.4.0 knitr_1.23
[25] vctrs_0.2.0 hms_0.5.0 generics_0.0.2 fs_1.3.1
[29] grid_3.6.1 tidyselect_0.2.5 glue_1.3.1 R6_2.4.0
[33] readxl_1.3.1 rmarkdown_1.14 modelr_0.1.4 magrittr_1.5
[37] backports_1.1.4 scales_1.0.0 htmltools_0.3.6 rvest_0.3.4
[41] assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.3 lazyeval_0.2.2
[45] munsell_0.5.0 broom_0.5.2 crayon_1.3.4
This R Markdown site was created with workflowr