Last updated: 2019-07-24

Purpose

This code processes Census of Agriculture data on agricultural land (cropland + pastureland) and % area treated. The main goals are to estimate missing values, remove states outside of the contiguous US (i.e. AK, HI) and join the data to USDA farm resource regions.

Data sources

Data on county acreage are from the US Agricultural Census, as described here. Farm resource regions are from USDA-ERS, as described here and linked here.

Libraries & functions

library(tidyverse)

## Function to restore lost zeroes to state & county fips codes and save as a new combined variable called 'fips'
## This function converts the original fips codes to characters and saves the new fips column as a character too
# x is the dataframe
# cty_var is the name of the column with county fips codes (in quotes)
# st_var is the name of the column with state fips codes (in quotes)
std_fips <- function(x, cty_var, st_var){
  x[[cty_var]] <- as.character(x[[cty_var]])
  x$fips.length<-nchar(x[[cty_var]])
  x$fips.add[x$fips.length==1]<-"00"
  x$fips.add[x$fips.length==2]<-"0"
  x$fips.add[x$fips.length==3]<-""
  x[[st_var]] <- as.character(x[[st_var]])
  x$fips.length2<-nchar(x[[st_var]])
  x$fips.add2[x$fips.length2==1]<-"0"
  x$fips.add2[x$fips.length2==2]<-""
  x$fips<-paste(x$fips.add2,x[[st_var]],x$fips.add,x[[cty_var]], sep="")
  x <- dplyr::select(x, -fips.add, -fips.add2, -fips.length, -fips.length2)
}

## Function to restore lost zeroes to fips codes when state & county already fused
std_fips2 <- function(x, fips_var){
  x[[fips_var]] <- as.character(x[[fips_var]])
  x$fips.length<-nchar(x[[fips_var]])
  x$fips.add[x$fips.length==4]<-"0"
  x$fips.add[x$fips.length==5]<-""
  x$fips<-paste(x$fips.add,x[[fips_var]], sep="")
  x <- dplyr::select(x, -fips.add, -fips.length)
}

### This function finds missing values of a variable in a dataframe and replaces them with the average value of the variable (if available) or with a zero (if no data available)
fill_miss <- function(x, var){
  y <- x %>%
    mutate(var_new = 
             ifelse(!is.na(x[[var]]), x[[var]],
                    mean(x[[var]], na.rm=TRUE)),
           interp = 
             ifelse(!is.na(x[[var]]), "no", "yes")) %>%
    mutate(var_new2 = ifelse(is.nan(var_new), 0, var_new)) %>%
   dplyr::select(YEAR, fips, var_new2, interp)
  names(y)[names(y) == 'var_new2'] <- paste(var, "new", sep="_")
  names(y)[names(y) == 'interp'] <- paste(var, "interp", sep="_")
  return(y)
}

### Split data frame by county, fill missing values, unsplit, and return new dataframe
# takes a longish time to run
# requires fill_miss function above
fill_df <- function(x, var){
  x$fips <- as.factor(x$fips)
  x_spl <- split(as.data.frame(x), f=x$fips)
  x_spl_cor <- lapply(x_spl, fill_miss, var = var)
  x_cor <- do.call(rbind, x_spl_cor)
  return(x_cor)
}

Load data

# Load county-level acreage data, standardize fips codes, and remove AK and HI
census <- read.csv("../output_big/nass_survey/qs.economics.ac.cty_20170519.csv") %>%
  std_fips(cty_var="COUNTY_ANSI", st_var="STATE_ANSI") %>%
  filter(!(STATE_ALPHA %in% c("AK","HI")))

# Extract 2012 data for total land area in each county
cty_land <- filter(census, SHORT_DESC=="LAND AREA, INCL NON-AG - ACRES" & YEAR==2012) %>%
  select(fips, VALUE) %>%
  mutate(acres = as.numeric(gsub(",","",VALUE)),
         ha = acres*0.404686) 

# Extract data for cropland in each county (Census years)                   
cty_cropland <- filter(census, (SHORT_DESC=="AG LAND, CROPLAND - ACRES")) %>%
  select(fips, YEAR, VALUE) %>%
  mutate(crop_ac = as.numeric(gsub(",","",VALUE)),
         crop_ha = crop_ac*0.404686) %>%
  complete(YEAR, fips)
Warning: NAs introduced by coercion
# Extract data for pastureland in each county (Census years)  
cty_pasture <- filter(census, (SHORT_DESC=="AG LAND, PASTURELAND, (EXCL CROPLAND & WOODLAND) - ACRES")) %>%
    select(fips, YEAR, VALUE) %>%
    mutate(past_ac = as.numeric(gsub(",","",VALUE)),
           past_ha = past_ac*0.404686) %>%
    complete(YEAR, fips)
Warning: NAs introduced by coercion
# Extract data for acres treated in each county (Census years)
trt_ins <- read.csv("../output_big/nass_survey/qs.environment.trt.cty_20170519.csv") %>%
  filter(DOMAINCAT_DESC=="CHEMICAL, INSECTICIDE: (EXCL NEMATICIDES)") %>%
  filter(!(STATE_ALPHA %in% c("AK","HI"))) %>%
  std_fips(cty_var="COUNTY_ANSI", st_var="STATE_ANSI") %>%
  select(fips, YEAR, VALUE) %>%
  mutate(ac_trt_ins = as.numeric(gsub(",","",VALUE)),
         ha_trt_ins = ac_trt_ins*0.404686) %>%
  complete(YEAR, fips)
Warning: NAs introduced by coercion
str(cty_land)
'data.frame':   3070 obs. of  4 variables:
 $ fips : chr  "01033" "01059" "01077" "01079" ...
 $ VALUE: Factor w/ 76168 levels "(D)","1,000",..: 46009 49335 50865 52058 43987 57505 54185 45683 47409 50013 ...
 $ acres: num  379277 405644 427328 442034 358360 ...
 $ ha   : num  153488 164158 172934 178885 145023 ...
str(cty_cropland)
Classes 'tbl_df', 'tbl' and 'data.frame':   12276 obs. of  5 variables:
 $ YEAR   : int  1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
 $ fips   : chr  "01001" "01003" "01005" "01007" ...
 $ VALUE  : Factor w/ 76168 levels "(D)","1,000",..: 54923 8504 64678 15506 69864 63453 46242 49879 35084 68024 ...
 $ crop_ac: num  49072 126581 65450 16899 79670 ...
 $ crop_ha: num  19859 51226 26487 6839 32241 ...
str(cty_pasture)
Classes 'tbl_df', 'tbl' and 'data.frame':   12260 obs. of  5 variables:
 $ YEAR   : int  1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
 $ fips   : chr  "01001" "01003" "01005" "01007" ...
 $ VALUE  : Factor w/ 76168 levels "(D)","1,000",..: 13238 73299 11595 61691 20518 41700 74046 70695 18826 70207 ...
 $ past_ac: num  15452 9030 14745 6094 19611 ...
 $ past_ha: num  6253 3654 5967 2466 7936 ...
str(trt_ins)
Classes 'tbl_df', 'tbl' and 'data.frame':   12248 obs. of  5 variables:
 $ YEAR      : int  1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
 $ fips      : chr  "01001" "01003" "01005" "01007" ...
 $ VALUE     : Factor w/ 34797 levels "(D)","1,000",..: 29648 29362 3646 NA 30281 17897 977 27308 33126 7955 ...
 $ ac_trt_ins: num  7145 68595 12105 NA 7947 ...
 $ ha_trt_ins: num  2891 27759 4899 NA 3216 ...
reg_key <- read.csv("../keys/usda_farm_regions_cty.csv") %>%
  std_fips2(fips_var = "fips")

# fill in a couple missing values
# re: 46102, see https://www.census.gov/geo/reference/county-changes.html (tab 2010)
# re: 08014, see https://www.census.gov/geo/reference/county-changes.html (tab 2000)
reg_key <- rbind(reg_key, c("46102", 3, "Northern Great Plains"))
reg_key <- rbind(reg_key, c("08014", 3, "Northern Great Plains"))

str(reg_key)
'data.frame':   3114 obs. of  3 variables:
 $ fips       : chr  "01001" "01003" "01005" "01007" ...
 $ region_code: chr  "6" "6" "6" "5" ...
 $ region_name: Factor w/ 9 levels "Basin and Range",..: 9 9 9 2 2 9 9 2 9 2 ...

Fill in missing values

cty_crop_fill <- fill_df(cty_cropland, var="crop_ha")
cty_past_fill <- fill_df(cty_pasture, var="past_ha")
cty_trt_fill <- fill_df(trt_ins, var="ha_trt_ins")

str(cty_crop_fill)
'data.frame':   12276 obs. of  4 variables:
 $ YEAR          : int  1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
 $ fips          : Factor w/ 3069 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ crop_ha_new   : num  19859 20900 17138 16711 51226 ...
 $ crop_ha_interp: chr  "no" "no" "no" "no" ...
str(cty_past_fill)
'data.frame':   12260 obs. of  4 variables:
 $ YEAR          : int  1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
 $ fips          : Factor w/ 3065 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ past_ha_new   : num  6253 6645 9619 9158 3654 ...
 $ past_ha_interp: chr  "no" "no" "no" "no" ...
str(cty_trt_fill)
'data.frame':   12248 obs. of  4 variables:
 $ YEAR             : int  1997 2002 2007 2012 1997 2002 2007 2012 1997 2002 ...
 $ fips             : Factor w/ 3062 levels "01001","01003",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ ha_trt_ins_new   : num  2891 4731 3416 5568 27759 ...
 $ ha_trt_ins_interp: chr  "no" "no" "no" "no" ...

Merge datasets

new_data <- full_join(cty_crop_fill, cty_past_fill, 
                  by = c("YEAR","fips")) %>%
  full_join(cty_trt_fill, by = c("YEAR","fips")) %>%
  full_join(cty_land, by = "fips") %>%
  rename(crop_ha = crop_ha_new,
         past_ha = past_ha_new,
         trt_ins_ha = ha_trt_ins_new,
         trt_ins_ha_interp = ha_trt_ins_interp) %>%
    mutate(crop_ha_interp = ifelse(is.na(crop_ha), "yes", crop_ha_interp),
         past_ha_interp = ifelse(is.na(past_ha), "yes", past_ha_interp),
         trt_ins_ha_interp = ifelse(is.na(trt_ins_ha), "yes", trt_ins_ha_interp)) %>%
  mutate(crop_ha = ifelse(is.na(crop_ha), 0, crop_ha),
         past_ha = ifelse(is.na(past_ha), 0, past_ha),
         trt_ins_ha = ifelse(is.na(trt_ins_ha), 0, trt_ins_ha)) %>%
    left_join(reg_key, by = "fips") %>%
  select(-VALUE)
Warning: Column `fips` joining factors with different levels, coercing to
character vector
Warning: Column `fips` joining character vector and factor, coercing into
character vector

Assess extent of estimated values

# check number of missing values
with(new_data, table(crop_ha_interp, YEAR))
              YEAR
crop_ha_interp 1997 2002 2007 2012
           no  2992 3055 3054 3050
           yes   77   14   15   19
with(new_data, table(past_ha_interp, YEAR))
              YEAR
past_ha_interp 1997 2002 2007 2012
           no  2971 2948 2969 2949
           yes   98  121  100  120
with(new_data, table(trt_ins_ha_interp, YEAR))
                 YEAR
trt_ins_ha_interp 1997 2002 2007 2012
              no  2893 2901 2937 2944
              yes  176  168  132  125

Export final file

write.csv(new_data, file = "../output_big/nass_survey/census_agland_cleaned.csv",
          row.names=FALSE)

Session information

sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2    
[5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0  
[9] tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       cellranger_1.1.0 pillar_1.4.2     compiler_3.6.1  
 [5] tools_3.6.1      zeallot_0.1.0    digest_0.6.20    lubridate_1.7.4 
 [9] jsonlite_1.6     evaluate_0.14    nlme_3.1-140     gtable_0.3.0    
[13] lattice_0.20-38  pkgconfig_2.0.2  rlang_0.4.0      cli_1.1.0       
[17] rstudioapi_0.10  yaml_2.2.0       haven_2.1.1      xfun_0.8        
[21] withr_2.1.2      xml2_1.2.0       httr_1.4.0       knitr_1.23      
[25] vctrs_0.2.0      hms_0.5.0        generics_0.0.2   fs_1.3.1        
[29] grid_3.6.1       tidyselect_0.2.5 glue_1.3.1       R6_2.4.0        
[33] readxl_1.3.1     rmarkdown_1.14   modelr_0.1.4     magrittr_1.5    
[37] backports_1.1.4  scales_1.0.0     htmltools_0.3.6  rvest_0.3.4     
[41] assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.3    lazyeval_0.2.2  
[45] munsell_0.5.0    broom_0.5.2      crayon_1.3.4    

This R Markdown site was created with workflowr