Last updated: 2019-07-25

Attach packages

library(timetk)
library(lubridate)
library(dtwclust)
library(sf)
library(tigris)
library(tidyverse)

Define functions

# Prep data for time series clustering
data_prep <- function(df) {
  dat <- filter(df, YEAR %in% c(1997:2012)) %>% # constrain to 1997-2012
    mutate(year = lubridate::ymd(YEAR, truncated = 2, tz = "EST")) %>%  # reformat year field
    select(year, # select and rename columns
           fips,
           region = region_name,
           class = class_short, 
           ctox_low = ct_tox_bil_low, 
           ctox_high = ct_tox_bil_high, 
           ortox_low = or_tox_bil_low, 
           ortox_high = or_tox_bil_high) %>%
    na.omit() # Northern Crescent has an NA for year in one row; pesticide data re also NA for this entry, so it can just be omitted
}

clust_prep <- function(df) {
  df %>% group_by(year, region) %>%  # aggregate toxic load by year and region
    summarise(agg_ctox_low = sum(ctox_low),
              agg_ctox_high = sum(ctox_high),
              agg_ortox_low = sum(ortox_low),
              agg_ortox_high = sum(ortox_high)) %>%
    ungroup() %>%
    gather(metric, value, -year, -region) %>% # gather into tidy array
    group_by(region, metric) %>%
    mutate(fold_change = value/value[[1]]) %>% # calculate fold change; divides each value by the first value, grouped by region and metric
    select(year, region, metric, fold_change) %>%
    nest() %>% # collapse into list column in prep for xts conversion
    mutate(data_xts = map(data, timetk::tk_xts)) %>% # convert to xts time series
    select(region, metric, data, data_xts)
}

plot_prep <- function(df) {
  df %>% group_by(year, region, class) %>%
    summarise(agg_ctox_low = sum(ctox_low),
              agg_ctox_high = sum(ctox_high),
              agg_ortox_low = sum(ortox_low),
              agg_ortox_high = sum(ortox_high)) %>%
    gather(metric, value, -year, -region, -class)
}
  

hist_prep <- function(df) {
  df %>% group_by(year, fips) %>%  # aggregate toxic load by year and region
    summarise(agg_ctox_low = sum(ctox_low),
              agg_ctox_high = sum(ctox_high),
              agg_ortox_low = sum(ortox_low),
              agg_ortox_high = sum(ortox_high)) %>%
    ungroup() %>%
    gather(metric, value, -year, -fips)
}

outlier_prep <- function(df) {
  df %>% group_by(year, fips, class) %>%  # aggregate toxic load by year and region
    summarise(agg_ctox_low = sum(ctox_low),
              agg_ctox_high = sum(ctox_high),
              agg_ortox_low = sum(ortox_low),
              agg_ortox_high = sum(ortox_high)) %>%
    ungroup() %>%
    gather(metric, value, -year, -fips, -class)
}

# Cluster
hclust <- function(df, metric) {
  
  metric <- enquo(metric) # hey, look, I can do tidy eval
  
  df_sub <- df %>% # filter to select metric for clustering
    filter(metric == !! metric) %>%
    select(region, data_xts)
  
  names(df_sub$data_xts) <- df_sub$region # name each time series in the list so that the output of tsclust is readable
  
  dtwclust::tsclust(df_sub$data_xts,   # perform clustering...
                           type = "hierarchical", # hierarchical
                           preproc = NULL, # we're sticking with fold change
                           distance = "Euclidean", # using simple euclidean distance metric
                           control = hierarchical_control(method = "ward.D2"), # and Ward's linkage method
                           trace = FALSE)
}

Load and prep data

cty_dat <- read_csv("../output_big/bee_tox_index_cty_class.csv", 
                    col_names = TRUE,
                    guess_max = 10000) # Increasing guess_max is necessary to avoid parsing errors due to wrong guesses of data type
prep_dat <- data_prep(cty_dat)
clust_dat <- clust_prep(prep_dat)
hist_dat <- hist_prep(prep_dat)
outlier_dat <- outlier_prep(prep_dat)
plot_dat <- plot_prep(prep_dat)

Hierarchical clustering to approximate appropriate number of clusters

pdf("../output/ctox_low_dend.pdf")
ctox_low <- hclust(clust_dat, "agg_ctox_low")
plot(ctox_low)
dev.off()
quartz_off_screen 
                2 
pdf("../output/ctox_high_dend.pdf")
ctox_high <- hclust(clust_dat, "agg_ctox_high")
plot(ctox_high)
dev.off()
quartz_off_screen 
                2 
pdf("../output/ortox_low_dend.pdf")
ortox_low <- hclust(clust_dat, "agg_ortox_low")
plot(ortox_low)
dev.off()
quartz_off_screen 
                2 
pdf("../output/ortox_high_dend.pdf")
ortox_high <- hclust(clust_dat, "agg_ortox_high")
plot(ortox_high)
dev.off()
quartz_off_screen 
                2 

Stacked bar plot

pdf("../output/ortox_low_stackbar.pdf", height = 8.5, width = 11)
ggplot(filter(plot_dat, metric == "agg_ortox_low"), 
              aes(x = year, y = value, fill = class)) +
  geom_bar(stat = "identity") +
  facet_wrap(~region, scales = "free")
dev.off()
quartz_off_screen 
                2 
pdf("../output/cttox_low_stackbar.pdf", height = 8.5, width = 11)
ggplot(filter(plot_dat, metric == "agg_ctox_low"), 
              aes(x = year, y = value, fill = class)) +
  geom_bar(stat = "identity") +
  facet_wrap(~region, scales = "free")
dev.off()
quartz_off_screen 
                2 

Generate USDA regions map

### Counties and states of the continental US
cty <- st_as_sf(counties()) %>%
  mutate(fips = as.integer(str_c(STATEFP, COUNTYFP, sep = ""))) %>%
  filter(!STATEFP %in% c("02", "15", "60", "74", "78", "72", "69", "66")) %>% # Filter to include only continental US
  select(fips, NAME, geometry) 

sts <- st_as_sf(states()) %>%
    filter(!STATEFP %in% c("02", "15", "60", "74", "78", "72", "69", "66")) %>% # Filter to include only continental US
    select(STUSPS, geometry)

Load county-level pesticide use data and split by year

### This is the county-level USGS pesticide data set; my interest at the moment is the extraction of region names  
use <- read_csv("../output_big/bee_tox_index_cty_class.csv", col_names = TRUE) %>% 
  select(YEAR, region_name, fips) %>%
  filter(YEAR == 2012) %>%
  mutate(fips = as.integer(fips)) %>% # convert to integer so that leading zeros won't matter during join operation (to follow)
  distinct()
Warning: 655512 parsing failures.
  row            col           expected       actual                                        file
91529 crop_ha        1/0/T/F/TRUE/FALSE 19858.751392 '../output_big/bee_tox_index_cty_class.csv'
91529 crop_ha_interp 1/0/T/F/TRUE/FALSE no           '../output_big/bee_tox_index_cty_class.csv'
91529 past_ha        1/0/T/F/TRUE/FALSE 6253.208072  '../output_big/bee_tox_index_cty_class.csv'
91529 past_ha_interp 1/0/T/F/TRUE/FALSE no           '../output_big/bee_tox_index_cty_class.csv'
91529 trt_ins_ha     1/0/T/F/TRUE/FALSE 2891.48147   '../output_big/bee_tox_index_cty_class.csv'
..... .............. .................. ............ ...........................................
See problems(...) for more details.

Make sf object

cty_use <- full_join(cty, use, by = "fips") %>%
  select(YEAR, fips, region = region_name, geometry) %>%
  st_as_sf() %>%
  group_by(region) %>%
  summarize() %>%
  na.omit()

pdf("../output/USDA_regions.pdf", height = 8.5, width = 11)
ggplot(cty_use, aes()) +
  geom_sf(aes(fill = region)) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.title=element_blank())
dev.off()
quartz_off_screen 
                2 

Session information

sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

Random number generation:
 RNG:     L'Ecuyer-CMRG 
 Normal:  Inversion 
 Sample:  Rejection 
 
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2    
 [5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0  
 [9] tidyverse_1.2.1 tigris_0.8.2    sf_0.7-6        dtwclust_5.5.4 
[13] dtw_1.20-1      proxy_0.4-23    lubridate_1.7.4 timetk_0.1.1.1 

loaded via a namespace (and not attached):
 [1] nlme_3.1-140        xts_0.11-2          RColorBrewer_1.1-2 
 [4] httr_1.4.0          tools_3.6.1         backports_1.1.4    
 [7] rgdal_1.4-4         R6_2.4.0            KernSmooth_2.23-15 
[10] DBI_1.0.0           lazyeval_0.2.2      colorspace_1.4-1   
[13] withr_2.1.2         sp_1.3-1            tidyselect_0.2.5   
[16] curl_3.3            compiler_3.6.1      flexclust_1.4-0    
[19] cli_1.1.0           rvest_0.3.4         xml2_1.2.0         
[22] shinyjs_1.0         labeling_0.3        scales_1.0.0       
[25] classInt_0.3-3      rappdirs_0.3.1      digest_0.6.20      
[28] foreign_0.8-71      rmarkdown_1.14      pkgconfig_2.0.2    
[31] htmltools_0.3.6     rlang_0.4.0         readxl_1.3.1       
[34] rstudioapi_0.10     shiny_1.3.2         generics_0.0.2     
[37] zoo_1.8-6           jsonlite_1.6        magrittr_1.5       
[40] modeltools_0.2-22   Matrix_1.2-17       Rcpp_1.0.1         
[43] munsell_0.5.0       stringi_1.4.3       yaml_2.2.0         
[46] plyr_1.8.4          grid_3.6.1          maptools_0.9-5     
[49] parallel_3.6.1      promises_1.0.1      ggrepel_0.8.1      
[52] bigmemory.sri_0.1.3 crayon_1.3.4        lattice_0.20-38    
[55] haven_2.1.1         hms_0.5.0           zeallot_0.1.0      
[58] knitr_1.23          pillar_1.4.2        uuid_0.1-2         
[61] reshape2_1.4.3      codetools_0.2-16    stats4_3.6.1       
[64] glue_1.3.1          evaluate_0.14       RcppParallel_4.4.3 
[67] modelr_0.1.4        vctrs_0.2.0         nloptr_1.2.1       
[70] httpuv_1.5.1        foreach_1.4.4       cellranger_1.1.0   
[73] gtable_0.3.0        clue_0.3-57         assertthat_0.2.1   
[76] xfun_0.8            mime_0.7            xtable_1.8-4       
[79] broom_0.5.2         e1071_1.7-2         RSpectra_0.15-0    
[82] later_0.8.0         class_7.3-15        iterators_1.0.10   
[85] units_0.6-3         cluster_2.1.0       bigmemory_4.5.33   

This R Markdown site was created with workflowr