Last updated: 2019-07-25
library(timetk)
library(lubridate)
library(dtwclust)
library(sf)
library(tigris)
library(tidyverse)
# Prep data for time series clustering
data_prep <- function(df) {
dat <- filter(df, YEAR %in% c(1997:2012)) %>% # constrain to 1997-2012
mutate(year = lubridate::ymd(YEAR, truncated = 2, tz = "EST")) %>% # reformat year field
select(year, # select and rename columns
fips,
region = region_name,
class = class_short,
ctox_low = ct_tox_bil_low,
ctox_high = ct_tox_bil_high,
ortox_low = or_tox_bil_low,
ortox_high = or_tox_bil_high) %>%
na.omit() # Northern Crescent has an NA for year in one row; pesticide data re also NA for this entry, so it can just be omitted
}
clust_prep <- function(df) {
df %>% group_by(year, region) %>% # aggregate toxic load by year and region
summarise(agg_ctox_low = sum(ctox_low),
agg_ctox_high = sum(ctox_high),
agg_ortox_low = sum(ortox_low),
agg_ortox_high = sum(ortox_high)) %>%
ungroup() %>%
gather(metric, value, -year, -region) %>% # gather into tidy array
group_by(region, metric) %>%
mutate(fold_change = value/value[[1]]) %>% # calculate fold change; divides each value by the first value, grouped by region and metric
select(year, region, metric, fold_change) %>%
nest() %>% # collapse into list column in prep for xts conversion
mutate(data_xts = map(data, timetk::tk_xts)) %>% # convert to xts time series
select(region, metric, data, data_xts)
}
plot_prep <- function(df) {
df %>% group_by(year, region, class) %>%
summarise(agg_ctox_low = sum(ctox_low),
agg_ctox_high = sum(ctox_high),
agg_ortox_low = sum(ortox_low),
agg_ortox_high = sum(ortox_high)) %>%
gather(metric, value, -year, -region, -class)
}
hist_prep <- function(df) {
df %>% group_by(year, fips) %>% # aggregate toxic load by year and region
summarise(agg_ctox_low = sum(ctox_low),
agg_ctox_high = sum(ctox_high),
agg_ortox_low = sum(ortox_low),
agg_ortox_high = sum(ortox_high)) %>%
ungroup() %>%
gather(metric, value, -year, -fips)
}
outlier_prep <- function(df) {
df %>% group_by(year, fips, class) %>% # aggregate toxic load by year and region
summarise(agg_ctox_low = sum(ctox_low),
agg_ctox_high = sum(ctox_high),
agg_ortox_low = sum(ortox_low),
agg_ortox_high = sum(ortox_high)) %>%
ungroup() %>%
gather(metric, value, -year, -fips, -class)
}
# Cluster
hclust <- function(df, metric) {
metric <- enquo(metric) # hey, look, I can do tidy eval
df_sub <- df %>% # filter to select metric for clustering
filter(metric == !! metric) %>%
select(region, data_xts)
names(df_sub$data_xts) <- df_sub$region # name each time series in the list so that the output of tsclust is readable
dtwclust::tsclust(df_sub$data_xts, # perform clustering...
type = "hierarchical", # hierarchical
preproc = NULL, # we're sticking with fold change
distance = "Euclidean", # using simple euclidean distance metric
control = hierarchical_control(method = "ward.D2"), # and Ward's linkage method
trace = FALSE)
}
cty_dat <- read_csv("../output_big/bee_tox_index_cty_class.csv",
col_names = TRUE,
guess_max = 10000) # Increasing guess_max is necessary to avoid parsing errors due to wrong guesses of data type
prep_dat <- data_prep(cty_dat)
clust_dat <- clust_prep(prep_dat)
hist_dat <- hist_prep(prep_dat)
outlier_dat <- outlier_prep(prep_dat)
plot_dat <- plot_prep(prep_dat)
pdf("../output/ctox_low_dend.pdf")
ctox_low <- hclust(clust_dat, "agg_ctox_low")
plot(ctox_low)
dev.off()
quartz_off_screen
2
pdf("../output/ctox_high_dend.pdf")
ctox_high <- hclust(clust_dat, "agg_ctox_high")
plot(ctox_high)
dev.off()
quartz_off_screen
2
pdf("../output/ortox_low_dend.pdf")
ortox_low <- hclust(clust_dat, "agg_ortox_low")
plot(ortox_low)
dev.off()
quartz_off_screen
2
pdf("../output/ortox_high_dend.pdf")
ortox_high <- hclust(clust_dat, "agg_ortox_high")
plot(ortox_high)
dev.off()
quartz_off_screen
2
pdf("../output/ortox_low_stackbar.pdf", height = 8.5, width = 11)
ggplot(filter(plot_dat, metric == "agg_ortox_low"),
aes(x = year, y = value, fill = class)) +
geom_bar(stat = "identity") +
facet_wrap(~region, scales = "free")
dev.off()
quartz_off_screen
2
pdf("../output/cttox_low_stackbar.pdf", height = 8.5, width = 11)
ggplot(filter(plot_dat, metric == "agg_ctox_low"),
aes(x = year, y = value, fill = class)) +
geom_bar(stat = "identity") +
facet_wrap(~region, scales = "free")
dev.off()
quartz_off_screen
2
### Counties and states of the continental US
cty <- st_as_sf(counties()) %>%
mutate(fips = as.integer(str_c(STATEFP, COUNTYFP, sep = ""))) %>%
filter(!STATEFP %in% c("02", "15", "60", "74", "78", "72", "69", "66")) %>% # Filter to include only continental US
select(fips, NAME, geometry)
sts <- st_as_sf(states()) %>%
filter(!STATEFP %in% c("02", "15", "60", "74", "78", "72", "69", "66")) %>% # Filter to include only continental US
select(STUSPS, geometry)
### This is the county-level USGS pesticide data set; my interest at the moment is the extraction of region names
use <- read_csv("../output_big/bee_tox_index_cty_class.csv", col_names = TRUE) %>%
select(YEAR, region_name, fips) %>%
filter(YEAR == 2012) %>%
mutate(fips = as.integer(fips)) %>% # convert to integer so that leading zeros won't matter during join operation (to follow)
distinct()
Warning: 655512 parsing failures.
row col expected actual file
91529 crop_ha 1/0/T/F/TRUE/FALSE 19858.751392 '../output_big/bee_tox_index_cty_class.csv'
91529 crop_ha_interp 1/0/T/F/TRUE/FALSE no '../output_big/bee_tox_index_cty_class.csv'
91529 past_ha 1/0/T/F/TRUE/FALSE 6253.208072 '../output_big/bee_tox_index_cty_class.csv'
91529 past_ha_interp 1/0/T/F/TRUE/FALSE no '../output_big/bee_tox_index_cty_class.csv'
91529 trt_ins_ha 1/0/T/F/TRUE/FALSE 2891.48147 '../output_big/bee_tox_index_cty_class.csv'
..... .............. .................. ............ ...........................................
See problems(...) for more details.
cty_use <- full_join(cty, use, by = "fips") %>%
select(YEAR, fips, region = region_name, geometry) %>%
st_as_sf() %>%
group_by(region) %>%
summarize() %>%
na.omit()
pdf("../output/USDA_regions.pdf", height = 8.5, width = 11)
ggplot(cty_use, aes()) +
geom_sf(aes(fill = region)) +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.title=element_blank())
dev.off()
quartz_off_screen
2
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
Random number generation:
RNG: L'Ecuyer-CMRG
Normal: Inversion
Sample: Rejection
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
[9] tidyverse_1.2.1 tigris_0.8.2 sf_0.7-6 dtwclust_5.5.4
[13] dtw_1.20-1 proxy_0.4-23 lubridate_1.7.4 timetk_0.1.1.1
loaded via a namespace (and not attached):
[1] nlme_3.1-140 xts_0.11-2 RColorBrewer_1.1-2
[4] httr_1.4.0 tools_3.6.1 backports_1.1.4
[7] rgdal_1.4-4 R6_2.4.0 KernSmooth_2.23-15
[10] DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1
[13] withr_2.1.2 sp_1.3-1 tidyselect_0.2.5
[16] curl_3.3 compiler_3.6.1 flexclust_1.4-0
[19] cli_1.1.0 rvest_0.3.4 xml2_1.2.0
[22] shinyjs_1.0 labeling_0.3 scales_1.0.0
[25] classInt_0.3-3 rappdirs_0.3.1 digest_0.6.20
[28] foreign_0.8-71 rmarkdown_1.14 pkgconfig_2.0.2
[31] htmltools_0.3.6 rlang_0.4.0 readxl_1.3.1
[34] rstudioapi_0.10 shiny_1.3.2 generics_0.0.2
[37] zoo_1.8-6 jsonlite_1.6 magrittr_1.5
[40] modeltools_0.2-22 Matrix_1.2-17 Rcpp_1.0.1
[43] munsell_0.5.0 stringi_1.4.3 yaml_2.2.0
[46] plyr_1.8.4 grid_3.6.1 maptools_0.9-5
[49] parallel_3.6.1 promises_1.0.1 ggrepel_0.8.1
[52] bigmemory.sri_0.1.3 crayon_1.3.4 lattice_0.20-38
[55] haven_2.1.1 hms_0.5.0 zeallot_0.1.0
[58] knitr_1.23 pillar_1.4.2 uuid_0.1-2
[61] reshape2_1.4.3 codetools_0.2-16 stats4_3.6.1
[64] glue_1.3.1 evaluate_0.14 RcppParallel_4.4.3
[67] modelr_0.1.4 vctrs_0.2.0 nloptr_1.2.1
[70] httpuv_1.5.1 foreach_1.4.4 cellranger_1.1.0
[73] gtable_0.3.0 clue_0.3-57 assertthat_0.2.1
[76] xfun_0.8 mime_0.7 xtable_1.8-4
[79] broom_0.5.2 e1071_1.7-2 RSpectra_0.15-0
[82] later_0.8.0 class_7.3-15 iterators_1.0.10
[85] units_0.6-3 cluster_2.1.0 bigmemory_4.5.33
This R Markdown site was created with workflowr