Sensitivity: Low vs. High USGS estimate

Purpose

This code is intended to investigate the low vs. high pesticide data provided by USGS, in relation to independent pesticide estimates from USDA.

Load libraries + data

library(tidyverse)

## read in data comparing USGS to USDA values, for high and low estimate
high <- read.csv("../output_big/validation_high_estimate.csv") %>%
  mutate(type = "high")

low <- read.csv("../output_big/validation_low_estimate.csv") %>%
  mutate(type = "low")

# bind the two together
full <- rbind(high, low)

## read in full datasets
usgs_high <- read.csv("../output_big/bee_tox_index_state_yr_cmpd_high_20200609.csv") %>%
  select(-starts_with(c("ld50", "source"))) %>%
  rename_with(~ paste0(.x, "_h"), .cols = c("kg_app","kg_ha", "ha")) 

usgs_low <- read.csv("../output_big/bee_tox_index_state_yr_cmpd_20200609.csv") %>%
    select(-starts_with(c("ld50", "source"))) %>%
    rename_with(~ paste0(.x, "_l"), .cols = c("kg_app","kg_ha", "ha")) 

# join + create column for missing value in low estimate
usgs_full <- full_join(usgs_high, usgs_low, by = c("State_FIPS_code","State","Compound","Year","Units","crop",
                                                   "cmpd_usgs","STATE_ALPHA","cat","cat_grp","class","grp_source")) %>%
  mutate(low_missing = ifelse(is.na(kg_app_l), 1, 0))

Restructure data for plotting

high_val <- high %>%
  rename(usgs_high = usgs,
         RE_high = RE) %>%
  select(-diff, -method_pretty, -type)

joined <- low %>%
  rename(usgs_low = usgs,
         RE_low = RE) %>%
  select(-diff, -method_pretty, -type) %>%
  full_join(high_val, by = c("STATE_ALPHA", "YEAR", "usgs_cmpd", "CROP", "methods", "usda_category",
                             "method"))

Plot differences with USDA data

# plot by method of analysis and pesticide category
ggplot(full, aes(x = method, y = RE, fill = type))+
  geom_boxplot() +
  geom_hline(yintercept = 0, linetype = 2) +
  facet_grid("usda_category") +
  theme_classic() +
  coord_fixed(ratio = 0.005) +
  xlab("") +
  ylab("Relative difference to USDA (%)") +
  labs(fill = "USGS estimate")

# plot by method of analysis for all AI combined
ggplot(full, aes(x = method, y = RE, fill = type))+
  geom_boxplot() +
  geom_hline(yintercept = 0, linetype = 2) +
  theme_classic() +
  xlab("") +
  ylab("Relative difference to USDA (%)") +
  labs(fill = "USGS estimate")

Investigate relationship between low and high estimate

# calculate correlation between the two estimates
cor(usgs_full$kg_ha_l, usgs_full$kg_ha_h, use = "pairwise.complete.obs")

## [1] 0.9522947

# plot relationship
ggplot(usgs_full, aes(x = kg_ha_l, y = kg_ha_h)) +
  geom_point(alpha = 0.3) +
  theme_minimal(base_size = 18) +
  xlab("USGS low (kg/ha)") +
  ylab("USGS high (kg/ha)") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = 2) +
  xlim(c(0,100)) +
  ylim(c(0,100))

Summarize missing values in low dataset

summary(usgs_full$low_missing)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00    0.17    0.00    1.00

# summarize missing data by pesticide category
missing <- usgs_full %>%
  group_by(cat) %>%
  summarize(missing_n = sum(low_missing),
            total_n = n(),
            missing_perc = round(mean(low_missing), 2))
print.data.frame(missing)

##   cat missing_n total_n missing_perc
## 1   F      8319   69901         0.12
## 2 FUM       590    4094         0.14
## 3   H     32736  169828         0.19
## 4   I     17702  101215         0.17
## 5   O       237    2720         0.09
## 6 PGR      1365   10674         0.13

# summarize missing data by pesticide crop
missing_crop <- usgs_full %>%
  group_by(crop) %>%
  summarize(missing_n = sum(low_missing),
            total_n = n(),
            missing_perc = round(mean(low_missing), 2))
print.data.frame(missing_crop)

##                    crop missing_n total_n missing_perc
## 1               Alfalfa      5995   16141         0.37
## 2                  Corn     13040   47467         0.27
## 3                Cotton      3839   24779         0.15
## 4   Orchards_and_grapes      4853   61567         0.08
## 5           Other_crops      6345   42153         0.15
## 6       Pasture_and_hay      4743   14379         0.33
## 7                  Rice       370    4844         0.08
## 8              Soybeans      8905   32846         0.27
## 9  Vegetables_and_fruit      6445   88980         0.07
## 10                Wheat      6414   25276         0.25

# summarize missing data by state
missing_state <- usgs_full %>%
  group_by(State) %>%
  summarize(missing_n = sum(low_missing),
            total_n = n(),
            missing_perc = round(mean(low_missing), 2))
print.data.frame(missing_state)

##             State missing_n total_n missing_perc
## 1         Alabama      2156    8383         0.26
## 2         Arizona       792    7044         0.11
## 3        Arkansas      1193    7470         0.16
## 4      California         0   21550         0.00
## 5        Colorado      1976    7745         0.26
## 6     Connecticut       725    4231         0.17
## 7        Delaware      1011    5075         0.20
## 8         Florida       926    8085         0.11
## 9         Georgia      1405    9506         0.15
## 10          Idaho       892    7330         0.12
## 11       Illinois      1430    7977         0.18
## 12        Indiana      1577    8664         0.18
## 13           Iowa      1038    7008         0.15
## 14         Kansas      1451    7631         0.19
## 15       Kentucky      2044    7780         0.26
## 16      Louisiana      1099    6550         0.17
## 17          Maine       385    2974         0.13
## 18       Maryland      1312    7367         0.18
## 19  Massachusetts       980    4539         0.22
## 20       Michigan      1042    8326         0.13
## 21      Minnesota      1724    8771         0.20
## 22    Mississippi      1507    7809         0.19
## 23       Missouri      2019    9378         0.22
## 24        Montana      1765    6653         0.27
## 25       Nebraska      1916    7732         0.25
## 26         Nevada       796    6743         0.12
## 27  New Hampshire       665    3404         0.20
## 28     New Jersey       772    5610         0.14
## 29     New Mexico      2372    9086         0.26
## 30       New York       665    6158         0.11
## 31 North Carolina      1357    9282         0.15
## 32   North Dakota      1329    6194         0.21
## 33           Ohio      1894    8671         0.22
## 34       Oklahoma      2005    8648         0.23
## 35         Oregon      1157    8430         0.14
## 36   Pennsylvania       958    7616         0.13
## 37   Rhode Island       580    3201         0.18
## 38 South Carolina      1353    8386         0.16
## 39   South Dakota      1848    6892         0.27
## 40      Tennessee      2190    9307         0.24
## 41          Texas      1129   10445         0.11
## 42           Utah       924    5152         0.18
## 43        Vermont       709    4094         0.17
## 44       Virginia      1777    8905         0.20
## 45     Washington       769    7697         0.10
## 46  West Virginia       972    6287         0.15
## 47      Wisconsin      1059    6909         0.15
## 48        Wyoming      1304    5737         0.23

This R Markdown site was created with workflowr

Sensitivity: Low vs. High USGS estimate

Maggie Douglas

Purpose

Load libraries + data

Restructure data for plotting

Plot differences with USDA data

Investigate relationship between low and high estimate

Summarize missing values in low dataset