## Options
options(tidyverse.quiet = TRUE)
options(dplyr.summarise.inform = FALSE)
options(knitr.kable.NA = '')
## Packages
# Utilities
library(readxl) # I/O
library(lubridate) # date formatting
library(broman) # number formatting
library(zoo) # time series data
# Models
library(lme4) # mixed effect regression models
library(party) # conditional random forest models
library(mlr) # framework for ML model development
library(broom) # extract model estimates
library(broom.mixed) # extract model estimates (lme4)
#library(pROC) # ROC curve analysis (conflicts, call functions directly)
# Graphs
library(corrplot) # plotting correlations
library(patchwork) # combine plots
library(ggh4x) # nested facets
library(ragg) # more consistent graphics output
# Tidyverse
library(knitr) # rendering
library(quarto) # rendering
library(tidyverse) # load last to avoid conflicts (particularly with data.table)
## Define general functions
`%notin%` = function(x,y) !(x %in% y)
Predicting Human Fecal Contamination of Produce Irrigation Water
Version 4.1
Description
R code to conduct the analysis of human fecal contamination in produce irrigation ponds using predictive models that is reported in:
Hofstetter J, Holcomb DA, Kahler AM, Rodrigues C, da Silva ALBR, Mattioli MC. (2024). Performance of conditional random forest and regression models at predicting human fecal contamination of produce irrigation ponds in the southeastern United States. ACS ES&T Water. https://doi.org/10.1021/acsestwater.4c00839.
This analysis code and the study data are available in the manuscript repository.
Setup
Load packages and set options.
Data
We have two separate datasets from the same growing region. The first, collected in 2020-2021, is used to tune and train the predictive models of fecal contamination. The second, collected in 2015-2016, is used only as a test dataset to assess the predictive performance of the models. The data are provided in a single file. Read the file, transform certain variables for analysis, and save separately in RDS format for convenience.
## read combined dataset
<- data.table::fread("../out/pond_data.csv")
df_all_raw
## transform variables for analysis
<- df_all_raw %>%
df_all mutate(dt_collect = as_date(dt_collect),
turb = log10(turb + 1),
cond = log10(cond),
across(starts_with("solar"), ~log10(.x)))
## save transformed training data
<- df_all %>%
df_train filter(set == "train")
saveRDS(df_train, "../out/data_train.rds")
## save transformed test data
<- df_all %>%
df_test filter(set == "test")
saveRDS(df_test, "../out/data_test.rds")
Train Models
We implement two modeling approaches to predict the presence of fecal indicators in fresh produce irrigation pond water: logistic regression and conditional random forests. Both modeling approaches are be applied separately to three dichotomous outcomes: HF183 detection, “human fecal indicator” (HFI)—detection of HF183 or a human-associated bacteriophage (HF183 in the training dataset and FRNA GII coliphage in the test dataset)—and elevated generic E. coli concentrations ≥126 MPN/100 mL.
Correlation
Explanatory variables considered for inclusion in both types of models are first screened for pairwise correlations; variable pairs with correlation absolute values ≥0.5 are not included in the same models. Estimate pairwise Pearson correlations between all explanatory variables and plot a correlogram.
Code
## labels
<- c("building", "week",
cor_var "rain2", "rain7",
"solar2", "solar7",
"wind2", "wind7",
"temp", "ph", "do",
"cond", "turb")
<- read_xlsx("../data/variable_names.xlsx") %>%
cor_lab filter(term %in% cor_var) %>%
mutate(term = fct_relevel(term, cor_var)) %>%
arrange(term) %>%
select(label) %>%
pull()
## variables to correlate
<- readRDS("../out/data_train.rds") %>%
df_train_corr select(building, week,
rain2, rain7,
solar2, solar7,
wind2, wind7,
temp, ph, do,
cond, turb)
names(df_train_corr) <- cor_lab
## calculate correlations
<- cor(df_train_corr)
cormat
## prepare correlation plot
<- corrplot(cormat,
corlist type = "upper", method = 'number',
order = "hclust", addrect = 3,
tl.col = "black", tl.srt = 45,
number.cex = 0.8)$corrPos
Code
## identify highly correlated (positive or negative) variables
<- corlist %>%
cor_hi mutate(corr_abs = abs(corr),
corr_coarse = round(corr_abs, 1)) %>%
filter(corr_coarse >= 0.5,
!(xName == yName))
Unsurprisingly, we see high correlations (> 0.5 absolute value) for both solar variables and temperature. Solar 0-2 has a slightly higher association with temperature than Solar 2-7 does, so if we only include Solar 0-2 in the multivariable models (excluding both Solar 2-7 and temperature) then we should capture more of the influence of both the excluded variables while avoided highly correlated predictors. Additionally, pH and dissolved oxygen are fairly highly correlated at 0.47, but do not quite exceed the 0.5 correlation threshold.
Also examine associations between the outcome variables by Cochran-Mantel-Haenszel chi-squared test stratified by pond.
## HF183 & crAssphage
<- readRDS("../out/data_train.rds") %>%
tab_hf_crass select(hf183, phage, pond) %>%
table()
mantelhaen.test(tab_hf_crass)
Mantel-Haenszel chi-squared test with continuity correction
data: tab_hf_crass
Mantel-Haenszel X-squared = 5.1106, df = 1, p-value = 0.02378
alternative hypothesis: true common odds ratio is not equal to 1
95 percent confidence interval:
1.356967 18.170199
sample estimates:
common odds ratio
4.965517
## HF183 & E. coli > 126
<- readRDS("../out/data_train.rds") %>%
tab_hf_ec select(hf183, ecoli, pond) %>%
table()
mantelhaen.test(tab_hf_ec)
Mantel-Haenszel chi-squared test with continuity correction
data: tab_hf_ec
Mantel-Haenszel X-squared = 1.8212, df = 1, p-value = 0.1772
alternative hypothesis: true common odds ratio is not equal to 1
95 percent confidence interval:
0.7617406 6.4196774
sample estimates:
common odds ratio
2.211364
Regression
Logistic regression models are implemented as generalized linear mixed models (GLMM) to account for repeated measures at each pond. Univariable models are first fit for each outcome/explanatory variable pair to screen for association. Multivariable models are then fit for each outcome including the explanatory variables with univariable \(p < 0.1\) and backwards elimination is used to obtain the final explanatory variable sets.
Univariable
Fit a univariable model for each combination of explanatory variable and outcome, including pond as a random effect.
## long data
<- readRDS("../out/data_train.rds") %>%
df_reg_uni select(pond,
week, building,
temp, turb,
cond, do, ph,
rain2, rain7,
solar2, solar7,
wind2, wind7,%>%
ecoli, hf183, hfi) pivot_longer(c(ecoli, hf183, hfi),
names_to = "outcome",
values_to = "detect") %>%
pivot_longer(c(week, building, temp, turb, cond, do, ph,
rain2, rain7, solar2, solar7, wind2, wind7),names_to = "predictor",
values_to = "value")
## fit
<- df_reg_uni %>%
fit_reg_uni group_by(outcome, predictor) %>%
nest() %>%
mutate(fit = map(data,
~glmer(formula = detect ~ value + (1 | pond),
data = .x,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))),
est = map(fit, ~tidy(.x, exponentiate = TRUE, conf.int = TRUE))) %>%
ungroup()
saveRDS(fit_reg_uni, "../fit/reg_uni.rds")
## Estimates
<- fit_reg_uni %>%
est_reg_uni select(outcome, predictor, est) %>%
unnest(est)
saveRDS(est_reg_uni, "../out/est_reg_uni.rds")
## filter for only explanatory variables with p < 0.1 to include in multivariable models
# variable names
.1 <- est_reg_uni %>%
est_reg_p0filter(term == "value") %>%
left_join(read_xlsx("../data/variable_names.xlsx"),
by = c("predictor" = "term")) %>%
mutate(outcome = fct_relevel(outcome, c("hf183", "hfi", "ecoli")),
predictor = fct_relevel(predictor,
c("building", "week",
"rain2", "rain7",
"solar2", "solar7",
"wind2", "wind7",
"temp", "ph", "do",
"cond", "turb"))) %>%
arrange(outcome, predictor) %>%
filter(p.value < 0.1)
Plot the univariable associations.
Code
## estimates
# variable names
<- read_xlsx("../data/variable_names.xlsx")
var_lab
# raw
<- readRDS("../out/est_reg_uni.rds")
est_reg_uni
# format
<- est_reg_uni %>%
df_plot_reg_uni filter(term == "value") %>%
left_join(var_lab, by = c("predictor" = "term")) %>%
mutate(predictor = fct_relevel(predictor,
c("building", "week",
"rain2", "rain7",
"solar2", "solar7",
"wind2", "wind7",
"temp", "ph", "do",
"cond", "turb"))) %>%
arrange(predictor) %>%
mutate(label = fct_inorder(label) %>%
fct_recode("Dissolved\noxygen" = "Dissolved oxygen") %>%
fct_rev(),
signif = case_when(p.value < 0.01 ~ "p < 0.01",
>= 0.01 & p.value < 0.05 ~ "p < 0.05",
p.value >= 0.05 & p.value < 0.1 ~ "p < 0.1",
p.value >= 0.1 ~ "p ≥ 0.1",
p.value TRUE ~ NA_character_) %>%
fct_relevel(c("p < 0.01", "p < 0.05", "p < 0.1")),
out_lab = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("(A) HF183 detection" = "hf183",
"(B) HFI detection" = "hfi",
"(C) *E. coli* ≥126 MPN/100 mL" = "ecoli"))
<- df_plot_reg_uni %>%
p_or_uni ggplot(aes(x = estimate, y = label, color = signif)) +
facet_grid(cols = vars(out_lab)) +
geom_vline(xintercept = 1, color = "red",
linetype = "dashed", linewidth = 1,
alpha = 0.5) +
geom_point(shape = 18, size = 5) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
height = 0.15) +
scale_x_log10(breaks = c(0.001, 0.01, 0.1, 1, 10, 100),
labels = c("0.001", "0.01", "0.1", "1", "10", "100")) +
coord_cartesian(xlim = c(0.001, NA)) +
scale_color_viridis_d(begin = 0.05, end = 0.95,
option = "plasma", direction = 1,
name = "Significance") +
labs(x = "odds ratio",
y = NULL) +
theme_bw() +
theme(strip.text = ggtext::element_markdown(hjust = 0),
legend.position = "top",
panel.spacing = unit(0.5, "lines"))
ggsave(filename = "../fig/forest_or_uni.jpg",
plot = p_or_uni,
device = ragg::agg_jpeg, dpi = 300,
width = 8, height = 6, units = "in")
We are retaining explanatory variables with \(p < 0.1\) in the univariable models as candidates for the multivariable models. This criterion leads to the following full models:
- HF183 detection: Building, ISO week, Rain 0-2, Rain 2-7, Solar 0-2
- HFI detection: Building, ISO week, Rain 0-2, Rain 2-7, Solar 0-2
- E. coli ≥126 MPN/100 mL: ISO week, Rain 0-2, Solar 0-2, Wind 2-7, Temperature, pH, Conductivity
Stepwise selection
We use backwards elimination with a criterion of \(p \geq 0.1\) on a chi-square test comparing model deviance in favor of the simpler (fewer explanatory variables) model. Define a function to identify the explanatory variable with the highest p-value in a model fit, which will be dropped to form the simpler model.
## function to ID highest p-value predictor
<- function(fit){
pick_hi_p
%>%
fit tidy() %>%
filter(effect == "fixed",
!= "(Intercept)") %>%
term mutate(pval_hi = max(p.value)) %>%
filter(p.value == pval_hi)
}
Beginning with the HF183 outcome, fit the full model with all the explanatory variables identified in the univariable analysis.
## Data
<- readRDS("../out/data_train.rds")
df_reg
## HF183
# full
<- glmer(hf183 ~ building + week + rain2 + rain7 + solar2 + (1 | pond),
m183_full data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
pick_hi_p(m183_full)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> week -0.00686 0.0143 -0.480 0.632 0.632
ISO Week is least significant (highest p-value), so we drop it for the next model.
# reduced (-week)
<- glmer(hf183 ~ building + rain2 + rain7 + solar2 + (1 | pond),
m183_r1 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
# compare full and reduced models
# non-significant chi-square test means more complex model is no better than simpler model
anova(m183_full, m183_r1)
Data: df_reg
Models:
m183_r1: hf183 ~ building + rain2 + rain7 + solar2 + (1 | pond)
m183_full: hf183 ~ building + week + rain2 + rain7 + solar2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
m183_r1 6 199.66 219.94 -93.832 187.66
m183_full 7 201.44 225.10 -93.720 187.44 0.2246 1 0.6356
pick_hi_p(m183_r1)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> solar2 -1.05 0.866 -1.22 0.224 0.224
The full model did not give significantly lower deviance than the simpler model, so we use the simpler model and repeat the procedure again, this time eliminating the next-highest p-value predictor, Solar 0-2.
# reduced (-solar2)
<- glmer(hf183 ~ building + rain2 + rain7 + (1 | pond),
m183_r2 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
# compare full and reduced models
# non-significant chi-square test means more complex model is no better than simpler model
anova(m183_r1, m183_r2)
Data: df_reg
Models:
m183_r2: hf183 ~ building + rain2 + rain7 + (1 | pond)
m183_r1: hf183 ~ building + rain2 + rain7 + solar2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
m183_r2 5 199.13 216.03 -94.566 189.13
m183_r1 6 199.66 219.94 -93.832 187.66 1.4671 1 0.2258
pick_hi_p(m183_r2)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> rain2 0.715 0.369 1.94 0.0526 0.0526
Again, the test was not significant, suggesting that we can safely drop the Solar 0-2 variable. Next we drop Rain 0-2, the next highest p-value variable.
# reduced (-rain2)
<- glmer(hf183 ~ rain7 + building + (1 | pond),
m183_r3 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
# compare full and reduced models
# non-significant chi-square test means more complex model is no better than simpler model
anova(m183_r2, m183_r3)
Data: df_reg
Models:
m183_r3: hf183 ~ rain7 + building + (1 | pond)
m183_r2: hf183 ~ building + rain2 + rain7 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
m183_r3 4 200.89 214.41 -96.444 192.89
m183_r2 5 199.13 216.03 -94.566 189.13 3.7571 1 0.05258 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pick_hi_p(m183_r3)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> rain7 0.574 0.184 3.12 0.00180 0.00180
This time the test is significant at the 10% level (and very nearly significant the 5% level), so we should keep Rain 0-2. Compare to an even further reduced model:
# reduced (-rain7)
<- glmer(hf183 ~ building + (1 | pond),
m183_r4 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
# compare full and reduced models
# non-significant chi-square test means more complex model is no better than simpler model
anova(m183_r3, m183_r4)
Data: df_reg
Models:
m183_r4: hf183 ~ building + (1 | pond)
m183_r3: hf183 ~ rain7 + building + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
m183_r4 3 208.34 218.48 -101.171 202.34
m183_r3 4 200.89 214.41 -96.444 192.89 9.4533 1 0.002108 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Notably significant this time, indicating that Rain 2-7 should be retained.
Now we repeat with the human fecal indicator, which will likely be very similar since it’s dominated by HF183 detection:
## HFI
# full
<- glmer(hfi ~ building + week + rain2 + rain7 + solar2 + (1 | pond),
mhfi_full data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
pick_hi_p(mhfi_full)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> week -0.00419 0.0141 -0.297 0.767 0.767
# reduced (-week)
<- glmer(hfi ~ building + rain2 + rain7 + solar2 + (1 | pond),
mhfi_r1 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mhfi_full, mhfi_r1)
Data: df_reg
Models:
mhfi_r1: hfi ~ building + rain2 + rain7 + solar2 + (1 | pond)
mhfi_full: hfi ~ building + week + rain2 + rain7 + solar2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mhfi_r1 6 203.21 223.49 -95.607 191.21
mhfi_full 7 205.13 228.79 -95.563 191.13 0.0861 1 0.7691
pick_hi_p(mhfi_r1)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> solar2 -0.577 0.862 -0.669 0.503 0.503
# reduced (-solar2)
<- glmer(hfi ~ building + rain2 + rain7 + (1 | pond),
mhfi_r2 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mhfi_r1, mhfi_r2)
Data: df_reg
Models:
mhfi_r2: hfi ~ building + rain2 + rain7 + (1 | pond)
mhfi_r1: hfi ~ building + rain2 + rain7 + solar2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mhfi_r2 5 201.65 218.55 -95.827 191.65
mhfi_r1 6 203.21 223.49 -95.607 191.21 0.4409 1 0.5067
pick_hi_p(mhfi_r2)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> rain2 0.748 0.380 1.97 0.0488 0.0488
# reduced (-rain2)
<- glmer(hfi ~ building + rain7 + (1 | pond),
mhfi_r3 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mhfi_r2, mhfi_r3)
Data: df_reg
Models:
mhfi_r3: hfi ~ building + rain7 + (1 | pond)
mhfi_r2: hfi ~ building + rain2 + rain7 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mhfi_r3 4 203.65 217.17 -97.823 195.65
mhfi_r2 5 201.65 218.55 -95.827 191.65 3.9925 1 0.0457 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pick_hi_p(mhfi_r3)
# A tibble: 1 × 8
effect group term estimate std.error statistic p.value pval_hi
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> rain7 0.579 0.185 3.14 0.00171 0.00171
This time the test is significant even at \(p < 0.05\), so we keep Rain 0-2 and stop selection.
Finally, E. coli:
## E. coli
# full
<- glmer(ecoli ~ week + rain2 + solar2 + wind7 + temp + ph + cond + (1 | pond),
mec_full data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
unable to evaluate scaled gradient
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
Model failed to converge: degenerate Hessian with 1 negative eigenvalues
The full model will not converge, so we need to do a forward selection instead. We fit a null (intercept + random effect only) model and then a larger model with a single variable added in. Add the variable with the lowest p-value from the univariable models, Rain 0-2.
## E. coli
# null model
<- glmer(ecoli ~ (1 | pond),
mec_null data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
tidy(mec_null)
# A tibble: 2 × 7
effect group term estimate std.error statistic p.value
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 fixed <NA> (Intercept) -2.20 0.253 -8.69 3.66e-18
2 ran_pars pond sd__(Intercept) 0.206 NA NA NA
# add lowest p-value variable from univariable tests (+rain2)
<- glmer(ecoli ~ rain2 + (1 | pond),
mec_a1 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mec_null, mec_a1)
Data: df_reg
Models:
mec_null: ecoli ~ (1 | pond)
mec_a1: ecoli ~ rain2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mec_null 2 146.36 153.12 -71.181 142.36
mec_a1 3 109.73 119.87 -51.864 103.73 38.634 1 5.112e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The test is significant, indicating the more complex model with Rain 0-2 included is superior. Now add the next lowest p-value variable, Solar 0-2:
# add lowest p-value variable from univariable tests (+solar2)
<- glmer(ecoli ~ rain2 + solar2 + (1 | pond),
mec_a2 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mec_a1, mec_a2)
Data: df_reg
Models:
mec_a1: ecoli ~ rain2 + (1 | pond)
mec_a2: ecoli ~ rain2 + solar2 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mec_a1 3 109.73 119.87 -51.864 103.73
mec_a2 4 108.36 121.88 -50.180 100.36 3.3685 1 0.06646 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This test is significant at the 10% level, so we retain Solar 0-2 and proceed with the next lowest p-value variable, conductivity:
# add lowest p-value variable from univariable tests (+cond)
<- glmer(ecoli ~ rain2 + solar2 + cond + (1 | pond),
mec_a3 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
anova(mec_a2, mec_a3)
Data: df_reg
Models:
mec_a2: ecoli ~ rain2 + solar2 + (1 | pond)
mec_a3: ecoli ~ rain2 + solar2 + cond + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mec_a2 4 108.36 121.88 -50.180 100.359
mec_a3 5 109.91 126.81 -49.953 99.907 0.4524 1 0.5012
Also significant at 0.1, so retain conductivity. Now we are getting into the non-significant variables from the univariable models that likely do not help with model fit, starting with Wind 2-7:
# add lowest p-value variable from univariable tests
<- glmer(ecoli ~ rain2 + solar2 + cond + wind7 + (1 | pond),
mec_a4 data = df_reg,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
Model failed to converge with max|grad| = 0.0160912 (tol = 0.002, component 1)
anova(mec_a3, mec_a4)
Data: df_reg
Models:
mec_a3: ecoli ~ rain2 + solar2 + cond + (1 | pond)
mec_a4: ecoli ~ rain2 + solar2 + cond + wind7 + (1 | pond)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
mec_a3 5 109.91 126.81 -49.953 99.907
mec_a4 6 111.34 131.62 -49.671 99.341 0.5657 1 0.452
Convergence issues and a very non-significant test, so we do not include wind in the model. Stop model selection and move on to fitting the final models for further use.
Final multivariable
Fit the final multivariable logistic regression models to the training data and save the model fits for future use.
## training data
<- readRDS("../out/data_train.rds")
df_train
## HF183
# model
<- glmer(hf183 ~ building + rain2 + rain7 + (1 | pond),
fit_reg_183 data = df_train,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
saveRDS(fit_reg_183, "../fit/reg_slct_hf183.rds")
# estimates
<- fit_reg_183 %>%
est_reg_183 tidy(exponentiate = TRUE,
conf.int = TRUE) %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "glmm")
## HFI
# model
<- glmer(hfi ~ rain2 + rain7 + building + (1 | pond),
fit_reg_hfi data = df_train,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
saveRDS(fit_reg_hfi, "../fit/reg_slct_hfi.rds")
# estimates
<- fit_reg_hfi %>%
est_reg_hfi tidy(exponentiate = TRUE,
conf.int = TRUE) %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "glmm")
## E. coli
# model
<- glmer(ecoli ~ rain2 + solar2 + (1 | pond),
fit_reg_ec data = df_train,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
saveRDS(fit_reg_ec, "../fit/reg_slct_ecoli.rds")
# estimates
<- fit_reg_ec %>%
est_reg_ec tidy(exponentiate = TRUE,
conf.int = TRUE) %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "glmm")
## Combine
<- bind_rows(est_reg_183,
est_reg
est_reg_hfi,
est_reg_ec)saveRDS(est_reg, "../out/est_reg_slct.rds")
Add E. coli predictor
As an additional analysis to investigate whether elevated generic E. coli levels help predict human fecal contamination, add elevated E. coli as a predictor to the final human models and save model fits for future use.
## training data
<- readRDS("../out/data_train.rds")
df_train
## HF183
# model
<- glmer(hf183 ~ ecoli + building + rain2 + rain7 + (1 | pond),
fit_reg_183_ec data = df_train,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
saveRDS(fit_reg_183_ec, "../fit/reg_ec_hf183.rds")
# estimates
<- fit_reg_183_ec %>%
est_reg_183_ec tidy(exponentiate = TRUE,
conf.int = TRUE) %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "glmm")
## HFI
# model
<- glmer(hfi ~ ecoli + building + rain2 + rain7 + (1 | pond),
fit_reg_hfi_ec data = df_train,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 2e5)))
saveRDS(fit_reg_hfi_ec, "../fit/reg_ec_hfi.rds")
# estimates
<- fit_reg_hfi_ec %>%
est_reg_hfi_ec tidy(exponentiate = TRUE,
conf.int = TRUE) %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "glmm")
## Combine
<- bind_rows(est_reg_183_ec,
est_reg_ecpred
est_reg_hfi_ec)saveRDS(est_reg_ecpred, "../out/est_reg_ecpred.rds")
Random forest
Conditional random forest (CRF) models modify the random forest approach to produce unbiased covariate selection and, from a practical standpoint, ensure that trees are not overfit without needing to resort to pruning and other such post-hoc remedies (which, among other things, necessitate specifying additional hyperparameters) [1,2].
Hyperparameter tuning
Conditional random forest models still require some setting of hyperparameters. As implemented in party::cforest()
, most of the hyperparameter defaults have been carefully selected to ensure unbiased estimation and should not be changed without good reason. Hyperparameters that may be modified, however, include:
ntree
the number of trees, which simply needs to be large enough to ensure stable model fitsmtry
the number of explanatory variables to randomly consider when splitting a nodeminbucket
the minimum number of observations that must end up in a child node for a split to occur
ntree
can be arbitrarily increased as necessary to stabilize the model fits, at the sole expense of greater computational load & time. The default values for the other two, mtry = 5
and minbucket = 7
, are probably adequate is many cases, but cross-validation with the training dataset can be used to tune their values to hopefully improve predictive performance.
Raw data
Use the mlr
package framework [3] to implement 5 iterations of repeated 3-fold cross validation, maximizing the area under the receiver operating characteristic curve (AUROC or, more simply, AUC) as the performance metric to optimize the values for mtry
and minbucket
. Use a relatively large but not computationally prohibitive value of ntree = 5001
for all models. Because we have a relatively small number of explanatory variables (about a dozen) and positive outcomes (e.g., 22 samples with E. coli ≥126 MPN/100 mL in the training dataset), restrict the search grid of values to 2 - 8 for mtry
and 3 - 19 (in increments of 4 for efficiency), for a total of 35 hyperparameter combinations assessed for each outcome.
## training data (all outcomes)
<- readRDS("../out/data_train.rds") %>%
df_train_crf mutate(across(c(hf183, phage, hfi, ecoli, building),
~factor(.x, levels = c("0", "1"),
labels = c("0" = "neg", "1" = "pos")))) %>%
select(week, building,
turb, cond, do, ph,
rain2, rain7,
solar2,
wind2, wind7,
hf183, hfi, ecoli)
### Shared across outcomes
## set parameter search space and grid
<- makeParamSet(
hparam_set makeDiscreteParam("ntree", values = c(5001)),
makeDiscreteParam("mtry", values = c(2, 3, 4, 5, 6, 7, 8)),
makeDiscreteParam("minbucket", values = c(3, 7, 11, 15, 19))
)
<- makeTuneControlGrid()
hparam_grid
## set resampling approach
<- makeResampleDesc("RepCV", reps = 5, folds = 3)
hparam_resamp
## List of measures to evaluate
<- list(setAggregation(auc, testgroup.mean),
hparam_measure setAggregation(auc, testgroup.sd),
setAggregation(brier, testgroup.mean),
setAggregation(brier, testgroup.sd))
### HF183
## Select outcome
<- df_train_crf %>%
df_hf183 select(-hfi, -ecoli)
## Make classification task
<- makeClassifTask(id = "hf183",
hf183_task data = df_hf183,
target = "hf183",
positive = "pos")
## set learner
<- makeLearner("classif.cforest",
hf183_lrn predict.type = "prob",
fix.factors.prediction = TRUE)
## perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = hf183_lrn,
hf183_tune task = hf183_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(hf183_tune, "../fit/crf_tune_hf183.rds")
<- hf183_tune$opt.path$env$path
hf183_tune_df
### HFI
## Select outcome
<- df_train_crf %>%
df_hfi select(-hf183, -ecoli)
## Make classification task
<- makeClassifTask(id = "hfi",
hfi_task data = df_hfi,
target = "hfi",
positive = "pos")
## set learner
<- makeLearner("classif.cforest",
hfi_lrn predict.type = "prob",
fix.factors.prediction = TRUE)
## perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = hfi_lrn,
hfi_tune task = hfi_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(hfi_tune, "../fit/crf_tune_hfi.rds")
### E coli
## Select outcome
<- df_train_crf %>%
df_ecoli select(-hfi, -hf183)
## Make classification task
<- makeClassifTask(id = "ecoli",
ecoli_task data = df_ecoli,
target = "ecoli",
positive = "pos")
## set learner
<- makeLearner("classif.cforest",
ecoli_lrn predict.type = "prob",
fix.factors.prediction = TRUE)
## perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = ecoli_lrn,
ecoli_tune task = ecoli_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(ecoli_tune, "../fit/crf_tune_ecoli.rds")
## Save all tuning rounds
<- bind_rows(
tune_all $opt.path$env$path %>%
hf183_tunemutate(outcome = "hf183"),
$opt.path$env$path %>%
hfi_tunemutate(outcome = "hfi"),
$opt.path$env$path %>%
ecoli_tunemutate(outcome = "ecoli")
)
# Summarise tuning
<- tune_all %>%
tab_tune group_by(outcome) %>%
summarise(max_auc = max(auc.testgroup.mean),
auc_sd_max = auc.testgroup.sd[auc.testgroup.mean == max_auc],
med_auc = median(auc.testgroup.mean),
auc_sd_med = auc.testgroup.sd[auc.testgroup.mean == med_auc],
min_auc = min(auc.testgroup.mean),
auc_sd_min = auc.testgroup.sd[auc.testgroup.mean == min_auc],
mtry_max = mtry[auc.testgroup.mean == max_auc],
mtry_med = mtry[auc.testgroup.mean == med_auc],
mtry_min = mtry[auc.testgroup.mean == min_auc],
minbucket_max = minbucket[auc.testgroup.mean == max_auc],
minbucket_med = minbucket[auc.testgroup.mean == med_auc],
minbucket_min = minbucket[auc.testgroup.mean == min_auc])
saveRDS(tab_tune, "../fit/crf_tune_all.rds")
SMOTE
Another consideration is imbalance in the proportion of observations in the positive class (fecal indicator detected) versus the negative class (fecal indicator not detected). Because random forest is prone to underweighting the minority class, it is common practice to artificially balance class before training random forest models. However, the literature disagrees on the necessity and impact of artificially correcting class imbalance: some report practical improvements in predictive performance [4], while others present theoretical and empirical arguments that the practice in inappropriate [5–8].
The synthetic minority oversampling technique (SMOTE) increases the minority class by interpolating predictor values from each randomly selected minority observation’s nearest neighbors to create synthetic minority observations for training [9]. Perform the same hyperparameter tuning approach applying SMOTE to the training data during each cross-validation iteration.
### HF183
## Make classificaton tasks
## set learner
<- makeSMOTEWrapper(hf183_lrn, sw.rate = 146/71)
hf183_lrn_smote
# perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = hf183_lrn_smote,
hf183_tune_smote task = hf183_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(hf183_tune_smote, "../fit/crf_tune_hf183_smote.rds")
### HFI
## set learner
<- makeSMOTEWrapper(hfi_lrn, sw.rate = 142/75)
hfi_lrn_smote
## perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = hfi_lrn_smote,
hfi_tune_smote task = hfi_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(hfi_tune_smote, "../fit/crf_tune_hfi_smote.rds")
### E coli
## set learner
<- makeSMOTEWrapper(ecoli_lrn, sw.rate = 195/22)
ecoli_lrn_smote
## perform tuning
# seed for reproducibility
set.seed(30333)
<- tuneParams(learner = ecoli_lrn_smote,
ecoli_tune_smote task = ecoli_task,
resampling = hparam_resamp,
par.set = hparam_set,
control = hparam_grid,
measures = hparam_measure)
saveRDS(ecoli_tune_smote, "../fit/crf_tune_ecoli_smote.rds")
## Save all tuning rounds
<- tune_all %>%
tune_all_smote mutate(balance = "raw") %>%
bind_rows(
$opt.path$env$path %>%
hf183_tune_smotemutate(outcome = "hf183"),
$opt.path$env$path %>%
hfi_tune_smotemutate(outcome = "hfi"),
$opt.path$env$path %>%
ecoli_tune_smotemutate(outcome = "ecoli")
%>%
) mutate(balance = "smote")
saveRDS(tune_all_smote, "../fit/crf_tune_grid.rds")
Plot tuning results
Construct a heatmap of AUC results for each tuning scenario to visualize the impact of hyperparameter values.
Code
## Plot tuning
# data
<- bind_rows(
df_p_tune readRDS("../fit/crf_tune_hf183.rds")$opt.path$env$path %>%
mutate(outcome = "hf183"),
readRDS("../fit/crf_tune_hfi.rds")$opt.path$env$path %>%
mutate(outcome = "hfi"),
readRDS("../fit/crf_tune_ecoli.rds")$opt.path$env$path %>%
mutate(outcome = "ecoli")) %>%
mutate(method = "tune") %>%
bind_rows(
readRDS("../fit/crf_tune_hf183_smote.rds")$opt.path$env$path %>%
mutate(outcome = "hf183",
method = "smote"),
readRDS("../fit/crf_tune_hfi_smote.rds")$opt.path$env$path %>%
mutate(outcome = "hfi",
method = "smote"),
readRDS("../fit/crf_tune_ecoli_smote.rds")$opt.path$env$path %>%
mutate(outcome = "ecoli",
method = "smote")) %>%
rename(AUC = auc.testgroup.mean) %>%
mutate(minbucket = factor(minbucket, levels = c(3, 7, 11, 15, 19)),
mtry = factor(mtry, c(2, 3, 4, 5, 6, 7, 8)),
method = fct_relevel(method, c("tune", "smote")) %>%
fct_recode("tuned" = "tune",
"SMOTE tuned" = "smote"))
# HF183
<- df_p_tune %>%
p_tune_hf183 filter(outcome == "hf183") %>%
ggplot(data = .,
mapping = aes(x = mtry, y = minbucket, fill = AUC)) +
facet_grid(cols = vars(method)) +
geom_tile() +
scale_fill_viridis_c() +
labs(subtitle = "(A) HF183 detection")
# HFI
<- df_p_tune %>%
p_tune_hfi filter(outcome == "hfi") %>%
ggplot(data = .,
mapping = aes(x = mtry, y = minbucket, fill = AUC)) +
facet_grid(cols = vars(method)) +
geom_tile() +
scale_fill_viridis_c() +
labs(subtitle = "(B) HFI detection")
# ECOLI
<- df_p_tune %>%
p_tune_ecoli filter(outcome == "ecoli") %>%
ggplot(data = .,
mapping = aes(x = mtry, y = minbucket, fill = AUC)) +
facet_grid(cols = vars(method)) +
geom_tile() +
scale_fill_viridis_c() +
labs(subtitle = "(C) E. coli ≥126 MPN/100 mL")
# combine
<- p_tune_hf183 + p_tune_hfi + p_tune_ecoli +
p_tune plot_layout(ncol = 1, guides = "keep")
ggsave(filename = "../fig/hp_tune.png",
plot = p_tune,
device = ragg::agg_png, dpi = 300,
width = 6, height = 8, units = "in")
Benchmark performance
Compare model predictive performance between the untuned (raw data, default hyperparameters), tuned (raw data, optimized hyperparameters), and balanced (SMOTE data, SMOTE-tuned hyperparameters) models for each outcome using repeated cross-validation. Increase ntrees
for stability and change the seed for variety.
### HF183
## default
# learner
<- setHyperPars(hf183_lrn, ntree = 10001)
hf183_lrn_dflt
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hf183_lrn_dflt,
hf183_bench_dflt task = hf183_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## tuned
# optimized params
<- readRDS("../fit/crf_tune_hf183.rds")$x
hf183_opt
# learner
<- setHyperPars(hf183_lrn,
hf183_lrn_tune ntree = 10001,
mtry = hf183_opt$mtry,
minbucket = hf183_opt$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hf183_lrn_tune,
hf183_bench_tune task = hf183_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## smote
# optimized params
<- readRDS("../fit/crf_tune_hf183_smote.rds")$x
hf183_opt_smote
# learner
<- setHyperPars(hf183_lrn_smote,
hf183_lrn_tune_smote ntree = 10001,
mtry = hf183_opt_smote$mtry,
minbucket = hf183_opt_smote$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hf183_lrn_tune_smote,
hf183_bench_smote task = hf183_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
### HFI
## default
# learner
<- setHyperPars(hfi_lrn, ntree = 10001)
hfi_lrn_dflt
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hfi_lrn_dflt,
hfi_bench_dflt task = hfi_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## tuned
# optimized params
<- readRDS("../fit/crf_tune_hfi.rds")$x
hfi_opt
# learner
<- setHyperPars(hfi_lrn,
hfi_lrn_tune ntree = 10001,
mtry = hfi_opt$mtry,
minbucket = hfi_opt$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hfi_lrn_tune,
hfi_bench_tune task = hfi_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## smote
# optimized params
<- readRDS("../fit/crf_tune_hfi_smote.rds")$x
hfi_opt_smote
# learner
<- setHyperPars(hfi_lrn_smote,
hfi_lrn_tune_smote ntree = 10001,
mtry = hfi_opt_smote$mtry,
minbucket = hfi_opt_smote$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = hfi_lrn_tune_smote,
hfi_bench_smote task = hfi_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
### ECOLI
## default
# learner
<- setHyperPars(ecoli_lrn, ntree = 10001)
ecoli_lrn_dflt
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = ecoli_lrn_dflt,
ecoli_bench_dflt task = ecoli_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## tuned
# optimized params
<- readRDS("../fit/crf_tune_ecoli.rds")$x
ecoli_opt
# learner
<- setHyperPars(ecoli_lrn,
ecoli_lrn_tune ntree = 10001,
mtry = ecoli_opt$mtry,
minbucket = ecoli_opt$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = ecoli_lrn_tune,
ecoli_bench_tune task = ecoli_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
## smote
# optimized params
<- readRDS("../fit/crf_tune_ecoli_smote.rds")$x
ecoli_opt_smote
# learner
<- setHyperPars(ecoli_lrn_smote,
ecoli_lrn_tune_smote ntree = 10001,
mtry = ecoli_opt_smote$mtry,
minbucket = ecoli_opt_smote$minbucket)
# seed for reproducibility
set.seed(27599)
# perform cross-validation
<- repcv(learner = ecoli_lrn_tune_smote,
ecoli_bench_smote task = ecoli_task,
folds = 3L, reps = 5L,
measures = hparam_measure)
### Save Learners for later use
save(hf183_lrn, hf183_lrn_tune, hf183_lrn_smote, hf183_lrn_dflt,
hfi_lrn, hfi_lrn_tune, hfi_lrn_smote, hfi_lrn_dflt,
ecoli_lrn, ecoli_lrn_tune, ecoli_lrn_smote, ecoli_lrn_dflt,file = "../fit/learner_objs.RData")
### Combine and save
<- bind_rows(
crf_bench $aggr %>%
hf183_bench_dfltas_tibble_row() %>%
mutate(outcome = "hf183", set = "dflt"),
$aggr %>%
hf183_bench_tuneas_tibble_row() %>%
mutate(outcome = "hf183", set = "tune"),
$aggr %>%
hf183_bench_smoteas_tibble_row() %>%
mutate(outcome = "hf183", set = "smote"),
$aggr %>%
hfi_bench_dfltas_tibble_row() %>%
mutate(outcome = "hfi", set = "dflt"),
$aggr %>%
hfi_bench_tuneas_tibble_row() %>%
mutate(outcome = "hfi", set = "tune"),
$aggr %>%
hfi_bench_smoteas_tibble_row() %>%
mutate(outcome = "hfi", set = "smote"),
$aggr %>%
ecoli_bench_dfltas_tibble_row() %>%
mutate(outcome = "ecoli", set = "dflt"),
$aggr %>%
ecoli_bench_tuneas_tibble_row() %>%
mutate(outcome = "ecoli", set = "tune"),
$aggr %>%
ecoli_bench_smoteas_tibble_row() %>%
mutate(outcome = "ecoli", set = "smote")
)
saveRDS(crf_bench, "../fit/crf_benchmark.rds")
Plot the mean and standard deviation AUC for each outcome and CRF tuning strategy.
Code
## Plot
<- readRDS("../fit/crf_benchmark.rds") %>%
p_bench mutate(set = fct_relevel(set,
c("dflt", "tune", "smote")) %>%
fct_recode("default" = "dflt",
"tuned" = "tune",
"SMOTE\n tuned" = "smote") %>%
fct_rev(),
outcome = fct_relevel(outcome,
c("hf183", "hfi", "ecoli")) %>%
fct_recode("HF183 detection" = "hf183",
"HFI detection" = "hfi",
"E. coli ≥126" = "ecoli") %>%
fct_rev()) %>%
rename(auc = auc.testgroup.mean,
auc_sd = auc.testgroup.sd) %>%
ggplot(data = .,
mapping = aes(x = auc, y = outcome, group = set, color = set)) +
geom_pointrange(aes(xmin = (auc - auc_sd), xmax = (auc + auc_sd)),
position = position_dodge(width = 0.7)) +
scale_color_viridis_d(name = "Hyperparameters",
end = 0.95, direction = -1) +
guides(color = guide_legend(reverse = TRUE)) +
labs(x = "AUC",
y = NULL) +
theme_bw() +
theme(legend.key.height = unit(1.5, "lines"))
ggsave(filename = "../fig/crf_benchmark.png",
plot = p_bench,
device = ragg::agg_png, dpi = 300,
width = 6, height = 3.5, units = "in")
The discrimination, as measured by AUC, is nearly identical using the default hyperparameter values and the optimal tuned values for both HF183 and HFI. Models trained on SMOTE-adjusted data perform notably worse for both human-associated outcomes. For predicting elevated generic E. coli, however, the default values provide considerably worse performance than either tuning approach. Between the tuned models, SMOTE again offers inferior discrimination. Mean AUC is lower and the variance in AUC over cross-validation iterations much larger for all E. coli models than for either human outcome.
Although tuning improves the E. coli model performance, the values selected by tuning are at the extreme range of the search space for both hyperparameters: mtry = 8
and minbucket = 19
. This chasing of the extremes suggests some degenerate model behavior, particularly considering there are only 22 positive E. coli cases in the training dataset and only a dozen explanatory variables.
Train
Perform the final model training for all three CRF variants (default, tuned, and SMOTE) on the full training dataset for each of the three outcomes.
# Load learners
load("../fit/learner_objs.RData")
### HF183
## default
set.seed(30333)
<- train(learner = hf183_lrn_dflt,
hf183_train_dflt task = hf183_task)
saveRDS(hf183_train_dflt, "../fit/hf183_train_dflt.rds")
## tuned
set.seed(30333)
<- train(learner = hf183_lrn_tune,
hf183_train_tune task = hf183_task)
saveRDS(hf183_train_tune, "../fit/hf183_train_tune.rds")
## smote
set.seed(30333)
<- train(learner = hf183_lrn_tune_smote,
hf183_train_smote task = hf183_task)
saveRDS(hf183_train_smote, "../fit/hf183_train_smote.rds")
### HFI
## default
set.seed(30333)
<- train(learner = hfi_lrn_dflt,
hfi_train_dflt task = hfi_task)
saveRDS(hfi_train_dflt, "../fit/hfi_train_dflt.rds")
## tuned
set.seed(30333)
<- train(learner = hfi_lrn_tune,
hfi_train_tune task = hfi_task)
saveRDS(hfi_train_tune, "../fit/hfi_train_tune.rds")
## smote
set.seed(30333)
<- train(learner = hfi_lrn_tune_smote,
hfi_train_smote task = hfi_task)
saveRDS(hfi_train_smote, "../fit/hfi_train_smote.rds")
### ECOLI
## default
set.seed(30333)
<- train(learner = ecoli_lrn_dflt,
ecoli_train_dflt task = ecoli_task)
saveRDS(ecoli_train_dflt, "../fit/ecoli_train_dflt.rds")
## tuned
set.seed(30333)
<- train(learner = ecoli_lrn_tune,
ecoli_train_tune task = ecoli_task)
saveRDS(ecoli_train_tune, "../fit/ecoli_train_tune.rds")
## smote
set.seed(30333)
<- train(learner = ecoli_lrn_tune_smote,
ecoli_train_smote task = ecoli_task)
saveRDS(ecoli_train_smote, "../fit/ecoli_train_smote.rds")
Add E. coli predictor
As an additional analysis to investigate whether elevated generic E. coli levels help predict human fecal contamination, retrain the human outcome CRF models including elevated E. coli as an explanatory variable.
## training data (all outcomes)
<- readRDS("../out/data_train.rds") %>%
df_train_crf mutate(across(c(hf183, phage, hfi, ecoli, building),
~factor(.x, levels = c("0", "1"),
labels = c("0" = "neg", "1" = "pos")))) %>%
select(week, building,
turb, cond, do, ph,
rain2, rain7,
solar2,
wind2, wind7,
hf183, hfi, ecoli)
### HF183
## drop non-target vars
<- df_train_crf %>%
df_hf183_ec select(-hfi)
## revise classification task
<- makeClassifTask(id = "hf183_ec",
hf183_task_ec data = df_hf183_ec,
target = "hf183",
positive = "pos")
## default
# train
set.seed(30333)
<- train(learner = hf183_lrn_dflt,
hf183_train_dflt_ec task = hf183_task_ec)
saveRDS(hf183_train_dflt_ec, "../fit/hf183_train_dflt_ec.rds")
## tuned
set.seed(30333)
<- train(learner = hf183_lrn_tune,
hf183_train_tune_ec task = hf183_task_ec)
saveRDS(hf183_train_tune_ec, "../fit/hf183_train_tune_ec.rds")
## smote
set.seed(30333)
<- train(learner = hf183_lrn_tune_smote,
hf183_train_smote_ec task = hf183_task_ec)
saveRDS(hf183_train_smote_ec, "../fit/hf183_train_smote_ec.rds")
### HFI
## drop non-target vars
<- df_train_crf %>%
df_hfi_ec select(-hf183)
## revise classification task
<- makeClassifTask(id = "hfi_ec",
hfi_task_ec data = df_hfi_ec,
target = "hfi",
positive = "pos")
## default
set.seed(30333)
<- train(learner = hfi_lrn_dflt,
hfi_train_dflt_ec task = hfi_task_ec)
saveRDS(hfi_train_dflt_ec, "../fit/hfi_train_dflt_ec.rds")
## tuned
set.seed(30333)
<- train(learner = hfi_lrn_tune,
hfi_train_tune_ec task = hfi_task_ec)
saveRDS(hfi_train_tune_ec, "../fit/hfi_train_tune_ec.rds")
## smote
set.seed(30333)
<- train(learner = hfi_lrn_tune_smote,
hfi_train_smote_ec task = hfi_task_ec)
saveRDS(hfi_train_smote_ec, "../fit/hfi_train_smote_ec.rds")
Variable importance
Estimate conditional variable importance for all the CRF model fits using AIC as the importance measure [10,11]. Traditional permutation variable importance approaches are biased towards correlated predictor variables [12] and use accuracy as the importance metric, which biases predictions towards the majority class [11].
Note: this is a very slow computation.
### HF183
## default
<- getFeatureImportance(readRDS("../fit/hf183_train_dflt.rds"),
hf183_imp_dflt conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "dflt")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hf183_train_dflt_ec.rds"),
hf183_imp_dflt_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "dflt")
## tuned
<- getFeatureImportance(readRDS("../fit/hf183_train_tune.rds"),
hf183_imp_tune conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "tune")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hf183_train_tune_ec.rds"),
hf183_imp_tune_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "tune")
## smote
<- getFeatureImportance(readRDS("../fit/hf183_train_smote.rds"),
hf183_imp_smote conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "smote")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hf183_train_smote_ec.rds"),
hf183_imp_smote_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "smote")
### HFI
## default
<- getFeatureImportance(readRDS("../fit/hfi_train_dflt.rds"),
hfi_imp_dflt conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "dflt")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hfi_train_dflt_ec.rds"),
hfi_imp_dflt_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "dflt")
## tuned
<- getFeatureImportance(readRDS("../fit/hfi_train_tune.rds"),
hfi_imp_tune conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "tune")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hfi_train_tune_ec.rds"),
hfi_imp_tune_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "tune")
## smote
<- getFeatureImportance(readRDS("../fit/hfi_train_smote.rds"),
hfi_imp_smote conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "smote")
# with E. coli
<- getFeatureImportance(readRDS("../fit/hfi_train_smote_ec.rds"),
hfi_imp_smote_ec conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "smote")
### ECOLI
## default
<- getFeatureImportance(readRDS("../fit/ecoli_train_dflt.rds"),
ecoli_imp_dflt conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "dflt")
## tuned
<- getFeatureImportance(readRDS("../fit/ecoli_train_tune.rds"),
ecoli_imp_tune conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "tune")
## smote
<- getFeatureImportance(readRDS("../fit/ecoli_train_smote.rds"),
ecoli_imp_smote conditional = TRUE,
auc = TRUE)$res %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "smote")
### Combine
<- bind_rows(hf183_imp_dflt, hf183_imp_dflt_ec,
imp_crf
hf183_imp_tune, hf183_imp_tune_ec,
hf183_imp_smote, hf183_imp_smote_ec,
hfi_imp_dflt, hfi_imp_dflt_ec,
hfi_imp_tune, hfi_imp_tune_ec,
hfi_imp_smote, hfi_imp_smote_ec,
ecoli_imp_dflt,
ecoli_imp_tune,%>%
ecoli_imp_smote) select(method, predvar, outcome,
term = variable,
est = importance) %>%
group_by(method, predvar, outcome) %>%
mutate(est_max = max(est),
est_min = min(est),
est_range = est_max - est_min,
est_norm = (est - est_min) / est_range) %>%
ungroup()
saveRDS(imp_crf, "../out/varimp_crf.rds")
Predictions
The final models have been trained for both regression and CRF approaches. Now predict the completely separate test dataset using the trained models to assess out-of-sample predictive performance.
Regression
While the test dataset is used to assess predictive performance to avoid overfitting, the training dataset can also be predicted simultaneously to provide an idea of how much performance changes for in-sample vs. out of sample predictions.
## stack training and test data, predict both
<- bind_rows(readRDS("../out/data_train.rds"),
df_pred readRDS("../out/data_test.rds"))
## HF183
# predict
<- df_pred %>%
pred_reg_183 mutate(pred = predict(readRDS("../fit/reg_slct_hf183.rds"),
newdata = .,
allow.new.levels = TRUE,
type = "response")) %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "glmm") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
# predict with E. coli
<- df_pred %>%
pred_reg_183_ec mutate(pred = predict(readRDS("../fit/reg_ec_hf183.rds"),
newdata = .,
allow.new.levels = TRUE,
type = "response")) %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "glmm") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
## HFI
# predict
<- df_pred %>%
pred_reg_hfi mutate(pred = predict(readRDS("../fit/reg_slct_hfi.rds"),
newdata = .,
allow.new.levels = TRUE,
type = "response")) %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "glmm") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
# predict with E. coli
<- df_pred %>%
pred_reg_hfi_ec mutate(pred = predict(readRDS("../fit/reg_ec_hfi.rds"),
newdata = .,
allow.new.levels = TRUE,
type = "response")) %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "glmm") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
## E. coli
# predict
<- df_pred %>%
pred_reg_ec mutate(pred = predict(readRDS("../fit/reg_slct_ecoli.rds"),
newdata = .,
allow.new.levels = TRUE,
type = "response")) %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "glmm") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = ecoli,
pred)
## Combine
<- bind_rows(pred_reg_183,
pred_reg
pred_reg_183_ec,
pred_reg_hfi,
pred_reg_hfi_ec,
pred_reg_ec)
saveRDS(pred_reg, "../out/pred_reg.rds")
Random forest
Generate predictions for training and test datasets for all variations of CRF models.
### stack training and test data, predict both
<- bind_rows(readRDS("../out/data_train.rds"),
df_pred_crf readRDS("../out/data_test.rds")) %>%
mutate(across(c(hf183, phage, hfi, ecoli, building),
~factor(.x, levels = c("0", "1"),
labels = c("0" = "neg", "1" = "pos"))))
### Function to process CRF predictions
<- function(newdata, fit){
predict_crf
<- predict(fit, newdata = newdata)
pred
<- newdata %>%
pos bind_cols(pred$data) %>%
rename(pred = prob.pos)
return(pos)
}
### HF183
## default
<- df_pred_crf %>%
hf183_pred_dflt predict_crf(readRDS("../fit/hf183_train_dflt.rds")) %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "dflt") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
# with E. coli
<- df_pred_crf %>%
hf183_pred_dflt_ec predict_crf(readRDS("../fit/hf183_train_dflt_ec.rds")) %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "dflt") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
## tuned
<- df_pred_crf %>%
hf183_pred_tune predict_crf(readRDS("../fit/hf183_train_tune.rds")) %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "tune") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
# with E. coli
<- df_pred_crf %>%
hf183_pred_tune_ec predict_crf(readRDS("../fit/hf183_train_tune_ec.rds")) %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "tune") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
## smote
<- df_pred_crf %>%
hf183_pred_smote predict_crf(readRDS("../fit/hf183_train_smote.rds")) %>%
mutate(outcome = "hf183",
predvar = "slct",
method = "smote") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
# with E. coli
<- df_pred_crf %>%
hf183_pred_smote_ec predict_crf(readRDS("../fit/hf183_train_smote_ec.rds")) %>%
mutate(outcome = "hf183",
predvar = "ec_pred",
method = "smote") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hf183,
pred)
### HFI
## default
<- df_pred_crf %>%
hfi_pred_dflt predict_crf(readRDS("../fit/hfi_train_dflt.rds")) %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "dflt") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
# with E. coli
<- df_pred_crf %>%
hfi_pred_dflt_ec predict_crf(readRDS("../fit/hfi_train_dflt_ec.rds")) %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "dflt") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
## tuned
<- df_pred_crf %>%
hfi_pred_tune predict_crf(readRDS("../fit/hfi_train_tune.rds")) %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "tune") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
# with E. coli
<- df_pred_crf %>%
hfi_pred_tune_ec predict_crf(readRDS("../fit/hfi_train_tune_ec.rds")) %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "tune") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
## smote
<- df_pred_crf %>%
hfi_pred_smote predict_crf(readRDS("../fit/hfi_train_smote.rds")) %>%
mutate(outcome = "hfi",
predvar = "slct",
method = "smote") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
# with E. coli
<- df_pred_crf %>%
hfi_pred_smote_ec predict_crf(readRDS("../fit/hfi_train_smote_ec.rds")) %>%
mutate(outcome = "hfi",
predvar = "ec_pred",
method = "smote") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = hfi,
pred)
### ECOLI
## default
<- df_pred_crf %>%
ecoli_pred_dflt predict_crf(readRDS("../fit/ecoli_train_dflt.rds")) %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "dflt") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = ecoli,
pred)
## tuned
<- df_pred_crf %>%
ecoli_pred_tune predict_crf(readRDS("../fit/ecoli_train_tune.rds")) %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "tune") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = ecoli,
pred)
## smote
<- df_pred_crf %>%
ecoli_pred_smote predict_crf(readRDS("../fit/ecoli_train_smote.rds")) %>%
mutate(outcome = "ecoli",
predvar = "slct",
method = "smote") %>%
select(set, dt_collect, pond,
method, predvar, outcome,obs = ecoli,
pred)
## Combine
<- bind_rows(hf183_pred_dflt, hf183_pred_dflt_ec,
pred_crf
hf183_pred_tune, hf183_pred_tune_ec,
hf183_pred_smote, hf183_pred_smote_ec,
hfi_pred_dflt, hfi_pred_dflt_ec,
hfi_pred_tune, hfi_pred_tune_ec,
hfi_pred_smote, hfi_pred_smote_ec,
ecoli_pred_dflt,
ecoli_pred_tune,
ecoli_pred_smote)
saveRDS(pred_crf, "../out/pred_crf.rds")
Assess Performance
Receiver operating characteristic (ROC) curve analysis plots the true positive rate (sensitivity) against the false positive rate (1 - specificity) using each predicted probability as the threshold for classifying the prediction as positive. The area under the resulting curve provides a metric for the overall ability of the model to correctly classify predictions as positive and negative. The point on this curve with the greatest distance from the diagonal line (AUC = 0.5) corresponds to the predicted probability that maximizes the sum of sensitivity and specificity when used as the cutpoint to classify a prediction as positive or negative, called the Youden’s J statistic. For a pair of observations, one positive for the outcome and the other negative, the AUC corresponds to the proportion of such pairs that the model assigns a higher probability of the outcome to the positive observation.
All the predictions have been extracted as flat data frames and stacked, allowing operations to be applied across all model predictions in the same function calls. Generate ROC curves, calculate AUC, and extract the threshold value that maximizes Youden’s J for all models by iterating over dataset (training vs test), method (GLMM, default CRF, tuned CRF, SMOTE CRF), and explanatory variable set (selected variables or including elevated E. coli as a predictor).
## combine predictions
<- bind_rows(
pred_all readRDS("../out/pred_reg.rds") %>%
mutate(obs = factor(obs, levels = c("0", "1"),
labels = c("0" = "neg", "1" = "pos"))),
readRDS("../out/pred_crf.rds"))
## Calculate ROC by model and outcome
<- pred_all %>%
roc_all group_by(set, method, predvar, outcome) %>%
nest() %>%
mutate(roc = map(data, ~pROC::roc(obs ~ pred, data = .x)),
thresh = map(roc, ~pROC::coords(.x, x = "best", best.method = "youden")),
diag = map(roc, ~pROC::ci.thresholds(.x, thresholds = "best", best.method = "youden")),
auc = map(roc, ~pROC::auc(.x)),
df = map(roc, ~pROC::ggroc(.x, legacy.axes = TRUE)$data)) %>%
select(-data) %>%
ungroup()
saveRDS(roc_all, "../out/roc_obj_all.rds")
Plot
Generate plots of regression model coefficient estimates, random forest variable importance estimates, and ROC curves.
Explanatory variables
Plot mode estimates of explanatory variable associations with the three binary outcomes: odds ratios derived from the logistic regression model coefficients and conditional variable importance from the random forest models.
Regression coefficients
Plot odds ratios and 95% confidence intervals for each predictor variable included in the final multivariable logistic regression models, color coded by significance.
Code
## estimates
# variable names
<- read_xlsx("../data/variable_names.xlsx")
var_lab
# raw
<- readRDS("../out/est_reg_slct.rds")
est_reg
# format
<- est_reg %>%
df_plot_reg filter(term %in% c("building", "rain2", "rain7", "solar2")) %>%
left_join(var_lab, by = "term") %>%
mutate(term = fct_relevel(term,
c("building", "rain2", "rain7", "solar2"))) %>%
arrange(term) %>%
mutate(label = fct_inorder(label) %>%
fct_rev(),
signif = case_when(p.value < 0.01 ~ "p < 0.01",
>= 0.01 & p.value < 0.05 ~ "p < 0.05",
p.value >= 0.05 & p.value < 0.1 ~ "p < 0.1",
p.value TRUE ~ NA_character_) %>%
fct_relevel(c("p < 0.01", "p < 0.05", "p < 0.1")),
out_lab = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("(A) HF183 detection" = "hf183",
"(B) HFI detection" = "hfi",
"(C) *E. coli* ≥126 MPN/100 mL" = "ecoli"))
<- df_plot_reg %>%
p_or_all ggplot(aes(x = estimate, y = label, color = signif)) +
facet_grid(cols = vars(out_lab)) +
geom_vline(xintercept = 1, color = "red",
linetype = "dashed", linewidth = 1,
alpha = 0.5) +
geom_point(shape = 18, size = 5) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
height = 0.15) +
scale_x_log10(breaks = c(0.01, 0.1, 1, 10, 100),
labels = c("0.01", "0.1", "1", "10", "100")) +
scale_color_viridis_d(begin = 0.05, end = 0.95,
option = "plasma", direction = 1,
name = "Significance") +
labs(x = "odds ratio",
y = NULL) +
theme_bw() +
theme(strip.text = ggtext::element_markdown(hjust = 0),
legend.position = "top",
panel.spacing = unit(0.5, "lines"))
ggsave(filename = "../fig/forest_or.jpg",
plot = p_or_all,
device = ragg::agg_jpeg, dpi = 300,
width = 8, height = 4, units = "in")
Variable importance
Similarly, plot the variable importance from the random forest models. Focus on the default hyperparameter models without E. coli as a predictor for simplicity.
Code
## estimates
# variable names
<- read_xlsx("../data/variable_names.xlsx")
var_lab
# raw
<- readRDS("../out/varimp_crf.rds")
est_crf
# format
<- est_crf %>%
df_plot_crf left_join(var_lab, by = "term") %>%
filter(method == "dflt",
== "slct") %>%
predvar mutate(out_lab = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("(A) HF183 detection" = "hf183",
"(B) HFI detection" = "hfi",
"(C) *E. coli* ≥126 MPN/100 mL" = "ecoli"),
est_norm = est_norm * 100)
## HF183
<- df_plot_crf %>%
p_import_183 filter(outcome == "hf183") %>%
arrange(est) %>%
mutate(label = fct_inorder(label) %>%
fct_recode("Dissolved\noxygen" = "Dissolved oxygen")) %>%
ggplot(aes(x = est, y = label)) +
facet_grid(cols = vars(out_lab)) +
geom_vline(xintercept = 0, color = "red",
linetype = "dashed", linewidth = 1,
alpha = 0.5) +
geom_point(shape = 18, size = 4) +
labs(x = "conditional variable importance",
y = NULL) +
theme_bw() +
theme(strip.text = ggtext::element_markdown(hjust = 0))
## HFI
<- df_plot_crf %>%
p_import_hfi filter(outcome == "hfi") %>%
arrange(est) %>%
mutate(label = fct_inorder(label) %>%
fct_recode("Dissolved\noxygen" = "Dissolved oxygen")) %>%
ggplot(aes(x = est, y = label)) +
facet_grid(cols = vars(out_lab)) +
geom_vline(xintercept = 0, color = "red",
linetype = "dashed", linewidth = 1,
alpha = 0.5) +
geom_point(shape = 18, size = 4) +
labs(x = "conditional variable importance",
y = NULL) +
theme_bw() +
theme(strip.text = ggtext::element_markdown(hjust = 0))
## E. coli
<- df_plot_crf %>%
p_import_ec filter(outcome == "ecoli") %>%
arrange(est) %>%
mutate(label = fct_inorder(label) %>%
fct_recode("Dissolved\noxygen" = "Dissolved oxygen")) %>%
ggplot(aes(x = est, y = label)) +
facet_grid(cols = vars(out_lab)) +
geom_vline(xintercept = 0, color = "red",
linetype = "dashed", linewidth = 1,
alpha = 0.5) +
geom_point(shape = 18, size = 4) +
labs(x = "conditional variable importance",
y = NULL) +
theme_bw() +
theme(strip.text = ggtext::element_markdown(hjust = 0))
## Combine
<- p_import_183 + p_import_hfi + p_import_ec +
p_import plot_layout(ncol = 3, axis_titles = "collect")
ggsave(filename = "../fig/varimp_dflt.jpg",
plot = p_import,
device = ragg::agg_jpeg, dpi = 300,
width = 9.5, height = 4.5, units = "in")
ROC curves
Again, focusing only on the models without E. coli as a predictor, using default hyperparameters for CRF models. Look at ROC curves for training and test data separately.
Test data
Plot ROC curves for test dataset predictions and augment with the threshold probability that maximizes the Youden’s J statistic (minimizes total misclassification) and the sensitivity and specificity at that threshold.
Code
## All ROC info
<- readRDS("../out/roc_obj_all.rds") %>%
roc_test ungroup() %>%
filter(set == "test",
%in% c("glmm", "dflt"),
method == "slct") %>%
predvar mutate(outcome = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("(A) HF183 detection" = "hf183",
"(B) HFI detection" = "hfi",
"(C) *E. coli* ≥126 MPN/100 mL" = "ecoli"),
method = fct_relevel(method, c("glmm", "dflt")) %>%
fct_recode("Conditional\nRandom Forest" = "dflt",
"Logistic\nRegression" = "glmm"))
## roc data for plot
<- roc_test %>%
df_roc_test select(set, method, outcome,
%>%
df) unnest(df)
## Other metrics
<- roc_test %>%
df_perf_test ungroup() %>%
select(set, method, outcome,
%>%
auc, thresh) unnest(thresh) %>%
mutate(auc = unlist(auc),
auc_lab = str_c("AUC: ", broman::myround(auc, 2)),
perf_lab = str_c(auc_lab, "\n",
"threshold: ", broman::myround(threshold, 2), "\n",
"sensitivity: ", broman::myround(sensitivity, 2), "\n",
"specificity: ", broman::myround(specificity, 2)),
`1-specificity` = 1 - specificity)
# plot ROC curves
<- df_roc_test %>%
p_roc_test #mutate(across(c(`1-specificity`, sensitivity), ~.x * 100)) %>%
ggplot(aes(x = `1-specificity`, y = sensitivity)) +
facet_grid(rows = vars(method), cols = vars(outcome)) +
geom_abline(intercept = 0, slope = 1,
linetype = "dashed", linewidth = 1,
color = "red", alpha = 0.6) +
geom_line(linewidth = 1.25) +
geom_point(data = df_perf_test,
size = 3, color = "#205493", alpha = 0.9) +
geom_text(aes(x = .55, y = .45, label = perf_lab),
data = df_perf_test,
size = 3,
hjust = 0, vjust = 1) +
coord_equal() +
scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
theme_bw() +
theme(strip.text.x = ggtext::element_markdown(hjust = 0))
ggsave(filename = "../fig/roc_curves_test.jpg",
plot = p_roc_test,
device = ragg::agg_jpeg, dpi = 300,
width = 8, height = 5.5, units = "in")
Training data
For context, also plot the ROC curves and thresholds for predictions on the training dataset.
Code
## All ROC info
<- readRDS("../out/roc_obj_all.rds") %>%
roc_train ungroup() %>%
filter(set == "train",
%in% c("glmm", "dflt"),
method == "slct") %>%
predvar mutate(outcome = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("(A) HF183 detection" = "hf183",
"(B) HFI detection" = "hfi",
"(C) *E. coli* ≥126 MPN/100 mL" = "ecoli"),
method = fct_relevel(method, c("glmm", "dflt")) %>%
fct_recode("Conditional\nRandom Forest" = "dflt",
"Logistic\nRegression" = "glmm"))
## roc data for plot
<- roc_train %>%
df_roc_train select(set, method, outcome,
%>%
df) unnest(df)
## Other metrics
<- roc_train %>%
df_perf_train ungroup() %>%
select(set, method, outcome,
%>%
auc, thresh) unnest(thresh) %>%
mutate(auc = unlist(auc),
auc_lab = str_c("AUC: ", broman::myround(auc, 2)),
perf_lab = str_c(auc_lab, "\n",
"threshold: ", broman::myround(threshold, 2), "\n",
"sensitivity: ", broman::myround(sensitivity, 2), "\n",
"specificity: ", broman::myround(specificity, 2)),
`1-specificity` = 1 - specificity)
# plot ROC curves
<- df_roc_train %>%
p_roc_train #mutate(across(c(`1-specificity`, sensitivity), ~.x * 100)) %>%
ggplot(aes(x = `1-specificity`, y = sensitivity)) +
facet_grid(rows = vars(method), cols = vars(outcome)) +
geom_abline(intercept = 0, slope = 1,
linetype = "dashed", linewidth = 1,
color = "red", alpha = 0.6) +
geom_line(linewidth = 1.25) +
geom_point(data = df_perf_train,
size = 3, color = "#205493", alpha = 0.9) +
geom_text(aes(x = .55, y = .45, label = perf_lab),
data = df_perf_train,
size = 3,
hjust = 0, vjust = 1) +
coord_equal() +
scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
theme_bw() +
theme(strip.text.x = ggtext::element_markdown(hjust = 0))
ggsave(filename = "../fig/roc_curves_train.png",
plot = p_roc_train,
device = ragg::agg_png, dpi = 300,
width = 8, height = 5.5, units = "in")
Human with E. coli
Also compare the predictions of the human markers with and without using an indicator of elevated E. coli as a predictor.
Code
## All ROC info
<- readRDS("../out/roc_obj_all.rds") %>%
roc_obj_ec ungroup() %>%
filter(set == "test",
%in% c("glmm", "dflt"),
method %in% c("hf183", "hfi")) %>%
outcome mutate(outcome = fct_relevel(outcome, c("hf183", "hfi")) %>%
fct_recode("HF183 detection" = "hf183",
"HFI detection" = "hfi"),
method = fct_relevel(method, c("glmm", "dflt")) %>%
fct_recode("Conditional\nRandom Forest" = "dflt",
"Logistic\nRegression" = "glmm"),
predvar = fct_recode(predvar,
"Original model" = "slct",
"*E. coli* predictor added" = "ec_pred") %>%
fct_relevel(c("Original model", "*E. coli* predictor added")))
## roc data for plot
<- roc_obj_ec %>%
df_roc_ec select(set, method, predvar, outcome,
%>%
df) unnest(df)
## Other metrics
<- roc_obj_ec %>%
df_perf_ec ungroup() %>%
select(set, method, predvar, outcome,
%>%
auc, thresh) unnest(thresh) %>%
mutate(auc = unlist(auc),
auc_lab = str_c("AUC: ", broman::myround(auc, 2)),
perf_lab = str_c(auc_lab, "\n",
"threshold: ", broman::myround(threshold, 2), "\n",
"sensitivity: ", broman::myround(sensitivity, 2), "\n",
"specificity: ", broman::myround(specificity, 2)),
`1-specificity` = 1 - specificity)
## Plot
<- df_roc_ec %>%
p_roc_ec ggplot(aes(x = `1-specificity`, y = sensitivity)) +
facet_nested(rows = vars(method),
cols = vars(outcome, predvar)) +
geom_abline(intercept = 0, slope = 1,
linetype = "dashed", linewidth = 1,
color = "red", alpha = 0.6) +
geom_line(linewidth = 1.25) +
geom_point(data = df_perf_ec,
size = 3, color = "#205493", alpha = 0.9) +
geom_text(aes(x = .55, y = .45, label = perf_lab),
data = df_perf_ec,
size = 3,
hjust = 0, vjust = 1) +
coord_equal() +
scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
theme_bw() +
theme(strip.text.x = ggtext::element_markdown())
ggsave(filename = "../fig/roc_human_ecpred_nest.png",
plot = p_roc_ec,
device = ragg::agg_png, dpi = 300,
width = 10, height = 5.5, units = "in")
Tuning comparison
Finally, compare the predictive performance of the CRF models using different hyperparameter tuning and class imbalance correction strategies for both the training and test datasets.
Code
## All ROC info
<- readRDS("../out/roc_obj_all.rds") %>%
roc_obj_tune ungroup() %>%
filter(method != "glmm",
== "slct") %>%
predvar mutate(outcome = fct_relevel(outcome, c("hf183", "hfi", "ecoli")) %>%
fct_recode("HF183 detection" = "hf183",
"HFI detection" = "hfi",
"*E. coli* ≥126 MPN/100 mL" = "ecoli"),
method = fct_relevel(method, c("dflt", "tune", "smote")) %>%
fct_recode("default" = "dflt",
"tuned" = "tune",
"SMOTE tuned" = "smote"),
set = fct_relevel(set, c("train", "test")) %>%
fct_recode("training data" = "train",
"test data" = "test"),
hyper = "Hyperparameters")
## roc data for plot
<- roc_obj_tune %>%
df_roc_tune select(set, hyper, method, predvar, outcome,
%>%
df) unnest(df)
## Other metrics
<- roc_obj_tune %>%
df_perf_tune ungroup() %>%
select(set, hyper, method, predvar, outcome,
%>%
auc, thresh) unnest(thresh) %>%
mutate(auc = unlist(auc),
auc_lab = str_c("AUC: ", broman::myround(auc, 2)),
perf_lab = str_c(auc_lab),
`1-specificity` = 1 - specificity)
## Plot
<- df_roc_tune %>%
p_roc_tune ggplot(aes(x = `1-specificity`, y = sensitivity)) +
facet_nested(rows = vars(hyper, method),
cols = vars(outcome, set)) +
geom_abline(intercept = 0, slope = 1,
linetype = "dashed", linewidth = 1,
color = "red", alpha = 0.6) +
geom_line(linewidth = 1.25) +
geom_text(aes(x = .55, y = .45, label = perf_lab),
data = df_perf_tune,
size = 3,
hjust = 0, vjust = 1) +
coord_equal() +
scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1),
labels = c("0", "0.25", "0.5", "0.75", "1")) +
theme_bw() +
theme(strip.text.x = ggtext::element_markdown(),
panel.spacing = unit(0.2, "lines"))
ggsave(filename = "../fig/roc_crf_tune.png",
plot = p_roc_tune,
device = ragg::agg_png, dpi = 300,
width = 10, height = 5.5, units = "in")
Disclaimer:
The findings and conclusions of this report are those of the authors and do not necessarily represent the official position of the Centers for Disease Control and Prevention.