# Read in Excel file and prepare data for analysis
excel_file <- "CHLa_data.xlsx" # <--- change as needed depending on your Excel workbook file name
# Define the two source dataframes
# 1 df.james
df.james <- openxlsx::read.xlsx(excel_file, sheet = 2)
# Convert `date` column from Excel encoded date to a more legible date format. Otherwise date shows as numeric value, e.g. '44230'.
df.james$date <- df.james$date * 86400 # 86400 = seconds in a day.
df.james$date <- as.POSIXct(df.james$date, origin = "1899-12-30", tz = "UTC")
# 2 df.discharge
df.discharge <- openxlsx::read.xlsx(excel_file, sheet = 3)
# Define column names
colnames(df.james)[3:5] <- c("SurfaceTemp", "CHLa", "DIN")
colnames(df.discharge) <- as.character(colnames(df.discharge))
# Define and set dynamic variables for use in e.g, for loops.
yr_range <-range(df.james$date)
min_yr <- substr(yr_range[1], 1, 4)
max_yr <- substr(yr_range[2], 1, 4)
years_vec <- c(min_yr:max_yr)
print(yr_range)
print(min_yr)
print(max_yr)
print(years_vec)
# Set and populate new fields for ln(CHLa),discharge, and day of year (1-365)
df.james <-
df.james %>%
mutate(ln_chla = log(df.james$CHLa),
Discharge = as.numeric(NA),
yday = yday(date) # `yday()` function returns day of year for a given date. )
)
# Add discharge values
for (i in years_vec) {
## subset DOYs by year and assign to variable
DOY_vec <- df.james$yday[year(df.james$date) == i]
## Add discharge values to data frame
df.james$Discharge[year(df.james$date) == i] <- df.discharge[DOY_vec, as.character(i)]
}
# Create factor for differentiating data points before and after 2018
df.james$period <- NA
for (i in seq_along(df.james$date)) {
df.james$period[i] <- ifelse(
year(df.james$date[i]) %in% c(2010:2018),
"2010-2018",
paste0("2019-", as.character(max_yr)))
}
# Generate plot using ggplot package
suppressWarnings(
ggplot(data = df.james) +
geom_point(aes(x = date, y = CHLa,
color = CHLa) ) +
geom_line(aes(x = date, y = CHLa,
color = CHLa) ) +
geom_hline(yintercept = 40, linetype = "dashed", color = "gray") +
labs(title = NULL, # or set title with quoted character string
y = paste0("Total CHLa (\u03BCg/L)"),
x = "Date",
) +
scale_y_continuous(breaks = seq(0, max(df.james$CHLa, na.rm = TRUE), by = 20)) +
theme_classic() +
theme(legend.position = "none",
legend.text = element_text(size = 10),
plot.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
axis.text.y = element_text(size = 11),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = 15, color = "grey20", margin = margin(t = 15)),
axis.title.y = element_text(size = 15, color = "grey20", margin = margin(r = 15))) +
scale_x_datetime(date_labels = "%Y", date_breaks = "1 years",
limits = c(as.POSIXct(paste0(min_yr, "-01-01")), as.POSIXct(paste0(max_yr,"-12-31"))
) ) +
scale_color_viridis_c(option = "D", direction = 1) # Set color palette
)
# Save plot to working directory
ggsave(filename = "output/CHLa_timeseries.jpeg", device = "jpeg", dpi = 300)
suppressWarnings(
ggplot(data = df.james) +
geom_point(aes(x = date, y = DIN,
color = DIN), alpha = .8 ) +
geom_line(aes(x = date, y = DIN,
color = DIN) ) +
labs(title = NULL, # or set title with quoted character string
y = "Total DIN (mg/L)",
x = "Date",
color = "Year") +
scale_y_continuous(breaks = seq(0, max(df.james$DIN, na.rm = TRUE), by = .25)) +
theme_classic() +
theme(legend.position = "none",
legend.text = element_text(size = 10),
plot.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
axis.text.y = element_text(size = 11),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = 14, color = "grey20", margin = margin(t = 15)),
axis.title.y = element_text(size = 14, color = "grey20", margin = margin(r = 15))) +
scale_x_datetime(date_labels = "%Y", date_breaks = "1 years",
limits = c(as.POSIXct(paste0(min_yr, "-01-01")), as.POSIXct(paste0(max_yr,"-12-31"))
) ) +
scale_color_gradient(low = "#40798C", high = "grey20") # manually set the two ends of the color ramp
)
mean_vals <- data.frame(Year = years_vec)
jul1_yday <- yday("2013-07-01")
oct31_yday <- yday("2023-10-31")
# Calculate mean discharge by year, Jul-Oct (all dates)
for (i in 2:ncol(df.discharge)) {
mean_vals$Discharge[i-1] <-
round(mean(df.discharge[jul1_yday:oct31_yday, i], na.rm=T), 2)
}
# Calculate mean discharge by year based only on CHLa sampling days during Jul-Oct.
df.samplingDays <-
data.frame(
DOY = yday(df.james$date[yday(df.james$date) >= jul1_yday &
yday( df.james$date) < oct31_yday] ),
year = year(df.james$date)[yday(df.james$date) >= jul1_yday &
yday( df.james$date) < oct31_yday]
)
for (i in (min(years_vec):max(years_vec)) ) {
df.samplingDays$discharge[df.samplingDays$year == i] <-
df.discharge[
df.samplingDays$DOY[df.samplingDays$year == i], as.character(i)]
}
for (i in 2:ncol(df.discharge)) {
doy <-
df.samplingDays$DOY[df.samplingDays$year ==
as.numeric(colnames(df.discharge)[i])]
mean_vals$Discharge_sampling[i-1] <-
round(
mean(df.discharge[doy, i], na.rm = TRUE), 2)
}
# Calculate mean for variables with weekly observation data
for (col in c("SurfaceTemp", "CHLa", "DIN", "SpCond") ) {
means <- df.james %>%
group_by(year = lubridate::year(date) ) %>%
filter(month(date) %in% c(7:10) ) %>%
summarize(mean_value = mean(!!sym(col), na.rm = TRUE) )
mean_vals[, col] <- round(means$mean_value, 2)
}
# Peak CHLa (90%-tile) by year
percentile_90 <-
df.james %>%
group_by(year(date)) %>%
summarize(CHLa_90th = round(quantile(CHLa, probs = .90, na.rm = TRUE), 2) )
summary_stats <- mean_vals %>%
left_join(percentile_90, by = join_by("Year" == "year(date)"))
print(head(summary_stats))
# Add superscripts to column names to correspond to footnotes specififying units
## discharge
colnames(summary_stats)[2] <- paste0(colnames(summary_stats)[2],
footnote_marker_alphabet(1) )
## surface temp
colnames(summary_stats)[4] <- paste0(colnames(summary_stats)[4],
footnote_marker_alphabet(2) )
## CHLa
colnames(summary_stats)[5] <- paste0(colnames(summary_stats)[5],
footnote_marker_alphabet(3) )
## DIN
colnames(summary_stats)[6] <- paste0(colnames(summary_stats)[6],
footnote_marker_alphabet(4) )
## specific conductance
colnames(summary_stats)[7] <- paste0(colnames(summary_stats)[7],
footnote_marker_alphabet(5) )
# Define footnote content. 'Regular expressions' are used to achieve special characters, like the mu or the superscript '3' for cubic meters.
fn.discharge <- "m\u00B3/s"
fn.temp <- "Celsius"
fn.chla <- "\u03BCg/L"
fn.din <- "mg/L"
fn.spcond <- "\u03BCS/cm"
# Print `summary_stats` dataframe as a kable table
summary_stats %>%
kable(align = "c",
caption = "James River Station JMS75 Mean Water Quality Values, by Year <br>
Means based on values observed during July-October index period",
escape = FALSE,
padding = 3) %>% # 'escape' argument necessary for <sup></sup> markup to render the superscripts properly in the column names
kable_paper() %>%
# This next section sets what appears in the footnotes using the variables defined above.
footnote(alphabet = c(fn.discharge,
fn.temp,
fn.chla,
fn.din,
fn.spcond)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed") )
bloom_days <-
df.james %>%
filter(CHLa > 40 & month(date) %in% c(7:10) ) %>%
group_by(year(date) ) %>%
summarize(count = n() ) %>%
mutate("Bloom days" = count*7) %>% # readings are weekly, so multiply count by 7 to estimate totals
rename(Year = "year(date)") %>%
select(-count)
n_depleted <-
df.james %>%
filter(DIN < .07 & month(date) %in% c(7:10) ) %>%
group_by(year(date) ) %>%
summarize(count = n() ) %>%
mutate("N-depleted days" = count*7) %>%
rename(Year = "year(date)") %>%
select(-count)
low_flow <-
data.frame(Year = years_vec,
"Low flow days" = NA)
for (i in 2:15) {
low_flow[i-1, 2] <-
length(
which(df.discharge[jul1_yday:oct31_yday, i] < 100 ) )
}
joined.df <- bloom_days %>%
left_join(n_depleted, by = join_by("Year")) %>%
left_join(low_flow, by = join_by("Year"))
# Add superscripts to col names for unit specification
## bloom days
colnames(joined.df)[2] <- paste0(colnames(joined.df)[2], footnote_marker_alphabet(1) )
## N-depleted days
colnames(joined.df)[3] <- paste0(colnames(joined.df)[3], footnote_marker_alphabet(2) )
## low flow days
colnames(joined.df)[4] <- paste0(colnames(joined.df)[4], footnote_marker_alphabet(3) )
# Print table
joined.df %>%
kable(align = "c",
caption = paste0("Counts of Algal Bloom, N-Depleted, and Low Flow Days, Jul-Oct 2010-", max_yr),
escape = FALSE,
padding = 3) %>%
kable_paper() %>%
footnote(alphabet = c("CHLa > 40 \u03BCg/L",
"DIN < .07 mg/L",
"discharge < 100 m\u00B3/s") ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed") )
# Data wrangling for Model 1
## Subset date, SurfaceTemp, CHLa
df.model1 <-
df.james[, c("date", "SurfaceTemp", "CHLa", "ln_chla", "Discharge", "period")]
## Derive T/Q ratio and ln(CHLa)
df.model1 <-
df.model1 %>%
mutate(t_q = SurfaceTemp/Discharge)
# Create dataframes to hold regression statistics
### For individual yr stats
reg.stats <-
data.frame(Model.Year = years_vec,
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
### For all years combined
reg.stats2 <-
data.frame(Model.Year = "All years",
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
# custom functions (see 'functions.R' file)
reg.stats <- regstat_eachyear(reg.df = reg.stats,
data = df.model1,
x = "t_q")
reg.stats2 <- regstat_combined(reg.df = reg.stats2,
data = df.model1,
x = "t_q")
# Bind together to create a single df
reg.stats <- rbind(reg.stats2, reg.stats)
# Plot of ln(CHLa) ~ T:Q for all years' data with best fit regression line.
# custom function (see 'functions.R' file)
plot_allYears(
data = df.model1,
x = "t_q",
y = "ln_chla",
formula = y ~ log(x),
method = "lm",
x.title = "Temperature:Discharge",
y.title = "ln(CHLa) (\u03BCg/L)",
axis.title.size = 14,
point.col = c("#ED254E", "#0096FF"))
ggsave(filename = "output/model1_regressionPlot.jpeg", device = "jpeg", dpi = 300)
#### **Chlorophyll A ~ Temperature/Discharge Ratio Plots by Year**
plot_eachYear(data = df.model1,
x = "t_q",
formula = y ~ log(x),
method = "lm",
reg.stats_ = reg.stats,
xlab = "Temperature:Discharge")
# View in table
reg.stats %>%
kable(align = "c",
caption = "Regression statistics by year for ln(CHLAa) ~ ln(T:Q)") %>%
kable_paper() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")
)
model1_tq <- lm(ln_chla ~ log(t_q), data = df.model1)
residuals_tq <- model1_tq$residuals
predicted_vals_tq <- fitted(model1_tq)
# CHLa TIME SERIES & MODEL ACCURACY PLOT
# custom function (see 'functions.R' file)
plot_bloomday_predictions(data = df.model1,
x = "t_q",
method = "lm")
# Create df with residuals and dates
df.residuals_tq <- as.data.frame(residuals_tq)
colnames(df.residuals_tq)[1] <- "residuals"
df.residuals_tq$date <- df.model1$date[as.numeric(rownames(df.residuals_tq))]
# custom function
plot.residuals_timeSeries(data = df.residuals_tq,
title = NULL)
# -- This requires associating each residual with a specific date/year.
## -- Determine the date for each observation used in the T:Q model
indices <- which(!is.na(df.model1$ln_chla) & !is.na(df.model1$t_q))
tq_dates <- as.POSIXct(df.model1$date[indices])
## -- Join dates to residuals vector
residuals_tq_date <- data.frame(residuals_tq, tq_dates)
jul_oct_Residuals <- residuals_tq_date[month(residuals_tq_date$tq_dates) %in% c(7:10), ]
# Create a box-whisker plot
ggplot(jul_oct_Residuals, aes(x = format(tq_dates, "%Y"), y = residuals_tq)) +
geom_boxplot() +
geom_hline(yintercept = 0, linetype = "solid", color = "salmon") + # Add a horizontal line at y=0
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 11.5),
axis.text.y = element_text(size = 11.5),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = 16, color = "grey20", margin = margin(t = 15)),
axis.title.y = element_text(size = 16, color = "grey20", margin = margin(r = 15))) +
labs(title = "CHLa ~ T:Q Box-and-Whisker Index Period Residuals, ",
x = "Year",
y = "Residuals")
## Subset date, SurfaceTemp, CHLa
df.model2 <-
df.james[, c("date", "SurfaceTemp", "CHLa", "ln_chla", "Discharge", "period")]
# Create field containing DOY for each observation in df.james
df.model2 <- df.model2 %>%
mutate(yday = yday(date))
# Calculate 4 day rolling avg for discharge
for (i in years_vec) {
# subset DOYs by year and assign to variable
DOY_vec <- df.model2$yday[year(df.model2$date) == i]
# Calculate rolling average (`width` argument sets window width, i.e. number of days to calcualte average )
d <-
zoo::rollapply(df.discharge[, as.character(i)], width = 4, FUN = mean, align = "right", fill = NA)
# Add discharge values to data set with temperature and chla values
df.model2$DischargeAvg[year(df.model2$date) == i] <- d[DOY_vec]
}
# Derive T/Qavg ratio and ln(CHLa)
df.model2 <-
df.model2 %>%
mutate(t_q_avg = SurfaceTemp/DischargeAvg) %>%
mutate(ln_chla = log(CHLa)) %>%
select(-yday)
# Create factor for use in visually distinguishing 2010-2018 plot points from those 2019 and beyond.
df.model2$period <- NA
for (i in seq_along(df.model2$date)) {
df.model2$period[i] <- ifelse(
year(df.model2$date[i]) %in% c(2010:2018),
"2010-2018",
paste0("2019-", as.character(max_yr)))
}
# Create dataframes to hold regression statistics
### For individual yr stats
reg.stats <-
data.frame(Model.Year = years_vec,
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
# All years df
reg.stats2 <-
data.frame(Model.Year = "All years",
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
# custom functions (see 'functions.R' file)
reg.stats <- regstat_eachyear(reg.df = reg.stats,
data = df.model2,
x = "t_q_avg")
reg.stats2 <- regstat_combined(reg.df = reg.stats2,
data = df.model2,
x = "t_q_avg")
reg.stats <- rbind(reg.stats2, reg.stats)
# Plot for ln(CHLa) ~ T:Qavg, all years with best fit regression line
# custom function (see 'functions.R' file)
plot_allYears(
data = df.model2,
x = "t_q_avg",
y = "ln_chla",
formula = y ~ log(x),
method = "lm",
title = "Chlorophyll A ~ Temperature:Qavg",
x.title = expression("Temperature:Q"[avg]),
y.title = "ln(CHLa) (\u03BCg/L)",
point.col = c("#ED254E", "#0096FF"))
#### Chlorophyll A ~ Temperature/Discharge Ratio Plots by Year
# custom function (see 'functions.R' file)
plot_eachYear(data = df.model2,
x = "t_q_avg",
formula = y ~ log(x),
method = "lm",
reg.stats_ = reg.stats,
xlab = expression("Temperature:Q"[avg])
)
reg.stats %>%
kable(align = "c",
caption = "Regression statistics by year for ln(CHLAa) ~ ln(T/Qavg)") %>%
kable_paper() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")
)
# Create a factor, "bloom_days", to indicate model prediction vs observed value
model_tq_avg <- lm(ln_chla ~ log(t_q_avg), data = df.model2)
residuals_tq_avg <- model_tq_avg$residuals
predicted_values_tq_avg <- fitted(model_tq_avg)
# CHLa TIME SERIES & MODEL ACCURACY PLOT.
# custom function (see 'functions.R' file)
plot_bloomday_predictions(data = df.model2,
x = "t_q_avg",
method = "lm")
# Create df with residuals and dates
df.residuals_tq_avg <- as.data.frame(residuals_tq_avg)
colnames(df.residuals_tq_avg)[1] <- "residuals"
df.residuals_tq_avg$date <- df.model2$date[as.numeric(rownames(df.residuals_tq_avg))]
# custom function (see 'functions.R' file)
plot.residuals_timeSeries(data = df.residuals_tq_avg,
title = NULL)
## Subset date, SurfaceTemp, CHLa
df.model3 <- df.james[, c("date", "SurfaceTemp", "CHLa", "TtoCond")]
# Derive ln(CHLa)
df.model3$ln_chla <- log(df.model3$CHLa)
# Create factor for use in visually distinguishing 2010-2018 plot points from those 2019 and beyond.
df.model3$period <- NA
for (i in seq_along(df.model3$date)) {
df.model3$period[i] <- ifelse(
year(df.model3$date[i]) %in% c(2010:2018),
"2010-2018",
paste0("2019-", as.character(max_yr)))
}
# Create dfs to contain regression stats
reg.stats <-
data.frame(Model.Year = years_vec,
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
reg.stats2 <-
data.frame(Model.Year = "All years",
coefficient = as.numeric(NA),
coef.error = as.numeric(NA),
p.val = as.numeric(NA),
RSQR = as.numeric(NA)
)
# Custom functions to generate stats
reg.stats <- regstat_eachyear(reg.stats, data = df.model3, x = "TtoCond")
reg.stats2 <- regstat_combined(reg.stats2, data = df.model3, x = "TtoCond")
# Bind together 'all years' and individual year regression statistics
reg.stats <- rbind(reg.stats2, reg.stats)
# Plot for ln(CHLa) ~ TtoCond, all years with best fit regression line.
# custom function (see 'functions.R' file)
plot_allYears(
data = df.model3,
x = "TtoCond",
y = "ln_chla",
formula = y ~ log(x),
method = "lm",
title = "CHLa~ Temperature/Inverse Special Conductivity Ratio",
x.title = "Temperature:InvSpCond",
y.title = "ln(CHLa) (\u03BCg/L)",
point.col = c("#ED254E", "#0096FF"))
#### Chlorophyll A ~ Temperature/Inverse Special Conductivity Ratio, by Year
# custom function (see 'functions.R' file)
plot_eachYear(data = df.model3,
x = "TtoCond",
formula = y ~ log(x),
method = "lm",
reg.stats_ = reg.stats,
xlab = "Temperature:InvSpCond")
# View in table
reg.stats %>%
kable(align = "c",
caption = "Regression statistics by year for ln(CHLAa) ~ ln(T:InvSpCond)") %>%
kable_paper() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")
)
# Create a factor, "bloom_days", to indicate model prediction vs observed value
model_tcond <- lm(ln_chla ~ log(TtoCond), data = df.model3)
residuals_tcond <- model_tcond$residuals
predicted_values_tcond <- fitted(model_tcond)
# CHLa TIME SERIES & MODEL ACCURACY PLOT.
# custom function (see 'functions.R' file)
plot_bloomday_predictions(data = df.model3,
x = "TtoCond",
method = "lm") # Custom function
# Create df with residuals and dates
df.residuals_tcond <- as.data.frame(residuals_tcond)
colnames(df.residuals_tcond)[1] <- "residuals"
df.residuals_tcond$date <- df.model3$date[as.numeric(rownames(df.residuals_tcond))]
# custom function (see 'functions.R' file)
plot.residuals_timeSeries(data = df.residuals_tcond,
title = NULL)
\[CHLa \sim CHLamax \times \tanh\left(\frac{\alpha \times T}{CHLamax}\right)\]
# Recycle df.model1 dataframe, as it already has t_q values.
df_tang <- df.model1[, -9]
model_tang <- nls(CHLa ~ CHLamax * tanh((alpha * t_q)/CHLamax),
data = df_tang,
start = c(alpha = 50, CHLamax = 10),
control = nls.control(maxiter = 2000))
mod_tang_sum <- summary(model_tang)
mod_tang_predictions <- fitted(model_tang)
obs_fit <- data.frame(observed = df_tang$CHLa[!is.na(df_tang$CHLa)],
predicted = fitted(model_tang),
date = df_tang$date[!is.na(df_tang$CHLa)])
# Plot for ln(CHLa) ~ T:Qavg, all years
plot_allYears <- ggplot(df_tang, aes(x = t_q, y = CHLa)) +
geom_point(aes(col=period), alpha = 0.55) +
geom_smooth(method = "nls", formula = y ~ CHLamax * tanh((alpha * x)/CHLamax),
se = FALSE, linetype = "solid", color = "grey45", alpha = 0.5,
method.args = list(start = coef(model_tang))) +
labs(title = NULL,
x = "Temperature:Discharge",
y = "CHLa (\u03BCg/L)") +
scale_color_manual(values = c("#ED254E", "#0096FF"),
name = NULL) +
theme(title = element_text(size = 8)) +
theme_minimal()
suppressWarnings(
print(plot_allYears)
)
## LINEAR REGRESSION STATISTICS
reg.stats_tang <-
data.frame(Model.Year = years_vec,
alpha.coef = as.numeric(NA),
alpha.p.val = as.numeric(NA),
alpha.se = as.numeric(NA),
CHLamax.coef = as.numeric(NA),
CHLamax.pval = as.numeric(NA),
CHLamax.se = as.numeric(NA) )
# Run the model for each year
for (i in seq_along(years_vec)) {
data_subset <-
df_tang[year(df_tang$date) == years_vec[i], ]
res_tang <-
summary(
nls(CHLa ~ CHLamax * tanh((alpha * t_q)/CHLamax),
data = data_subset,
start = c(alpha = 70, CHLamax = 20),
control = nls.control(maxiter = 1000) )
)
reg.stats_tang$alpha.coef[i] <- round(res_tang$coefficients[1,1], 4)
reg.stats_tang$alpha.p.val[i] <- round(res_tang$coefficients[1,4], 4)
reg.stats_tang$alpha.se[i] <- round(res_tang$coefficients[1,2] ,4)
reg.stats_tang$CHLamax.coef[i] <- round(res_tang$coefficients[2,1], 4)
reg.stats_tang$CHLamax.pval[i] <- round(res_tang$coefficients[2,4], 4)
reg.stats_tang$CHLamax.se[i] <- round(res_tang$coefficients[2,2], 4)
}
# Add regression stats for all years -- Repeat same as above without for loop and create a 1 row data frame.
reg.stats_tang2 <-
data.frame(Model.Year = "All years",
alpha.coef = as.numeric(NA),
alpha.p.val = as.numeric(NA),
alpha.se = as.numeric(NA),
CHLamax.coef = as.numeric(NA),
CHLamax.pval = as.numeric(NA),
CHLamax.se = as.numeric(NA) )
res_tang <-
summary(
nls(CHLa ~ CHLamax * tanh((alpha * t_q)/CHLamax),
data = df_tang,
start = c(alpha = 70, CHLamax = 10),
control = nls.control(maxiter = 1000) )
)
reg.stats_tang2$alpha.coef <- round(res_tang$coefficients[1,1], 4)
reg.stats_tang2$alpha.p.val <- round(res_tang$coefficients[1,4], 4)
reg.stats_tang2$alpha.se <- round(res_tang$coefficients[1,2] ,4)
reg.stats_tang2$CHLamax.coef <- round(res_tang$coefficients[2,1], 4)
reg.stats_tang2$CHLamax.pval <- round(res_tang$coefficients[2,4], 4)
reg.stats_tang2$CHLamax.se <- round(res_tang$coefficients[2,2], 4)
# Bind together 'all years' and individual year regression statistics
reg.stats_tang <- rbind(reg.stats_tang2, reg.stats_tang)
for (yr in years_vec) {
plot <-
ggplot(df_tang, aes(x = t_q, y = CHLa)) +
geom_smooth(data = df_tang[year(df_tang$date) == yr,],
method = "nls",
formula = y ~ CHLamax * tanh((alpha * x)/CHLamax),
se = FALSE, linetype = "solid", color = "grey45", alpha = 0.75,
method.args = list(start = coef(model_tang))) +
geom_point(data = df_tang[year(df_tang$date) == yr,],
color = ifelse(yr < 2019, "#ED254E", "#0096FF"),
alpha = .6) +
labs(title = yr,
x = "Temperature:Discharge",
y = "CHLa (\u03BCg/L)") +
theme(title = element_text(size = 8)) +
theme_minimal()
suppressWarnings(
print(plot)
)
}
ggplot(obs_fit, aes(x = predicted, y = residuals(model_tang) ) ) +
geom_point(alpha = .65) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Predicted Values", y = "Residuals", title = "Residuals vs. Predicted") +
theme_classic()
reg.stats_tang %>%
kable(align = "c",
caption = "CHLa ~ CHLamax * tanh((alpha * t_q)/CHLamax) Model Statistics, by Year") %>%
kable_paper() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")
)
# Create df with residuals and dates
df.residuals_tang <- as.data.frame(residuals(model_tang))
colnames(df.residuals_tang)[1] <- "residuals"
df.residuals_tang$date <- df_tang$date[as.numeric(rownames(df.residuals_tang))]
# custom function (see 'functions.R' file)
plot.residuals_timeSeries(data = df.residuals_tang,
title = NULL)
## TIME SERIES PLOT & MODEL ACCURACY TABLE
# custom function (see 'functions.R' file)
plot_bloomday_predictions(data = df_tang,
x = "t_q",
method = "nls")
# ---- CUSTOM FUNCTIONS USED IN CHLa MODELLING, ANALYSIS, and DATA VISUALIZATION ----- #
# function: plot_allYears(data, x, y, formula, method, title, x.title, y.title, axis.title.size)
# Plots CHLa ~ T:Q for all available data across all years
## arguments
#### data = dataframe with model variables
#### x = column name in `data` containing the values of x
#### y = column name in `data` containing the values of y
#### formula = model formula, unquoted (e.g., y ~ log(x))
#### method = quoted, either "lm" or "nls"
#### title = defines plot title, character string
#### x.title = character string to label variable on x axis
#### y.title= character string to label variable on y axis
#### axis.title.size = numeric; axis title font size. defaults to 16
plot_allYears <- function(data, x, y, formula, method, title, x.title, y.title, axis.title.size = 16, point.col) {
y.coef <- round(reg.stats[1, 2], 4)
R2 <- round(reg.stats[1, 5], 4)
p.val <- round(reg.stats[1, 4], 2)
p <- ggplot(data = data[!is.na(data[[x]]) & !is.na(data[[y]]),]) +
geom_point(aes(x = !!sym(x), y = !!sym(y), color = factor(period)),
size = 2.25,
alpha = 0.55) +
geom_smooth(aes(x = !!sym(x), y = !!sym(y)),
color = "grey45",
linewidth = 1,
alpha = 0.5,
formula = formula,
method = method,
show.legend = FALSE) +
labs(title = NULL,
x = x.title,
y = y.title) +
theme_classic() +
theme(legend.position = "right",
legend.text = element_text(size = 10),
plot.title = element_text(size = 16, margin = margin(b = 10)),
axis.text.x = element_text(size = 11.5),
axis.text.y = element_text(size = 11.5),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = axis.title.size, color = "grey20", margin = margin(t = 5)),
axis.title.y = element_text(size = axis.title.size, color = "grey20", margin = margin(r = 5))) +
scale_color_manual(values = point.col, name = NULL)
if (x == "t_q" | x == "t_q_avg") {
plot <- p +
geom_text(aes(x = .6, y = 1.1),
label = paste("y = ", round(reg.stats[1, 2],4), "\n",
" R\u00B2 = ", round(reg.stats[1 ,5],4)),
size = 4, hjust=0) +
scale_x_continuous(breaks = seq(0, max(data[[x]], na.rm = TRUE), by = 0.2))
} else if (x == "TtoCond") {
plot <- p +
geom_text(aes(x = 22, y = 1.1),
label = paste("y = ", round(reg.stats[1, 2],4), "\n",
" R\u00B2 = ", round(reg.stats[1 ,5],4)),
size = 4, hjust=0) +
xlim(c(0, 40)) +
scale_x_continuous(breaks = seq(0, max(data[[x]], na.rm = TRUE), by = 10))
}
print(plot)
}
# ------------------------------------------------------------------------------------
# function: plot_eachYear(data, x, formula, method, reg.stats_, xlab)
# Generates linear regression plots by year for time series data
## arguments
#### data = dataframe with model variables
#### x = column name in `data` containing the values of x
#### formula = model formula, unquoted
#### method = either one of "lm" or "nls"
#### reg.stats_ = the dataframe containing derived model regression statistics
#### xlab = character string containing the ratio used as predictor, e.g., "Temperature:Discharge"
plot_eachYear <-function(data, x, formula, method, reg.stats_, xlab) {
for (yr in years_vec) {
max_x <- max(data[year(data$date) == yr, x], na.rm = TRUE)
max_y <- max(data[year(data$date) == yr, "ln_chla"], na.rm = TRUE)
text_x <- max_x * 0.75 # 75% of the max x value
text_y <- max_y * 0.30 # 30% of the max y values
plot <-
ggplot() +
geom_smooth(data = data[year(data$date) == yr,],
aes(x = !!sym(x), y = ln_chla),
color = "grey30",
alpha = .75,
formula = formula,
method = method,
show.legend = FALSE) +
geom_point(data = data[year(data$date) == yr,],
aes(x = !!sym(x), y = ln_chla),
color = ifelse(yr < 2019, "#ED254E", "#0096FF"),
alpha = .6) +
geom_text(aes(x = text_x, y = text_y),
label = paste(" R\u00B2 = ", round(reg.stats_[reg.stats_$Model.Year == yr ,5],4), "\n",
"coef. = ", round(reg.stats_[reg.stats_$Model.Year == yr, 2],4), "\n",
"p = ", round(reg.stats_[reg.stats_$Model.Year == yr, 4], 2) ),
size = 3.5, hjust=0) +
labs(title = yr,
x = xlab,
y = "ln(CHLa) (\u03BCg/L)") +
theme(title = element_text(size = 8)) +
theme_minimal()
suppressWarnings(
print(plot)
)
}
}
# ------------------------------------------------------------------------------------
# function: plot.residuals_timeSeries()
## use: generate time series plot for observed CHLa with points colored to indicate model accuracy in predicting a bloom day for a given date.
## arguments
#### data = dataframe containing model residuals and observation dates
#### title = string defining plot title
#### axis.title.size = numeric; axis title font size. defaults to 16
plot.residuals_timeSeries <- function(data, title = NULL, axis.title.size = 16) {
plot <-
ggplot(data = data) +
geom_hline(yintercept = 0, linetype = "solid", color = "salmon") +
geom_line(aes(x = date, y = residuals), col="grey75" ) +
geom_point(aes(x = date, y = residuals), alpha = .65) +
labs(title = title,
y = "Residuals",
x = "Date",
color = "Year") +
theme_classic() +
theme(legend.position = "top",
legend.text = element_text(size = 10),
plot.title = element_text(size = 16, margin = margin(b = 15)),
axis.text.x = element_text(angle = 45, hjust = 1, size = 11.5),
axis.text.y = element_text(size = 11.5),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = axis.title.size, color = "grey20", margin = margin(t = 15)),
axis.title.y = element_text(size = axis.title.size, color = "grey20", margin = margin(r = 15))) +
scale_x_datetime(date_labels = "%Y", date_breaks = "1 years",
limits = c(as.POSIXct(paste0(min_yr, "-01-01")), as.POSIXct(paste0(max_yr,"-12-31"))
)
)
print(plot)
}
# ------------------------------------------------------------------------------------
# function: plot_bloomday_predictions(bloom.data)
## use: generate time series plot for observed CHLa with points colored to indicate the model accuracy in predicting a bloom day for a given date.
## arguments:
#### data = data frame containing model variables
#### x = quoted column name of model predictor variable
#### method = "lm" or "nls", the method used to define the model formula
#### axis.title.size = numeric; axis title font size. defaults to 16
plot_bloomday_predictions <- function(data, x, method, axis.title.size = 16) {
if (method == "lm") {
formula_string <- paste0("ln_chla ~ log(", x, ")")
} else {
if (method == "nls") {
formula_string <- paste0("CHLa ~ CHLamax * tanh((alpha *", x, ")/CHLamax)")
}
}
if (method == "lm") {
model <- lm(as.formula(formula_string), data = data)
residuals <- model$residuals
} else {
if (method == "nls"){
model <- nls(as.formula(formula_string), data = data,
start = c(alpha = 50, CHLamax = 15),
control = nls.control(maxiter = 2000))
residuals <- residuals(model)
}
}
predicted_vals <- fitted(model)
# Create vector corresponding to row numbers for all non-NA CHLa values.
if (x == "TtoCond") {
notNA_indices <- which(!is.na(data$CHLa) & !is.na(data$TtoCond))
} else {
notNA_indices <- which(!is.na(data$CHLa))
}
# Create data frame with residuals, predicted vals, and the row number corresponding to each observation in the original data frame
observed_predicted <-
data.frame(
rownum = notNA_indices,
residuals = residuals,
predicted = predicted_vals,
observed = data$CHLa[notNA_indices],
predicted_exp = NA)
# Exponentiate predicted values
observed_predicted$predicted_exp <- exp(observed_predicted$predicted)
observed_predicted <- observed_predicted %>%
mutate( bloom_days = case_when(
observed >= 40 & predicted_exp >= 40 ~ "correctly predicted",
observed <= 40 & predicted_exp <= 40 ~ "correctly predicted",
observed >= 40 & predicted_exp <= 40 ~ "false negative",
observed <= 40 & predicted_exp >= 40 ~ "false positive",
TRUE ~ NA
) )
# Define a column that contains the row number for each observed value, to be used in joining bloom day data
data$row <- which(data$date==data$date)
data <-
data %>%
left_join(observed_predicted %>%
select(rownum, bloom_days),
by = join_by(row == rownum) )
# Create mode accuracy table
bloom_table <-
as.data.frame(table(data$bloom_days))
colnames(bloom_table) <- c("Model Result", "Count")
bloom_table$Percentage <-
paste0(
round(bloom_table$Count/sum(bloom_table$Count), 2) * 100,
"%")
table <- bloom_table %>%
kable(align = "c",
caption = NULL) %>%
kable_paper() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed") )
# Create and generate plot
p <-
suppressWarnings(
ggplot(data = na.omit(data)) +
geom_line(aes(x = date, y = CHLa), col = "#D9E0E8") +
geom_point(aes(x = date, y = CHLa, color = bloom_days), alpha = .65) +
geom_hline(yintercept = 40, linetype = "dashed", color = "grey") +
geom_segment(aes(x = as.POSIXct("2010-01-01"), y = 40,
xend = as.POSIXct("2010-01-01"), yend = 25),
linetype = "dashed", color = "grey60") +
geom_text(aes(x = as.POSIXct("2009-10-01"), y = 3),
label = "bloom\nthreshold",
hjust = 0, vjust = -1, color = "grey60", size = 3) +
labs(title = paste0("Predictive Success of T:Q Model, 2010-", max_yr),
subtitle = NULL,
y = paste0("Observed total Chla (\u03BCg/L)"),
x = "Date",
color = "Year") +
theme_classic() +
theme(legend.position = "top",
legend.text = element_text(size = 10),
plot.title = element_text(size = 16, margin = margin(b = 10)),
axis.text.x = element_text(angle = 45, hjust = 1, size = 11.5),
axis.text.y = element_text(size = 11),
axis.line = element_line(color = "grey"),
axis.title.x = element_text(size = axis.title.size, color = "grey20", margin = margin(t = 15)),
axis.title.y = element_text(size = axis.title.size, color = "grey20", margin = margin(r = 15))) +
scale_x_datetime(date_labels = "%Y", date_breaks = "1 years",
limits = c(as.POSIXct("2009-10-01"), as.POSIXct("2023-12-31"))) +
scale_y_continuous(breaks = seq(from = 0, max(df.james$CHLa, na.rm = TRUE), by=20)) +
scale_color_manual(values = c("correctly predicted" = "grey30",
"false negative" = "#0096FF",
"false positive" = "salmon"),
name = NULL)
)
print(p)
table
}
# ------------------------------------------------------------------------------------
# function: regstat_eachyear(reg.df, data, x)
## use: derive regression statistics for individual model years and store them in a single dataframe
## arguments
#### reg.df = dataframe to contain regression statistics
#### data = dataframe containing model variable data
#### x = column name in dataframe containing the values of x
regstat_eachyear <- function(reg.df, data, x) {
for (i in seq_along(reg.df$Model.Year)) {
subset_data <- data[year(data$date) == reg.df$Model.Year[i], ]
res <- summary(
lm(ln_chla ~ log(subset_data[[x]]), data = subset_data)
)
reg.df$coefficient[i] <- round(res$coefficients[2,1], 4) # add reg. coefficient to results summary
reg.df$coef.error[i] <- round(res$coefficients[2,2], 4) # add coefficient error to results summary
reg.df$p.val[i] <- round(res$coefficients[2,4], 4) # add p values to results summary
reg.df$RSQR[i] <- round(res$adj.r.squared, 4) # add adj. r sq to results summary
}
return(reg.df)
}
# ------------------------------------------------------------------------------------
# function: regstat_combined(reg.df, data, x)
# use: derive regression statistics for all years combined and store in a one-row dataframe
## arguments
#### reg.df = dataframe to contain regression statistics
#### data = dataframe containing model variable data
#### x = column name in dataframe containing the values of x
regstat_combined <- function(reg.df, data, x) {
res <-
summary(lm(ln_chla ~ log(data[[x]]), data = data))
reg.df[1, 2] <- round(res$coefficients[2,1], 4)
reg.df[1, 3] <- round(res$coefficients[2,2], 4)
reg.df[1, 4] <- round(res$coefficients[2,4], 4)
reg.df[1, 5] <- round(res$adj.r.squared, 4)
return(reg.df)
}
# ------------------------------------------------------------------------------------