knitr::opts_chunk$set(echo = TRUE)
# Install any missing packages
install_packages <- function(packages) {
  new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
  if(length(new_packages) > 0) {
    install.packages(new_packages, dependencies = TRUE)
  }
}

packages <- c("ggplot2", "tidyverse", "mblm")
install_packages(packages)

library(ggplot2)
library(tidyverse)
library(mblm)

Read in Data

trendSites.df <- openxlsx::read.xlsx("input_data/data_original.xlsx", sheet = 2)
allData.df <- openxlsx::read.xlsx("input_data/data_original.xlsx", sheet = 3)

# Convert `date` column from Excel encoded date to a more legible date format. Otherwise date shows as numeric value, e.g. '44230'.
    allData.df$Date <- allData.df$Date * 86400      # 86400 = seconds in a day.
    allData.df$Date <- as.POSIXct(allData.df$Date, origin = "1899-12-30", tz = "UTC")

# a surface temperature (Tmax) of 2.8 C in August
# 13774     6ACNR000.00     8/28/1990   0.3     NA  NA  NA  NA

which(grepl("13774", allData.df$X1))

## [1] 13770

allData.df[13770, (5:8)] <- NA

Data Filtering and Subsetting

# filter for rows corresponding to the 41 stations
station_IDS <- trendSites.df$FDT_STA_ID

surfaceOnly_IDs <- c("2-JKS053.48",
                    "2-XDD000.40",
                    "4AROA192.55",
                    "4AROA196.05",
                    "5ASRN000.66",
                    "6ACNR000.00",
                    "6APNR008.15") # only surface water measurements ever taken

# Apr to August only; reservoirs are cooling by Oct
subset.df <- allData.df %>%
  mutate(MonthNum = lubridate::month(Date)) %>%  
  filter(FDT_STA_ID %in% station_IDS) %>%
  filter(MonthNum %in% 4:8) %>%
  filter(year(Date) >= 1999) %>%
  mutate(stationYear = paste0(FDT_STA_ID,"-", as.character(year(Date))))

# Identify station years lacking April or August observations
AprAugust_present <- subset.df %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(staYear = paste0(FDT_STA_ID, "-", `year(Date)`)) # Create station-year field to facilitate filtering
  
staYrs_AprAug <- AprAugust_present$staYear

data_AprAug <- subset.df %>%
  filter(stationYear %in% staYrs_AprAug)

Apr to August Modelling

Linear Regression, `variable ~ DOY` by Station-Year

Variable-by-variable filtering must be done to eliminate station-years where Nobs < 3. Some station-years that passed initial filter above nevertheless lack 4 or more observations for specific variables.

slopesList <- list()

Min, Mean, Range slopes

Exclude surface-only stations from these analyses

Temp

# TMin
## --------------------------------------------
# Derive valid Nobs for each station-year
TMin_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(TMin)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
TMin_insufNobs  <- TMin_Nobs %>%
  filter(validObs < 3)

TMin_data <- subset.df %>%
  select(FDT_STA_ID, TMin, DOY, Date, stationYear) %>%
  filter(!stationYear %in% TMin_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_tmin <- TMin_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_TMin <- AprAug_present_tmin$stationYear

TMin_data <- TMin_data %>%
  filter(stationYear %in% staYrs_TMin)

# df to hold model slopes
TMin_lmslopes <- data.frame(StationYear = unique(TMin_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(TMin_data$stationYear)) {
  model_data <- TMin_data %>%
    filter(stationYear == sy)
  
  model <- lm(TMin ~ DOY, data = model_data)
  modsum <- summary(model)
  
  TMin_lmslopes[TMin_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  TMin_lmslopes[TMin_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  TMin_lmslopes[TMin_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

TMin_lmslopes <- TMin_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["TMin"]] <- TMin_lmslopes

# TMean
## --------------------------------------------
# Derive valid Nobs for each station-year
TMean_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(TMean)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
TMean_insufNobs  <- TMean_Nobs %>%
  filter(validObs < 3)

TMean_data <- subset.df %>%
  select(FDT_STA_ID, TMean, DOY, Date, stationYear) %>%
  filter(!stationYear %in% TMean_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_TMean <- TMean_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_TMean <- AprAug_present_TMean$stationYear

TMean_data <- TMean_data %>%
  filter(stationYear %in% staYrs_TMean)

# df to hold model slopes
TMean_lmslopes <- data.frame(StationYear = unique(TMean_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(TMean_data$stationYear)) {
  model_data <- TMean_data %>%
    filter(stationYear == sy)
  
  model <- lm(TMean ~ DOY, data = model_data)
  modsum <- summary(model)
  
  TMean_lmslopes[TMean_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  TMean_lmslopes[TMean_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  TMean_lmslopes[TMean_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

TMean_lmslopes <- TMean_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["TMean"]] <- TMean_lmslopes

# TRange
## --------------------------------------------
# Derive valid Nobs for each station-year
TRange_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(TRange)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
TRange_insufNobs  <- TRange_Nobs %>%
  filter(validObs < 3)

TRange_data <- subset.df %>%
  select(FDT_STA_ID, TRange, DOY, Date, stationYear) %>%
  filter(!stationYear %in% TRange_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_TRange <- TRange_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_TRange <- AprAug_present_TRange$stationYear

TRange_data <- TRange_data %>%
  filter(stationYear %in% staYrs_TRange)

# df to hold model slopes
TRange_lmslopes <- data.frame(StationYear = unique(TRange_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(TRange_data$stationYear)) {
  model_data <- TRange_data %>%
    filter(stationYear == sy)
  
  model <- lm(TRange ~ DOY, data = model_data)
  modsum <- summary(model)
  
  TRange_lmslopes[TRange_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  TRange_lmslopes[TRange_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  TRange_lmslopes[TRange_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

TRange_lmslopes <- TRange_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["TRange"]] <- TRange_lmslopes

Dissolved Oxygen

# DOMin
## --------------------------------------------
# Derive valid Nobs for each station-year
DOMin_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(DOMin)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
DOMin_insufNobs  <- DOMin_Nobs %>%
  filter(validObs < 3)

DOMin_data <- subset.df %>%
  select(FDT_STA_ID, DOMin, DOY, Date, stationYear) %>%
  filter(!stationYear %in% DOMin_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_DOMin <- DOMin_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_DOMin <- AprAug_present_DOMin$stationYear

DOMin_data <- DOMin_data %>%
  filter(stationYear %in% staYrs_DOMin)

# df to hold model slopes
DOMin_lmslopes <- data.frame(StationYear = unique(DOMin_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(DOMin_data$stationYear)) {
  model_data <- DOMin_data %>%
    filter(stationYear == sy)
  
  model <- lm(DOMin ~ DOY, data = model_data)
  modsum <- summary(model)
  
  DOMin_lmslopes[DOMin_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  DOMin_lmslopes[DOMin_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  DOMin_lmslopes[DOMin_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

DOMin_lmslopes <- DOMin_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["DOMin"]] <- DOMin_lmslopes

# DOMean
## --------------------------------------------
# Derive valid Nobs for each station-year
DOMean_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(DOMean)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
DOMean_insufNobs  <- DOMean_Nobs %>%
  filter(validObs < 3)

DOMean_data <- subset.df %>%
  select(FDT_STA_ID, DOMean, DOY, Date, stationYear) %>%
  filter(!stationYear %in% DOMean_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_DOMean <- DOMean_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_DOMean <- AprAug_present_DOMean$stationYear

DOMean_data <- DOMean_data %>%
  filter(stationYear %in% staYrs_DOMean)

# df to hold model slopes
DOMean_lmslopes <- data.frame(StationYear = unique(DOMean_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(DOMean_data$stationYear)) {
  model_data <- DOMean_data %>%
    filter(stationYear == sy)
  
  model <- lm(DOMean ~ DOY, data = model_data)
  modsum <- summary(model)
  
  DOMean_lmslopes[DOMean_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  DOMean_lmslopes[DOMean_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  DOMean_lmslopes[DOMean_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

DOMean_lmslopes <- DOMean_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["DOMean"]] <- DOMean_lmslopes

# DORange
## --------------------------------------------
# Derive valid Nobs for each station-year
DORange_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(DORange)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
DORange_insufNobs  <- DORange_Nobs %>%
  filter(validObs < 3)

DORange_data <- subset.df %>%
  select(FDT_STA_ID, DORange, DOY, Date, stationYear) %>%
  filter(!stationYear %in% DORange_insufNobs$stationYear) %>%
  filter(!FDT_STA_ID %in% surfaceOnly_IDs)

# Identify station years lacking Apr or August observations
AprAug_present_DORange <- DORange_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_DORange <- AprAug_present_DORange$stationYear

DORange_data <- DORange_data %>%
  filter(stationYear %in% staYrs_DORange)

# df to hold model slopes
DORange_lmslopes <- data.frame(StationYear = unique(DORange_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(DORange_data$stationYear)) {
  model_data <- DORange_data %>%
    filter(stationYear == sy)
  
  model <- lm(DORange ~ DOY, data = model_data)
  modsum <- summary(model)
  
  DORange_lmslopes[DORange_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  DORange_lmslopes[DORange_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  DORange_lmslopes[DORange_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

DORange_lmslopes <- DORange_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["DORange"]] <- DORange_lmslopes

TMax and DOMax slopes

Include surface only stations

# TMax
## --------------------------------------------
# Derive valid Nobs for each station-year
TMax_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(TMax)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
TMax_insufNobs  <- TMax_Nobs %>%
  filter(validObs < 3)

TMax_data <- subset.df %>%
  select(FDT_STA_ID, TMax, DOY, Date, stationYear) %>%
  filter(!stationYear %in% TMax_insufNobs$stationYear)

# Identify station years lacking Apr or August observations
AprAug_present_TMax <- TMax_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_TMax <- AprAug_present_TMax$stationYear

TMax_data <- TMax_data %>%
  filter(stationYear %in% staYrs_TMax)

# df to hold model slopes
TMax_lmslopes <- data.frame(StationYear = unique(TMax_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(TMax_data$stationYear)) {
  model_data <- TMax_data %>%
    filter(stationYear == sy)
  
  model <- lm(TMax ~ DOY, data = model_data)
  modsum <- summary(model)
  
  TMax_lmslopes[TMax_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  TMax_lmslopes[TMax_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  TMax_lmslopes[TMax_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

TMax_lmslopes <- TMax_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["TMax"]] <- TMax_lmslopes

# DOMax
## --------------------------------------------
# Derive valid Nobs for each station-year
DOMax_Nobs <- data_AprAug %>%
  group_by(stationYear) %>%
  summarize(n(),
            NA_count = length(which(is.na(DOMax)))) %>%
  mutate(validObs = `n()` - NA_count) %>%
  select(stationYear,validObs) %>%
  ungroup()

# Identify station-years w/ fewer than 3 Nobs
DOMax_insufNobs  <- DOMax_Nobs %>%
  filter(validObs < 3)

DOMax_data <- subset.df %>%
  select(FDT_STA_ID, DOMax, DOY, Date, stationYear) %>%
  filter(!stationYear %in% DOMax_insufNobs$stationYear)

# Identify station years lacking Apr or August observations
AprAug_present_DOMax <- DOMax_data %>%
  group_by(FDT_STA_ID, year(Date)) %>%
  summarize(Apr_obs = any(lubridate::month(Date) == 4),
            Aug_obs = any(lubridate::month(Date) == 8)) %>%
  filter(Apr_obs == TRUE & Aug_obs == TRUE) %>%
  mutate(stationYear = paste0(FDT_STA_ID, "-", `year(Date)`)) %>% # Create station-year field to facilitate filtering
  ungroup()

staYrs_DOMax <- AprAug_present_DOMax$stationYear

DOMax_data <- DOMax_data %>%
  filter(stationYear %in% staYrs_DOMax)

# df to hold model slopes
DOMax_lmslopes <- data.frame(StationYear = unique(DOMax_data$stationYear),
                          model_slope = NA,
                          pval = NA,
                          stdErr = NA)


for (sy in unique(DOMax_data$stationYear)) {
  model_data <- DOMax_data %>%
    filter(stationYear == sy)
  
  model <- lm(DOMax ~ DOY, data = model_data)
  modsum <- summary(model)
  
  DOMax_lmslopes[DOMax_lmslopes$StationYear == sy, 2] <- modsum$coefficients[2, 1]
  DOMax_lmslopes[DOMax_lmslopes$StationYear == sy, 3] <- modsum$coefficients[2, 4]
  DOMax_lmslopes[DOMax_lmslopes$StationYear == sy, 4] <- modsum$coefficients[2, 2]
  
}

DOMax_lmslopes <- DOMax_lmslopes %>%
  mutate(Year = as.numeric(sub(".*-", "", StationYear))) %>%   #  ".*-" will match any sequence of characters followed by a hyphen (i.e., the station ID) 
  mutate(StationID = sub("-[0-9]{4}$", "", StationYear)) # "-[0-9]{4}$" will match a hyphen followed by exactly four digits at the end of the string (i.e., the year)

slopesList[["DOMax"]] <- DOMax_lmslopes

All Slopes -> Single DF

# Add field to each df in slopesList to identiy the variable used in analysis
for (i in 1:length(slopesList)) {
  
  varName <- names(slopesList[i])
  slopesList[[i]]$variable <- varName
}


slopes.df <- bind_rows(slopesList)
slopes.df <- slopes.df %>%
  select(-StationYear, -pval, -stdErr)

wider_df <- slopes.df %>%
  pivot_wider(names_from = variable, values_from = model_slope)

# write_csv(wider_df, "output_data/annualLongTerm/yearStation_slopeSummary_AprAug.csv")

MBLM, seasonal rate of change ~ year

TMin

#TMin

## --------------------------
# create list to store regression results
TMin_modSums <- list()

for (station_id in unique(TMin_lmslopes$StationID)) {
  model_data <- TMin_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      TMin_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
TMin_mblm <- data.frame(StationID = unique(TMin_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "TMin")

for (i in 1:nrow(TMin_mblm)) {
  station = TMin_mblm$StationID[i]
  
  TMin_mblm$model_slope[i] <-  TMin_modSums[[station]]$slope
   TMin_mblm$model_MAD[i] <-  TMin_modSums[[station]]$MAD
    TMin_mblm$model_pval[i] <-  TMin_modSums[[station]]$pvalue
     TMin_mblm$model_intcpt[i] <- TMin_modSums[[station]]$intercept
  
}

# write_csv(TMin_mblm, "output_data/annualLongTerm/TMin_mblmAnnual_AprAug.csv")

TMean

#TMean

## --------------------------
# create list to store regression results
TMean_modSums <- list()

for (station_id in unique(TMean_lmslopes$StationID)) {
  model_data <- TMean_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      TMean_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
TMean_mblm <- data.frame(StationID = unique(TMean_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "TMean")

for (i in 1:nrow(TMean_mblm)) {
  station = TMean_mblm$StationID[i]
  
  TMean_mblm$model_slope[i] <-  TMean_modSums[[station]]$slope
   TMean_mblm$model_MAD[i] <-  TMean_modSums[[station]]$MAD
    TMean_mblm$model_pval[i] <-  TMean_modSums[[station]]$pvalue
     TMean_mblm$model_intcpt[i] <- TMean_modSums[[station]]$intercept
  
}

# write_csv(TMean_mblm, "output_data/annualLongTerm/TMean_mblmAnnual_AprAug.csv")

TRange

#TRange

## --------------------------
# create list to store regression results
TRange_modSums <- list()

for (station_id in unique(TRange_lmslopes$StationID)) {
  model_data <- TRange_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      TRange_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
TRange_mblm <- data.frame(StationID = unique(TRange_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "TRange")

for (i in 1:nrow(TRange_mblm)) {
  station = TRange_mblm$StationID[i]
  
  TRange_mblm$model_slope[i] <-  TRange_modSums[[station]]$slope
   TRange_mblm$model_MAD[i] <-  TRange_modSums[[station]]$MAD
    TRange_mblm$model_pval[i] <-  TRange_modSums[[station]]$pvalue
     TRange_mblm$model_intcpt[i] <- TRange_modSums[[station]]$intercept
  
}

# write_csv(TRange_mblm, "output_data/annualLongTerm/TRange_mblmAnnual_AprAug.csv")

TMax

#TMax

## --------------------------
# create list to store regression results
TMax_modSums <- list()

for (station_id in unique(TMax_lmslopes$StationID)) {
  model_data <- TMax_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      TMax_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
TMax_mblm <- data.frame(StationID = unique(TMax_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "TMax")

for (i in 1:nrow(TMax_mblm)) {
  station = TMax_mblm$StationID[i]
  
  TMax_mblm$model_slope[i] <-  TMax_modSums[[station]]$slope
   TMax_mblm$model_MAD[i] <-  TMax_modSums[[station]]$MAD
    TMax_mblm$model_pval[i] <-  TMax_modSums[[station]]$pvalue
     TMax_mblm$model_intcpt[i] <- TMax_modSums[[station]]$intercept
  
}

# write_csv(TMax_mblm, "output_data/annualLongTerm/TMax_mblmAnnual_AprAug.csv")

DOMin

#DOMin

## --------------------------
# create list to store regression results
DOMin_modSums <- list()

for (station_id in unique(DOMin_lmslopes$StationID)) {
  model_data <- DOMin_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      DOMin_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
DOMin_mblm <- data.frame(StationID = unique(DOMin_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "DOMin")

for (i in 1:nrow(DOMin_mblm)) {
  station = DOMin_mblm$StationID[i]
  
  DOMin_mblm$model_slope[i] <-  DOMin_modSums[[station]]$slope
   DOMin_mblm$model_MAD[i] <-  DOMin_modSums[[station]]$MAD
    DOMin_mblm$model_pval[i] <-  DOMin_modSums[[station]]$pvalue
     DOMin_mblm$model_intcpt[i] <- DOMin_modSums[[station]]$intercept
  
}

# write_csv(DOMin_mblm, "output_data/annualLongTerm/DOMin_mblmAnnual_AprAug.csv")

DOMean

#DOMean

## --------------------------
# create list to store regression results
DOMean_modSums <- list()

for (station_id in unique(DOMean_lmslopes$StationID)) {
  model_data <- DOMean_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      DOMean_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
DOMean_mblm <- data.frame(StationID = unique(DOMean_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "DOMean")

for (i in 1:nrow(DOMean_mblm)) {
  station = DOMean_mblm$StationID[i]
  
  DOMean_mblm$model_slope[i] <-  DOMean_modSums[[station]]$slope
   DOMean_mblm$model_MAD[i] <-  DOMean_modSums[[station]]$MAD
    DOMean_mblm$model_pval[i] <-  DOMean_modSums[[station]]$pvalue
     DOMean_mblm$model_intcpt[i] <- DOMean_modSums[[station]]$intercept
  
}

# write_csv(DOMean_mblm, "output_data/annualLongTerm/DOMean_mblmAnnual_AprAug.csv")

DORange

#DORange

## --------------------------
# create list to store regression results
DORange_modSums <- list()

for (station_id in unique(DORange_lmslopes$StationID)) {
  model_data <- DORange_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      DORange_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
DORange_mblm <- data.frame(StationID = unique(DORange_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "DORange")

for (i in 1:nrow(DORange_mblm)) {
  station = DORange_mblm$StationID[i]
  
  DORange_mblm$model_slope[i] <-  DORange_modSums[[station]]$slope
   DORange_mblm$model_MAD[i] <-  DORange_modSums[[station]]$MAD
    DORange_mblm$model_pval[i] <-  DORange_modSums[[station]]$pvalue
     DORange_mblm$model_intcpt[i] <- DORange_modSums[[station]]$intercept
  
}

# write_csv(DORange_mblm, "output_data/annualLongTerm/DORange_mblmAnnual_AprAug.csv")

DOMax

#DOMax

## --------------------------
# create list to store regression results
DOMax_modSums <- list()

for (station_id in unique(DOMax_lmslopes$StationID)) {
  model_data <- DOMax_lmslopes %>%
    filter(StationID == station_id)
    

      model <- mblm(model_slope ~ Year, data = model_data)
      mod.sum <- summary.mblm(model)
      
      # store results
      DOMax_modSums[[station_id]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients["(Intercept)", 1]
        )
  }

# create station-month key
DOMax_mblm <- data.frame(StationID = unique(DOMax_lmslopes$StationID),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intcpt = NA,
          variable = "DOMax")

for (i in 1:nrow(DOMax_mblm)) {
  station = DOMax_mblm$StationID[i]
  
  DOMax_mblm$model_slope[i] <-  DOMax_modSums[[station]]$slope
   DOMax_mblm$model_MAD[i] <-  DOMax_modSums[[station]]$MAD
    DOMax_mblm$model_pval[i] <-  DOMax_modSums[[station]]$pvalue
     DOMax_mblm$model_intcpt[i] <- DOMax_modSums[[station]]$intercept
  
}

# write_csv(DOMax_mblm, "output_data/annualLongTerm/DOMax_mblmAnnual_AprAug.csv")

Combine to Single Dataframe

## Combine all mblm model results into single df

varlist <- ls()

myvars <- varlist[which(grepl("_mblm$", varlist))]

combined_data <- do.call(rbind, lapply(myvars, function(x) {
  get(x)
}))

# write_csv(combined_data, "output_data/annualLongTerm/mblm_AprAug_allVariables.csv")

Data Viz: mblm slopes for all stations, by variable

# geom_line has no 'mblm' formula option; abline is the work around.
# but abline has no axis values associated, it is just an infinite line based on slope and intercept.
# effectively plotting the mblm lines thus requires a geom_ layer to define the scales. 

# also, a field to color lines by slope significance must be added to the data frame
# var_class values set up to include units for facetting labels (i.e., with unicode characters for degree, sub/superscript)
mblm_data <- combined_data %>%
  mutate(significant = if_else(model_pval <= .05 & model_slope > 0, "significant positive", 
                               if_else(model_pval <= .05 & model_slope < 0, "significant negative", "not significant"))
         ) %>%
  mutate(var_class = ifelse(startsWith(variable, "T"), "Temperature (\u00B0C d\u207B\u00B9)", "Dissolved Oxygen (mg O\u2082 L\u207B\u00B9 d\u207B\u00B9)")) %>%
  mutate(var_class = factor(var_class, levels = c("Temperature (\u00B0C d\u207B\u00B9)", "Dissolved Oxygen (mg O\u2082 L\u207B\u00B9 d\u207B\u00B9)"))) %>%
  mutate(var_type = case_when(
    endsWith(variable, "Max") ~ "Surface",
    endsWith(variable, "Min") ~ "Bottom",
    endsWith(variable, "Range") ~ "Range",
    endsWith(variable, "Mean") ~ "Mean",
    TRUE ~ "")) %>%
  mutate(var_type = factor(var_type, levels = c("Surface", "Mean", "Bottom", "Range")))

# set significant as factor to ensure sig lines sit on top of insignificant lines
mblm_data$significant <- factor(mblm_data$significant, levels = c("significant positive", "significant negative", "not significant"))

# point plotting variable -- new var name so that it can be modified to remove outliers as needed without affecting the original data frame
annualTrends.df <- slopes.df

# exclude regression line for 2-XDD000.40 from the DOmax plot as this has a number of missing values in the time series.
mblm_data <- mblm_data %>%
  filter(!(variable == "DOMax" & StationID == "2-XDD000.40"))

plot <- ggplot() +
  geom_point(data = annualTrends.df, aes(x = Year, y = model_slope), alpha = 0) +
  geom_abline(data = mblm_data, aes(intercept = model_intcpt, slope = model_slope, color = significant), 
              linewidth = .42, alpha = .6) +
   labs(title = NULL,
       x = "Year",
       y = NULL) +  
  facet_grid(var_class ~ var_type, scales = "free", switch = "y") +
  scale_color_manual(values = c("not significant" = "grey72", 
                                "significant positive" = "#C9A818", 
                                "significant negative" = "#124E78")) +
  theme_minimal() + 
  theme(legend.position = "top",
        panel.border = element_rect(colour = "black", fill=NA, size=.5),
        plot.title = element_text(size = 16),
        axis.title = element_text(size = 14), 
        axis.text.y = element_text(size = 9.5,  margin = margin(t = 0, r = 3, b = 0, l = 0)), 
        axis.text.x = element_text(size = 9.5, angle = 45, hjust = .95),
        legend.text = element_text(size = 8),  
        legend.title = element_blank(),
        strip.text = element_text(size = 11),
        strip.placement = "outside",
        panel.grid = element_line(color = "grey90", linewidth = .1),
        aspect.ratio = 1.8)

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot(plot)

# ggsave("annualLongterm_plot_AprAugx.jpg", device = "jpg", plot, dpi = 1000)

results <- data.frame(station = unique(TMin_lmslopes$StationID),
                      slopeTies = NA,
                      yearTies = NA)

for (station in unique(TMin_lmslopes$StationID)) {
  slopes = TMin_lmslopes %>%
    filter(StationID == station)
  
  ties = c()
  
  for (i in 1:nrow(slopes)) {
    slopes_vec <- slopes$model_slope[-i]
    current_slope <- slopes$model_slope[i]
    ties[i] <- current_slope %in% slopes_vec
  }
  
  results[results$station == station, "slopeTies"] <- ifelse(any(ties), "YES", "NO")
  
    ties2 = c()
  
  for (i in 1:nrow(slopes)) {
    years_vec <- slopes$Year[-i]
    current_year <- slopes$Year[i]
    ties2[i] <- current_year %in% years_vec
  }
  
  results[results$station == station, "yearTies"] <- ifelse(any(ties2), "YES", "NO")
}

Assessing Climate Change Effects in VA Reservoirs

Part 4: Long-Term Annual Trends in T & DO (Apr to Aug)

Read in Data

Data Filtering and Subsetting

Apr to August Modelling

Linear Regression, `variable ~ DOY` by Station-Year

Min, Mean, Range slopes

Temp

Dissolved Oxygen

TMax and DOMax slopes

All Slopes -> Single DF

MBLM, seasonal rate of change ~ year

TMin

TMean

TRange

TMax

DOMin

DOMean

DORange

DOMax

Combine to Single Dataframe

Data Viz: mblm slopes for all stations, by variable

Assessing Climate Change Effects in VA Reservoirs

Part 4: Long-Term Annual Trends in T & DO (Apr to Aug)

Read in Data

Data Filtering and Subsetting

Apr to August Modelling

Linear Regression, variable ~ DOY by Station-Year

Min, Mean, Range slopes

Temp

Dissolved Oxygen

TMax and DOMax slopes

All Slopes -> Single DF

MBLM, seasonal rate of change ~ year

TMin

TMean

TRange

TMax

DOMin

DOMean

DORange

DOMax

Combine to Single Dataframe

Data Viz: mblm slopes for all stations, by variable

Linear Regression, `variable ~ DOY` by Station-Year