0. Setup for Systems and Packages

Define System Language and Time

Sys.setenv(LANG = "en")
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"

Library Packages

# Load required packages
invisible(
  lapply(
    c(
      # Nexis data handling
      'LexisNexisTools',    
      
      # Data handling
      'stringr', 'dplyr', 'broom', 'tidytext', 'tidyr', 'tidyverse', 'scales', 
      'reshape2', 'zoo',
      
      # NLP tools
      'udpipe', 'spacyr', 'quanteda',
      
      # Correlation analysis
      'ggcorrplot', 'corrplot',
      
      # Regression analysis
      'MASS',
      
      # Topic modeling
      'stm',
      
      # File handling
      'readxl', 'xlsx',
      
      # Data visualization
      'ggplot2', 'wordcloud', 'ggwordcloud', 'RColorBrewer', 'ggpubr', 
      'ggnewscale', 'treemap', 'ggh4x', 'viridis',
      'rcartocolor'
    ),
    require, character.only = TRUE
  )
)

# Silence dplyr warnings
options(dplyr.summarise.inform = FALSE)

System Overview

xfun::session_info(dependencies = FALSE)
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## 
## Locale:
##   LC_COLLATE=English_United States.utf8 
##   LC_CTYPE=English_United States.utf8   
##   LC_MONETARY=English_United States.utf8
##   LC_NUMERIC=C                          
##   LC_TIME=English_United States.1252    
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## Package version:
##   rcartocolor_2.1.1         viridis_0.6.3            
##   viridisLite_0.4.2         ggh4x_0.2.8              
##   treemap_2.4-4             ggnewscale_0.4.9         
##   ggpubr_0.6.0              ggwordcloud_0.5.0        
##   wordcloud_2.6             RColorBrewer_1.1-3       
##   xlsx_0.6.5                readxl_1.4.2             
##   stm_1.3.6                 MASS_7.3-60              
##   corrplot_0.92             ggcorrplot_0.1.4.1       
##   quanteda_3.3.1            spacyr_1.2.1             
##   udpipe_0.8.11             zoo_1.8-12               
##   reshape2_1.4.4            scales_1.2.1             
##   lubridate_1.9.2           forcats_1.0.0            
##   purrr_1.0.1               readr_2.1.4              
##   tibble_3.2.1              ggplot2_3.4.2            
##   tidyverse_2.0.0           tidyr_1.3.0              
##   tidytext_0.4.1            broom_1.0.4              
##   dplyr_1.1.2               stringr_1.5.1            
##   LexisNexisTools_0.3.7     pbapply_1.7-2            
##   gridExtra_2.3             rlang_1.1.1              
##   magrittr_2.0.3            gridBase_0.4-7           
##   compiler_4.3.2            png_0.1-8                
##   vctrs_0.6.2               pkgconfig_2.0.3          
##   fastmap_1.1.1             backports_1.4.1          
##   ellipsis_0.3.2            utf8_1.2.3               
##   promises_1.2.0.1          rmarkdown_2.22           
##   tzdb_0.4.0                xfun_0.39                
##   cachem_1.0.8              jsonlite_1.8.5           
##   SnowballC_0.7.1           later_1.3.1              
##   xlsxjars_0.6.1            parallel_4.3.2           
##   stopwords_2.3             R6_2.5.1                 
##   bslib_0.4.2               stringi_1.7.12           
##   car_3.1-2                 jquerylib_0.1.4          
##   cellranger_1.1.0          nsyllable_1.0.1          
##   Rcpp_1.0.10               knitr_1.43               
##   httpuv_1.6.11             Matrix_1.6-0             
##   igraph_1.5.0              timechange_0.2.0         
##   tidyselect_1.2.0          rstudioapi_0.14          
##   stringdist_0.9.12         abind_1.4-5              
##   yaml_2.3.7                lattice_0.21-9           
##   plyr_1.8.8                shiny_1.7.4              
##   withr_3.0.0               evaluate_0.21            
##   rJava_1.0-6               RcppParallel_5.1.7       
##   pillar_1.9.0              quanteda.textstats_0.96.2
##   carData_3.0-5             janeaustenr_1.0.0        
##   generics_0.1.3            hms_1.1.3                
##   munsell_0.5.0             xtable_1.8-4             
##   glue_1.6.2                tools_4.3.2              
##   data.table_1.15.4         tokenizers_0.3.0         
##   ggsignif_0.6.4            fastmatch_1.1-3          
##   grid_4.3.2                colorspace_2.1-0         
##   cli_3.6.1                 fansi_1.0.4              
##   gtable_0.3.3              rstatix_0.7.2            
##   sass_0.4.6                digest_0.6.31            
##   htmltools_0.5.5           lifecycle_1.0.4          
##   mime_0.12

1 Corpus building

data3_Nexis is a data frame to contain the count of monthly newspaper articles

# Load structured newspaper articles (originally obtained from Nexis Uni)
Nexisdata_df <- readRDS("./ScrapedData/Nexisdata_all_raw.rds")
print(paste("Number of articles from Nexis Uni: ", nrow(Nexisdata_df)))
## [1] "Number of articles from Nexis Uni:  1361"
# Data cleaning 1: Remove duplicate headlines
Nexisdata_df0 <- Nexisdata_df[!duplicated(Nexisdata_df [c("Headline")]), ]


# Data cleaning 2: Remove geographic and thematic metadata to be irrelevant
Nexisdata_df0 <- Nexisdata_df0 %>%
  mutate(
    mainsubject = str_remove(str_extract(Subject, "^[^;]+"), "\\(.*"),
    mainregion = str_remove(str_extract(Geographic, "^[^;]+"), "\\(.*")
  )

Nexisdata_df0 <- Nexisdata_df0 %>%
  mutate(order = row_number()) %>%
  dplyr::select(order, Art_ID, mainsubject, mainregion, everything())

Nexisdata_df00 <- subset(Nexisdata_df0,
                         !grepl(paste(c("Sport", "SPORT"),
                                      collapse = "|"), Section))

Nexisdata_df00 <- subset(Nexisdata_df00, 
                         !grepl("SOCCER|CRICKET",mainsubject, ignore.case = TRUE))

Nexisdata_df00 <- subset(Nexisdata_df00, 
                         grepl(paste(c("UNITED KINGDOM", "ENGLAND"), 
                                     collapse = "|"), mainregion, ignore.case = TRUE))

Nexisdata_df00 <- subset(Nexisdata_df00, 
                         !grepl("NEW SOUTH WALES", mainregion, ignore.case = TRUE))

Nexisdata_df00 <- Nexisdata_df00 %>% drop_na(Date)


# Double-check on the selected media sources 
monthly_count_Nexis_summary <- Nexisdata_df00 %>%
  count(Newspaper, name = "Freq") %>%
  arrange(desc(Freq)) %>%
  mutate(
    Tabloid = case_when(
      str_detect(tolower(Newspaper), "telegraph|independent|times|guardian|financial") ~ 0,
      str_detect(tolower(Newspaper), "mail|mirror|sun") ~ 1,
      TRUE ~ 9
    ),
    Online = ifelse(str_detect(tolower(Newspaper), ".co.uk|.com|online"), 1, 0)
  ) %>%
  filter(Tabloid != 9)


# Customize date info: year, month, and daytime
Nexisdata_df000 <- Nexisdata_df00 %>%
  separate(Date, into = c("year", "month", "daytime"), sep = "-") %>%
  mutate(across(c(year, month), as.numeric))


# Summarize monthly article counts
Nexisdata_df1 <- Nexisdata_df000 %>%
  filter(Newspaper %in% monthly_count_Nexis_summary$Newspaper)
print(paste("Number of valid articles after filtering: ", nrow(Nexisdata_df1)))
## [1] "Number of valid articles after filtering:  836"
data3_Nexis <- Nexisdata_df1 %>% group_by(year, month) %>%  summarize(count = n()) 

monthly_count_Nexis <- data.frame(year = rep(2000:2023, each = 12), month = rep(1:12, 24))

monthly_count_Nexis <- left_join(monthly_count_Nexis, data3_Nexis)

monthly_count_Nexis[is.na(monthly_count_Nexis)] = 0

monthly_count_Nexis[, c("year", "month")] <- 
  sapply(monthly_count_Nexis[, c("year", "month")], as.character)

monthly_count_Nexis <- monthly_count_Nexis %>%
  mutate(ym = paste0(year, "-", month))


# Clean up workspace
rm(Nexisdata_df, Nexisdata_df0, Nexisdata_df00, Nexisdata_df000)

2 Temporal alignment analysis

The variables include temperature, precipitation, groundwater, and seasonality

2.1 Temperature

# Load UK Temperature Data: Mean Central England Temperature (MCET)
#https://www.metoffice.gov.uk/hadobs/hadcet/data/meantemp_monthly_totals.txt
List_UK_temp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MCET")


# Calculate Anomaly Baseline Based on 1991–2020 Average
List_UK_temp_subset_ref <- List_UK_temp %>% subset(Year %in% c(1991:2020)) %>% 
  dplyr::select(-Year, -Annual)

UK_temp_ref_mean <- colMeans(List_UK_temp_subset_ref)


# Calculate Temperature Anomalies 
List_UK_temp_subset_target <- List_UK_temp %>% 
  subset(Year %in% c(2000:2023)) %>% 
  dplyr::select(-Annual)

temp_target_anomaly <- sweep(List_UK_temp_subset_target[2:13], 2, UK_temp_ref_mean, "-")

temp_target_anomaly$Year <- List_UK_temp_subset_target$Year

temp_target_anomaly_long <- temp_target_anomaly %>% 
  pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")

temp_target_anomaly_long$month <- rep(1:12, length(2000:2023))

temp_target_anomaly_long <- temp_target_anomaly_long %>% subset(Year %in% c(2000:2023))


# Add temperature anomaly values
monthly_count_Nexis$temp_anomaly <- temp_target_anomaly_long$Value


# Exclude months 9–12 of 2023
monthly_count_Nexis <- head(monthly_count_Nexis, n = nrow(monthly_count_Nexis) - 4) 


# Compute rolling averages with temporal lags
monthly_count_Nexis$temp_3mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 3, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_6mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 6, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_12mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 12, 
                                                  FUN = mean, fill = 0, align = "right")


# Clean Up Workspace
rm(List_UK_temp, List_UK_temp_subset_ref, List_UK_temp_subset_target,
   temp_target_anomaly, temp_target_anomaly_long)

2.2 Seasonality

# Assign seasonality names based on month
monthly_count_Nexis <- monthly_count_Nexis %>%
  mutate(season = if_else(month %in% c(9, 10, 11), "SON",
                          if_else(month %in% c(12, 1,2), "DJF", 
                                  if_else(month %in% c(3,4,5), "MAM", 
                                          if_else(month %in% c(6,7,8), "JJA", NA)))))

monthly_count_Nexis$season <- factor(monthly_count_Nexis$season, 
                                     levels=c("SON", "DJF", "MAM","JJA"))

2.3 Precipitation and groundwater

# Load Station-based indices downloaded from a link below 
# https://nrfa.ceh.ac.uk/data/search
data_station <- read.csv("./ScrapedData/nrfa-station-metadata-2024-10-24.csv")

data_station <- data_station[, c(1:15)]

data_station <- data_station %>%
  mutate(region_name = case_when(
    hydrometric.area %in% c(22, 23, 24) ~ 'Northumbria',
    hydrometric.area %in% c(25, 26, 27) ~ 'Yorkshire',
    hydrometric.area %in% c(29, 30, 31, 32, 33, 34, 35, 36, 37) ~ 'Anglian',
    hydrometric.area %in% c(38, 39) ~ 'Thames',
    hydrometric.area %in% c(40, 41, 42) ~ 'Southern',
    hydrometric.area %in% c(43, 44, 52, 53) ~ 'Wessex',
    hydrometric.area %in% c(45, 46, 47, 48, 49, 50, 51) ~ 'South West England',
    hydrometric.area %in% c(54, 28) ~ 'Midlands',
    hydrometric.area %in% c(68, 69, 70, 71, 72, 73, 74, 75, 76) ~ 'North West England',
    TRUE ~ NA_character_  # This handles cases where none of the conditions are met
  ))


# Retrieve files and extract data 
file_names_ssi <- list.files(path = "./ScrapedData/Drought_Indices", 
                             pattern = "^nhmp-ssi-nrfa-")  #813 
file_names_ssi <- sub("nhmp-ssi-nrfa-([0-9]+).*", "\\1", file_names_ssi)

file_names_spi <- list.files(path = "./ScrapedData/Drought_Indices", 
                             pattern = "^nhmp-spi-nrfa-") #916 
file_names_spi <- sub("nhmp-spi-nrfa-([0-9]+).*", "\\1", file_names_spi)


data_station <- data_station %>%
  mutate(match_ssi = ifelse(id %in% file_names_ssi, TRUE, FALSE),
         match_spi = ifelse(id %in% file_names_spi, TRUE, FALSE))


print(paste("Number of valid stations for SSI: ", sum(data_station$match_ssi))) # count of TRUE
## [1] "Number of valid stations for SSI:  811"
print(paste("Number of valid stations for SPI: ", sum(data_station$match_spi))) # count of TRUE
## [1] "Number of valid stations for SPI:  916"
data_station <- data_station %>%
  filter(!(match_ssi == FALSE & match_spi == FALSE)) %>%
  filter(!is.na(region_name))

print(paste("Station ID: ", unique(data_station$measuring.authority.id)))
##  [1] "Station ID:  EA-NE"   "Station ID:  EA-Y"    "Station ID:  EA-EM"  
##  [4] "Station ID:  EA-WM"   "Station ID:  EA-LN"   "Station ID:  EA-EA"  
##  [7] "Station ID:  EA-HNL"  "Station ID:  EA-T"    "Station ID:  EA-KSL" 
## [10] "Station ID:  EA-SSD"  "Station ID:  EA-WX"   "Station ID:  EA-DC"  
## [13] "Station ID:  EA-GMMC" "Station ID:  EA-CL"
# Selectively call the station data 
file_names_sgi <- list.files(path = "./ScrapedData/Drought_Indices", 
                             pattern = "^nhmp-sgi-") #41

file_names_sgi <- sub("nhmp-sgi-", "\\1", file_names_sgi)

file_names_sgi <- sub(".csv", "\\1", file_names_sgi)

station_sgi <- read.xlsx("./ScrapedData/SGI.xlsx", sheetName = "Sheet1")

station_sgi <- station_sgi %>% dplyr::filter(Country == "England")


# Specify time stamps
start_date <- as.Date("2000-01-01")

end_date <- as.Date("2023-08-31")

monthly_dates <- seq(from = start_date, to = end_date, by = "month")

year_month <- format(monthly_dates, "%Y-%m")

selected_station_ssi <- data_station %>% filter(match_ssi == TRUE) %>% pull(id)

selected_station_spi <- data_station %>% filter(match_spi == TRUE) %>% pull(id)

selected_station_sgi <- station_sgi %>% pull(Station.ID.fixed)


# Generate indices to compatible format 
generate_data <- function(index, col_name, year_month, selected_stations) {
  result <- data.frame(ym = year_month)
  
  for (selected_station in selected_stations) {
    file_name <- paste0("./ScrapedData/Drought_Indices/nhmp-", 
                        index, selected_station, ".csv") #SPI, SSI
    
    if (file.exists(file_name)) {
      data <- read.csv(file_name, skip = 7)
      
      row_start <- which(data[, 1] == '2000-01')
      
      row_end <- which(data[, 1] == '2023-08')
      
      if (length(row_start) == 0 | length(row_end) == 0) {
        next  # Skip current loop if row_start or row_end is not found
      }
      
      col_target <- which(names(data) == col_name)
      
      selected_data <- data %>%
        slice(row_start:row_end) %>%
        dplyr::select("month", all_of(col_name))
      
      colnames(selected_data) <- c("ym", selected_station)
      
      result <- left_join(result, selected_data, by = "ym")
    }
  }
  
  return(result)
}


# Pre-process the indices 
data_ssi3 <- generate_data("ssi-nrfa-", "ssi.3", year_month, selected_station_ssi)
data_ssi6 <- generate_data("ssi-nrfa-", "ssi.6", year_month, selected_station_ssi)
data_ssi12 <- generate_data("ssi-nrfa-", "ssi.12", year_month, selected_station_ssi)


data_spi3 <- generate_data("spi-nrfa-", "spi.3", year_month, selected_station_spi)
data_spi6 <- generate_data("spi-nrfa-", "spi.6", year_month, selected_station_spi)
data_spi12 <- generate_data("spi-nrfa-", "spi.12", year_month, selected_station_spi)

data_sgi <- generate_data("sgi-", "sgi", year_month, selected_station_sgi)


unique_regions <- unique(data_station$region_name)


generate_summary <- function(data, data_station, unique_regions, prefix) {
  region_mean_list <- list()
  
  # Calculate region means
  for (region in unique_regions) {
    id_values <- data_station %>%
      filter(region_name == !!region) %>%
      pull(id)
    
    id_values_valid <- intersect(id_values, colnames(data))
    
    subset_df <- data %>%
      dplyr::select(ym, all_of(id_values_valid))
    
    region_mean <- rowMeans(subset_df[ , -1], na.rm = TRUE)
    
    region_mean_list[[region]] <- region_mean
  }
  
  # Create summary dataframe
  summary_df <- data.frame(ym = data$ym)
  for (region in names(region_mean_list)) {
    summary_df[[paste0(prefix, "_", region)]] <- region_mean_list[[region]]
  }
  
  # Calculate the overall mean
  summary_df[[paste0("mean_", prefix)]] <- rowMeans(summary_df[, !names(summary_df) %in% 'ym'])
  
  return(summary_df)
}


# Generate summaries for SPI and SSI and SGI 
summary_spi3 <- generate_summary(data_spi3, data_station, unique_regions, "spi3")
summary_spi6 <- generate_summary(data_spi6, data_station, unique_regions, "spi6")
summary_spi12 <- generate_summary(data_spi12, data_station, unique_regions, "spi12")

summary_ssi3 <- generate_summary(data_ssi3, data_station, unique_regions, "ssi3")
summary_ssi6 <- generate_summary(data_ssi6, data_station, unique_regions, "ssi6")
summary_ssi12 <- generate_summary(data_ssi12, data_station, unique_regions, "ssi12")

summary_sgi <- data.frame(
  ym = data_sgi$ym,
  sgi = rowMeans(data_sgi[ , !names(data_sgi) %in% "ym"], na.rm = TRUE)
)


# Combine to main dataset
monthly_count_Nexis <- bind_cols(monthly_count_Nexis, 
                                 summary_spi3 %>% dplyr::select(-ym), 
                                 summary_spi6 %>% dplyr::select(-ym), 
                                 summary_spi12 %>% dplyr::select(-ym), 
                                 summary_ssi3 %>% dplyr::select(-ym),
                                 summary_ssi6 %>% dplyr::select(-ym),
                                 summary_ssi12 %>% dplyr::select(-ym),
                                 summary_sgi %>% dplyr::select(-ym)
)

# Clean Up Workspace
rm(data_station, 
   selected_station_ssi, selected_station_spi, selected_station_sgi, 
   data_ssi3, data_ssi6, data_ssi12, 
   data_spi3, data_spi6, data_spi12, 
   data_sgi, 
   unique_regions, 
   summary_ssi3, summary_ssi6, summary_ssi12, 
   summary_spi3, summary_spi6, summary_spi12, 
   summary_sgi, 
   file_names_spi, file_names_ssi, file_names_sgi)

3 Statistical analysis of temporal alignment

3.1 Pair-wise correlation test of variables (Supplementary material 2)

selected_vars <- monthly_count_Nexis[, c("mean_ssi3", "mean_ssi6", "mean_ssi12",
                                         "mean_spi3", "mean_spi6", "mean_spi12",
                                         "sgi"
)]  
cor_matrix <- cor(selected_vars, use = "complete.obs")

ggcorrplot(cor_matrix, 
           method = "circle",     
           type = "lower",        
           lab = TRUE,            
           lab_size = 3,          
           colors = c("red", "white", "blue")) 

3.2 Negative binomial regression (Table 1, Supplementary material 4)

Explore all possible combinations

# Prepare the variables 
set.seed(123) 

monthly_count_Nexis_subset <- monthly_count_Nexis %>% 
  dplyr::select(year, month, count, ym, season,
                temp_3mean, temp_6mean, temp_12mean,
                mean_spi3, mean_spi6, mean_spi12, 
                mean_ssi3, mean_ssi6, mean_ssi12, 
                sgi)

monthly_count_Nexis_subset$season <- factor(monthly_count_Nexis_subset$season, 
                                            levels = c("SON", "DJF", "MAM", "JJA"))

monthly_count_Nexis_subset <- monthly_count_Nexis_subset %>%
  mutate(season_spring = ifelse(season == "MAM", 1, 0),
         season_summer = ifelse(season == "JJA", 1, 0),
         season_fall = ifelse(season == "SON", 1, 0),
         season_winter = ifelse(season == "DJF", 1, 0))


monthly_count_Nexis_subset$lag_count <- dplyr::lag(monthly_count_Nexis_subset$count, n = 1)

nb_results <- data.frame(
  CET = character(), 
  SPI = character(),
  SGI = character(),
  Seasonality = character(),
  aic = numeric(),
  deviance = numeric(),
  theta = numeric(),
  SE.theta = numeric(),
  stringsAsFactors = FALSE
)

independent_vars <- names(monthly_count_Nexis_subset)[
  names(monthly_count_Nexis_subset) != "count"
  ]

vars_temp <- grep("^temp_", independent_vars, value = TRUE)  
vars_spi <- grep("^mean_spi", independent_vars, value = TRUE)
vars_ssi <- grep("^mean_ssi", independent_vars, value = TRUE)
vars_gwater <- "sgi"
vars_season <- "season"


# Run the analysis 
for (var1 in c(NA, vars_temp)) {
  for (var2 in c(NA, vars_spi)) {
    for (var3 in c(NA, vars_gwater)) {
      for (var4 in c(NA, vars_season)) {
        
        if (is.na(var1) && is.na(var2) && is.na(var3) && is.na(var4)) {
          next  # Skip to the next iteration
        }
        
        formula_parts <- c()
        
        if (!is.na(var1)) {
          formula_parts <- c(formula_parts, var1)
        }
        if (!is.na(var2)) {
          formula_parts <- c(formula_parts, var2)
        }
        
        if (!is.na(var3)) {
          formula_parts <- c(formula_parts, var3)
        }
        if (!is.na(var4)) {
          formula_parts <- c(formula_parts, var4)
        }
        
        formula <- as.formula(paste0("count ~ ", 
                                     paste(formula_parts, collapse = " + "), 
                                     " + lag_count"))
        
        # Fit the Negative Binomial model
        skip_to_next <- FALSE
        
        tryCatch({
          nb_model <- glm.nb(formula, data = monthly_count_Nexis_subset) }, 
          error = function(e) {
            skip_to_next <<- TRUE
          })
        if (skip_to_next) next
        
        
        # Extract the required values
        model_aic <- nb_model$aic
        model_deviance <- nb_model$deviance
        model_theta <- nb_model$theta
        model_SE_theta <- nb_model$SE.theta
        
        # Append results to the data frame
        nb_results <- rbind(nb_results, data.frame(
          CET = ifelse(is.na(var1), "-", var1),
          SPI = ifelse(is.na(var2), "-", var2),
          SGI = ifelse(is.na(var3), "-", var3),
          Seasonality = ifelse(is.na(var4), "-", var4),
          aic = model_aic,
          deviance = model_deviance,
          theta = model_theta,
          SE.theta = model_SE_theta,
          stringsAsFactors = FALSE
        ))
        
        rm(model_aic, model_deviance, model_theta, model_SE_theta)
        
      }
    }
  }
}

# Clean Up Workspace
rm(vars_temp, vars_spi, vars_ssi, vars_gwater, vars_season, 
   var1, var2, var3, var4)

3.2.1 Show the details of best-fitting model (the lowest AIC)

nb_model <- glm.nb(count ~ temp_12mean + mean_spi3 + sgi + season + lag_count, 
  data = monthly_count_Nexis_subset, 
  na.action = na.exclude,
  control = glm.control(maxit = 100))

summary(nb_model)
## 
## Call:
## glm.nb(formula = count ~ temp_12mean + mean_spi3 + sgi + season + 
##     lag_count, data = monthly_count_Nexis_subset, na.action = na.exclude, 
##     control = glm.control(maxit = 100), init.theta = 0.5267077673, 
##     link = log)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.735557   0.243955  -3.015 0.002569 ** 
## temp_12mean  0.394625   0.215748   1.829 0.067384 .  
## mean_spi3   -0.415491   0.123040  -3.377 0.000733 ***
## sgi         -0.867528   0.187490  -4.627 3.71e-06 ***
## seasonDJF    0.692755   0.319936   2.165 0.030365 *  
## seasonMAM    1.064976   0.306074   3.479 0.000502 ***
## seasonJJA    1.107580   0.303781   3.646 0.000266 ***
## lag_count    0.045931   0.006465   7.105 1.20e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.5267) family taken to be 1)
## 
##     Null deviance: 477.79  on 282  degrees of freedom
## Residual deviance: 247.44  on 275  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 883.87
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.5267 
##           Std. Err.:  0.0775 
## 
##  2 x log-likelihood:  -865.8690

3.2.2 Show the details of parsimonious model (3rd AIC, yet fewer variables)

nb_model_parsi <- glm.nb(count ~ mean_spi12 + season + lag_count,
  data = monthly_count_Nexis_subset, 
  na.action = na.exclude,
  control = glm.control(maxit = 100))

summary(nb_model_parsi)
## 
## Call:
## glm.nb(formula = count ~ mean_spi12 + season + lag_count, data = monthly_count_Nexis_subset, 
##     na.action = na.exclude, control = glm.control(maxit = 100), 
##     init.theta = 0.502878727, link = log)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.528358   0.245573  -2.152 0.031434 *  
## mean_spi12  -0.834511   0.125272  -6.662 2.71e-11 ***
## seasonDJF    0.669849   0.323615   2.070 0.038462 *  
## seasonMAM    1.107087   0.311349   3.556 0.000377 ***
## seasonJJA    1.206022   0.310088   3.889 0.000101 ***
## lag_count    0.050184   0.006499   7.722 1.14e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.5029) family taken to be 1)
## 
##     Null deviance: 463.06  on 282  degrees of freedom
## Residual deviance: 246.52  on 277  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 885.06
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.5029 
##           Std. Err.:  0.0729 
## 
##  2 x log-likelihood:  -871.0600

3.3 Test model performances of best & parsimonious models (Supplementary material 3)

The best AIC model

fitted_values <- round(fitted(nb_model))

monthly_count_Nexis_subset$order <- c(1:nrow(monthly_count_Nexis_subset))

plot(monthly_count_Nexis_subset$order, 
     monthly_count_Nexis_subset$count, 
     type = "l", col = "blue", lwd = 1,
     ylab = " ", xlab = " ", 
     main = "CET-12, SPI-3, SGI-1, Seasonality, Lag",
     xaxt = "n")

lines(monthly_count_Nexis_subset$order, fitted_values, col = "red", lwd = 1)

legend("topleft", 
       legend = c("Count", "Modeled Count"), 
       col = c("blue", "red"), lwd = 1)

The parsimonious model

fitted_values <- round(fitted(nb_model_parsi))

monthly_count_Nexis_subset$order <- c(1:nrow(monthly_count_Nexis_subset))

plot(monthly_count_Nexis_subset$order, 
     monthly_count_Nexis_subset$count, 
     type = "l", col = "blue", lwd = 1,
     ylab = " ", xlab = " ", 
     main = "SPI12, Seasonality, Lag",
     xaxt = "n")

lines(monthly_count_Nexis_subset$order, fitted_values, col = "red", lwd = 1)

legend("topleft", 
       legend = c("Count", "Modeled Count"), 
       col = c("blue", "red"), lwd = 1)

3.4 Temporal plot with the best-fitting variables (Figure 1)

x_breaks <- seq(which(monthly_count_Nexis_subset$ym == "2000-1"), 
                length(monthly_count_Nexis_subset$ym), by = 12)


monthly_count_Nexis_subset %>%
  mutate(ym = factor(ym, levels = ym)) %>%
  mutate(order = 1:n()) %>%
  
  mutate(condition_flag = case_when(
    temp_12mean > 0 & sgi < 0 & mean_spi3 < 0 & 
      (season_spring == 1 | season_summer == 1) ~ "favorable",
    TRUE ~ NA_character_
  )) %>%
  
  pivot_longer(cols = c(temp_12mean, mean_spi3, sgi),
               names_to = "variable",
               values_to = "value") %>%
  
  mutate(variable = factor(variable, levels = c("temp_12mean", "mean_spi3", "sgi"))) %>%
  
  ggplot(aes(x = order)) +
  
  geom_rect(data = . %>% filter(condition_flag == "favorable"),
            aes(xmin = order - 0.5, xmax = order + 0.5, ymin = -Inf, ymax = Inf),
            fill = "#F0E442", alpha = 0.2) + #gold
  
  geom_bar(aes(y = count / 100, fill = "dimgray"), stat = "identity", width = 1.1) +
  
  scale_fill_manual(values = c("dimgray" = "dimgray"), guide = "none") +
  
  geom_line(aes(y = value, color = variable, size = variable, linetype = variable)) +
  
  scale_color_manual(values = c("temp_12mean" = "#D55E00", 
                                "mean_spi3" = "#0072B2",
                                "sgi" = "#009E73"), 
                     labels = c("temp_12mean" = "CET-12",
                                "mean_spi3" = "SPI-3",
                                "sgi" = "SGI-1"),
                     name = "") +
  
  scale_size_manual(values = c("temp_12mean" = 0.5, 
                               "mean_spi3" = 0.5, 
                               "sgi" = 0.5), 
                    guide = "none") +
  
  scale_linetype_manual(values = c("temp_12mean" = "solid",
                                   "mean_spi3" = "solid", "sgi" = "solid"),
                        guide = "none") + 
  
  geom_vline(xintercept = x_breaks, linetype = "dotted", color = "darkgrey", size = 0.3) +
  
  scale_y_continuous(
    name = "Index score",
    sec.axis = sec_axis(~ . * 100, name = "Article count")
  ) +
  
  labs(x = " ", color = " ") +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.3) +
  
  scale_x_continuous(
    expand = c(0, 0),
    breaks = seq(which(monthly_count_Nexis_subset$ym == "2000-1")[1], 
                 length(monthly_count_Nexis_subset$ym), by = 12),
    labels = monthly_count_Nexis_subset$year[seq(which(monthly_count_Nexis_subset$ym == "2000-1"),
                                                 length(monthly_count_Nexis_subset$ym), by = 12)]
  ) +
  
  theme_bw() +
  
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    legend.position = c(0.03, 0.98), # top-left inside
    legend.justification = c("left", "top"),
    legend.background = element_rect(fill = alpha("white", 0.6))
  )

3.5 Heatmap plot with the best-fitting variables (Figure 2)

# Define winsorisation function for visualisation 
winsorize <- function(x, p = c(0.01, 0.99)) {
  q <- quantile(x, probs = p, na.rm = TRUE)
  x[x < q[1]] <- q[1]
  x[x > q[2]] <- q[2]
  x
}

monthly_count_Nexis_subset$ym <- as.character(monthly_count_Nexis_subset$ym)

monthly_count_Nexis_subset$year <- as.numeric(monthly_count_Nexis_subset$year)

monthly_count_Nexis_subset$month <- as.numeric(monthly_count_Nexis_subset$month)


monthly_count_Nexis_0 <- data.frame("ym" = monthly_count_Nexis_subset$ym, 
                                    "year" = monthly_count_Nexis_subset$year,
                                    "month" = monthly_count_Nexis_subset$month,
                                    "count" = monthly_count_Nexis_subset$count, 
                                    "source" = "Article count")

monthly_count_Nexis_0$count_rescale2 <- rescale(winsorize(monthly_count_Nexis_0$count))


# CET-12
monthly_count_Nexis_T <- data.frame("ym" = monthly_count_Nexis_subset$ym, 
                                    "year" = monthly_count_Nexis_subset$year,
                                    "month" = monthly_count_Nexis_subset$month,
                                    "count" = round(monthly_count_Nexis_subset$temp_12mean, 
                                                    digits = 1),
                                    "source" = "CET-12")


# SPI-3
monthly_count_Nexis_P <- data.frame("ym" = monthly_count_Nexis_subset$ym, 
                                    "year" = monthly_count_Nexis_subset$year,
                                    "month" = monthly_count_Nexis_subset$month,
                                    "count" = round(monthly_count_Nexis_subset$mean_spi3, 
                                                    digits = 1), 
                                    "source" = "SPI-3")


# SGI-1
monthly_count_Nexis_G <- data.frame("ym" = monthly_count_Nexis_subset$ym, 
                                    "year" = monthly_count_Nexis_subset$year,
                                    "month" = monthly_count_Nexis_subset$month,
                                    "count" = round(monthly_count_Nexis_subset$sgi, 
                                                    digits = 1), 
                                    "source" = "SGI-1")


desired_order <- c("Article count", "CET-12", "SPI-3", "SGI-1")

monthly_count_Nexis_0$source <- factor(monthly_count_Nexis_0$source, levels = desired_order)
monthly_count_Nexis_T$source <- factor(monthly_count_Nexis_T$source, levels = desired_order)
monthly_count_Nexis_P$source <- factor(monthly_count_Nexis_P$source, levels = desired_order)
monthly_count_Nexis_G$source <- factor(monthly_count_Nexis_G$source, levels = desired_order)

Plot the variables

ggplot() + 
  
  # CET-12
  geom_point(
    data = monthly_count_Nexis_T,
    aes(month, source, color=count, size = 6)) +
  
  scale_color_gradient2(low = "white", mid = "white", high = "#D55E00") + 
  
  geom_text(data = monthly_count_Nexis_T,
            aes(month, source, label = count), 
            color = "black", size = 2) +
  
  # SPI-3
  new_scale_color() +
  
  geom_point(
    data = monthly_count_Nexis_P,
    aes(month, source, color=count, size = 6)) +
  
  scale_color_gradient2(low = "#0072B2", mid = "white", high = "white") +
  
  geom_text(data = monthly_count_Nexis_P,
            aes(month, source, label = count), 
            color = "black", size = 2) +
  
  # SGI-1
  new_scale_color() +
  
  geom_point(
    data = monthly_count_Nexis_G,
    aes(month, source, color=count, size = 6)) +
  
  scale_color_gradient2(low = "#009E73", mid = "white", high = "white") +
  
  geom_text(data = monthly_count_Nexis_G,
            aes(month, source, label = count), 
            color = "black", size = 2) +
  
  
  # Article count  
  new_scale_fill() +
  geom_tile(
    data = monthly_count_Nexis_0,
    aes(month, source, fill=count_rescale2) #count
  ) +
  
  scale_fill_gradient(low = "white", high = "black") + 
  
  geom_text(data = monthly_count_Nexis_0,
            aes(month, source, label = count),
            color = ifelse(monthly_count_Nexis_0$count > 30, 
                           "white", "black"), size = 2) +
  
  facet_wrap(~year) +
  
  theme_minimal() +
  
  labs(x = " ", y = " ", fill = " ") +
  
  scale_x_continuous(breaks = 1:12, labels = month, 
                     position = "top", 
                     expand = c(0,0)) + 
  
  scale_y_discrete(limits = rev(desired_order)) + 
  
  theme(legend.position = "none", 
        
        panel.grid.major = element_blank(),
        
        panel.grid.minor = element_blank(), 
        
        strip.placement = "outside", 
        
        panel.spacing.y = unit(0.5, "lines"), 
        
        panel.spacing.x = unit(0.5, "lines"),
        
        strip.text.x = element_text(size = 8, face = "bold"), 
        
        axis.text.x = element_text(size = 6, face = "bold.italic") 
  )

rm(monthly_count_Nexis_0, monthly_count_Nexis_T, 
   monthly_count_Nexis_P, monthly_count_Nexis_G)

4 Topic modeling

STM tutorial by Roberts et al., 2019 (DOI: https://doi.org/10.18637/jss.v091.i02)

4.1 NLP pre-processing

Subset data for target year

reference <- data.frame("year" = rep(c(2012, 2022), each = 6),
                        "month" = c(1,2,3,4,5,6,       # for 2012
                                    7,8,9,10,11,12))    # for 2022


datarange <- list("2012" = "2012, Jan to Jun", 
                  "2022" = "2022, July to Dec")


data_input <- Nexisdata_df1

Initiate the NLP model

#spacy_download_langmodel(model = "en_core_web_trf")
spacy_initialize(model = "en_core_web_trf") 

Proceed with NLP

spacy_0 <- spacy_parse(paste0(data_input$Headline, ". ", data_input$Sentence), 
                       dependency = T, nounphrase = TRUE) 


data_entity <- entity_extract(spacy_0, type = "named", concatenator = "_")

data_entity$ngram <- sapply(strsplit(data_entity$entity, "_"), length)

data_entity_consolidate <- udpipe::txt_recode_ngram(spacy_0$token, 
                                                    compound = data_entity$entity, 
                                                    ngram = data_entity$ngram, 
                                                    sep = "_") %>% as.data.frame()

colnames(data_entity_consolidate) <- "entity_result"

spacy_1 <- cbind(spacy_0, data_entity_consolidate)

spacy_1$entity[spacy_1$entity %in% 
                  c("DATE_B", "DATE_I", "LANGUAGE_B", "LANGUAGE_I", "QUANTITY_I", 
                    "QUANTITY_B", "CARDINAL_I", "CARDINAL_B", "ORDINAL_B", 
                    "ORDINAL_I", "TIME_B", "TIME_I", "MONEY_I", "MONEY_B", 
                    "PERCENT_B", "PERCENT_I")] <- "" 

spacy_1$final <- with(spacy_1, ifelse(entity == "", lemma, entity_result)) 

spacy_1 <- spacy_1 %>% drop_na() 

spacy_1$docsen <- paste0(spacy_1$doc_id, "_", spacy_1$sentence_id) 

sw <- quanteda::stopwords("english") %>% data.frame() 

colnames(sw) <- "final"

spacy_2 <-  spacy_1 %>% dplyr::anti_join(sw, by = "final")

spacy_2 <- filter(spacy_2, spacy_2$pos != "PUNCT") #punctuation 

spacy_2 <- filter(spacy_2, nchar(spacy_2$final) != 1) # single text removal 

spacy_2 <- spacy_2[!(spacy_2$final %in% 
                       c("I", "’", "-", "•", "'", "/", "’s", "'s", "’re", "’ve", "n't")),]


# Pickle the data frames
saveRDS(spacy_1, paste0("./ProcessedData/N_drought+england_ALL_spacy1_2024.rds"))

saveRDS(spacy_2, paste0("./ProcessedData/N_drought+england_ALL_spacy2_2024.rds"))

4.2 Get data ready for STM

spacy_2 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy2_2024.rds"))

x <- spacy_2 %>% 
  filter(pos %in% c("NOUN")) %>% 
  group_by(doc_id) %>% 
  mutate(text_string = paste(final, collapse = " ")) %>% 
  dplyr::select(doc_id, text_string) %>% unique()


temp <- data_input %>% dplyr::select(Art_ID, year, month, Newspaper)

temp <- temp %>%
  mutate(source = ifelse(str_detect(tolower(Newspaper), 
                                    'telegraph|independent|times|guardian|financial'), 
                         "broadsheet", 
                         ifelse(str_detect(tolower(Newspaper), 
                                           'mail|mirror|sun'), "tabloid", "NA")))

temp$doc_id <- paste0("text", c(1:nrow(temp)))


x <- left_join(x, temp, by = "doc_id")

x_token <- x$text_string %>% tokens(what = "fastestword") 

x_dfm <- dfm(x_token, tolower = F)

x_stm <- convert(x_dfm, to = "stm")

rm(temp)

4.3 Define the number of k (Supplementary 3)

Try different k and find the optimal number of topics

K1 <- c(4, 6, 8, 10, 12, 15, 20, 30, 50)

set.seed(123)
x_fit1 <- searchK(x_stm$documents, x_stm$vocab, K = K1, verbose = F)

x_plot1 <- data.frame("K" = K1, 
                      
                      "Coherence" = unlist(x_fit1$results$semcoh),  
                      
                      "Exclusivity" = unlist(x_fit1$results$exclus), 
                      
                      "heldout" = unlist(x_fit1$results$heldout),
                      
                      "residual" = unlist(x_fit1$results$residual))


x_plot1 <- reshape2::melt(x_plot1, id = c("K"))

ggplot(x_plot1, aes(K, value, color = variable, group = 1)) +
  
  geom_point(show.legend = FALSE) +
  
  geom_line(show.legend = FALSE) + 
  
  scale_x_continuous(breaks = unique(x_plot1$K)) +
  
  facet_wrap(~variable, scales = "free_y") + 
  
  theme_bw() +
  
  theme(panel.grid.minor.x = element_blank()) + # Remove secondary vertical lines
  
  labs(x = "Number of topics K", 
       title = "Statistical fit of models with different K")

4.4 Run the stm with optimal k

4.4.1 Run the model

optimalK <- 15

set.seed(123)

x_model <- stm(documents = x_stm$documents, 
               vocab = x_stm$vocab, 
               K = as.numeric(optimalK),
               data = x, 
               verbose = F) 
x_model
## A topic model with 15 topics, 836 documents and a 5588 word dictionary.

4.4.2 Review the topical proportions (Supplementary material 6)

Upon the result, we rearrange the topic names for readability (e.g. Topic 5 –> Topic 2)

plot.STM(x_model, type = "summary") 

rename_rules <- c(
  "V1" = "#1",
  "V5" = "#2",
  "V14" = "#3",
  "V9" = "#4",
  "V13" = "#5",
  "V6" = "#6",
  "V2" = "#7",
  "V4" = "#8",
  "V7" = "#9",
  "V15" = "#10",
  "V8" = "#11",
  "V10" = "#12",
  "V11" = "#13",
  "V12" = "#14",
  "V3" = "#15"
)

4.5 Understand topic model reseults (Table 2)

Filter 50 keywords per topic

x_topic <- labelTopics(x_model, n = 50) 


x_topic_prob <- data.frame("features" = t(x_topic$prob)) 

colnames(x_topic_prob) <- paste0("V", c(1:as.numeric(optimalK)))


x_topic_frex <- data.frame("features" = t(x_topic$frex)) 

colnames(x_topic_frex) <- paste0("V", c(1:as.numeric(optimalK)))

Print 10 keywords based on probability

x_topic_prob <- x_topic_prob %>%
  rename_with(~ recode(., !!!rename_rules))

topic_order <- order(as.integer(str_extract(names(x_topic_prob), "\\d+")))
x_topic_prob <- x_topic_prob[, topic_order]

x_topic_prob %>% head(10)
##          #1        #2         #3       #4       #5          #6        #7
## 1   drought      year      water     rain    water temperature    farmer
## 2     water      cent    company    flood      ban         day      crop
## 3       ban       per     supply flooding hosepipe        week   drought
## 4  hosepipe     water       year  drought    plant    heatwave      food
## 5   company  rainfall       leak  weather   garden     weather     price
## 6     level reservoir    drought     week    order       water      year
## 7      area     month  reservoir    water  drought        fire   weather
## 8  rainfall     level    leakage  warning      car        heat    potato
## 9     river      rain       plan downpour   people   condition condition
## 10     part   drought government     area  company        area vegetable
##             #8       #9       #10         #11         #12    #13            #14
## 1        water   garden   climate       water       water   year           year
## 2        river     year    change         ban      summer   cent          water
## 3      drought    plant      year    customer        week    per         beaver
## 4     wildlife  drought   weather    hosepipe     drought  price        drought
## 5         year   flower   drought     company         day market         people
## 6         fish gardener scientist     drought        rain   time        project
## 7        level    thing   pattern temperature     weather school infrastructure
## 8      wetland     goal    summer        area      people   sale     population
## 9  environment    grass     world         day temperature  month      reservoir
## 10        bird      day   warming      supply        heat  beach        country
##        #15
## 1     tree
## 2  drought
## 3     year
## 4    leave
## 5   summer
## 6    plant
## 7    water
## 8   autumn
## 9   flower
## 10 robinia

Print 10 keywords based on frex score

x_topic_frex <- x_topic_frex %>%
  rename_with(~ recode(., !!!rename_rules))

topic_order <- order(as.integer(str_extract(names(x_topic_frex), "\\d+")))
x_topic_frex <- x_topic_frex[, topic_order]

x_topic_frex %>% head(10)
##             #1        #2          #3         #4        #5           #6
## 1  restriction       per        leak   flooding      drip thunderstorm
## 2       status      cent      target      flood     order  temperature
## 3  groundwater   average     leakage   downpour      butt     barbecue
## 4          ban reservoir       meter      flash sprinkler     heatwave
## 5     hosepipe        mm        bill       wind      lawn         fire
## 6      drought  capacity    metering subsidence  watering  firefighter
## 7  environment  rainfall   regulator    tornado      hose        blaze
## 8       agency    figure        plan      claim       can     wildfire
## 9         area     month    industry       road       car         crew
## 10       level      fear consumption    sandbag  swimming         high
##           #7       #8        #9        #10      #11          #12       #13
## 1     potato    trout   prairie  scientist customer     ladybird jellyfish
## 2      wheat     vole      goal    climate  leakage    standpipe     fever
## 3    harvest  wetland    dahlia    pattern   bottle       street    market
## 4  livestock     fish    trophy     change property championship     share
## 5     barley   salmon      game    warming      bin       memory    pollen
## 6     grower   stream      show      study    litre         bath  sufferer
## 7       crop wildlife      rose   emission   health        swarm  investor
## 8      yield  lapwing  defender    extreme wildfire        trial    school
## 9  vegetable    chalk euphorbia strawberry     fire        clock   analyst
## 10    carrot   oxygen verbascum      world    amber       umpire     beach
##                #14          #15
## 1           beaver      robinia
## 2             cave     bluebell
## 3             fund         tree
## 4              bog pseudoacacia
## 5              dam          ivy
## 6             peat        leave
## 7          project         leaf
## 8  embed.component      blossom
## 9        developer   blackberry
## 10            debt      sapling

4.6 Post processing

# Assign a single topic for a single article based on maximum theta value
theta <- x_model[["theta"]] %>% as.data.frame()


theta$max <- apply(theta, 1, max, na.rm=TRUE)

theta$temp_topic <- colnames(theta)[apply(theta,1,which.max)]

theta$final_topic <- colnames(theta)[apply(theta[, c(1:as.numeric(optimalK))],1,which.max)]


theta$year <- x$year

theta$month <- x$month

theta$Art_ID <- x$Art_ID

theta$source <- x$source


# Reorder topic numbers according to plot.STM(x_model, type = "summary") 
theta <- theta %>%
  mutate(final_topic = case_when(
    final_topic == "V1" ~ "V1",
    final_topic == "V5" ~ "V2",
    final_topic == "V14" ~ "V3",
    final_topic == "V9" ~ "V4",
    final_topic == "V13" ~ "V5",
    final_topic == "V6" ~ "V6",
    final_topic == "V2" ~ "V7",
    final_topic == "V4" ~ "V8",
    final_topic == "V7" ~ "V9", #n/a
    final_topic == "V15" ~ "V10",
    final_topic == "V8" ~ "V11",
    final_topic == "V10" ~ "V12", #n/a
    final_topic == "V11" ~ "V13", #n/a
    final_topic == "V12" ~ "V14",
    final_topic == "V3" ~ "V15",
    TRUE ~ final_topic  # Keep other values unchanged
  ))

4.7 Visualize topic modeling results

4.7.1 Create a reference

topic_relabel <- paste0("V", c(1:15)) 

temp_order <- make.dt(x_model) # Create the topic-document matrix
temp_order <- summarize_all(temp_order, mean)
temp_order <- temp_order[, -c(1)]

temp <- data.frame(values = as.numeric(temp_order[1, ]), labels = topic_relabel)
temp_order <- aggregate(values ~ labels, data = temp, FUN = sum)
print(temp_order)
##    labels     values
## 1      V1 0.23758342
## 2     V10 0.02539055
## 3     V11 0.02469192
## 4     V12 0.02331535
## 5     V13 0.07729065
## 6     V14 0.10570514
## 7     V15 0.03347381
## 8      V2 0.05676462
## 9      V3 0.02297600
## 10     V4 0.05086555
## 11     V5 0.11672582
## 12     V6 0.07460046
## 13     V7 0.03408245
## 14     V8 0.03058337
## 15     V9 0.08595087
temp_order$labels <- gsub("^V", "", temp_order$labels)
temp_order <- temp_order %>% arrange(desc(values))
temp_order <- unlist(temp_order$labels)
temp_order <- as.numeric(temp_order)
print(temp_order)
##  [1]  1  5 14  9 13  6  2  4  7 15  8 10 11 12  3
# Define the topic labels 
topic_label_new <- c()

for (k1 in temp_order) {
  temp <- paste0(which(temp_order == k1), ". ", x_topic_prob[1,k1], 
                 ", ", x_topic_prob[2,k1], 
                 ", ", x_topic_prob[3,k1])
  
  topic_label_new <- c(topic_label_new, temp)
  rm(temp, k1)
}

print(topic_label_new)
##  [1] "1. drought, water, ban"     "2. water, ban, hosepipe"   
##  [3] "3. year, water, beaver"     "4. garden, year, plant"    
##  [5] "5. year, cent, per"         "6. temperature, day, week" 
##  [7] "7. year, cent, per"         "8. rain, flood, flooding"  
##  [9] "9. farmer, crop, drought"   "10. tree, drought, year"   
## [11] "11. water, river, drought"  "12. climate, change, year" 
## [13] "13. water, ban, customer"   "14. water, summer, week"   
## [15] "15. water, company, supply"
topic_level <- paste0("V", c(1:15)) 

topic_label_new1 <- c("#1.Drought and hosepipe ban", "#2.Water shortage", 
                      "#3.Water management", "#4.Flooding", 
                      "#5.Water use impacts", "#6.Heatwave", 
                      "#7.Agriculture", "#8.Aquatic ecosystems", 
                      "#9.n/a", "#10.Climate change", 
                      "#11.Temperature and hosepipe ban", "#12.n/a", 
                      "#13.n/a", "#14.Beavers", "#15.Terrestrial ecosystems")

topic_label_new <- paste0("#", c(1:15)) 

4.7.2 Colorblind test for color palette

library(rcartocolor)
library(dichromat)
library(ggplot2)
library(scales)
library(gridExtra)

set.seed(123) 

# Define the color 
colorpalette <- carto_pal(12, "Safe")
colorpalette <- c("#88CCEE", "#CC6677","#117733",
                  "#AA4499", "#44AA99", "#882255", 
                  "#DDCC77", "#6699CC", "white","#999933", 
                  "#332288",  "white","white", "#661100", "#888888")

colorpalette
##  [1] "#88CCEE" "#CC6677" "#117733" "#AA4499" "#44AA99" "#882255" "#DDCC77"
##  [8] "#6699CC" "white"   "#999933" "#332288" "white"   "white"   "#661100"
## [15] "#888888"
shuffled_palette <- sample(colorpalette)

simulated_protan <- dichromat(shuffled_palette, type = "protan") # Red-green blindness
simulated_deutan <- dichromat(shuffled_palette, type = "deutan") # Green blindness
simulated_tritan <- dichromat(shuffled_palette, type = "tritan") # Blue-yellow blindness


display_palette <- function(palette, title) {
  data <- data.frame(
    x = 1:length(palette),
    color = palette
  )
  ggplot(data, aes(x = x, y = 1, fill = color)) +
    geom_tile() +
    scale_fill_manual(values = palette) +
    labs(title = title) +
    theme_void() +
    theme(legend.position = "none")
}

original_plot <- display_palette(colorpalette, "Original Palette")
shuffled_plot <- display_palette(shuffled_palette, "Shuffled Palette")
protan_plot <- display_palette(simulated_protan, "Protanopia Simulation")
deutan_plot <- display_palette(simulated_deutan, "Deuteranopia Simulation")
tritan_plot <- display_palette(simulated_tritan, "Tritanopia Simulation")

grid.arrange(original_plot, shuffled_plot, protan_plot, deutan_plot, tritan_plot, ncol = 5)

rm(shuffled_palette, simulated_protan, simulated_deutan, simulated_tritan,
   original_plot, shuffled_plot, protan_plot, deutan_plot, tritan_plot)

4.7.3 Topic composition by month (Supplementary 7)

Generate a summary file

theta_summary3 <- theta %>% 
  
  complete(year, month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(year, month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>%
  
  mutate(ym = paste0(year, "-", month))

theta_summary3$month <- factor(theta_summary3$month) 

theta_summary3 <- theta_summary3 %>% as.data.frame()

count_bymonth3 <- aggregate(count ~ ym, data = theta_summary3, FUN = sum) %>% 
  dplyr::select(count) %>% as.vector()

Plot by count

ggplot(theta_summary3, 
       aes(x = month, y = count, fill = factor(final_topic, levels = topic_level))) + 
  
  geom_bar(stat = "identity", position = position_stack(reverse = T)) + 
  
  theme_bw() + 
  
  xlab(" ") + 
  
  ylab("count")+ 
  
  facet_wrap(~year,  ncol=5) + 
  
  scale_fill_manual(label = topic_label_new1, values = colorpalette, na.value = "gray70") +
  
  theme(legend.position = "right", legend.title = element_blank(),
        
        plot.title = element_text(size=12, face = "bold"),
        
        plot.subtitle =element_text(size=10, face = "italic")) 

4.7.4a Visualize by month: 2012 (Fig. 3a)

Set a target year

y <- 2012
y0 <- "2012"

theta_summary2012 <- theta %>% 
  
  dplyr::filter(year == y) %>% 
  
  dplyr::filter(month %in% reference$month[reference$year == y]) %>% 
  
  complete(month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>% 
  
  as.data.frame()

count_bymonth <- aggregate(count ~ month, data = theta_summary2012, FUN = sum) %>% 
  dplyr::select(count) %>% as.vector()

Plot by count – treemap

temp_palette <- data.frame(final_topic = topic_level,
                           colors = colorpalette, 
                           topic_label0 = topic_label_new)

theta_summary2012 <- left_join(theta_summary2012, temp_palette, by = "final_topic")

theta_summary2012 <- theta_summary2012 %>%
  mutate(month_abb = month.abb[month])


# png(paste0("./IMG/treemap_allnoun_topic", optimalK, "_", datarange[y0], 
#            ".png"), width = 500, height = 700)


treemap(theta_summary2012,
        
        index = c("month_abb", "topic_label0"),
        
        vSize = "count",
        
        type = "color",
        
        vColor = "colors",
        
        fontface.labels=c(2,1),
        
        bg.labels=c("transparent"),
        
        align.labels=list(
          c("left", "top"),
          c("center", "center")
        ),
        
        fontsize.labels=c(30,18),
        
        fontsize.title = 20,
        
        border.col=c("black","white"),
        
        force.print.labels = TRUE, 
        
        title = paste0(datarange[y0])
)

# dev.off()

4.7.4b Visualize by month: 2022 (Fig. 3b)

Set a target year

y <- 2022
y0 <- "2022"

theta_summary2022 <- theta %>% 
  
  dplyr::filter(year == y) %>% 
  
  dplyr::filter(month %in% reference$month[reference$year == y]) %>% 
  
  complete(month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>% 
  
  as.data.frame()

count_bymonth <- aggregate(count ~ month, data = theta_summary2022, FUN = sum) %>% 
  dplyr::select(count) %>% as.vector()

Plot by count – treemap

temp_palette <- data.frame(final_topic = topic_level,
                           colors = colorpalette, 
                           topic_label0 = topic_label_new)

theta_summary2022 <- left_join(theta_summary2022, temp_palette, by = "final_topic")

theta_summary2022 <- theta_summary2022 %>%
  mutate(month_abb = month.abb[month])


# png(paste0("./IMG/treemap_allnoun_topic", optimalK, "_", datarange[y0], 
#           ".png"), width = 500, height = 700)

treemap(theta_summary2022,
        
        index = c("month_abb", "topic_label0"),
        
        vSize = "count",
        
        type = "color",
        
        vColor = "colors",
        
        fontface.labels=c(2,1),
        
        bg.labels=c("transparent"),
        
        align.labels=list(
          c("left", "top"),
          c("center", "center")
        ),
        
        fontsize.labels=c(30,18),
        
        fontsize.title = 20,
        
        border.col=c("black","white"),
        
        force.print.labels = TRUE, 
        
        title = paste0(datarange[y0])
)

# dev.off()

4.7.4c Generate the legend

# png("./IMG/legend.png", width = 500, height = 500) # Save to a file
plot.new()
legend("center", legend = topic_label_new1, fill = colorpalette, cex = 1.7, bty = "n") #, title = "Legend"

# dev.off()