Define System Language and Time
Sys.setenv(LANG = "en")
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
Library Packages
# Load required packages
invisible(
lapply(
c(
# Nexis data handling
'LexisNexisTools',
# Data handling
'stringr', 'dplyr', 'broom', 'tidytext', 'tidyr', 'tidyverse', 'scales',
'reshape2', 'zoo',
# NLP tools
'udpipe', 'spacyr', 'quanteda',
# Correlation analysis
'ggcorrplot', 'corrplot',
# Regression analysis
'MASS',
# Topic modeling
'stm',
# File handling
'readxl', 'xlsx',
# Data visualization
'ggplot2', 'wordcloud', 'ggwordcloud', 'RColorBrewer', 'ggpubr',
'ggnewscale', 'treemap', 'ggh4x', 'viridis',
'rcartocolor'
),
require, character.only = TRUE
)
)
# Silence dplyr warnings
options(dplyr.summarise.inform = FALSE)
System Overview
xfun::session_info(dependencies = FALSE)
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
##
## Locale:
## LC_COLLATE=English_United States.utf8
## LC_CTYPE=English_United States.utf8
## LC_MONETARY=English_United States.utf8
## LC_NUMERIC=C
## LC_TIME=English_United States.1252
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## Package version:
## rcartocolor_2.1.1 viridis_0.6.3
## viridisLite_0.4.2 ggh4x_0.2.8
## treemap_2.4-4 ggnewscale_0.4.9
## ggpubr_0.6.0 ggwordcloud_0.5.0
## wordcloud_2.6 RColorBrewer_1.1-3
## xlsx_0.6.5 readxl_1.4.2
## stm_1.3.6 MASS_7.3-60
## corrplot_0.92 ggcorrplot_0.1.4.1
## quanteda_3.3.1 spacyr_1.2.1
## udpipe_0.8.11 zoo_1.8-12
## reshape2_1.4.4 scales_1.2.1
## lubridate_1.9.2 forcats_1.0.0
## purrr_1.0.1 readr_2.1.4
## tibble_3.2.1 ggplot2_3.4.2
## tidyverse_2.0.0 tidyr_1.3.0
## tidytext_0.4.1 broom_1.0.4
## dplyr_1.1.2 stringr_1.5.1
## LexisNexisTools_0.3.7 pbapply_1.7-2
## gridExtra_2.3 rlang_1.1.1
## magrittr_2.0.3 gridBase_0.4-7
## compiler_4.3.2 png_0.1-8
## vctrs_0.6.2 pkgconfig_2.0.3
## fastmap_1.1.1 backports_1.4.1
## ellipsis_0.3.2 utf8_1.2.3
## promises_1.2.0.1 rmarkdown_2.22
## tzdb_0.4.0 xfun_0.39
## cachem_1.0.8 jsonlite_1.8.5
## SnowballC_0.7.1 later_1.3.1
## xlsxjars_0.6.1 parallel_4.3.2
## stopwords_2.3 R6_2.5.1
## bslib_0.4.2 stringi_1.7.12
## car_3.1-2 jquerylib_0.1.4
## cellranger_1.1.0 nsyllable_1.0.1
## Rcpp_1.0.10 knitr_1.43
## httpuv_1.6.11 Matrix_1.6-0
## igraph_1.5.0 timechange_0.2.0
## tidyselect_1.2.0 rstudioapi_0.14
## stringdist_0.9.12 abind_1.4-5
## yaml_2.3.7 lattice_0.21-9
## plyr_1.8.8 shiny_1.7.4
## withr_3.0.0 evaluate_0.21
## rJava_1.0-6 RcppParallel_5.1.7
## pillar_1.9.0 quanteda.textstats_0.96.2
## carData_3.0-5 janeaustenr_1.0.0
## generics_0.1.3 hms_1.1.3
## munsell_0.5.0 xtable_1.8-4
## glue_1.6.2 tools_4.3.2
## data.table_1.15.4 tokenizers_0.3.0
## ggsignif_0.6.4 fastmatch_1.1-3
## grid_4.3.2 colorspace_2.1-0
## cli_3.6.1 fansi_1.0.4
## gtable_0.3.3 rstatix_0.7.2
## sass_0.4.6 digest_0.6.31
## htmltools_0.5.5 lifecycle_1.0.4
## mime_0.12
data3_Nexis is a data frame to contain the count of monthly newspaper articles
# Load structured newspaper articles (originally obtained from Nexis Uni)
Nexisdata_df <- readRDS("./ScrapedData/Nexisdata_all_raw.rds")
print(paste("Number of articles from Nexis Uni: ", nrow(Nexisdata_df)))
## [1] "Number of articles from Nexis Uni: 1361"
# Data cleaning 1: Remove duplicate headlines
Nexisdata_df0 <- Nexisdata_df[!duplicated(Nexisdata_df [c("Headline")]), ]
# Data cleaning 2: Remove geographic and thematic metadata to be irrelevant
Nexisdata_df0 <- Nexisdata_df0 %>%
mutate(
mainsubject = str_remove(str_extract(Subject, "^[^;]+"), "\\(.*"),
mainregion = str_remove(str_extract(Geographic, "^[^;]+"), "\\(.*")
)
Nexisdata_df0 <- Nexisdata_df0 %>%
mutate(order = row_number()) %>%
dplyr::select(order, Art_ID, mainsubject, mainregion, everything())
Nexisdata_df00 <- subset(Nexisdata_df0,
!grepl(paste(c("Sport", "SPORT"),
collapse = "|"), Section))
Nexisdata_df00 <- subset(Nexisdata_df00,
!grepl("SOCCER|CRICKET",mainsubject, ignore.case = TRUE))
Nexisdata_df00 <- subset(Nexisdata_df00,
grepl(paste(c("UNITED KINGDOM", "ENGLAND"),
collapse = "|"), mainregion, ignore.case = TRUE))
Nexisdata_df00 <- subset(Nexisdata_df00,
!grepl("NEW SOUTH WALES", mainregion, ignore.case = TRUE))
Nexisdata_df00 <- Nexisdata_df00 %>% drop_na(Date)
# Double-check on the selected media sources
monthly_count_Nexis_summary <- Nexisdata_df00 %>%
count(Newspaper, name = "Freq") %>%
arrange(desc(Freq)) %>%
mutate(
Tabloid = case_when(
str_detect(tolower(Newspaper), "telegraph|independent|times|guardian|financial") ~ 0,
str_detect(tolower(Newspaper), "mail|mirror|sun") ~ 1,
TRUE ~ 9
),
Online = ifelse(str_detect(tolower(Newspaper), ".co.uk|.com|online"), 1, 0)
) %>%
filter(Tabloid != 9)
# Customize date info: year, month, and daytime
Nexisdata_df000 <- Nexisdata_df00 %>%
separate(Date, into = c("year", "month", "daytime"), sep = "-") %>%
mutate(across(c(year, month), as.numeric))
# Summarize monthly article counts
Nexisdata_df1 <- Nexisdata_df000 %>%
filter(Newspaper %in% monthly_count_Nexis_summary$Newspaper)
print(paste("Number of valid articles after filtering: ", nrow(Nexisdata_df1)))
## [1] "Number of valid articles after filtering: 836"
data3_Nexis <- Nexisdata_df1 %>% group_by(year, month) %>% summarize(count = n())
monthly_count_Nexis <- data.frame(year = rep(2000:2023, each = 12), month = rep(1:12, 24))
monthly_count_Nexis <- left_join(monthly_count_Nexis, data3_Nexis)
monthly_count_Nexis[is.na(monthly_count_Nexis)] = 0
monthly_count_Nexis[, c("year", "month")] <-
sapply(monthly_count_Nexis[, c("year", "month")], as.character)
monthly_count_Nexis <- monthly_count_Nexis %>%
mutate(ym = paste0(year, "-", month))
# Clean up workspace
rm(Nexisdata_df, Nexisdata_df0, Nexisdata_df00, Nexisdata_df000)
The variables include temperature, precipitation, groundwater, and seasonality
# Load UK Temperature Data: Mean Central England Temperature (MCET)
#https://www.metoffice.gov.uk/hadobs/hadcet/data/meantemp_monthly_totals.txt
List_UK_temp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MCET")
# Calculate Anomaly Baseline Based on 1991–2020 Average
List_UK_temp_subset_ref <- List_UK_temp %>% subset(Year %in% c(1991:2020)) %>%
dplyr::select(-Year, -Annual)
UK_temp_ref_mean <- colMeans(List_UK_temp_subset_ref)
# Calculate Temperature Anomalies
List_UK_temp_subset_target <- List_UK_temp %>%
subset(Year %in% c(2000:2023)) %>%
dplyr::select(-Annual)
temp_target_anomaly <- sweep(List_UK_temp_subset_target[2:13], 2, UK_temp_ref_mean, "-")
temp_target_anomaly$Year <- List_UK_temp_subset_target$Year
temp_target_anomaly_long <- temp_target_anomaly %>%
pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")
temp_target_anomaly_long$month <- rep(1:12, length(2000:2023))
temp_target_anomaly_long <- temp_target_anomaly_long %>% subset(Year %in% c(2000:2023))
# Add temperature anomaly values
monthly_count_Nexis$temp_anomaly <- temp_target_anomaly_long$Value
# Exclude months 9–12 of 2023
monthly_count_Nexis <- head(monthly_count_Nexis, n = nrow(monthly_count_Nexis) - 4)
# Compute rolling averages with temporal lags
monthly_count_Nexis$temp_3mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 3,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_6mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 6,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_12mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 12,
FUN = mean, fill = 0, align = "right")
# Clean Up Workspace
rm(List_UK_temp, List_UK_temp_subset_ref, List_UK_temp_subset_target,
temp_target_anomaly, temp_target_anomaly_long)
# Assign seasonality names based on month
monthly_count_Nexis <- monthly_count_Nexis %>%
mutate(season = if_else(month %in% c(9, 10, 11), "SON",
if_else(month %in% c(12, 1,2), "DJF",
if_else(month %in% c(3,4,5), "MAM",
if_else(month %in% c(6,7,8), "JJA", NA)))))
monthly_count_Nexis$season <- factor(monthly_count_Nexis$season,
levels=c("SON", "DJF", "MAM","JJA"))
# Load Station-based indices downloaded from a link below
# https://nrfa.ceh.ac.uk/data/search
data_station <- read.csv("./ScrapedData/nrfa-station-metadata-2024-10-24.csv")
data_station <- data_station[, c(1:15)]
data_station <- data_station %>%
mutate(region_name = case_when(
hydrometric.area %in% c(22, 23, 24) ~ 'Northumbria',
hydrometric.area %in% c(25, 26, 27) ~ 'Yorkshire',
hydrometric.area %in% c(29, 30, 31, 32, 33, 34, 35, 36, 37) ~ 'Anglian',
hydrometric.area %in% c(38, 39) ~ 'Thames',
hydrometric.area %in% c(40, 41, 42) ~ 'Southern',
hydrometric.area %in% c(43, 44, 52, 53) ~ 'Wessex',
hydrometric.area %in% c(45, 46, 47, 48, 49, 50, 51) ~ 'South West England',
hydrometric.area %in% c(54, 28) ~ 'Midlands',
hydrometric.area %in% c(68, 69, 70, 71, 72, 73, 74, 75, 76) ~ 'North West England',
TRUE ~ NA_character_ # This handles cases where none of the conditions are met
))
# Retrieve files and extract data
file_names_ssi <- list.files(path = "./ScrapedData/Drought_Indices",
pattern = "^nhmp-ssi-nrfa-") #813
file_names_ssi <- sub("nhmp-ssi-nrfa-([0-9]+).*", "\\1", file_names_ssi)
file_names_spi <- list.files(path = "./ScrapedData/Drought_Indices",
pattern = "^nhmp-spi-nrfa-") #916
file_names_spi <- sub("nhmp-spi-nrfa-([0-9]+).*", "\\1", file_names_spi)
data_station <- data_station %>%
mutate(match_ssi = ifelse(id %in% file_names_ssi, TRUE, FALSE),
match_spi = ifelse(id %in% file_names_spi, TRUE, FALSE))
print(paste("Number of valid stations for SSI: ", sum(data_station$match_ssi))) # count of TRUE
## [1] "Number of valid stations for SSI: 811"
print(paste("Number of valid stations for SPI: ", sum(data_station$match_spi))) # count of TRUE
## [1] "Number of valid stations for SPI: 916"
data_station <- data_station %>%
filter(!(match_ssi == FALSE & match_spi == FALSE)) %>%
filter(!is.na(region_name))
print(paste("Station ID: ", unique(data_station$measuring.authority.id)))
## [1] "Station ID: EA-NE" "Station ID: EA-Y" "Station ID: EA-EM"
## [4] "Station ID: EA-WM" "Station ID: EA-LN" "Station ID: EA-EA"
## [7] "Station ID: EA-HNL" "Station ID: EA-T" "Station ID: EA-KSL"
## [10] "Station ID: EA-SSD" "Station ID: EA-WX" "Station ID: EA-DC"
## [13] "Station ID: EA-GMMC" "Station ID: EA-CL"
# Selectively call the station data
file_names_sgi <- list.files(path = "./ScrapedData/Drought_Indices",
pattern = "^nhmp-sgi-") #41
file_names_sgi <- sub("nhmp-sgi-", "\\1", file_names_sgi)
file_names_sgi <- sub(".csv", "\\1", file_names_sgi)
station_sgi <- read.xlsx("./ScrapedData/SGI.xlsx", sheetName = "Sheet1")
station_sgi <- station_sgi %>% dplyr::filter(Country == "England")
# Specify time stamps
start_date <- as.Date("2000-01-01")
end_date <- as.Date("2023-08-31")
monthly_dates <- seq(from = start_date, to = end_date, by = "month")
year_month <- format(monthly_dates, "%Y-%m")
selected_station_ssi <- data_station %>% filter(match_ssi == TRUE) %>% pull(id)
selected_station_spi <- data_station %>% filter(match_spi == TRUE) %>% pull(id)
selected_station_sgi <- station_sgi %>% pull(Station.ID.fixed)
# Generate indices to compatible format
generate_data <- function(index, col_name, year_month, selected_stations) {
result <- data.frame(ym = year_month)
for (selected_station in selected_stations) {
file_name <- paste0("./ScrapedData/Drought_Indices/nhmp-",
index, selected_station, ".csv") #SPI, SSI
if (file.exists(file_name)) {
data <- read.csv(file_name, skip = 7)
row_start <- which(data[, 1] == '2000-01')
row_end <- which(data[, 1] == '2023-08')
if (length(row_start) == 0 | length(row_end) == 0) {
next # Skip current loop if row_start or row_end is not found
}
col_target <- which(names(data) == col_name)
selected_data <- data %>%
slice(row_start:row_end) %>%
dplyr::select("month", all_of(col_name))
colnames(selected_data) <- c("ym", selected_station)
result <- left_join(result, selected_data, by = "ym")
}
}
return(result)
}
# Pre-process the indices
data_ssi3 <- generate_data("ssi-nrfa-", "ssi.3", year_month, selected_station_ssi)
data_ssi6 <- generate_data("ssi-nrfa-", "ssi.6", year_month, selected_station_ssi)
data_ssi12 <- generate_data("ssi-nrfa-", "ssi.12", year_month, selected_station_ssi)
data_spi3 <- generate_data("spi-nrfa-", "spi.3", year_month, selected_station_spi)
data_spi6 <- generate_data("spi-nrfa-", "spi.6", year_month, selected_station_spi)
data_spi12 <- generate_data("spi-nrfa-", "spi.12", year_month, selected_station_spi)
data_sgi <- generate_data("sgi-", "sgi", year_month, selected_station_sgi)
unique_regions <- unique(data_station$region_name)
generate_summary <- function(data, data_station, unique_regions, prefix) {
region_mean_list <- list()
# Calculate region means
for (region in unique_regions) {
id_values <- data_station %>%
filter(region_name == !!region) %>%
pull(id)
id_values_valid <- intersect(id_values, colnames(data))
subset_df <- data %>%
dplyr::select(ym, all_of(id_values_valid))
region_mean <- rowMeans(subset_df[ , -1], na.rm = TRUE)
region_mean_list[[region]] <- region_mean
}
# Create summary dataframe
summary_df <- data.frame(ym = data$ym)
for (region in names(region_mean_list)) {
summary_df[[paste0(prefix, "_", region)]] <- region_mean_list[[region]]
}
# Calculate the overall mean
summary_df[[paste0("mean_", prefix)]] <- rowMeans(summary_df[, !names(summary_df) %in% 'ym'])
return(summary_df)
}
# Generate summaries for SPI and SSI and SGI
summary_spi3 <- generate_summary(data_spi3, data_station, unique_regions, "spi3")
summary_spi6 <- generate_summary(data_spi6, data_station, unique_regions, "spi6")
summary_spi12 <- generate_summary(data_spi12, data_station, unique_regions, "spi12")
summary_ssi3 <- generate_summary(data_ssi3, data_station, unique_regions, "ssi3")
summary_ssi6 <- generate_summary(data_ssi6, data_station, unique_regions, "ssi6")
summary_ssi12 <- generate_summary(data_ssi12, data_station, unique_regions, "ssi12")
summary_sgi <- data.frame(
ym = data_sgi$ym,
sgi = rowMeans(data_sgi[ , !names(data_sgi) %in% "ym"], na.rm = TRUE)
)
# Combine to main dataset
monthly_count_Nexis <- bind_cols(monthly_count_Nexis,
summary_spi3 %>% dplyr::select(-ym),
summary_spi6 %>% dplyr::select(-ym),
summary_spi12 %>% dplyr::select(-ym),
summary_ssi3 %>% dplyr::select(-ym),
summary_ssi6 %>% dplyr::select(-ym),
summary_ssi12 %>% dplyr::select(-ym),
summary_sgi %>% dplyr::select(-ym)
)
# Clean Up Workspace
rm(data_station,
selected_station_ssi, selected_station_spi, selected_station_sgi,
data_ssi3, data_ssi6, data_ssi12,
data_spi3, data_spi6, data_spi12,
data_sgi,
unique_regions,
summary_ssi3, summary_ssi6, summary_ssi12,
summary_spi3, summary_spi6, summary_spi12,
summary_sgi,
file_names_spi, file_names_ssi, file_names_sgi)
selected_vars <- monthly_count_Nexis[, c("mean_ssi3", "mean_ssi6", "mean_ssi12",
"mean_spi3", "mean_spi6", "mean_spi12",
"sgi"
)]
cor_matrix <- cor(selected_vars, use = "complete.obs")
ggcorrplot(cor_matrix,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("red", "white", "blue"))
Explore all possible combinations
# Prepare the variables
set.seed(123)
monthly_count_Nexis_subset <- monthly_count_Nexis %>%
dplyr::select(year, month, count, ym, season,
temp_3mean, temp_6mean, temp_12mean,
mean_spi3, mean_spi6, mean_spi12,
mean_ssi3, mean_ssi6, mean_ssi12,
sgi)
monthly_count_Nexis_subset$season <- factor(monthly_count_Nexis_subset$season,
levels = c("SON", "DJF", "MAM", "JJA"))
monthly_count_Nexis_subset <- monthly_count_Nexis_subset %>%
mutate(season_spring = ifelse(season == "MAM", 1, 0),
season_summer = ifelse(season == "JJA", 1, 0),
season_fall = ifelse(season == "SON", 1, 0),
season_winter = ifelse(season == "DJF", 1, 0))
monthly_count_Nexis_subset$lag_count <- dplyr::lag(monthly_count_Nexis_subset$count, n = 1)
nb_results <- data.frame(
CET = character(),
SPI = character(),
SGI = character(),
Seasonality = character(),
aic = numeric(),
deviance = numeric(),
theta = numeric(),
SE.theta = numeric(),
stringsAsFactors = FALSE
)
independent_vars <- names(monthly_count_Nexis_subset)[
names(monthly_count_Nexis_subset) != "count"
]
vars_temp <- grep("^temp_", independent_vars, value = TRUE)
vars_spi <- grep("^mean_spi", independent_vars, value = TRUE)
vars_ssi <- grep("^mean_ssi", independent_vars, value = TRUE)
vars_gwater <- "sgi"
vars_season <- "season"
# Run the analysis
for (var1 in c(NA, vars_temp)) {
for (var2 in c(NA, vars_spi)) {
for (var3 in c(NA, vars_gwater)) {
for (var4 in c(NA, vars_season)) {
if (is.na(var1) && is.na(var2) && is.na(var3) && is.na(var4)) {
next # Skip to the next iteration
}
formula_parts <- c()
if (!is.na(var1)) {
formula_parts <- c(formula_parts, var1)
}
if (!is.na(var2)) {
formula_parts <- c(formula_parts, var2)
}
if (!is.na(var3)) {
formula_parts <- c(formula_parts, var3)
}
if (!is.na(var4)) {
formula_parts <- c(formula_parts, var4)
}
formula <- as.formula(paste0("count ~ ",
paste(formula_parts, collapse = " + "),
" + lag_count"))
# Fit the Negative Binomial model
skip_to_next <- FALSE
tryCatch({
nb_model <- glm.nb(formula, data = monthly_count_Nexis_subset) },
error = function(e) {
skip_to_next <<- TRUE
})
if (skip_to_next) next
# Extract the required values
model_aic <- nb_model$aic
model_deviance <- nb_model$deviance
model_theta <- nb_model$theta
model_SE_theta <- nb_model$SE.theta
# Append results to the data frame
nb_results <- rbind(nb_results, data.frame(
CET = ifelse(is.na(var1), "-", var1),
SPI = ifelse(is.na(var2), "-", var2),
SGI = ifelse(is.na(var3), "-", var3),
Seasonality = ifelse(is.na(var4), "-", var4),
aic = model_aic,
deviance = model_deviance,
theta = model_theta,
SE.theta = model_SE_theta,
stringsAsFactors = FALSE
))
rm(model_aic, model_deviance, model_theta, model_SE_theta)
}
}
}
}
# Clean Up Workspace
rm(vars_temp, vars_spi, vars_ssi, vars_gwater, vars_season,
var1, var2, var3, var4)
nb_model <- glm.nb(count ~ temp_12mean + mean_spi3 + sgi + season + lag_count,
data = monthly_count_Nexis_subset,
na.action = na.exclude,
control = glm.control(maxit = 100))
summary(nb_model)
##
## Call:
## glm.nb(formula = count ~ temp_12mean + mean_spi3 + sgi + season +
## lag_count, data = monthly_count_Nexis_subset, na.action = na.exclude,
## control = glm.control(maxit = 100), init.theta = 0.5267077673,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.735557 0.243955 -3.015 0.002569 **
## temp_12mean 0.394625 0.215748 1.829 0.067384 .
## mean_spi3 -0.415491 0.123040 -3.377 0.000733 ***
## sgi -0.867528 0.187490 -4.627 3.71e-06 ***
## seasonDJF 0.692755 0.319936 2.165 0.030365 *
## seasonMAM 1.064976 0.306074 3.479 0.000502 ***
## seasonJJA 1.107580 0.303781 3.646 0.000266 ***
## lag_count 0.045931 0.006465 7.105 1.20e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.5267) family taken to be 1)
##
## Null deviance: 477.79 on 282 degrees of freedom
## Residual deviance: 247.44 on 275 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 883.87
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.5267
## Std. Err.: 0.0775
##
## 2 x log-likelihood: -865.8690
nb_model_parsi <- glm.nb(count ~ mean_spi12 + season + lag_count,
data = monthly_count_Nexis_subset,
na.action = na.exclude,
control = glm.control(maxit = 100))
summary(nb_model_parsi)
##
## Call:
## glm.nb(formula = count ~ mean_spi12 + season + lag_count, data = monthly_count_Nexis_subset,
## na.action = na.exclude, control = glm.control(maxit = 100),
## init.theta = 0.502878727, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.528358 0.245573 -2.152 0.031434 *
## mean_spi12 -0.834511 0.125272 -6.662 2.71e-11 ***
## seasonDJF 0.669849 0.323615 2.070 0.038462 *
## seasonMAM 1.107087 0.311349 3.556 0.000377 ***
## seasonJJA 1.206022 0.310088 3.889 0.000101 ***
## lag_count 0.050184 0.006499 7.722 1.14e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.5029) family taken to be 1)
##
## Null deviance: 463.06 on 282 degrees of freedom
## Residual deviance: 246.52 on 277 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 885.06
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.5029
## Std. Err.: 0.0729
##
## 2 x log-likelihood: -871.0600
The best AIC model
fitted_values <- round(fitted(nb_model))
monthly_count_Nexis_subset$order <- c(1:nrow(monthly_count_Nexis_subset))
plot(monthly_count_Nexis_subset$order,
monthly_count_Nexis_subset$count,
type = "l", col = "blue", lwd = 1,
ylab = " ", xlab = " ",
main = "CET-12, SPI-3, SGI-1, Seasonality, Lag",
xaxt = "n")
lines(monthly_count_Nexis_subset$order, fitted_values, col = "red", lwd = 1)
legend("topleft",
legend = c("Count", "Modeled Count"),
col = c("blue", "red"), lwd = 1)
The parsimonious model
fitted_values <- round(fitted(nb_model_parsi))
monthly_count_Nexis_subset$order <- c(1:nrow(monthly_count_Nexis_subset))
plot(monthly_count_Nexis_subset$order,
monthly_count_Nexis_subset$count,
type = "l", col = "blue", lwd = 1,
ylab = " ", xlab = " ",
main = "SPI12, Seasonality, Lag",
xaxt = "n")
lines(monthly_count_Nexis_subset$order, fitted_values, col = "red", lwd = 1)
legend("topleft",
legend = c("Count", "Modeled Count"),
col = c("blue", "red"), lwd = 1)
x_breaks <- seq(which(monthly_count_Nexis_subset$ym == "2000-1"),
length(monthly_count_Nexis_subset$ym), by = 12)
monthly_count_Nexis_subset %>%
mutate(ym = factor(ym, levels = ym)) %>%
mutate(order = 1:n()) %>%
mutate(condition_flag = case_when(
temp_12mean > 0 & sgi < 0 & mean_spi3 < 0 &
(season_spring == 1 | season_summer == 1) ~ "favorable",
TRUE ~ NA_character_
)) %>%
pivot_longer(cols = c(temp_12mean, mean_spi3, sgi),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable, levels = c("temp_12mean", "mean_spi3", "sgi"))) %>%
ggplot(aes(x = order)) +
geom_rect(data = . %>% filter(condition_flag == "favorable"),
aes(xmin = order - 0.5, xmax = order + 0.5, ymin = -Inf, ymax = Inf),
fill = "#F0E442", alpha = 0.2) + #gold
geom_bar(aes(y = count / 100, fill = "dimgray"), stat = "identity", width = 1.1) +
scale_fill_manual(values = c("dimgray" = "dimgray"), guide = "none") +
geom_line(aes(y = value, color = variable, size = variable, linetype = variable)) +
scale_color_manual(values = c("temp_12mean" = "#D55E00",
"mean_spi3" = "#0072B2",
"sgi" = "#009E73"),
labels = c("temp_12mean" = "CET-12",
"mean_spi3" = "SPI-3",
"sgi" = "SGI-1"),
name = "") +
scale_size_manual(values = c("temp_12mean" = 0.5,
"mean_spi3" = 0.5,
"sgi" = 0.5),
guide = "none") +
scale_linetype_manual(values = c("temp_12mean" = "solid",
"mean_spi3" = "solid", "sgi" = "solid"),
guide = "none") +
geom_vline(xintercept = x_breaks, linetype = "dotted", color = "darkgrey", size = 0.3) +
scale_y_continuous(
name = "Index score",
sec.axis = sec_axis(~ . * 100, name = "Article count")
) +
labs(x = " ", color = " ") +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.3) +
scale_x_continuous(
expand = c(0, 0),
breaks = seq(which(monthly_count_Nexis_subset$ym == "2000-1")[1],
length(monthly_count_Nexis_subset$ym), by = 12),
labels = monthly_count_Nexis_subset$year[seq(which(monthly_count_Nexis_subset$ym == "2000-1"),
length(monthly_count_Nexis_subset$ym), by = 12)]
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
legend.position = c(0.03, 0.98), # top-left inside
legend.justification = c("left", "top"),
legend.background = element_rect(fill = alpha("white", 0.6))
)
# Define winsorisation function for visualisation
winsorize <- function(x, p = c(0.01, 0.99)) {
q <- quantile(x, probs = p, na.rm = TRUE)
x[x < q[1]] <- q[1]
x[x > q[2]] <- q[2]
x
}
monthly_count_Nexis_subset$ym <- as.character(monthly_count_Nexis_subset$ym)
monthly_count_Nexis_subset$year <- as.numeric(monthly_count_Nexis_subset$year)
monthly_count_Nexis_subset$month <- as.numeric(monthly_count_Nexis_subset$month)
monthly_count_Nexis_0 <- data.frame("ym" = monthly_count_Nexis_subset$ym,
"year" = monthly_count_Nexis_subset$year,
"month" = monthly_count_Nexis_subset$month,
"count" = monthly_count_Nexis_subset$count,
"source" = "Article count")
monthly_count_Nexis_0$count_rescale2 <- rescale(winsorize(monthly_count_Nexis_0$count))
# CET-12
monthly_count_Nexis_T <- data.frame("ym" = monthly_count_Nexis_subset$ym,
"year" = monthly_count_Nexis_subset$year,
"month" = monthly_count_Nexis_subset$month,
"count" = round(monthly_count_Nexis_subset$temp_12mean,
digits = 1),
"source" = "CET-12")
# SPI-3
monthly_count_Nexis_P <- data.frame("ym" = monthly_count_Nexis_subset$ym,
"year" = monthly_count_Nexis_subset$year,
"month" = monthly_count_Nexis_subset$month,
"count" = round(monthly_count_Nexis_subset$mean_spi3,
digits = 1),
"source" = "SPI-3")
# SGI-1
monthly_count_Nexis_G <- data.frame("ym" = monthly_count_Nexis_subset$ym,
"year" = monthly_count_Nexis_subset$year,
"month" = monthly_count_Nexis_subset$month,
"count" = round(monthly_count_Nexis_subset$sgi,
digits = 1),
"source" = "SGI-1")
desired_order <- c("Article count", "CET-12", "SPI-3", "SGI-1")
monthly_count_Nexis_0$source <- factor(monthly_count_Nexis_0$source, levels = desired_order)
monthly_count_Nexis_T$source <- factor(monthly_count_Nexis_T$source, levels = desired_order)
monthly_count_Nexis_P$source <- factor(monthly_count_Nexis_P$source, levels = desired_order)
monthly_count_Nexis_G$source <- factor(monthly_count_Nexis_G$source, levels = desired_order)
Plot the variables
ggplot() +
# CET-12
geom_point(
data = monthly_count_Nexis_T,
aes(month, source, color=count, size = 6)) +
scale_color_gradient2(low = "white", mid = "white", high = "#D55E00") +
geom_text(data = monthly_count_Nexis_T,
aes(month, source, label = count),
color = "black", size = 2) +
# SPI-3
new_scale_color() +
geom_point(
data = monthly_count_Nexis_P,
aes(month, source, color=count, size = 6)) +
scale_color_gradient2(low = "#0072B2", mid = "white", high = "white") +
geom_text(data = monthly_count_Nexis_P,
aes(month, source, label = count),
color = "black", size = 2) +
# SGI-1
new_scale_color() +
geom_point(
data = monthly_count_Nexis_G,
aes(month, source, color=count, size = 6)) +
scale_color_gradient2(low = "#009E73", mid = "white", high = "white") +
geom_text(data = monthly_count_Nexis_G,
aes(month, source, label = count),
color = "black", size = 2) +
# Article count
new_scale_fill() +
geom_tile(
data = monthly_count_Nexis_0,
aes(month, source, fill=count_rescale2) #count
) +
scale_fill_gradient(low = "white", high = "black") +
geom_text(data = monthly_count_Nexis_0,
aes(month, source, label = count),
color = ifelse(monthly_count_Nexis_0$count > 30,
"white", "black"), size = 2) +
facet_wrap(~year) +
theme_minimal() +
labs(x = " ", y = " ", fill = " ") +
scale_x_continuous(breaks = 1:12, labels = month,
position = "top",
expand = c(0,0)) +
scale_y_discrete(limits = rev(desired_order)) +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.placement = "outside",
panel.spacing.y = unit(0.5, "lines"),
panel.spacing.x = unit(0.5, "lines"),
strip.text.x = element_text(size = 8, face = "bold"),
axis.text.x = element_text(size = 6, face = "bold.italic")
)
rm(monthly_count_Nexis_0, monthly_count_Nexis_T,
monthly_count_Nexis_P, monthly_count_Nexis_G)
STM tutorial by Roberts et al., 2019 (DOI: https://doi.org/10.18637/jss.v091.i02)
Subset data for target year
reference <- data.frame("year" = rep(c(2012, 2022), each = 6),
"month" = c(1,2,3,4,5,6, # for 2012
7,8,9,10,11,12)) # for 2022
datarange <- list("2012" = "2012, Jan to Jun",
"2022" = "2022, July to Dec")
data_input <- Nexisdata_df1
Initiate the NLP model
#spacy_download_langmodel(model = "en_core_web_trf")
spacy_initialize(model = "en_core_web_trf")
Proceed with NLP
spacy_0 <- spacy_parse(paste0(data_input$Headline, ". ", data_input$Sentence),
dependency = T, nounphrase = TRUE)
data_entity <- entity_extract(spacy_0, type = "named", concatenator = "_")
data_entity$ngram <- sapply(strsplit(data_entity$entity, "_"), length)
data_entity_consolidate <- udpipe::txt_recode_ngram(spacy_0$token,
compound = data_entity$entity,
ngram = data_entity$ngram,
sep = "_") %>% as.data.frame()
colnames(data_entity_consolidate) <- "entity_result"
spacy_1 <- cbind(spacy_0, data_entity_consolidate)
spacy_1$entity[spacy_1$entity %in%
c("DATE_B", "DATE_I", "LANGUAGE_B", "LANGUAGE_I", "QUANTITY_I",
"QUANTITY_B", "CARDINAL_I", "CARDINAL_B", "ORDINAL_B",
"ORDINAL_I", "TIME_B", "TIME_I", "MONEY_I", "MONEY_B",
"PERCENT_B", "PERCENT_I")] <- ""
spacy_1$final <- with(spacy_1, ifelse(entity == "", lemma, entity_result))
spacy_1 <- spacy_1 %>% drop_na()
spacy_1$docsen <- paste0(spacy_1$doc_id, "_", spacy_1$sentence_id)
sw <- quanteda::stopwords("english") %>% data.frame()
colnames(sw) <- "final"
spacy_2 <- spacy_1 %>% dplyr::anti_join(sw, by = "final")
spacy_2 <- filter(spacy_2, spacy_2$pos != "PUNCT") #punctuation
spacy_2 <- filter(spacy_2, nchar(spacy_2$final) != 1) # single text removal
spacy_2 <- spacy_2[!(spacy_2$final %in%
c("I", "’", "-", "•", "'", "/", "’s", "'s", "’re", "’ve", "n't")),]
# Pickle the data frames
saveRDS(spacy_1, paste0("./ProcessedData/N_drought+england_ALL_spacy1_2024.rds"))
saveRDS(spacy_2, paste0("./ProcessedData/N_drought+england_ALL_spacy2_2024.rds"))
spacy_2 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy2_2024.rds"))
x <- spacy_2 %>%
filter(pos %in% c("NOUN")) %>%
group_by(doc_id) %>%
mutate(text_string = paste(final, collapse = " ")) %>%
dplyr::select(doc_id, text_string) %>% unique()
temp <- data_input %>% dplyr::select(Art_ID, year, month, Newspaper)
temp <- temp %>%
mutate(source = ifelse(str_detect(tolower(Newspaper),
'telegraph|independent|times|guardian|financial'),
"broadsheet",
ifelse(str_detect(tolower(Newspaper),
'mail|mirror|sun'), "tabloid", "NA")))
temp$doc_id <- paste0("text", c(1:nrow(temp)))
x <- left_join(x, temp, by = "doc_id")
x_token <- x$text_string %>% tokens(what = "fastestword")
x_dfm <- dfm(x_token, tolower = F)
x_stm <- convert(x_dfm, to = "stm")
rm(temp)
Try different k and find the optimal number of topics
K1 <- c(4, 6, 8, 10, 12, 15, 20, 30, 50)
set.seed(123)
x_fit1 <- searchK(x_stm$documents, x_stm$vocab, K = K1, verbose = F)
x_plot1 <- data.frame("K" = K1,
"Coherence" = unlist(x_fit1$results$semcoh),
"Exclusivity" = unlist(x_fit1$results$exclus),
"heldout" = unlist(x_fit1$results$heldout),
"residual" = unlist(x_fit1$results$residual))
x_plot1 <- reshape2::melt(x_plot1, id = c("K"))
ggplot(x_plot1, aes(K, value, color = variable, group = 1)) +
geom_point(show.legend = FALSE) +
geom_line(show.legend = FALSE) +
scale_x_continuous(breaks = unique(x_plot1$K)) +
facet_wrap(~variable, scales = "free_y") +
theme_bw() +
theme(panel.grid.minor.x = element_blank()) + # Remove secondary vertical lines
labs(x = "Number of topics K",
title = "Statistical fit of models with different K")
optimalK <- 15
set.seed(123)
x_model <- stm(documents = x_stm$documents,
vocab = x_stm$vocab,
K = as.numeric(optimalK),
data = x,
verbose = F)
x_model
## A topic model with 15 topics, 836 documents and a 5588 word dictionary.
Upon the result, we rearrange the topic names for readability (e.g. Topic 5 –> Topic 2)
plot.STM(x_model, type = "summary")
rename_rules <- c(
"V1" = "#1",
"V5" = "#2",
"V14" = "#3",
"V9" = "#4",
"V13" = "#5",
"V6" = "#6",
"V2" = "#7",
"V4" = "#8",
"V7" = "#9",
"V15" = "#10",
"V8" = "#11",
"V10" = "#12",
"V11" = "#13",
"V12" = "#14",
"V3" = "#15"
)
Filter 50 keywords per topic
x_topic <- labelTopics(x_model, n = 50)
x_topic_prob <- data.frame("features" = t(x_topic$prob))
colnames(x_topic_prob) <- paste0("V", c(1:as.numeric(optimalK)))
x_topic_frex <- data.frame("features" = t(x_topic$frex))
colnames(x_topic_frex) <- paste0("V", c(1:as.numeric(optimalK)))
Print 10 keywords based on probability
x_topic_prob <- x_topic_prob %>%
rename_with(~ recode(., !!!rename_rules))
topic_order <- order(as.integer(str_extract(names(x_topic_prob), "\\d+")))
x_topic_prob <- x_topic_prob[, topic_order]
x_topic_prob %>% head(10)
## #1 #2 #3 #4 #5 #6 #7
## 1 drought year water rain water temperature farmer
## 2 water cent company flood ban day crop
## 3 ban per supply flooding hosepipe week drought
## 4 hosepipe water year drought plant heatwave food
## 5 company rainfall leak weather garden weather price
## 6 level reservoir drought week order water year
## 7 area month reservoir water drought fire weather
## 8 rainfall level leakage warning car heat potato
## 9 river rain plan downpour people condition condition
## 10 part drought government area company area vegetable
## #8 #9 #10 #11 #12 #13 #14
## 1 water garden climate water water year year
## 2 river year change ban summer cent water
## 3 drought plant year customer week per beaver
## 4 wildlife drought weather hosepipe drought price drought
## 5 year flower drought company day market people
## 6 fish gardener scientist drought rain time project
## 7 level thing pattern temperature weather school infrastructure
## 8 wetland goal summer area people sale population
## 9 environment grass world day temperature month reservoir
## 10 bird day warming supply heat beach country
## #15
## 1 tree
## 2 drought
## 3 year
## 4 leave
## 5 summer
## 6 plant
## 7 water
## 8 autumn
## 9 flower
## 10 robinia
Print 10 keywords based on frex score
x_topic_frex <- x_topic_frex %>%
rename_with(~ recode(., !!!rename_rules))
topic_order <- order(as.integer(str_extract(names(x_topic_frex), "\\d+")))
x_topic_frex <- x_topic_frex[, topic_order]
x_topic_frex %>% head(10)
## #1 #2 #3 #4 #5 #6
## 1 restriction per leak flooding drip thunderstorm
## 2 status cent target flood order temperature
## 3 groundwater average leakage downpour butt barbecue
## 4 ban reservoir meter flash sprinkler heatwave
## 5 hosepipe mm bill wind lawn fire
## 6 drought capacity metering subsidence watering firefighter
## 7 environment rainfall regulator tornado hose blaze
## 8 agency figure plan claim can wildfire
## 9 area month industry road car crew
## 10 level fear consumption sandbag swimming high
## #7 #8 #9 #10 #11 #12 #13
## 1 potato trout prairie scientist customer ladybird jellyfish
## 2 wheat vole goal climate leakage standpipe fever
## 3 harvest wetland dahlia pattern bottle street market
## 4 livestock fish trophy change property championship share
## 5 barley salmon game warming bin memory pollen
## 6 grower stream show study litre bath sufferer
## 7 crop wildlife rose emission health swarm investor
## 8 yield lapwing defender extreme wildfire trial school
## 9 vegetable chalk euphorbia strawberry fire clock analyst
## 10 carrot oxygen verbascum world amber umpire beach
## #14 #15
## 1 beaver robinia
## 2 cave bluebell
## 3 fund tree
## 4 bog pseudoacacia
## 5 dam ivy
## 6 peat leave
## 7 project leaf
## 8 embed.component blossom
## 9 developer blackberry
## 10 debt sapling
# Assign a single topic for a single article based on maximum theta value
theta <- x_model[["theta"]] %>% as.data.frame()
theta$max <- apply(theta, 1, max, na.rm=TRUE)
theta$temp_topic <- colnames(theta)[apply(theta,1,which.max)]
theta$final_topic <- colnames(theta)[apply(theta[, c(1:as.numeric(optimalK))],1,which.max)]
theta$year <- x$year
theta$month <- x$month
theta$Art_ID <- x$Art_ID
theta$source <- x$source
# Reorder topic numbers according to plot.STM(x_model, type = "summary")
theta <- theta %>%
mutate(final_topic = case_when(
final_topic == "V1" ~ "V1",
final_topic == "V5" ~ "V2",
final_topic == "V14" ~ "V3",
final_topic == "V9" ~ "V4",
final_topic == "V13" ~ "V5",
final_topic == "V6" ~ "V6",
final_topic == "V2" ~ "V7",
final_topic == "V4" ~ "V8",
final_topic == "V7" ~ "V9", #n/a
final_topic == "V15" ~ "V10",
final_topic == "V8" ~ "V11",
final_topic == "V10" ~ "V12", #n/a
final_topic == "V11" ~ "V13", #n/a
final_topic == "V12" ~ "V14",
final_topic == "V3" ~ "V15",
TRUE ~ final_topic # Keep other values unchanged
))
topic_relabel <- paste0("V", c(1:15))
temp_order <- make.dt(x_model) # Create the topic-document matrix
temp_order <- summarize_all(temp_order, mean)
temp_order <- temp_order[, -c(1)]
temp <- data.frame(values = as.numeric(temp_order[1, ]), labels = topic_relabel)
temp_order <- aggregate(values ~ labels, data = temp, FUN = sum)
print(temp_order)
## labels values
## 1 V1 0.23758342
## 2 V10 0.02539055
## 3 V11 0.02469192
## 4 V12 0.02331535
## 5 V13 0.07729065
## 6 V14 0.10570514
## 7 V15 0.03347381
## 8 V2 0.05676462
## 9 V3 0.02297600
## 10 V4 0.05086555
## 11 V5 0.11672582
## 12 V6 0.07460046
## 13 V7 0.03408245
## 14 V8 0.03058337
## 15 V9 0.08595087
temp_order$labels <- gsub("^V", "", temp_order$labels)
temp_order <- temp_order %>% arrange(desc(values))
temp_order <- unlist(temp_order$labels)
temp_order <- as.numeric(temp_order)
print(temp_order)
## [1] 1 5 14 9 13 6 2 4 7 15 8 10 11 12 3
# Define the topic labels
topic_label_new <- c()
for (k1 in temp_order) {
temp <- paste0(which(temp_order == k1), ". ", x_topic_prob[1,k1],
", ", x_topic_prob[2,k1],
", ", x_topic_prob[3,k1])
topic_label_new <- c(topic_label_new, temp)
rm(temp, k1)
}
print(topic_label_new)
## [1] "1. drought, water, ban" "2. water, ban, hosepipe"
## [3] "3. year, water, beaver" "4. garden, year, plant"
## [5] "5. year, cent, per" "6. temperature, day, week"
## [7] "7. year, cent, per" "8. rain, flood, flooding"
## [9] "9. farmer, crop, drought" "10. tree, drought, year"
## [11] "11. water, river, drought" "12. climate, change, year"
## [13] "13. water, ban, customer" "14. water, summer, week"
## [15] "15. water, company, supply"
topic_level <- paste0("V", c(1:15))
topic_label_new1 <- c("#1.Drought and hosepipe ban", "#2.Water shortage",
"#3.Water management", "#4.Flooding",
"#5.Water use impacts", "#6.Heatwave",
"#7.Agriculture", "#8.Aquatic ecosystems",
"#9.n/a", "#10.Climate change",
"#11.Temperature and hosepipe ban", "#12.n/a",
"#13.n/a", "#14.Beavers", "#15.Terrestrial ecosystems")
topic_label_new <- paste0("#", c(1:15))
library(rcartocolor)
library(dichromat)
library(ggplot2)
library(scales)
library(gridExtra)
set.seed(123)
# Define the color
colorpalette <- carto_pal(12, "Safe")
colorpalette <- c("#88CCEE", "#CC6677","#117733",
"#AA4499", "#44AA99", "#882255",
"#DDCC77", "#6699CC", "white","#999933",
"#332288", "white","white", "#661100", "#888888")
colorpalette
## [1] "#88CCEE" "#CC6677" "#117733" "#AA4499" "#44AA99" "#882255" "#DDCC77"
## [8] "#6699CC" "white" "#999933" "#332288" "white" "white" "#661100"
## [15] "#888888"
shuffled_palette <- sample(colorpalette)
simulated_protan <- dichromat(shuffled_palette, type = "protan") # Red-green blindness
simulated_deutan <- dichromat(shuffled_palette, type = "deutan") # Green blindness
simulated_tritan <- dichromat(shuffled_palette, type = "tritan") # Blue-yellow blindness
display_palette <- function(palette, title) {
data <- data.frame(
x = 1:length(palette),
color = palette
)
ggplot(data, aes(x = x, y = 1, fill = color)) +
geom_tile() +
scale_fill_manual(values = palette) +
labs(title = title) +
theme_void() +
theme(legend.position = "none")
}
original_plot <- display_palette(colorpalette, "Original Palette")
shuffled_plot <- display_palette(shuffled_palette, "Shuffled Palette")
protan_plot <- display_palette(simulated_protan, "Protanopia Simulation")
deutan_plot <- display_palette(simulated_deutan, "Deuteranopia Simulation")
tritan_plot <- display_palette(simulated_tritan, "Tritanopia Simulation")
grid.arrange(original_plot, shuffled_plot, protan_plot, deutan_plot, tritan_plot, ncol = 5)
rm(shuffled_palette, simulated_protan, simulated_deutan, simulated_tritan,
original_plot, shuffled_plot, protan_plot, deutan_plot, tritan_plot)
Generate a summary file
theta_summary3 <- theta %>%
complete(year, month, final_topic, fill = list(z = NA)) %>%
group_by(year, month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
mutate(ym = paste0(year, "-", month))
theta_summary3$month <- factor(theta_summary3$month)
theta_summary3 <- theta_summary3 %>% as.data.frame()
count_bymonth3 <- aggregate(count ~ ym, data = theta_summary3, FUN = sum) %>%
dplyr::select(count) %>% as.vector()
Plot by count
ggplot(theta_summary3,
aes(x = month, y = count, fill = factor(final_topic, levels = topic_level))) +
geom_bar(stat = "identity", position = position_stack(reverse = T)) +
theme_bw() +
xlab(" ") +
ylab("count")+
facet_wrap(~year, ncol=5) +
scale_fill_manual(label = topic_label_new1, values = colorpalette, na.value = "gray70") +
theme(legend.position = "right", legend.title = element_blank(),
plot.title = element_text(size=12, face = "bold"),
plot.subtitle =element_text(size=10, face = "italic"))
Set a target year
y <- 2012
y0 <- "2012"
theta_summary2012 <- theta %>%
dplyr::filter(year == y) %>%
dplyr::filter(month %in% reference$month[reference$year == y]) %>%
complete(month, final_topic, fill = list(z = NA)) %>%
group_by(month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
as.data.frame()
count_bymonth <- aggregate(count ~ month, data = theta_summary2012, FUN = sum) %>%
dplyr::select(count) %>% as.vector()
Plot by count – treemap
temp_palette <- data.frame(final_topic = topic_level,
colors = colorpalette,
topic_label0 = topic_label_new)
theta_summary2012 <- left_join(theta_summary2012, temp_palette, by = "final_topic")
theta_summary2012 <- theta_summary2012 %>%
mutate(month_abb = month.abb[month])
# png(paste0("./IMG/treemap_allnoun_topic", optimalK, "_", datarange[y0],
# ".png"), width = 500, height = 700)
treemap(theta_summary2012,
index = c("month_abb", "topic_label0"),
vSize = "count",
type = "color",
vColor = "colors",
fontface.labels=c(2,1),
bg.labels=c("transparent"),
align.labels=list(
c("left", "top"),
c("center", "center")
),
fontsize.labels=c(30,18),
fontsize.title = 20,
border.col=c("black","white"),
force.print.labels = TRUE,
title = paste0(datarange[y0])
)
# dev.off()
Set a target year
y <- 2022
y0 <- "2022"
theta_summary2022 <- theta %>%
dplyr::filter(year == y) %>%
dplyr::filter(month %in% reference$month[reference$year == y]) %>%
complete(month, final_topic, fill = list(z = NA)) %>%
group_by(month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
as.data.frame()
count_bymonth <- aggregate(count ~ month, data = theta_summary2022, FUN = sum) %>%
dplyr::select(count) %>% as.vector()
Plot by count – treemap
temp_palette <- data.frame(final_topic = topic_level,
colors = colorpalette,
topic_label0 = topic_label_new)
theta_summary2022 <- left_join(theta_summary2022, temp_palette, by = "final_topic")
theta_summary2022 <- theta_summary2022 %>%
mutate(month_abb = month.abb[month])
# png(paste0("./IMG/treemap_allnoun_topic", optimalK, "_", datarange[y0],
# ".png"), width = 500, height = 700)
treemap(theta_summary2022,
index = c("month_abb", "topic_label0"),
vSize = "count",
type = "color",
vColor = "colors",
fontface.labels=c(2,1),
bg.labels=c("transparent"),
align.labels=list(
c("left", "top"),
c("center", "center")
),
fontsize.labels=c(30,18),
fontsize.title = 20,
border.col=c("black","white"),
force.print.labels = TRUE,
title = paste0(datarange[y0])
)
# dev.off()
# png("./IMG/legend.png", width = 500, height = 500) # Save to a file
plot.new()
legend("center", legend = topic_label_new1, fill = colorpalette, cex = 1.7, bty = "n") #, title = "Legend"
# dev.off()