
# ************************************************************************************
# Figure 1 - "Using R in Hydrology"
#
# This code extracts the first publication date of each package from the CRAN archive 
# (https://cran.r-project.org/src/contrib/Archive/) 
# and scrapes the html table of packages to extract the date of last publication 
# (https://cran.rstudio.com/web/packages/available_packages_by_date.html).
# The code then filters the hydrology-related packages and plots the number of published packages over time.
# ====================================================================================

# required packages
library(tidyverse)
library(rvest)
library(lubridate)
library(cowplot)

# Archived packages url
archived_pckgs <- "https://cran.r-project.org/src/contrib/Archive/"

# CRAN published table of packages (the table includes date of last update and package description)
published_pckgs <- "https://cran.rstudio.com/web/packages/available_packages_by_date.html"

# define a function that takes package url to get the html table
get_html_table <- function(url) {
  read_html(url) %>%
    html_nodes(xpath = "/html/body/table") %>%
    .[[1]] %>%
    html_table(fill = T)
}

# ************************************************************************************
# Scrape the date when each package was archived for the first time
# ====================================================================================

# Extract archived package names
ap_names <- get_html_table(archived_pckgs)

# Clean the package name
ap_names2 <- ap_names$Name %>%
  str_remove(., "/")
ap_names2 <- ap_names2[!ap_names2 %in% c("Parent Directory", "", "README")] 

# Create an empty data frame
pkg_1st_release_date <- data_frame()

# Set the start time (to track total elapsed time)
stime <- proc.time()

for (p in 1:length(ap_names2)) {
  tryCatch({ 

  # get the package name
  pckg_name <- ap_names2[p]

  # Make url for that package
  Pname_url <- paste0(archived_pckgs, pckg_name, "/")

  # Extract the metadata (contained in table)
  arch_table <- get_html_table(Pname_url)

  # Extract the date of the first archive
  rd <- arch_table["Last modified"] %>%
    rename(last_modified = "Last modified") %>%
    as_data_frame() %>%
    filter(last_modified != "") %>%
    mutate(last_modified = ymd_hm(last_modified)) %>%
    slice(which.min(last_modified))

  # Create data frame
  to_df <- data_frame(Package = pckg_name, release_date = pull(rd))

  # Append information for this package to the initial dataframe (pkg_1st_release_date)
  pkg_1st_release_date <- bind_rows(pkg_1st_release_date, to_df)

  # Print package name and the elapsed time
  et <- proc.time() - stime
  print(cat(paste0(p,": package: ",pckg_name,"\n",
                   "elapsed: ", round(et[3]/60,2), " min")))
  
  }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}

# Check how many packages did not download
npackages <- length(ap_names2)- nrow(pkg_1st_release_date) 
print(paste(npackages, "packages did not download"))

# Save a backup of the data just in case
todaysdate <- Sys.Date()
write_csv(pkg_1st_release_date, paste0("archived_pckgs_",todaysdate,".csv"))

# ************************************************************************************
#  Scape the "date of last publication" (last update) for each package currently in CRAN
# ====================================================================================

# Read the CRAN package table from the html page and save backup
pkg_last_update <- get_html_table(published_pckgs)
write_csv(pkg_last_update, paste0("published_pckgs_",todaysdate,".csv")) 

# Clean and tidy up
pkg_last_update <- pkg_last_update %>%
  as_tibble() %>%
  mutate(publication_date = ymd(Date)) %>%
  select(publication_date, Package, Title)

# ************************************************************************************
#  Join both dataframes
# ====================================================================================

# Join archived packages to those currently in CRAN
# pkg_1st_release_date <- read_csv("archived_pckgs.csv")
joined_df <- pkg_last_update %>%
  left_join(pkg_1st_release_date)

# Some current active packages are not in the archive list
# (packages that haven't been updated yet) in which case
# their date of release corresponds to the date of publication
# that appears in table of "Packages By Date of Publication"

output_df <- joined_df %>% mutate(
  release_date = date(release_date), # this removes hours and minutes in the 'date of release'
  release_date = as_date(ifelse(is.na(release_date),
    publication_date, release_date
  ))
)

# Number of packages per year by date of *first release* and by date of *last publication*
summary_df <- output_df %>%
  mutate(
    release_yr = year(release_date),
    updt_yr = year(publication_date)
  ) %>%
  gather(by, yr, c("release_yr", "updt_yr")) %>%
  group_by(by, yr) %>%
  summarise(count = n()) 

# bar colours
colour_bar <- c("grey", "black")

# Panel a
panel_a <-
  ggplot(summary_df, aes(x = yr, y = count, fill = by)) +
  geom_bar(stat = "identity", position = position_dodge(preserve = "single"), color = "black") +
  scale_x_continuous(breaks = seq(1996, 2019, 2)) +
  scale_y_continuous(breaks = seq(0, 5000, 1000)) +
  coord_cartesian(ylim=c(0,5000), expand=c(0,0))+
  scale_fill_manual(
    values = colour_bar,
    labels = c("By date of first release ", "By date of last publication")
  ) +
  geom_text(aes(
    label = "All packages",
    x = 2010, y = 5000 * 0.9
  ),
  size = 5, fontface = "bold", color = "black", hjust = 0
  ) +
  ylab("Number of Packages") +
  xlab("Year") +
  guides(fill = guide_legend(title = "")) +
  theme_bw() +
  theme(legend.position = c(0.2, 0.85))

# ************************************************************************************
#  Subset the hydrology packages
# ====================================================================================

# Filter packages with 'hydro' as key word in the name and package description
hydro_pkgs <- output_df %>%
  mutate(pkg_descr = paste(Package, Title, sep = "/")) %>%
  filter(str_detect(pkg_descr, regex("hydro", ignore_case = T)))

# Discard incorrect packages
to_omit <- c("MLML2R", "ahnr", "zFactor")
hydro_pkgs <- hydro_pkgs %>% filter(!Package %in% to_omit)

# Number of hydro packages per year by date of first release and by date of last update
summary_hydro <- hydro_pkgs %>%
  mutate(
    release_yr = year(release_date),
    updt_yr = year(publication_date)
  ) %>%
  gather(by, yr, c("release_yr", "updt_yr")) %>%
  group_by(by, yr) %>%
  summarise(count = n()) 

# Panel b
panel_b <-
  ggplot(summary_hydro, aes(x = yr, y = count, fill = by)) +
  geom_bar(stat = "identity", position = position_dodge(preserve = "single"), color = "black") +
  scale_x_continuous(breaks = seq(2008, 2019, 1)) +
  scale_y_continuous(breaks = seq(0, 15, 5)) +
  coord_cartesian(ylim=c(0,15), expand=c(0,0))+
  scale_fill_manual(
    values = colour_bar,
    labels = c("By date of 1st release ", "By date of publication")
  ) +
  geom_text(aes(
    label = "Hydrology packages",
    x = 2013, y = 15* 0.9
  ),
  size = 5, fontface = "bold", color = "black", hjust = 0
  ) +
  ylab("") + #Number of Packages
  xlab("Year") +
  guides(fill = F) +
  theme_bw()

# ************************************************************************************
# Produce a composite figure with 2 panels
# ====================================================================================

# Create the figure
png(filename = paste0("./Figure_1_",todaysdate,".png"), 
    width = 30, height = 10, units = "cm", res = 800)
fig1 <- plot_grid(panel_a, panel_b, ncol = 2)
print(fig1)
dev.off()

# ************************************************************************************
# ************************************************************************************