# ------------------------------------------------------------------------------
# PF Analysis Pipeline - Isotopic Depth Clustering & Preferential Flow Detection
# Author: Jonas Pyschik
# Email: jonas.pyschik@hydrology.uni-freiburg.de
# Description:
#   This script processes isotopic depth profiles (δ18O and δ2H) from soil cores,
#   applies clustering, builds seasonal reference profiles using LOESS smoothing,
#   and identifies deviations as potential indicators of preferential flow (PF).
# ------------------------------------------------------------------------------

# ------------------------------------------------------------------------------
# 1. Load Required Libraries
# ------------------------------------------------------------------------------
library(tidyverse)    # Data wrangling and visualization
library(caret)        # Cross-validation (LOESS smoothing)
library(cluster)      # Silhouette scoring
library(cowplot)      # Plotting multiple ggplot panels

# ------------------------------------------------------------------------------
# 2. Set Input and Output Paths
# ------------------------------------------------------------------------------
data_path <- ""
output_path <- ""

input_csv <- file.path(data_path, "231220_ALL_calibted.csv") # Soil water isotope data
cutoff_csv <- file.path(data_path, "2501_Core_cutoff_jpyschik.csv") # Drill Core Cutoff depths

output_clustered_csv <- file.path(output_path, "PF_iso.all.clustered.csv")
output_final_csv <- file.path(output_path, "PF_Iso_Identified.csv")

dir.create(output_path, showWarnings = FALSE, recursive = TRUE)

# ------------------------------------------------------------------------------
# 3. Load and Label Isotope Data with Final Group Names
# ------------------------------------------------------------------------------
iso <- read.csv(input_csv)

iso <- iso %>%
  mutate(MD = substr(Date, 1, 7)) %>%
  select(Site, ID, Depth, calib.18O, calib.2H, MD) %>%
  mutate(SampleGroup = case_when(
    Site == "EG" & MD %in% c("2022-07", "2022-08", "2022-09") ~ "OM-summer",
    Site == "EG" & MD == "2023-03" ~ "OM-winter",
    Site == "FR" & MD %in% c("2022-06", "2022-07") ~ "BF-spring",
    Site == "FR" & MD == "2022-09" ~ "BF-summer",
    Site == "IN" & MD %in% c("2022-10", "2022-11") ~ "TA-fall",
    Site == "SL" & MD == "2022-07" ~ "SL-summer",
    Site == "SL" & MD == "2022-11" ~ "SL-fall",
    TRUE ~ NA_character_
  )) %>%
  drop_na(SampleGroup)  # Remove records without group assignment

groups <- unique(iso$SampleGroup)

# ------------------------------------------------------------------------------
# 4. Cluster Profiles Within Each Group Based on Polynomial Fit
# ------------------------------------------------------------------------------
cluster_profiles <- function(group_id, data) {
  df <- data %>% filter(SampleGroup == group_id)
  
  # Regress δ2H on δ18O to align scales
  reg <- lm(calib.2H ~ calib.18O, data = df)
  
  df <- df %>%
    mutate(d18O.regressed = calib.18O * coef(reg)[2] + coef(reg)[1]) %>%
    group_by(ID) %>%
    filter(n() >= 4) %>%
    ungroup() %>%
    group_by(Depth) %>%
    filter(n() >= 4) %>%
    ungroup() %>%
    filter(Depth <= 200)
  
  profiles <- unique(df$ID)
  
  # Fit 3rd-order polynomial to each profile
  poly_data <- map_dfr(profiles, function(pid) {
    long_df <- df %>%
      filter(ID == pid) %>%
      pivot_longer(c(calib.2H, d18O.regressed), names_to = "Iso", values_to = "Values")
    fit <- lm(Values ~ poly(Depth, 3, raw = TRUE), data = long_df)
    tibble(ID = pid,
           fitALL_intercept = coef(fit)[1],
           fitALL_1 = coef(fit)[2],
           fitALL_2 = coef(fit)[3],
           fitALL_3 = coef(fit)[4])
  })
  
  # Silhouette scoring to find optimal cluster number
  sil_scores <- map_dfr(3:10, function(k) {
    clust <- kmeans(scale(poly_data[, -1]), centers = k, nstart = 25)
    sil <- silhouette(clust$cluster, dist(scale(poly_data[, -1])))
    tibble(k = k, silhouette = mean(sil[, 3]))
  })
  
  best_k <- sil_scores %>% filter(silhouette == max(silhouette)) %>% pull(k)
  final_cluster <- kmeans(scale(poly_data[, -1]), centers = best_k, nstart = 25)
  
  clusters <- tibble(ID = poly_data$ID, clu.all = final_cluster$cluster)
  
  df %>%
    left_join(clusters, by = "ID") %>%
    mutate(SampleGroup = group_id)
}

# ------------------------------------------------------------------------------
# 5. Apply Clustering to All Groups
# ------------------------------------------------------------------------------
iso_clustered <- bind_rows(lapply(groups, cluster_profiles, data = iso))
write.csv(iso_clustered, output_clustered_csv, row.names = FALSE)

# ------------------------------------------------------------------------------
# 6. Define Cluster to Use for Reference Profile Modeling (Manually Selected)
# ------------------------------------------------------------------------------
group_clusters <- tibble(
  Groups = c("OM-summer", "OM-winter", "BF-spring", "BF-summer", "TA-fall", "SL-summer", "SL-fall"),
  cluster = c(3, 2, 2, 3, 1, 2, 1)  # Based on prior visual inspection
)

# ------------------------------------------------------------------------------
# 7. Build Reference Profiles and Identify PF Candidates
# ------------------------------------------------------------------------------
build_reference <- function(gid) {
  selected_cluster <- group_clusters %>% filter(Groups == gid) %>% pull(cluster)
  
  cluster_data <- iso_clustered %>%
    filter(SampleGroup == gid, clu.all == selected_cluster)
  
  # SD of profile values per depth
  depth_sd <- cluster_data %>%
    group_by(Depth) %>%
    summarise(sd.Depth = mean(sd(poly.all), na.rm = TRUE), .groups = "drop")
  
  # Fit LOESS model with cross-validated span
  ctrl <- trainControl(method = "cv", number = 10)
  loess_cv <- train(poly.all ~ Depth, data = cluster_data, method = "gamLoess", trControl = ctrl)
  loess_model <- loess(poly.all ~ Depth, data = cluster_data, span = loess_cv$bestTune$span)
  
  # Predict LOESS reference profile
  full_data <- iso_clustered %>%
    filter(SampleGroup == gid) %>%
    mutate(Reference = predict(loess_model, newdata = .)) %>%
    left_join(depth_sd, by = "Depth")
  
  # Topsoil signal range (10–20 cm)
  iso.top <- full_data %>%
    filter(Depth %in% c(10, 20)) %>%
    summarise(min.ref = min(Reference - 2 * sd.Depth, na.rm = TRUE),
              max.ref = max(Reference + 2 * sd.Depth, na.rm = TRUE))
  
  # Detect outliers and PF indicators
  full_data %>%
    mutate(
      d18O.outlier = !(d18O.regressed >= Reference - 2*sd.Depth & d18O.regressed <= Reference + 2*sd.Depth),
      d2H.outlier = !(calib.2H >= Reference - 2*sd.Depth & calib.2H <= Reference + 2*sd.Depth),
      d18O.top.sig = d18O.regressed >= iso.top$min.ref & d18O.regressed <= iso.top$max.ref,
      d2H.top.sig = calib.2H >= iso.top$min.ref & calib.2H <= iso.top$max.ref,
      PF.possible = d18O.outlier & d2H.outlier & d18O.top.sig & d2H.top.sig & !(Depth %in% c(10,20))
    )
}

# ------------------------------------------------------------------------------
# 8. Apply Reference Modeling to All Groups
# ------------------------------------------------------------------------------
iso_referenced <- bind_rows(lapply(groups, build_reference))

# ------------------------------------------------------------------------------
# 9. Remove PF Signals Close to Core Bottoms (±10 cm)
# ------------------------------------------------------------------------------
core_cutoffs <- read.csv(cutoff_csv)

iso_ref_cleaned <- iso_referenced %>%
  rowwise() %>%
  mutate(PF.possible = if_else(
    any(abs(Depth - core_cutoffs$Stop[
      core_cutoffs$Catchment == substr(SampleGroup, 1, 2) &
        core_cutoffs$ID == ID
    ]) <= 10),
    FALSE, PF.possible
  )) %>%
  ungroup()

# ------------------------------------------------------------------------------
# 10. Save Final Output for Analysis and Visualization
# ------------------------------------------------------------------------------
write.csv(iso_ref_cleaned, output_final_csv, row.names = FALSE)

