roclihom / utils /data_processing.R
alexdum's picture
refactor: Centralize and optimize seasonal data aggregation and filtering with a new completeness check.
6120561
#' Add seasonal columns to data
#'
#' Optimized to avoid date conversion for every row.
#' @param data DataFrame containing year and month columns
#' @return DataFrame with added season_id and season columns
add_seasonal_columns <- function(data) {
# Define season mapping: 12,1,2->DJF; 3,4,5->MAM; 6,7,8->JJA; 9,10,11->SON
# We can use a lookup vector.
# Month is 1..12.
season_lookup <- c(
"DJF", "DJF", # 1, 2
"MAM", "MAM", "MAM", # 3, 4, 5
"JJA", "JJA", "JJA", # 6, 7, 8
"SON", "SON", "SON", # 9, 10, 11
"DJF" # 12
)
data %>%
mutate(
season_id = (12 * year + month) %/% 3,
season = season_lookup[month]
)
}
#' Filter seasonal data by date range and selected season
#'
#' @param data DataFrame with seasonal columns
#' @param selected_season Character, e.g., "DJF", "MAM"
#' @return Filtered DataFrame
filter_seasonal_data <- function(data, selected_season) {
# Range: 1901-03-01 to 2023-11-30
# This corresponds to:
# Start: Year > 1901 OR (Year == 1901 and Month >= 3)
# End: Year < 2023 OR (Year == 2023 and Month <= 11)
data %>%
filter(
(year > 1901 | (year == 1901 & month >= 3)) &
(year < 2023 | (year == 2023 & month <= 11)),
season == selected_season
)
}
#' Aggregate data by season with completeness check
#'
#' Only calculates the mean/sum if all 3 months of the season are present.
#' @param data DataFrame with season_id
#' @param selected_variable Character, "PREC" or other
#' @return Aggregated DataFrame
aggregate_seasonal <- function(data, selected_variable) {
data %>%
group_by(id, season_id, season, name, latitude, longitude, altitude) %>%
# Calculate the "year" for the season (usually the year of the last month, or just max year in the group)
# For DJF (Dec 2022, Jan 2023, Feb 2023), usually denoted as Winter 2023.
# We can use max(year) as the season year.
summarise(
n_months = n(),
value = if (selected_variable == "PREC") sum(value, na.rm = TRUE) else mean(value, na.rm = TRUE),
year = max(year),
.groups = "drop"
) %>%
# STICT COMPLETENESS CHECK: Only keep seasons with 3 months
filter(n_months == 3) %>%
dplyr::select(-n_months)
}