Spaces:

alexdum
/

dwd

Running

App Files Files Community

dwd / fun /parse_dwd.R

alexdum

feat: Add 10-minute DWD data resolution and parallelize granular metadata updates.

1957ae8 12 days ago

raw

history blame contribute delete

26.7 kB

	#' Parse content of a DWD data zip file
	#'
	#' @param zip_path Local path to the .zip file
	#' @param start_date Optional filter start (POSIXct)
	#' @param end_date Optional filter end (POSIXct)
	#' @return Tibble with parsed data or NULL
	#' Generate DWD Missing Value Strings
	#'
	#' @param base Basic codes like -999, 8000
	#' @param nspace Number of leading spaces to include (DWD files often have 7 characters wide columns)
	#' @return Vector of strings
	generate_dwd_na_strings <- function(base = c("-999", "-777", "-888", "8000", "9999", "-9999", "999.9", "8000.0", "800.0", "-999.0"), nspace = 10) {
	res <- base
	# Add decimal variations
	res <- unique(c(res, paste0(base, ".0"), paste0(base, ".00")))
	# Add leading spaces
	for (i in 1:nspace) {
	res <- unique(c(res, paste0(strrep(" ", i), base)))
	}
	res <- unique(c(res, ""))
	return(res)
	}

	#' Parse content of a DWD data zip file
	#'
	#' @param zip_path Local path to the .zip file
	#' @param start_date Optional filter start (POSIXct)
	#' @param end_date Optional filter end (POSIXct)
	#' @return Tibble with parsed data or NULL
	#' Parse content of a DWD data zip file
	#'
	#' @param zip_path Local path to the .zip file
	#' @param start_date Optional filter start (POSIXct)
	#' @param end_date Optional filter end (POSIXct)
	#' @return Tibble with parsed data or NULL
	read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL, extract_metadata = FALSE) {
	# Create temp dir for extraction
	exdir <- tempfile()
	dir.create(exdir)
	on.exit(unlink(exdir, recursive = TRUE))

	# Unzip - OPTIMIZED: List first, then extract only the data file
	# This avoids extracting 10+ metadata files which saves IO
	tryCatch(
	{
	# List files in the zip
	file_list <- unzip(zip_path, list = TRUE)

	# Find the data file (produkt*.txt)
	# We look for the largest file matching the pattern, just in case
	# Typical DWD name: produkt_param_date_station.txt
	data_files <- file_list$Name[grepl("^produkt.*\\.txt$", file_list$Name)]

	if (length(data_files) == 0) {
	return(NULL)
	}

	# Determine which file to extract (largest if multiple, though usually one)
	if (length(data_files) > 1) {
	# Get sizes
	sizes <- file_list$Length[match(data_files, file_list$Name)]
	target_file <- data_files[which.max(sizes)]
	} else {
	target_file <- data_files[1]
	}

	# Extract ONLY the target file
	unzip(zip_path, files = target_file, exdir = exdir)
	data_file <- file.path(exdir, target_file)

	# Also extract metadata file for granular parameter validity periods
	granular_meta_df <- NULL
	if (extract_metadata) {
	meta_files <- file_list$Name[grepl("^Metadaten_Parameter.*\\.txt$", file_list$Name, ignore.case = TRUE)]
	if (length(meta_files) > 0) {
	meta_file <- meta_files[1]
	tryCatch(
	{
	unzip(zip_path, files = meta_file, exdir = exdir)
	meta_path <- file.path(exdir, meta_file)
	if (file.exists(meta_path)) {
	# Read semicolon-separated metadata file
	meta_dt <- data.table::fread(meta_path, sep = ";", header = TRUE, fill = TRUE, encoding = "Latin-1")
	# Expected columns usually include: Parameter, Von_Datum, Bis_Datum
	# Normalize column names
	names(meta_dt) <- tolower(trimws(names(meta_dt)))
	if ("parameter" %in% names(meta_dt)) {
	granular_meta_df <- data.frame(
	param = trimws(meta_dt$parameter),
	start_date = if ("von_datum" %in% names(meta_dt)) as.character(meta_dt$von_datum) else NA_character_,
	end_date = if ("bis_datum" %in% names(meta_dt)) as.character(meta_dt$bis_datum) else NA_character_,
	stringsAsFactors = FALSE
	)
	}
	}
	},
	error = function(e) {
	# Silent fail for metadata - not critical
	}
	)
	}
	}
	},
	error = function(e) {
	warning("Unzip failed: ", e$message)
	return(NULL)
	}
	)

	# Read Data
	# DWD standard: semicolon sep
	na_vec <- generate_dwd_na_strings()
	# fread strips whitespace by default, so we should pass trimmed NA strings to avoid warnings/errors
	na_vec_clean <- unique(trimws(na_vec))
	# Ensure empty string is included if not already
	if (!"" %in% na_vec_clean) na_vec_clean <- c(na_vec_clean, "")

	tryCatch(
	{
	# Use data.table::fread for faster parsing
	# Read everything as character first to separate schema from data loading safety
	dt <- data.table::fread(
	data_file,
	sep = ";",
	header = TRUE,
	na.strings = na_vec_clean,
	colClasses = "character", # Force character to avoid type inference issues with weird DWD codes
	encoding = "Latin-1",
	showProgress = FALSE,
	data.table = TRUE
	)

	# Sanitize column names: remove whitespace
	old_names <- names(dt)
	names(dt) <- trimws(names(dt))


	# Standardize Date Column Name
	if ("MESS_DATUM_BEGINN" %in% names(dt)) {
	data.table::setnames(dt, "MESS_DATUM_BEGINN", "MESS_DATUM")
	}

	if (!"MESS_DATUM" %in% names(dt)) {
	return(NULL)
	}

	# --- Optimization: Pre-filter rows by date string BEFORE heavy processing ---
	# DWD dates are usually monotonic strings/numbers: YYYYMMDD, YYYYMMDDHH, etc.
	# We can filter lexically or numerically without parsing to POSIXct first.
	if (!is.null(start_date) \|\| !is.null(end_date)) {
	# Determine format from first row
	sample_date <- dt$MESS_DATUM[1]
	nch <- nchar(sample_date)

	# Format mappings correspond to DWD standards
	fmt_str <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"

	if (!is.na(nch)) {
	if (!is.null(start_date)) {
	# Format request date to match DWD string format
	# Explicitly use UTC to avoid timezone shifts during string formatting if inputs are UTC
	s_val <- format(as.POSIXct(start_date), format = fmt_str, tz = "UTC")
	dt <- dt[MESS_DATUM >= s_val]
	}

	if (!is.null(end_date)) {
	# Add buffer (end of day) if needed, but usually exact match on string works
	# If end_date is 2025-01-01, we want up to 2025-01-01 23:59
	e_limit <- as.POSIXct(end_date) + days(1)
	e_val <- format(e_limit, format = fmt_str, tz = "UTC")
	dt <- dt[MESS_DATUM <= e_val]
	}
	}
	}

	# Convert to tibble/data.frame for existing dplyr pipeline compatibility
	# (Keeping existing logic for column mapping to minimize regression risk)
	df <- as_tibble(dt)

	# Parse Date logic (Original)
	raw_date <- as.character(df$MESS_DATUM)
	if (length(raw_date) == 0) {
	return(NULL)
	} # If filtered to empty

	nch <- nchar(raw_date[1])
	fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"

	df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")

	# Filter valid dates (Safety check)
	df <- df[!is.na(df$datetime), ]

	# Additional safety window filter (dates might strictly parse differently than strings)
	if (!is.null(start_date)) {
	s_limit <- as.POSIXct(start_date)
	df <- df[df$datetime >= s_limit, ]
	}
	if (!is.null(end_date)) {
	e_limit <- as.POSIXct(end_date) + days(1)
	df <- df[df$datetime <= e_limit, ]
	}

	# Column Mapping
	weather_cols <- c(
	"temp", "temp_min", "temp_max", "temp_min_avg", "temp_max_avg", "rh", "dew_point",
	"abs_humidity", "vapor_pressure", "wet_bulb_temp",
	"precip", "wind_speed", "wind_dir", "pressure", "station_pressure", "cloud_cover", "wind_gust_max", "solar_global", "sunshine_duration",
	"temp_5cm",
	"soil_temp_2cm", "soil_temp_5cm", "soil_temp_10cm", "soil_temp_20cm", "soil_temp_50cm", "soil_temp_100cm",
	"soil_temp_min_5cm",
	"snow_depth", "snow_water_equiv", "snow_fresh_sum", "snow_depth_sum",
	"thunderstorm", "glaze", "graupel", "hail", "fog", "frost", "storm_6", "storm_8", "dew",
	"precip_net_thunderstorm", "precip_net_graupel", "precip_net_hail", "precip_net_fog",
	"visibility", "weather_code",
	"cloud_layer1_code", "cloud_layer1_height", "cloud_layer1_amount",
	"cloud_layer2_code", "cloud_layer2_height", "cloud_layer2_amount",
	"cloud_layer3_code", "cloud_layer3_height", "cloud_layer3_amount",
	"cloud_layer4_code", "cloud_layer4_height", "cloud_layer4_amount"
	)
	weather_text_cols <- c(
	"cloud_cover_indicator", "cloud_layer1_abbrev", "cloud_layer2_abbrev", "cloud_layer3_abbrev", "cloud_layer4_abbrev",
	"visibility_indicator", "weather_text"
	)
	df <- df %>%
	rename_with(~ case_when(
	# Hourly Mappings
	. == "TT_TU" ~ "temp",
	. == "RF_TU" ~ "rh",
	. == "TT" ~ "temp",
	. == "TD" ~ "dew_point",
	. == "ABSF_STD" ~ "abs_humidity",
	. == "VP_STD" ~ "vapor_pressure",
	. == "TF_STD" ~ "wet_bulb_temp",
	. == "P_STD" ~ "pressure",
	. == "TT_STD" ~ "temp",
	. == "RF_STD" ~ "rh",
	. == "TD_STD" ~ "dew_point",
	. == "R1" ~ "precip",
	. == "F" ~ "wind_speed",
	. == "D" ~ "wind_dir",
	. == "P" ~ "pressure",
	. == "P0" ~ "station_pressure",
	. %in% c("N_8", "V_N") ~ "cloud_cover",
	. == "V_N_I" ~ "cloud_cover_indicator",
	. == "V_S1_CS" ~ "cloud_layer1_code",
	. == "V_S1_CSA" ~ "cloud_layer1_abbrev",
	. == "V_S1_HHS" ~ "cloud_layer1_height",
	. == "V_S1_NS" ~ "cloud_layer1_amount",
	. == "V_S2_CS" ~ "cloud_layer2_code",
	. == "V_S2_CSA" ~ "cloud_layer2_abbrev",
	. == "V_S2_HHS" ~ "cloud_layer2_height",
	. == "V_S2_NS" ~ "cloud_layer2_amount",
	. == "V_S3_CS" ~ "cloud_layer3_code",
	. == "V_S3_CSA" ~ "cloud_layer3_abbrev",
	. == "V_S3_HHS" ~ "cloud_layer3_height",
	. == "V_S3_NS" ~ "cloud_layer3_amount",
	. == "V_S4_CS" ~ "cloud_layer4_code",
	. == "V_S4_CSA" ~ "cloud_layer4_abbrev",
	. == "V_S4_HHS" ~ "cloud_layer4_height",
	. == "V_S4_NS" ~ "cloud_layer4_amount",
	. == "V_VV" ~ "visibility",
	. == "V_VV_I" ~ "visibility_indicator",
	. == "WW" ~ "weather_code",
	. == "WW_Text" ~ "weather_text",
	. %in% c("FX_10", "FX_911") ~ "wind_gust_max",
	. %in% c("FG_LBERG", "FG_STRAHL") ~ "solar_global",
	. %in% c("SD_LBERG", "SD_STRAHL", "SD_SO") ~ "sunshine_duration",

	# 10-Minute Mappings
	. == "TT_10" ~ "temp",
	. == "RF_10" ~ "rh",
	. == "TD_10" ~ "dew_point",
	. == "TM5_10" ~ "temp_5cm", # Soil temp logic? or 5cm air? DWD usually 5cm above ground
	. == "RWS_10" ~ "precip",
	. == "RWS_DAU_10" ~ "precip_duration", # Custom?
	. == "RWS_IND_10" ~ "precip_ind",
	. == "FF_10" ~ "wind_speed",
	. == "DD_10" ~ "wind_dir",
	. == "FX_10" ~ "wind_gust_max",
	. == "FNX_10" ~ "wind_gust_min",
	. == "DS_10" ~ "diffuse_radiation",
	. == "GS_10" ~ "solar_global",
	. == "SD_10" ~ "sunshine_duration",
	. == "LS_10" ~ "longwave_radiation", # Atmospheric radiation?
	. == "TX_10" ~ "temp_max",
	. == "TN_10" ~ "temp_min",
	. == "PP_10" ~ "station_pressure",

	# Daily Mappings
	. == "TGK" ~ "soil_temp_min_5cm",
	. %in% c("TMK", "TM_K") ~ "temp",
	. == "TNK" ~ "temp_min",
	. == "TXK" ~ "temp_max",
	. %in% c("RSK", "RS_K", "RS") ~ "precip",
	. == "FM" ~ "wind_speed",
	. == "FX" ~ "wind_gust_max",
	. %in% c("SDK", "SD_SO") ~ "sunshine_duration", # SD_SO is daily sunshine
	. == "UPM" ~ "rh",
	. == "PM" ~ "pressure",
	. == "NM" ~ "cloud_cover",

	# Monthly Mappings
	. == "MO_TT" ~ "temp",
	. == "MX_TX" ~ "temp_max", # Absolute Max
	. == "MX_TN" ~ "temp_min", # Absolute Min
	. == "MO_TX" ~ "temp_max_avg", # Average Daily Max
	. == "MO_TN" ~ "temp_min_avg", # Average Daily Min
	. == "MO_RR" ~ "precip",
	. == "MX_RS" ~ "precip_max_day", # Not in standard weather_cols but useful
	. == "MO_SD_S" ~ "sunshine_duration",
	. == "MO_N" ~ "cloud_cover",
	. == "MO_FK" ~ "wind_speed",
	. == "MX_FX" ~ "wind_gust_max",
	. == "MO_NSH" ~ "snow_fresh_sum", # Monthly Fresh Snow Sum
	. == "MO_SH_S" ~ "snow_depth_sum", # Monthly Snow Depth Sum


	# Soil Temperature
	. == "V_TE002" ~ "soil_temp_2cm",
	. == "V_TE005" ~ "soil_temp_5cm",
	. == "V_TE010" ~ "soil_temp_10cm",
	. == "V_TE020" ~ "soil_temp_20cm",
	. == "V_TE050" ~ "soil_temp_50cm",
	. == "V_TE100" ~ "soil_temp_100cm",
	. == "V_TE002M" ~ "soil_temp_2cm",
	. == "V_TE005M" ~ "soil_temp_5cm",
	. == "V_TE010M" ~ "soil_temp_100cm",
	. == "V_TE020M" ~ "soil_temp_20cm",
	. == "V_TE050M" ~ "soil_temp_50cm",

	# Water Equivalent / Snow
	. == "ASH_6" ~ "snow_depth_set",
	. == "SH_TAG" ~ "snow_depth",
	. == "WASH_6" ~ "snow_water_equiv",
	. == "WAAS_6" ~ "snow_water_equiv_excavated",

	# Weather Phenomena
	. == "GEWITTER" ~ "thunderstorm",
	. == "GLATTEIS" ~ "glaze",
	. == "GRAUPEL" ~ "graupel",
	. == "HAGEL" ~ "hail",
	. == "NEBEL" ~ "fog",
	. == "REIF" ~ "frost",
	. == "STURM_6" ~ "storm_6",
	. == "STURM_8" ~ "storm_8",
	. == "TAU" ~ "dew",

	# Preciptation Network Weather Phenomena
	. == "RR_GEWITTER" ~ "precip_net_thunderstorm",
	. == "RR_GRAUPEL" ~ "precip_net_graupel",
	. == "RR_HAGEL" ~ "precip_net_hail",
	. == "RR_NEBEL" ~ "precip_net_fog",
	TRUE ~ .
	))

	# Ensure output columns are numeric and all missing value variants (like 8000, -999) are NA
	# We apply this to specific weather columns for precision

	df <- df %>%
	mutate(across(any_of(weather_cols), ~ {
	# Suppress coercion warnings here because we handle the NA conversion explicitly
	val <- suppressWarnings(as.numeric(as.character(.)))
	# Catch 8000, 9999, -999 and anything outside reasonable physical ranges
	# Broad filter for DWD flags: >= 7999 or <= -998
	res <- ifelse(is.na(val) \| val >= 7999 \| val <= -998, NA_real_, val)
	res
	}))

	if (length(weather_text_cols) > 0) {
	df <- df %>%
	mutate(across(any_of(weather_text_cols), ~ {
	val <- trimws(as.character(.))
	val[val == ""] <- NA_character_
	val
	}))
	}

	# Parameter-specific safety bounds (Physical sanity check)
	if ("temp" %in% names(df)) df$temp[is.na(df$temp) \| df$temp < -80 \| df$temp > 60] <- NA_real_
	if ("temp_min" %in% names(df)) df$temp_min[is.na(df$temp_min) \| df$temp_min < -80 \| df$temp_min > 60] <- NA_real_
	if ("temp_max" %in% names(df)) df$temp_max[is.na(df$temp_max) \| df$temp_max < -80 \| df$temp_max > 60] <- NA_real_
	if ("dew_point" %in% names(df)) df$dew_point[is.na(df$dew_point) \| df$dew_point < -90 \| df$dew_point > 60] <- NA_real_
	if ("wet_bulb_temp" %in% names(df)) df$wet_bulb_temp[is.na(df$wet_bulb_temp) \| df$wet_bulb_temp < -80 \| df$wet_bulb_temp > 60] <- NA_real_
	if ("rh" %in% names(df)) df$rh[is.na(df$rh) \| df$rh < 0 \| df$rh > 100] <- NA_real_
	if ("pressure" %in% names(df)) df$pressure[is.na(df$pressure) \| df$pressure < 800 \| df$pressure > 1100] <- NA_real_
	if ("station_pressure" %in% names(df)) df$station_pressure[is.na(df$station_pressure) \| df$station_pressure < 500 \| df$station_pressure > 1100] <- NA_real_
	if ("soil_temp_min_5cm" %in% names(df)) df$soil_temp_min_5cm[is.na(df$soil_temp_min_5cm) \| df$soil_temp_min_5cm < -80 \| df$soil_temp_min_5cm > 60] <- NA_real_
	if ("solar_global" %in% names(df)) df$solar_global[is.na(df$solar_global) \| df$solar_global < 0 \| df$solar_global > 3500] <- NA_real_
	if ("snow_depth" %in% names(df)) df$snow_depth[is.na(df$snow_depth) \| df$snow_depth < 0] <- NA_real_

	# --- Fix: Ghost Pressure Data ---
	# DWD Moisture (TF), Temperature (TU), and DewPoint (TD) files often contain a "P_STD" (Pressure) column.
	# In many cases (e.g. Station 00917), this is a constant dummy value (e.g. 993.9 hPa derived from elevation) and NOT measured data.
	# We must ignore it unless the file is explicitly a Pressure file (P0) or Climate file (KL).
	# Note: We check the internal data file name (target_file), not the zip path, because the app uses temp file names.
	# EXCEPTION: For 10-minute resolution (10minutenwerte), data is valid.
	fname_lower <- tolower(target_file)
	is_tf_tu_td <- any(sapply(c("_tf_", "_tu_", "_td_"), function(p) grepl(p, fname_lower, fixed = TRUE)))
	is_10min <- grepl("10minutenwerte", fname_lower, fixed = TRUE) \| grepl("zehn_min", fname_lower, fixed = TRUE)

	if (is_tf_tu_td && !is_10min) {
	if ("pressure" %in% names(df)) df$pressure <- NULL
	if ("station_pressure" %in% names(df)) df$station_pressure <- NULL
	}


	df <- df %>% select(datetime, any_of(c(weather_cols, weather_text_cols)))

	result_df <- as_tibble(df)
	# Attach granular metadata if available
	if (!is.null(granular_meta_df) && nrow(granular_meta_df) > 0) {
	attr(result_df, "granular_meta") <- granular_meta_df
	}
	result_df
	},
	error = function(e) {
	warning("Failed to parse ", basename(zip_path), ": ", e$message)
	return(NULL)
	}
	)
	}

	#' Fetch and Parse All Data for a Station
	#'
	#' @param station_id GHCN/DWD ID (string 5 digits)
	#' @param index_df The global file index
	#' @param start_date Filter start
	#' @param end_date Filter end
	#' @return Merged tibble of data for the window
	fetch_and_parse_station_data <- function(station_id, index_df, start_date = NULL, end_date = NULL, status_cb = NULL) {
	# Helper to call callback if it exists
	# status_cb(message, detail = NULL, value = NULL)
	notify <- function(msg, detail = NULL, value = NULL) {
	if (is.function(status_cb)) status_cb(msg, detail, value)
	}

	targets <- index_df %>% filter(id == station_id)

	if (nrow(targets) == 0) {
	notify("No data found in index for this station.", value = 1)
	return(NULL)
	}

	all_data_list <- list()
	n_targets <- nrow(targets)

	last_pct <- -1

	for (i in seq_len(n_targets)) {
	target_row <- targets[i, ]
	url <- target_row$url
	tmp_zip <- tempfile(fileext = ".zip")

	dl_status <- tryCatch(
	{
	# Use curl with a handle to get progress
	h <- curl::new_handle()
	curl::handle_setopt(h, noprogress = FALSE, progressfunction = function(down, up) {
	if (down[1] > 0) {
	pct <- round(down[2] / down[1] * 100)
	if (pct != last_pct) {
	msg <- sprintf("Downloading file %d/%d...", i, n_targets)
	detail <- sprintf("%s (%d%%)", format_bytes(down[1]), pct)
	# Approximate progress: 0 to 0.8 range for downloading
	val <- ((i - 1) / n_targets) + (pct / 100 * (0.8 / n_targets))
	notify(msg, detail = detail, value = val)
	last_pct <<- pct
	}
	}
	TRUE
	})
	curl::curl_fetch_disk(url, tmp_zip, handle = h)
	TRUE
	},
	error = function(e) {
	warning("Download failed for ", url, ": ", e$message)
	FALSE
	}
	)

	if (dl_status) {
	notify(sprintf("Parsing file %d/%d...", i, n_targets), value = (i - 0.2) / n_targets)
	parsed <- read_dwd_data(tmp_zip, start_date, end_date)
	if (!is.null(parsed) && nrow(parsed) > 0) {
	all_data_list[[length(all_data_list) + 1]] <- parsed
	}
	unlink(tmp_zip)
	}
	}

	if (length(all_data_list) == 0) {
	notify("Failed to parse any data files.", value = 1)
	return(NULL)
	}

	notify("Merging multiple files...", value = 0.9)
	final_df <- purrr::reduce(all_data_list, full_join, by = "datetime")

	# Identify unique weather variables across all joined columns
	weather_vars <- c(
	"temp", "temp_min", "temp_max", "temp_min_avg", "temp_max_avg", "rh", "dew_point",
	"abs_humidity", "vapor_pressure", "wet_bulb_temp",
	"precip", "wind_speed", "wind_dir", "pressure", "station_pressure", "cloud_cover", "cloud_cover_indicator",
	"wind_gust_max", "solar_global", "sunshine_duration",
	"soil_temp_2cm", "soil_temp_5cm", "soil_temp_10cm", "soil_temp_20cm", "soil_temp_50cm", "soil_temp_100cm",
	"soil_temp_min_5cm",
	"snow_depth", "snow_water_equiv", "snow_fresh_sum", "snow_depth_sum",
	"thunderstorm", "glaze", "graupel", "hail", "fog", "frost", "storm_6", "storm_8", "dew",
	"precip_net_thunderstorm", "precip_net_graupel", "precip_net_hail", "precip_net_fog",
	"visibility", "visibility_indicator", "weather_code", "weather_text",
	"cloud_layer1_code", "cloud_layer1_abbrev", "cloud_layer1_height", "cloud_layer1_amount",
	"cloud_layer2_code", "cloud_layer2_abbrev", "cloud_layer2_height", "cloud_layer2_amount",
	"cloud_layer3_code", "cloud_layer3_abbrev", "cloud_layer3_height", "cloud_layer3_amount",
	"cloud_layer4_code", "cloud_layer4_abbrev", "cloud_layer4_height", "cloud_layer4_amount"
	)

	available_cols <- names(final_df)

	# Pre-allocate clean_df with datetime
	clean_df <- final_df %>% select(datetime)


	for (v in weather_vars) {
	# Find all columns that are exactly 'v' or 'v.x', 'v.y', 'v.x.x' etc.
	v_cols <- available_cols[grepl(paste0("^", v, "(\\.\|$)"), available_cols)]

	if (length(v_cols) > 0) {
	# Coalesce all versions of this variable
	# We use do.call(coalesce, ...) to handle any number of duplicates
	clean_df[[v]] <- do.call(coalesce, final_df[v_cols])
	}
	}

	# Final cleanup: arrange and filter by date window
	clean_df <- clean_df %>%
	distinct(datetime, .keep_all = TRUE) %>%
	arrange(datetime)

	if (!is.null(start_date) && !is.null(end_date)) {
	clean_df <- clean_df %>%
	filter(datetime >= as.POSIXct(start_date), datetime <= as.POSIXct(end_date) + days(1))
	}

	n_rows <- nrow(clean_df)
	notify(sprintf("Success: %d rows processed.", n_rows))

	return(clean_df)
	}

	# Helper to format bytes
	format_bytes <- function(x) {
	if (is.na(x)) {
	return("Unknown size")
	}
	if (x < 1024) {
	return(paste(x, "B"))
	}
	if (x < 1024^2) {
	return(paste(round(x / 1024, 1), "KB"))
	}
	return(paste(round(x / 1024^2, 1), "MB"))
	}