dwd / scripts /export_seo_metadata.R
alexdum's picture
fix: restore resolution-specific date windows in SEO export script
63efaec
# scripts/export_seo_metadata.R
#
# Exports enriched DWD station metadata from all 5 resolution caches
# into a single JSON file for use by the Netlify Edge Function.
#
# Output: dwd-seo-metadata.json
#
# Usage:
# Rscript scripts/export_seo_metadata.R
# (run from the DWD app root directory)
# Ensure UTF-8 output
invisible(Sys.setlocale("LC_CTYPE", "en_US.UTF-8"))
library(jsonlite)
# ── Slugification ────────────────────────────────────────────────────────
slugify <- function(text) {
s <- as.character(text)
# German umlaut / special char replacements (before NFD decomposition)
s <- gsub("\u00fc", "ue", s) # ΓΌ
s <- gsub("\u00f6", "oe", s) # ΓΆ
s <- gsub("\u00e4", "ae", s) # Γ€
s <- gsub("\u00dc", "Ue", s) # Ü
s <- gsub("\u00d6", "Oe", s) # Γ–
s <- gsub("\u00c4", "Ae", s) # Γ„
s <- gsub("\u00df", "ss", s) # ß
# Lowercase
s <- tolower(s)
# Remove accents via iconv transliteration
s <- iconv(s, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
# Replace non-alphanumeric with hyphens
s <- gsub("[^a-z0-9]+", "-", s)
# Trim leading/trailing hyphens
s <- gsub("^-+|-+$", "", s)
s
}
# ── Resolution config ────────────────────────────────────────────────────
resolution_config <- list(
list(
key = "daily",
label = "Daily",
slug = "daily",
file = "www/tabs/dwd_stations_enriched_daily.rds"
),
list(
key = "hourly",
label = "Hourly",
slug = "hourly",
file = "www/tabs/dwd_stations_enriched.rds"
),
list(
key = "monthly",
label = "Monthly",
slug = "monthly",
file = "www/tabs/dwd_stations_enriched_monthly.rds"
),
list(
key = "annual",
label = "Annual",
slug = "annual",
file = "www/tabs/dwd_stations_enriched_annual.rds"
),
list(
key = "10_minutes",
label = "10 Minutes",
slug = "10-minutes",
file = "www/tabs/dwd_stations_enriched_10min.rds"
)
)
# ── Main export ──────────────────────────────────────────────────────────
stations_out <- list()
states_out <- list()
resolutions_out <- list()
slug_map <- list()
for (rc in resolution_config) {
if (!file.exists(rc$file)) {
message("Skipping ", rc$key, ": file not found (", rc$file, ")")
next
}
df <- readRDS(rc$file)
message(rc$label, ": ", nrow(df), " stations loaded")
res_slug <- rc$slug
# ── Per-state aggregation ────────────────────────────────────────────
state_counts <- list()
for (i in seq_len(nrow(df))) {
row <- df[i, ]
station_name <- enc2utf8(as.character(row$name))
station_id <- as.character(row$id)
state_name <- enc2utf8(as.character(row$state))
station_slug <- slugify(station_name)
state_slug <- slugify(state_name)
# Build path key: "daily/bayern/muenchen-flughafen"
path_key <- paste0(res_slug, "/", state_slug, "/", station_slug)
# Station entry
entry <- list(
id = station_id,
name = station_name,
state = state_name,
stateSlug = state_slug,
elevation = as.numeric(row$elevation),
lat = as.numeric(row$latitude),
lon = as.numeric(row$longitude),
resolution = rc$key,
resolutionLabel = rc$label,
resolutionSlug = res_slug,
overallStart = as.character(row$station_overall_start),
overallEnd = as.character(row$station_overall_end)
)
# Add detailed_summary if available
if ("detailed_summary" %in% names(row)) {
entry$availableParams <- as.character(row$detailed_summary)
}
stations_out[[path_key]] <- entry
# Track state counts
state_key <- paste0(res_slug, "/", state_slug)
if (is.null(state_counts[[state_key]])) {
state_counts[[state_key]] <- list(
state = state_name,
stateSlug = state_slug,
resolution = rc$key,
resolutionLabel = rc$label,
resolutionSlug = res_slug,
stationCount = 0L,
activeStationCount = 0L
)
}
state_counts[[state_key]]$stationCount <-
state_counts[[state_key]]$stationCount + 1L
# Determine the UI's default date range start for this resolution
# Must match server.R URL param handler for initial load
default_start_date <- switch(rc$key,
"10_minutes" = Sys.Date() - 30,
"hourly" = Sys.Date() - 366,
"monthly" = Sys.Date() - (365 * 6),
"daily" = Sys.Date() - (365 * 6),
"annual" = as.Date("1991-01-01"),
Sys.Date() - (365 * 6)
)
default_start_num <- as.numeric(format(default_start_date, "%Y%m%d"))
default_end_num <- as.numeric(format(Sys.Date(), "%Y%m%d"))
overall_start_str <- as.character(row$station_overall_start)
overall_end_str <- as.character(row$station_overall_end)
is_active <- FALSE
# Must match server.R filtered_stations():
# as.numeric(station_overall_start) <= range_end &
# as.numeric(station_overall_end) >= range_start
start_num <- suppressWarnings(as.numeric(overall_start_str))
end_num <- if (overall_end_str == "99999999") Inf else suppressWarnings(as.numeric(overall_end_str))
if (!is.na(start_num) && !is.na(end_num) &&
start_num <= default_end_num && end_num >= default_start_num) {
is_active <- TRUE
}
if (is_active) {
state_counts[[state_key]]$activeStationCount <-
state_counts[[state_key]]$activeStationCount + 1L
}
# Slug map (only need unique entries)
if (is.null(slug_map[[station_name]])) {
slug_map[[station_name]] <- station_slug
}
if (is.null(slug_map[[state_name]])) {
slug_map[[state_name]] <- state_slug
}
}
# Merge state entries
for (sk in names(state_counts)) {
states_out[[sk]] <- state_counts[[sk]]
}
# Resolution summary (including active count summed from states)
total_active <- sum(sapply(state_counts, function(sc) sc$activeStationCount))
resolutions_out[[res_slug]] <- list(
key = rc$key,
label = rc$label,
slug = res_slug,
stationCount = nrow(df),
activeStationCount = total_active
)
}
# ── Assemble and write JSON ──────────────────────────────────────────────
output <- list(
stations = stations_out,
states = states_out,
resolutions = resolutions_out,
slugMap = slug_map
)
output_path <- "dwd-seo-metadata.json"
json_str <- toJSON(output, pretty = TRUE, auto_unbox = TRUE)
writeLines(json_str, output_path)
message("\nExported ", length(stations_out), " station entries, ",
length(states_out), " state entries, ",
length(resolutions_out), " resolutions")
message("Output: ", normalizePath(output_path))