dwd / fun /dwd_metadata.R
alexdum's picture
feat: Add support for 10-minute and annual DWD data resolutions, enhancing temporal detection, plotting, and documentation.
3e6528d
# funs/dwd_metadata.R
#' Fetch DWD File Index
#' Scrapes the DWD OpenData server to find available .zip files for each parameter.
#' @param resolution "hourly", "daily", "monthly", "annual", or "10_minutes"
fetch_dwd_file_index <- function(resolution = "hourly") {
resolution <- normalize_dwd_resolution(resolution)
index_list <- list()
# Select Config based on resolution
config <- get_dwd_resolution_config(resolution)
base_url_use <- config$base_url
params_use <- config$params
for (param in names(params_use)) {
folder <- params_use[[param]]
# Use crawl logic
crawl_list <- list()
# Check standard recent/historical structure first
# But verify special cases
if (param == "solar") {
crawl_list <- list(
list(url = paste0(base_url_use, folder, "/"), type = "solar")
)
# 10-min solar might be under /recent/ too?
# Based on inspection: 10_minutes/solar/recent/ exists.
if (resolution == "10_minutes") {
crawl_list <- list(
list(url = paste0(base_url_use, folder, "/recent/"), type = "recent"),
list(url = paste0(base_url_use, folder, "/historical/"), type = "historical")
)
}
} else {
crawl_list <- list(
list(url = paste0(base_url_use, folder, "/recent/"), type = "recent"),
list(url = paste0(base_url_use, folder, "/historical/"), type = "historical")
)
}
for (item in crawl_list) {
url <- item$url
subtype <- item$type
tryCatch(
{
# Fetch directory listing with retry
max_retries <- 3
retry_count <- 0
success <- FALSE
resp <- NULL
while (retry_count < max_retries && !success) {
retry_count <- retry_count + 1
tryCatch(
{
h <- new_handle()
handle_setopt(h, followlocation = TRUE, timeout = 60)
resp <- curl_fetch_memory(url, handle = h)
success <- TRUE
},
error = function(e) {
if (retry_count == max_retries) stop(e)
Sys.sleep(1) # Wait a bit before retry
}
)
}
if (resp$status_code == 200) {
content <- rawToChar(resp$content)
# Regex to find links to .zip files
zip_files <- str_extract_all(content, 'href="([^"]+\\.zip)"')[[1]]
zip_files <- gsub('href="', "", zip_files)
zip_files <- gsub('"', "", zip_files)
if (length(zip_files) > 0) {
# Extract Station ID
ids <- character(0)
if (resolution == "10_minutes") {
# 10-minute files often have prefix: 10minutenwerte_
# solar: 10minutenwerte_SOLAR_00044_...
# air_temp: 10minutenwerte_TU_00151_...
# precipitation: 10minutenwerte_nieder_00020_...
# extreme_wind: 10minutenwerte_extrema_wind_00011_...
# wind: 10minutenwerte_wind_00011_...
# Robust ID extraction: look for 5 digits surrounded by underscores
# This handles _TU_00151_, _SOLAR_00044_, _nieder_00020_
is_match <- grepl("_\\d{5}_", zip_files)
valid_zips <- zip_files[is_match]
ids_raw <- str_extract(valid_zips, "_\\d{5}_")
ids <- gsub("_", "", ids_raw)
zip_files <- valid_zips
} else if (param == "solar") {
# Common pattern for solar is _ST_xxxxx_
is_match <- grepl("_ST_\\d{5}_", zip_files)
valid_zips <- zip_files[is_match]
ids_raw <- str_extract(valid_zips, "_ST_\\d{5}_")
ids <- gsub("_ST_", "", ids_raw)
ids <- gsub("_", "", ids)
zip_files <- valid_zips
} else {
# General pattern: _xxxxx_ (5 digits surrounded by underscores)
is_match <- grepl("_\\d{5}_", zip_files)
valid_zips <- zip_files[is_match]
ids_raw <- str_extract(valid_zips, "_\\d{5}_")
ids <- gsub("_", "", ids_raw)
zip_files <- valid_zips
}
if (length(zip_files) > 0) {
# Extract dates if possible (format: _YYYYMMDD_YYYYMMDD_)
# Pattern: _(17|18|19|20)\d{6}_(17|18|19|20)\d{6}
date_pattern <- "_((?:17|18|19|20)\\d{6})_((?:17|18|19|20)\\d{6})"
start_dates <- str_match(zip_files, date_pattern)[, 2]
end_dates <- str_match(zip_files, date_pattern)[, 3]
# For "recent" files, end date might be "akt" or missing from filename
# If missing, we can assume "Present" or similar, but str_match returns NA
# Create index DataFrame
df <- data.frame(
id = ids,
filename = zip_files,
url = paste0(url, zip_files), # Absolute URL
param = param,
type = subtype,
start_date = start_dates,
end_date = end_dates,
stringsAsFactors = FALSE
)
index_list[[paste(param, subtype, sep = "_")]] <- df
}
}
}
},
error = function(e) {
warning(paste("Failed to index", url, ":", e$message))
}
)
}
}
if (length(index_list) > 0) {
bind_rows(index_list)
} else {
NULL
}
}
#' Fetch DWD Station List
#' Downloads the 'Beschreibung_Stationen.txt' and parses it.
#' @param resolution "hourly", "daily", "monthly", "annual", or "10_minutes"
fetch_dwd_stations <- function(resolution = "hourly") {
resolution <- normalize_dwd_resolution(resolution)
config <- get_dwd_resolution_config(resolution)
url <- config$station_url
desc_pattern <- config$station_desc
tryCatch(
{
h <- new_handle()
handle_setopt(h, followlocation = TRUE, timeout = 60)
resp <- curl_fetch_memory(url, handle = h)
content <- rawToChar(resp$content)
# Find the description txt file
# Use regex to find the specific file or similar
# For daily, we want exactly *KL_Tageswerte_Beschreibung_Stationen.txt
# For hourly, we usually look for *Beschreibung_Stationen.txt
# General approach: find href containing the pattern
# escape dot
pat <- paste0('href="([^"]+', gsub("\\.", "\\\\.", desc_pattern), ')"')
desc_file_match <- str_extract(content, pat)
if (is.na(desc_file_match)) {
# Fallback: sometimes the link is just the filename
# Try simple grep
all_links <- str_extract_all(content, 'href="([^"]+)"')[[1]]
target_link <- all_links[grepl(desc_pattern, all_links)]
if (length(target_link) > 0) {
desc_file_match <- target_link[1]
} else {
return(NULL)
}
}
desc_file <- gsub('href="', "", desc_file_match)
desc_file <- gsub('"', "", desc_file)
# Clean up if it grabbed full tag
desc_file <- basename(desc_file)
full_url <- paste0(url, desc_file)
# Read fixed-width data
# Standard DWD Format:
# Stations_id von_datum bis_datum Stationshoehe GeoBreite GeoLaenge Stationsname Bundesland Abgabe
# Widths are roughly consistent.
col_names <- c("id", "start_date", "end_date", "elevation", "lat", "lon", "name", "state", "abgabe")
# Using readr::read_fwf specific for this format
# Using same widths as successful hourly parsing
# State has ~40 chars, followed by Abgabe (Frei etc)
st_df <- read_fwf(
full_url,
fwf_widths(c(5, 9, 9, 15, 12, 10, 41, 40, NA), col_names),
skip = 2,
locale = locale(encoding = "ISO-8859-1"),
show_col_types = FALSE
)
# Clean up and standardise
st_df %>%
mutate(
id = sprintf("%05d", as.numeric(id)), # Pad with zeros to match zip filenames
name = str_trim(name),
state = str_trim(state),
country_name = "Germany",
# Convert German comma decimals if present (though usually dot in these files, check later)
# read_fwf handles dots by default. DWD metadata usually has dots.
latitude = lat,
longitude = lon
) %>%
select(id, name, latitude, longitude, elevation, state, country_name, start_date, end_date) %>%
distinct(id, .keep_all = TRUE)
},
error = function(e) {
warning("Failed to fetch station list: ", e$message)
NULL
}
)
}
#' Get DWD Parameter Map
#' Returns a named vector mapping DWD parameter codes to English names.
get_dwd_param_map <- function() {
c(
# Annual params
"JA_TT" = "Temperature",
"JA_TX" = "Avg Max Temp",
"JA_TN" = "Avg Min Temp",
"JA_RR" = "Precipitation",
"JA_FK" = "Wind",
"JA_N" = "Cloud Cover",
"JA_SD_S" = "Sunshine",
"JA_MX_TX" = "Abs Max Temp",
"JA_MX_TN" = "Abs Min Temp",
"JA_MX_RS" = "Max Daily Precip",
"JA_MX_FX" = "Max Wind Gust",
"JA_NSH" = "Fresh Snow Sum",
"JA_SH_S" = "Snow Depth Sum",
"JA_GEWITTER" = "Thunderstorm Days",
"JA_GLATTEIS" = "Glaze Days",
"JA_GRAUPEL" = "Graupel Days",
"JA_HAGEL" = "Hail Days",
"JA_NEBEL" = "Fog Days",
"JA_REIF" = "Frost Days",
"JA_STURM_6" = "Storm Days (Bft 6)",
"JA_STURM_8" = "Storm Days (Bft 8)",
"JA_TAU" = "Dew Days",
# Monthly params
"MO_TT" = "Temperature",
"MO_TX" = "Max Temp",
"MO_TN" = "Min Temp",
"MO_RR" = "Precipitation",
"MO_FK" = "Wind",
"MO_N" = "Cloud Cover",
"MO_SD_S" = "Sunshine",
"MX_TX" = "Abs Max Temp",
"MX_TN" = "Abs Min Temp",
"MX_RS" = "Max Daily Precip",
"MX_FX" = "Max Wind Gust",
"MO_NSH" = "Fresh Snow Sum",
"MO_SH_S" = "Snow Depth Sum",
# Daily params
"TMK" = "Temperature",
"TNK" = "Min Temp",
"TXK" = "Max Temp",
"RSK" = "Precipitation",
"RS" = "Precipitation (Synop)",
"RSF" = "Precipitation Form",
"RSKF" = "Precipitation Form",
"FM" = "Wind Speed",
"FX" = "Wind Gust",
"SDK" = "Sunshine",
"NM" = "Cloud Cover",
"NSH_TAG" = "Fresh Snow",
"PM" = "Pressure",
"UPM" = "Humidity",
"VPM" = "Vapor Pressure",
"SHK_TAG" = "Snow Depth",
"SH_TAG" = "Snow Depth (Synop)",
"TGK" = "Min Ground Temp (5cm)",
# Hourly params
"TT_TU" = "Temperature",
"RF_TU" = "Humidity",
"P0" = "Pressure (Station)",
"P" = "Pressure (Sea Level)",
"V_N" = "Cloud Cover",
"SD_SO" = "Sunshine",
"F" = "Wind Speed",
"D" = "Wind Direction",
"R1" = "Precipitation",
"WRTR" = "Weather Text",
"WW" = "Weather Code",
# Additional params (Soil, Sun, etc)
"V_VV" = "Visibility",
"V_S1_CS" = "Cloud Layer 1 Type",
"V_S1_HHS" = "Cloud Layer 1 Height",
"V_S1_NS" = "Cloud Layer 1 Amount",
"V_S2_CS" = "Cloud Layer 2 Type",
"V_S2_HHS" = "Cloud Layer 2 Height",
"V_S2_NS" = "Cloud Layer 2 Amount",
"V_S3_CS" = "Cloud Layer 3 Type",
"V_S3_HHS" = "Cloud Layer 3 Height",
"V_S3_NS" = "Cloud Layer 3 Amount",
"V_S4_CS" = "Cloud Layer 4 Type",
"V_S4_HHS" = "Cloud Layer 4 Height",
"V_S4_NS" = "Cloud Layer 4 Amount",
"ST_2" = "Soil Temp (2cm)",
"ST_5" = "Soil Temp (5cm)",
"ST_10" = "Soil Temp (10cm)",
"ST_20" = "Soil Temp (20cm)",
"ST_50" = "Soil Temp (50cm)",
"ST_100" = "Soil Temp (100cm)",
"TS2" = "Soil Temp (2cm)",
"TS5" = "Soil Temp (5cm)",
"TS10" = "Soil Temp (10cm)",
"TS20" = "Soil Temp (20cm)",
"TS50" = "Soil Temp (50cm)",
"TS100" = "Soil Temp (100cm)",
"R_R1" = "Precipitation (Hourly)",
"RS_IND" = "Precipitation Indicator"
)
}