Spaces:

alexdum
/

dwd

Running

dwd / fun /dwd_metadata.R

feat: Add support for 10-minute and annual DWD data resolutions, enhancing temporal detection, plotting, and documentation.

3e6528d about 2 months ago

raw

history blame contribute delete

14.2 kB

	# funs/dwd_metadata.R

	#' Fetch DWD File Index
	#' Scrapes the DWD OpenData server to find available .zip files for each parameter.
	#' @param resolution "hourly", "daily", "monthly", "annual", or "10_minutes"
	fetch_dwd_file_index <- function(resolution = "hourly") {
	resolution <- normalize_dwd_resolution(resolution)
	index_list <- list()

	# Select Config based on resolution
	config <- get_dwd_resolution_config(resolution)
	base_url_use <- config$base_url
	params_use <- config$params

	for (param in names(params_use)) {
	folder <- params_use[[param]]

	# Use crawl logic
	crawl_list <- list()

	# Check standard recent/historical structure first
	# But verify special cases
	if (param == "solar") {
	crawl_list <- list(
	list(url = paste0(base_url_use, folder, "/"), type = "solar")
	)
	# 10-min solar might be under /recent/ too?
	# Based on inspection: 10_minutes/solar/recent/ exists.
	if (resolution == "10_minutes") {
	crawl_list <- list(
	list(url = paste0(base_url_use, folder, "/recent/"), type = "recent"),
	list(url = paste0(base_url_use, folder, "/historical/"), type = "historical")
	)
	}
	} else {
	crawl_list <- list(
	list(url = paste0(base_url_use, folder, "/recent/"), type = "recent"),
	list(url = paste0(base_url_use, folder, "/historical/"), type = "historical")
	)
	}

	for (item in crawl_list) {
	url <- item$url
	subtype <- item$type

	tryCatch(
	{
	# Fetch directory listing with retry
	max_retries <- 3
	retry_count <- 0
	success <- FALSE
	resp <- NULL

	while (retry_count < max_retries && !success) {
	retry_count <- retry_count + 1
	tryCatch(
	{
	h <- new_handle()
	handle_setopt(h, followlocation = TRUE, timeout = 60)
	resp <- curl_fetch_memory(url, handle = h)
	success <- TRUE
	},
	error = function(e) {
	if (retry_count == max_retries) stop(e)
	Sys.sleep(1) # Wait a bit before retry
	}
	)
	}

	if (resp$status_code == 200) {
	content <- rawToChar(resp$content)
	# Regex to find links to .zip files
	zip_files <- str_extract_all(content, 'href="([^"]+\\.zip)"')[[1]]
	zip_files <- gsub('href="', "", zip_files)
	zip_files <- gsub('"', "", zip_files)

	if (length(zip_files) > 0) {
	# Extract Station ID
	ids <- character(0)

	if (resolution == "10_minutes") {
	# 10-minute files often have prefix: 10minutenwerte_
	# solar: 10minutenwerte_SOLAR_00044_...
	# air_temp: 10minutenwerte_TU_00151_...
	# precipitation: 10minutenwerte_nieder_00020_...
	# extreme_wind: 10minutenwerte_extrema_wind_00011_...
	# wind: 10minutenwerte_wind_00011_...

	# Robust ID extraction: look for 5 digits surrounded by underscores
	# This handles _TU_00151_, _SOLAR_00044_, _nieder_00020_
	is_match <- grepl("_\\d{5}_", zip_files)
	valid_zips <- zip_files[is_match]
	ids_raw <- str_extract(valid_zips, "_\\d{5}_")
	ids <- gsub("_", "", ids_raw)
	zip_files <- valid_zips
	} else if (param == "solar") {
	# Common pattern for solar is _ST_xxxxx_
	is_match <- grepl("_ST_\\d{5}_", zip_files)
	valid_zips <- zip_files[is_match]
	ids_raw <- str_extract(valid_zips, "_ST_\\d{5}_")
	ids <- gsub("_ST_", "", ids_raw)
	ids <- gsub("_", "", ids)
	zip_files <- valid_zips
	} else {
	# General pattern: _xxxxx_ (5 digits surrounded by underscores)
	is_match <- grepl("_\\d{5}_", zip_files)
	valid_zips <- zip_files[is_match]
	ids_raw <- str_extract(valid_zips, "_\\d{5}_")
	ids <- gsub("_", "", ids_raw)
	zip_files <- valid_zips
	}

	if (length(zip_files) > 0) {
	# Extract dates if possible (format: _YYYYMMDD_YYYYMMDD_)
	# Pattern: _(17\|18\|19\|20)\d{6}_(17\|18\|19\|20)\d{6}
	date_pattern <- "_((?:17\|18\|19\|20)\\d{6})_((?:17\|18\|19\|20)\\d{6})"

	start_dates <- str_match(zip_files, date_pattern)[, 2]
	end_dates <- str_match(zip_files, date_pattern)[, 3]

	# For "recent" files, end date might be "akt" or missing from filename
	# If missing, we can assume "Present" or similar, but str_match returns NA

	# Create index DataFrame
	df <- data.frame(
	id = ids,
	filename = zip_files,
	url = paste0(url, zip_files), # Absolute URL
	param = param,
	type = subtype,
	start_date = start_dates,
	end_date = end_dates,
	stringsAsFactors = FALSE
	)
	index_list[[paste(param, subtype, sep = "_")]] <- df
	}
	}
	}
	},
	error = function(e) {
	warning(paste("Failed to index", url, ":", e$message))
	}
	)
	}
	}

	if (length(index_list) > 0) {
	bind_rows(index_list)
	} else {
	NULL
	}
	}

	#' Fetch DWD Station List
	#' Downloads the 'Beschreibung_Stationen.txt' and parses it.
	#' @param resolution "hourly", "daily", "monthly", "annual", or "10_minutes"
	fetch_dwd_stations <- function(resolution = "hourly") {
	resolution <- normalize_dwd_resolution(resolution)
	config <- get_dwd_resolution_config(resolution)
	url <- config$station_url
	desc_pattern <- config$station_desc

	tryCatch(
	{
	h <- new_handle()
	handle_setopt(h, followlocation = TRUE, timeout = 60)
	resp <- curl_fetch_memory(url, handle = h)
	content <- rawToChar(resp$content)

	# Find the description txt file
	# Use regex to find the specific file or similar
	# For daily, we want exactly *KL_Tageswerte_Beschreibung_Stationen.txt
	# For hourly, we usually look for *Beschreibung_Stationen.txt

	# General approach: find href containing the pattern
	# escape dot
	pat <- paste0('href="([^"]+', gsub("\\.", "\\\\.", desc_pattern), ')"')
	desc_file_match <- str_extract(content, pat)

	if (is.na(desc_file_match)) {
	# Fallback: sometimes the link is just the filename
	# Try simple grep
	all_links <- str_extract_all(content, 'href="([^"]+)"')[[1]]
	target_link <- all_links[grepl(desc_pattern, all_links)]
	if (length(target_link) > 0) {
	desc_file_match <- target_link[1]
	} else {
	return(NULL)
	}
	}

	desc_file <- gsub('href="', "", desc_file_match)
	desc_file <- gsub('"', "", desc_file)

	# Clean up if it grabbed full tag
	desc_file <- basename(desc_file)

	full_url <- paste0(url, desc_file)

	# Read fixed-width data
	# Standard DWD Format:
	# Stations_id von_datum bis_datum Stationshoehe GeoBreite GeoLaenge Stationsname Bundesland Abgabe
	# Widths are roughly consistent.
	col_names <- c("id", "start_date", "end_date", "elevation", "lat", "lon", "name", "state", "abgabe")

	# Using readr::read_fwf specific for this format
	# Using same widths as successful hourly parsing
	# State has ~40 chars, followed by Abgabe (Frei etc)

	st_df <- read_fwf(
	full_url,
	fwf_widths(c(5, 9, 9, 15, 12, 10, 41, 40, NA), col_names),
	skip = 2,
	locale = locale(encoding = "ISO-8859-1"),
	show_col_types = FALSE
	)

	# Clean up and standardise
	st_df %>%
	mutate(
	id = sprintf("%05d", as.numeric(id)), # Pad with zeros to match zip filenames
	name = str_trim(name),
	state = str_trim(state),
	country_name = "Germany",
	# Convert German comma decimals if present (though usually dot in these files, check later)
	# read_fwf handles dots by default. DWD metadata usually has dots.
	latitude = lat,
	longitude = lon
	) %>%
	select(id, name, latitude, longitude, elevation, state, country_name, start_date, end_date) %>%
	distinct(id, .keep_all = TRUE)
	},
	error = function(e) {
	warning("Failed to fetch station list: ", e$message)
	NULL
	}
	)
	}

	#' Get DWD Parameter Map
	#' Returns a named vector mapping DWD parameter codes to English names.
	get_dwd_param_map <- function() {
	c(
	# Annual params
	"JA_TT" = "Temperature",
	"JA_TX" = "Avg Max Temp",
	"JA_TN" = "Avg Min Temp",
	"JA_RR" = "Precipitation",
	"JA_FK" = "Wind",
	"JA_N" = "Cloud Cover",
	"JA_SD_S" = "Sunshine",
	"JA_MX_TX" = "Abs Max Temp",
	"JA_MX_TN" = "Abs Min Temp",
	"JA_MX_RS" = "Max Daily Precip",
	"JA_MX_FX" = "Max Wind Gust",
	"JA_NSH" = "Fresh Snow Sum",
	"JA_SH_S" = "Snow Depth Sum",
	"JA_GEWITTER" = "Thunderstorm Days",
	"JA_GLATTEIS" = "Glaze Days",
	"JA_GRAUPEL" = "Graupel Days",
	"JA_HAGEL" = "Hail Days",
	"JA_NEBEL" = "Fog Days",
	"JA_REIF" = "Frost Days",
	"JA_STURM_6" = "Storm Days (Bft 6)",
	"JA_STURM_8" = "Storm Days (Bft 8)",
	"JA_TAU" = "Dew Days",

	# Monthly params
	"MO_TT" = "Temperature",
	"MO_TX" = "Max Temp",
	"MO_TN" = "Min Temp",
	"MO_RR" = "Precipitation",
	"MO_FK" = "Wind",
	"MO_N" = "Cloud Cover",
	"MO_SD_S" = "Sunshine",
	"MX_TX" = "Abs Max Temp",
	"MX_TN" = "Abs Min Temp",
	"MX_RS" = "Max Daily Precip",
	"MX_FX" = "Max Wind Gust",
	"MO_NSH" = "Fresh Snow Sum",
	"MO_SH_S" = "Snow Depth Sum",

	# Daily params
	"TMK" = "Temperature",
	"TNK" = "Min Temp",
	"TXK" = "Max Temp",
	"RSK" = "Precipitation",
	"RS" = "Precipitation (Synop)",
	"RSF" = "Precipitation Form",
	"RSKF" = "Precipitation Form",
	"FM" = "Wind Speed",
	"FX" = "Wind Gust",
	"SDK" = "Sunshine",
	"NM" = "Cloud Cover",
	"NSH_TAG" = "Fresh Snow",
	"PM" = "Pressure",
	"UPM" = "Humidity",
	"VPM" = "Vapor Pressure",
	"SHK_TAG" = "Snow Depth",
	"SH_TAG" = "Snow Depth (Synop)",
	"TGK" = "Min Ground Temp (5cm)",

	# Hourly params
	"TT_TU" = "Temperature",
	"RF_TU" = "Humidity",
	"P0" = "Pressure (Station)",
	"P" = "Pressure (Sea Level)",
	"V_N" = "Cloud Cover",
	"SD_SO" = "Sunshine",
	"F" = "Wind Speed",
	"D" = "Wind Direction",
	"R1" = "Precipitation",
	"WRTR" = "Weather Text",
	"WW" = "Weather Code",

	# Additional params (Soil, Sun, etc)
	"V_VV" = "Visibility",
	"V_S1_CS" = "Cloud Layer 1 Type",
	"V_S1_HHS" = "Cloud Layer 1 Height",
	"V_S1_NS" = "Cloud Layer 1 Amount",
	"V_S2_CS" = "Cloud Layer 2 Type",
	"V_S2_HHS" = "Cloud Layer 2 Height",
	"V_S2_NS" = "Cloud Layer 2 Amount",
	"V_S3_CS" = "Cloud Layer 3 Type",
	"V_S3_HHS" = "Cloud Layer 3 Height",
	"V_S3_NS" = "Cloud Layer 3 Amount",
	"V_S4_CS" = "Cloud Layer 4 Type",
	"V_S4_HHS" = "Cloud Layer 4 Height",
	"V_S4_NS" = "Cloud Layer 4 Amount",
	"ST_2" = "Soil Temp (2cm)",
	"ST_5" = "Soil Temp (5cm)",
	"ST_10" = "Soil Temp (10cm)",
	"ST_20" = "Soil Temp (20cm)",
	"ST_50" = "Soil Temp (50cm)",
	"ST_100" = "Soil Temp (100cm)",
	"TS2" = "Soil Temp (2cm)",
	"TS5" = "Soil Temp (5cm)",
	"TS10" = "Soil Temp (10cm)",
	"TS20" = "Soil Temp (20cm)",
	"TS50" = "Soil Temp (50cm)",
	"TS100" = "Soil Temp (100cm)",
	"R_R1" = "Precipitation (Hourly)",
	"RS_IND" = "Precipitation Indicator"
	)
	}