perf: Optimize DWD data parsing by switching to `data.table::fread` with string-based date pre-filtering and consolidate library imports.
Browse files- Dockerfile +3 -1
- fun/map_helpers.R +0 -3
- fun/parse_dwd.R +68 -32
- fun/plot_weather_dwd.R +1 -3
- global.R +2 -0
Dockerfile
CHANGED
|
@@ -23,7 +23,9 @@ RUN install2.r --error \
|
|
| 23 |
curl \
|
| 24 |
stringr \
|
| 25 |
purrr \
|
| 26 |
-
shinyjs
|
|
|
|
|
|
|
| 27 |
|
| 28 |
COPY . .
|
| 29 |
|
|
|
|
| 23 |
curl \
|
| 24 |
stringr \
|
| 25 |
purrr \
|
| 26 |
+
shinyjs \
|
| 27 |
+
tibble \
|
| 28 |
+
data.table
|
| 29 |
|
| 30 |
COPY . .
|
| 31 |
|
fun/map_helpers.R
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
library(leaflet)
|
| 2 |
-
library(dplyr)
|
| 3 |
-
|
| 4 |
#' Highlight Selected Station on Map
|
| 5 |
#'
|
| 6 |
#' @param proxy Leaflet proxy object
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#' Highlight Selected Station on Map
|
| 2 |
#'
|
| 3 |
#' @param proxy Leaflet proxy object
|
fun/parse_dwd.R
CHANGED
|
@@ -1,9 +1,3 @@
|
|
| 1 |
-
# funs/parse_dwd.R
|
| 2 |
-
library(dplyr)
|
| 3 |
-
library(lubridate)
|
| 4 |
-
library(readr)
|
| 5 |
-
library(tibble)
|
| 6 |
-
|
| 7 |
#' Parse content of a DWD data zip file
|
| 8 |
#'
|
| 9 |
#' @param zip_path Local path to the .zip file
|
|
@@ -27,6 +21,12 @@ generate_dwd_na_strings <- function(base = c("-999", "-777", "-888", "8000", "99
|
|
| 27 |
return(res)
|
| 28 |
}
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
#' Parse content of a DWD data zip file
|
| 31 |
#'
|
| 32 |
#' @param zip_path Local path to the .zip file
|
|
@@ -68,53 +68,89 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
|
|
| 68 |
# Read Data
|
| 69 |
# DWD standard: semicolon sep
|
| 70 |
na_vec <- generate_dwd_na_strings()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
tryCatch(
|
| 73 |
{
|
| 74 |
-
# Use
|
| 75 |
-
# Read everything as character first to separate schema from data loading
|
| 76 |
-
|
| 77 |
data_file,
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
na =
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
show_col_types = FALSE
|
| 86 |
)
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
if (
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
}
|
| 95 |
}
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
|
|
|
|
| 101 |
raw_date <- as.character(df$MESS_DATUM)
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
|
|
|
|
| 104 |
fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
|
| 105 |
|
| 106 |
df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
|
| 107 |
|
| 108 |
-
# Filter valid dates
|
| 109 |
df <- df[!is.na(df$datetime), ]
|
| 110 |
|
| 111 |
-
#
|
| 112 |
if (!is.null(start_date)) {
|
| 113 |
s_limit <- as.POSIXct(start_date)
|
| 114 |
df <- df[df$datetime >= s_limit, ]
|
| 115 |
}
|
| 116 |
if (!is.null(end_date)) {
|
| 117 |
-
e_limit <- as.POSIXct(end_date) + days(1)
|
| 118 |
df <- df[df$datetime <= e_limit, ]
|
| 119 |
}
|
| 120 |
|
|
@@ -219,7 +255,7 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
|
|
| 219 |
. == "V_TE100" ~ "soil_temp_100cm",
|
| 220 |
. == "V_TE002M" ~ "soil_temp_2cm",
|
| 221 |
. == "V_TE005M" ~ "soil_temp_5cm",
|
| 222 |
-
. == "V_TE010M" ~ "
|
| 223 |
. == "V_TE020M" ~ "soil_temp_20cm",
|
| 224 |
. == "V_TE050M" ~ "soil_temp_50cm",
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#' Parse content of a DWD data zip file
|
| 2 |
#'
|
| 3 |
#' @param zip_path Local path to the .zip file
|
|
|
|
| 21 |
return(res)
|
| 22 |
}
|
| 23 |
|
| 24 |
+
#' Parse content of a DWD data zip file
|
| 25 |
+
#'
|
| 26 |
+
#' @param zip_path Local path to the .zip file
|
| 27 |
+
#' @param start_date Optional filter start (POSIXct)
|
| 28 |
+
#' @param end_date Optional filter end (POSIXct)
|
| 29 |
+
#' @return Tibble with parsed data or NULL
|
| 30 |
#' Parse content of a DWD data zip file
|
| 31 |
#'
|
| 32 |
#' @param zip_path Local path to the .zip file
|
|
|
|
| 68 |
# Read Data
|
| 69 |
# DWD standard: semicolon sep
|
| 70 |
na_vec <- generate_dwd_na_strings()
|
| 71 |
+
# fread strips whitespace by default, so we should pass trimmed NA strings to avoid warnings/errors
|
| 72 |
+
na_vec_clean <- unique(trimws(na_vec))
|
| 73 |
+
# Ensure empty string is included if not already
|
| 74 |
+
if (!"" %in% na_vec_clean) na_vec_clean <- c(na_vec_clean, "")
|
| 75 |
|
| 76 |
tryCatch(
|
| 77 |
{
|
| 78 |
+
# Use data.table::fread for faster parsing
|
| 79 |
+
# Read everything as character first to separate schema from data loading safety
|
| 80 |
+
dt <- data.table::fread(
|
| 81 |
data_file,
|
| 82 |
+
sep = ";",
|
| 83 |
+
header = TRUE,
|
| 84 |
+
na.strings = na_vec_clean,
|
| 85 |
+
colClasses = "character", # Force character to avoid type inference issues with weird DWD codes
|
| 86 |
+
encoding = "Latin-1",
|
| 87 |
+
showProgress = FALSE,
|
| 88 |
+
data.table = TRUE
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# Standardize Date Column Name
|
| 92 |
+
if ("MESS_DATUM_BEGINN" %in% names(dt)) {
|
| 93 |
+
data.table::setnames(dt, "MESS_DATUM_BEGINN", "MESS_DATUM")
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
if (!"MESS_DATUM" %in% names(dt)) {
|
| 97 |
+
return(NULL)
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# --- Optimization: Pre-filter rows by date string BEFORE heavy processing ---
|
| 101 |
+
# DWD dates are usually monotonic strings/numbers: YYYYMMDD, YYYYMMDDHH, etc.
|
| 102 |
+
# We can filter lexically or numerically without parsing to POSIXct first.
|
| 103 |
+
if (!is.null(start_date) || !is.null(end_date)) {
|
| 104 |
+
# Determine format from first row
|
| 105 |
+
sample_date <- dt$MESS_DATUM[1]
|
| 106 |
+
nch <- nchar(sample_date)
|
| 107 |
+
|
| 108 |
+
# Format mappings correspond to DWD standards
|
| 109 |
+
fmt_str <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
|
| 110 |
+
|
| 111 |
+
if (!is.na(nch)) {
|
| 112 |
+
if (!is.null(start_date)) {
|
| 113 |
+
# Format request date to match DWD string format
|
| 114 |
+
# Explicitly use UTC to avoid timezone shifts during string formatting if inputs are UTC
|
| 115 |
+
s_val <- format(as.POSIXct(start_date), format = fmt_str, tz = "UTC")
|
| 116 |
+
dt <- dt[MESS_DATUM >= s_val]
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
if (!is.null(end_date)) {
|
| 120 |
+
# Add buffer (end of day) if needed, but usually exact match on string works
|
| 121 |
+
# If end_date is 2025-01-01, we want up to 2025-01-01 23:59
|
| 122 |
+
e_limit <- as.POSIXct(end_date) + days(1)
|
| 123 |
+
e_val <- format(e_limit, format = fmt_str, tz = "UTC")
|
| 124 |
+
dt <- dt[MESS_DATUM <= e_val]
|
| 125 |
+
}
|
| 126 |
}
|
| 127 |
}
|
| 128 |
|
| 129 |
+
# Convert to tibble/data.frame for existing dplyr pipeline compatibility
|
| 130 |
+
# (Keeping existing logic for column mapping to minimize regression risk)
|
| 131 |
+
df <- as_tibble(dt)
|
| 132 |
+
|
| 133 |
+
# Parse Date logic (Original)
|
| 134 |
raw_date <- as.character(df$MESS_DATUM)
|
| 135 |
+
if (length(raw_date) == 0) {
|
| 136 |
+
return(NULL)
|
| 137 |
+
} # If filtered to empty
|
| 138 |
|
| 139 |
+
nch <- nchar(raw_date[1])
|
| 140 |
fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
|
| 141 |
|
| 142 |
df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
|
| 143 |
|
| 144 |
+
# Filter valid dates (Safety check)
|
| 145 |
df <- df[!is.na(df$datetime), ]
|
| 146 |
|
| 147 |
+
# Additional safety window filter (dates might strictly parse differently than strings)
|
| 148 |
if (!is.null(start_date)) {
|
| 149 |
s_limit <- as.POSIXct(start_date)
|
| 150 |
df <- df[df$datetime >= s_limit, ]
|
| 151 |
}
|
| 152 |
if (!is.null(end_date)) {
|
| 153 |
+
e_limit <- as.POSIXct(end_date) + days(1)
|
| 154 |
df <- df[df$datetime <= e_limit, ]
|
| 155 |
}
|
| 156 |
|
|
|
|
| 255 |
. == "V_TE100" ~ "soil_temp_100cm",
|
| 256 |
. == "V_TE002M" ~ "soil_temp_2cm",
|
| 257 |
. == "V_TE005M" ~ "soil_temp_5cm",
|
| 258 |
+
. == "V_TE010M" ~ "soil_temp_100cm",
|
| 259 |
. == "V_TE020M" ~ "soil_temp_20cm",
|
| 260 |
. == "V_TE050M" ~ "soil_temp_50cm",
|
| 261 |
|
fun/plot_weather_dwd.R
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
# funs/plot_weather_dwd.R
|
| 2 |
-
|
| 3 |
-
library(plotly)
|
| 4 |
-
library(lubridate)
|
| 5 |
|
| 6 |
#' Create a placeholder plot for missing data
|
| 7 |
#' @param message Message to display
|
|
|
|
| 1 |
# funs/plot_weather_dwd.R
|
| 2 |
+
|
|
|
|
|
|
|
| 3 |
|
| 4 |
#' Create a placeholder plot for missing data
|
| 5 |
#' @param message Message to display
|
global.R
CHANGED
|
@@ -13,6 +13,8 @@ library(curl)
|
|
| 13 |
library(stringr)
|
| 14 |
library(purrr)
|
| 15 |
library(shinyjs)
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
# --- Configuration ---
|
|
|
|
| 13 |
library(stringr)
|
| 14 |
library(purrr)
|
| 15 |
library(shinyjs)
|
| 16 |
+
library(tibble)
|
| 17 |
+
library(data.table)
|
| 18 |
|
| 19 |
|
| 20 |
# --- Configuration ---
|