Spaces:

alexdum
/

dwd

Running

App Files Files Community

alexdum commited on Jan 28

Commit

8cfb3a4

1 Parent(s): 7cd06eb

perf: Optimize DWD data parsing by switching to `data.table::fread` with string-based date pre-filtering and consolidate library imports.

Browse files

Files changed (5) hide show

Dockerfile +3 -1
fun/map_helpers.R +0 -3
fun/parse_dwd.R +68 -32
fun/plot_weather_dwd.R +1 -3
global.R +2 -0

Dockerfile CHANGED Viewed

@@ -23,7 +23,9 @@ RUN install2.r --error \
     curl \
     stringr \
     purrr \
-    shinyjs
 COPY . .

     curl \
     stringr \
     purrr \
+    shinyjs \
+    tibble \
+    data.table
 COPY . .

fun/map_helpers.R CHANGED Viewed

@@ -1,6 +1,3 @@
-library(leaflet)
-library(dplyr)
 #' Highlight Selected Station on Map
 #'
 #' @param proxy Leaflet proxy object

 #' Highlight Selected Station on Map
 #'
 #' @param proxy Leaflet proxy object

fun/parse_dwd.R CHANGED Viewed

@@ -1,9 +1,3 @@
-# funs/parse_dwd.R
-library(dplyr)
-library(lubridate)
-library(readr)
-library(tibble)
 #' Parse content of a DWD data zip file
 #'
 #' @param zip_path Local path to the .zip file
@@ -27,6 +21,12 @@ generate_dwd_na_strings <- function(base = c("-999", "-777", "-888", "8000", "99
     return(res)
 }
 #' Parse content of a DWD data zip file
 #'
 #' @param zip_path Local path to the .zip file
@@ -68,53 +68,89 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
     # Read Data
     # DWD standard: semicolon sep
     na_vec <- generate_dwd_na_strings()
     tryCatch(
         {
-            # Use readr for faster parsing (approx 3-4x faster)
-            # Read everything as character first to separate schema from data loading
-            df <- readr::read_delim(
                 data_file,
-                delim = ";",
-                col_names = TRUE,
-                na = na_vec,
-                trim_ws = TRUE,
-                col_types = readr::cols(.default = readr::col_character()),
-                locale = readr::locale(encoding = "ISO-8859-1"),
-                progress = FALSE,
-                show_col_types = FALSE
             )
-            # Check for date column (MESS_DATUM or MESS_DATUM_BEGINN for monthly)
-            if (!"MESS_DATUM" %in% names(df)) {
-                if ("MESS_DATUM_BEGINN" %in% names(df)) {
-                    df$MESS_DATUM <- df$MESS_DATUM_BEGINN
-                } else {
-                    return(NULL)
                 }
             }
-            # Parse Date with dynamic format detection
-            # 8: YYYYMMDD (Daily)
-            # 10: YYYYMMDDHH (Hourly)
-            # 12: YYYYMMDDHHMM (10-min)
             raw_date <- as.character(df$MESS_DATUM)
-            nch <- nchar(raw_date[1])
             fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
             df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
-            # Filter valid dates
             df <- df[!is.na(df$datetime), ]
-            # Window Filter - Discard rows outside requested window immediately to save memory/processing
             if (!is.null(start_date)) {
                 s_limit <- as.POSIXct(start_date)
                 df <- df[df$datetime >= s_limit, ]
             }
             if (!is.null(end_date)) {
-                e_limit <- as.POSIXct(end_date) + days(1) # Include the end date fully
                 df <- df[df$datetime <= e_limit, ]
             }
@@ -219,7 +255,7 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
                     . == "V_TE100" ~ "soil_temp_100cm",
                     . == "V_TE002M" ~ "soil_temp_2cm",
                     . == "V_TE005M" ~ "soil_temp_5cm",
-                    . == "V_TE010M" ~ "soil_temp_10cm",
                     . == "V_TE020M" ~ "soil_temp_20cm",
                     . == "V_TE050M" ~ "soil_temp_50cm",

 #' Parse content of a DWD data zip file
 #'
 #' @param zip_path Local path to the .zip file
     return(res)
 }
+#' Parse content of a DWD data zip file
+#'
+#' @param zip_path Local path to the .zip file
+#' @param start_date Optional filter start (POSIXct)
+#' @param end_date Optional filter end (POSIXct)
+#' @return Tibble with parsed data or NULL
 #' Parse content of a DWD data zip file
 #'
 #' @param zip_path Local path to the .zip file
     # Read Data
     # DWD standard: semicolon sep
     na_vec <- generate_dwd_na_strings()
+    # fread strips whitespace by default, so we should pass trimmed NA strings to avoid warnings/errors
+    na_vec_clean <- unique(trimws(na_vec))
+    # Ensure empty string is included if not already
+    if (!"" %in% na_vec_clean) na_vec_clean <- c(na_vec_clean, "")
     tryCatch(
         {
+            # Use data.table::fread for faster parsing
+            # Read everything as character first to separate schema from data loading safety
+            dt <- data.table::fread(
                 data_file,
+                sep = ";",
+                header = TRUE,
+                na.strings = na_vec_clean,
+                colClasses = "character", # Force character to avoid type inference issues with weird DWD codes
+                encoding = "Latin-1",
+                showProgress = FALSE,
+                data.table = TRUE
             )
+            # Standardize Date Column Name
+            if ("MESS_DATUM_BEGINN" %in% names(dt)) {
+                data.table::setnames(dt, "MESS_DATUM_BEGINN", "MESS_DATUM")
+            }
+            if (!"MESS_DATUM" %in% names(dt)) {
+                return(NULL)
+            }
+            # --- Optimization: Pre-filter rows by date string BEFORE heavy processing ---
+            # DWD dates are usually monotonic strings/numbers: YYYYMMDD, YYYYMMDDHH, etc.
+            # We can filter lexically or numerically without parsing to POSIXct first.
+            if (!is.null(start_date) || !is.null(end_date)) {
+                # Determine format from first row
+                sample_date <- dt$MESS_DATUM[1]
+                nch <- nchar(sample_date)
+                # Format mappings correspond to DWD standards
+                fmt_str <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
+                if (!is.na(nch)) {
+                    if (!is.null(start_date)) {
+                        # Format request date to match DWD string format
+                        # Explicitly use UTC to avoid timezone shifts during string formatting if inputs are UTC
+                        s_val <- format(as.POSIXct(start_date), format = fmt_str, tz = "UTC")
+                        dt <- dt[MESS_DATUM >= s_val]
+                    }
+                    if (!is.null(end_date)) {
+                        # Add buffer (end of day) if needed, but usually exact match on string works
+                        # If end_date is 2025-01-01, we want up to 2025-01-01 23:59
+                        e_limit <- as.POSIXct(end_date) + days(1)
+                        e_val <- format(e_limit, format = fmt_str, tz = "UTC")
+                        dt <- dt[MESS_DATUM <= e_val]
+                    }
                 }
             }
+            # Convert to tibble/data.frame for existing dplyr pipeline compatibility
+            # (Keeping existing logic for column mapping to minimize regression risk)
+            df <- as_tibble(dt)
+            # Parse Date logic (Original)
             raw_date <- as.character(df$MESS_DATUM)
+            if (length(raw_date) == 0) {
+                return(NULL)
+            } # If filtered to empty
+            nch <- nchar(raw_date[1])
             fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
             df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
+            # Filter valid dates (Safety check)
             df <- df[!is.na(df$datetime), ]
+            # Additional safety window filter (dates might strictly parse differently than strings)
             if (!is.null(start_date)) {
                 s_limit <- as.POSIXct(start_date)
                 df <- df[df$datetime >= s_limit, ]
             }
             if (!is.null(end_date)) {
+                e_limit <- as.POSIXct(end_date) + days(1)
                 df <- df[df$datetime <= e_limit, ]
             }
                     . == "V_TE100" ~ "soil_temp_100cm",
                     . == "V_TE002M" ~ "soil_temp_2cm",
                     . == "V_TE005M" ~ "soil_temp_5cm",
+                    . == "V_TE010M" ~ "soil_temp_100cm",
                     . == "V_TE020M" ~ "soil_temp_20cm",
                     . == "V_TE050M" ~ "soil_temp_50cm",

fun/plot_weather_dwd.R CHANGED Viewed

@@ -1,7 +1,5 @@
 # funs/plot_weather_dwd.R
-library(dplyr)
-library(plotly)
-library(lubridate)
 #' Create a placeholder plot for missing data
 #' @param message Message to display

 # funs/plot_weather_dwd.R
 #' Create a placeholder plot for missing data
 #' @param message Message to display

global.R CHANGED Viewed

@@ -13,6 +13,8 @@ library(curl)
 library(stringr)
 library(purrr)
 library(shinyjs)
 # --- Configuration ---

 library(stringr)
 library(purrr)
 library(shinyjs)
+library(tibble)
+library(data.table)
 # --- Configuration ---