alexdum commited on
Commit
8cfb3a4
·
1 Parent(s): 7cd06eb

perf: Optimize DWD data parsing by switching to `data.table::fread` with string-based date pre-filtering and consolidate library imports.

Browse files
Files changed (5) hide show
  1. Dockerfile +3 -1
  2. fun/map_helpers.R +0 -3
  3. fun/parse_dwd.R +68 -32
  4. fun/plot_weather_dwd.R +1 -3
  5. global.R +2 -0
Dockerfile CHANGED
@@ -23,7 +23,9 @@ RUN install2.r --error \
23
  curl \
24
  stringr \
25
  purrr \
26
- shinyjs
 
 
27
 
28
  COPY . .
29
 
 
23
  curl \
24
  stringr \
25
  purrr \
26
+ shinyjs \
27
+ tibble \
28
+ data.table
29
 
30
  COPY . .
31
 
fun/map_helpers.R CHANGED
@@ -1,6 +1,3 @@
1
- library(leaflet)
2
- library(dplyr)
3
-
4
  #' Highlight Selected Station on Map
5
  #'
6
  #' @param proxy Leaflet proxy object
 
 
 
 
1
  #' Highlight Selected Station on Map
2
  #'
3
  #' @param proxy Leaflet proxy object
fun/parse_dwd.R CHANGED
@@ -1,9 +1,3 @@
1
- # funs/parse_dwd.R
2
- library(dplyr)
3
- library(lubridate)
4
- library(readr)
5
- library(tibble)
6
-
7
  #' Parse content of a DWD data zip file
8
  #'
9
  #' @param zip_path Local path to the .zip file
@@ -27,6 +21,12 @@ generate_dwd_na_strings <- function(base = c("-999", "-777", "-888", "8000", "99
27
  return(res)
28
  }
29
 
 
 
 
 
 
 
30
  #' Parse content of a DWD data zip file
31
  #'
32
  #' @param zip_path Local path to the .zip file
@@ -68,53 +68,89 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
68
  # Read Data
69
  # DWD standard: semicolon sep
70
  na_vec <- generate_dwd_na_strings()
 
 
 
 
71
 
72
  tryCatch(
73
  {
74
- # Use readr for faster parsing (approx 3-4x faster)
75
- # Read everything as character first to separate schema from data loading
76
- df <- readr::read_delim(
77
  data_file,
78
- delim = ";",
79
- col_names = TRUE,
80
- na = na_vec,
81
- trim_ws = TRUE,
82
- col_types = readr::cols(.default = readr::col_character()),
83
- locale = readr::locale(encoding = "ISO-8859-1"),
84
- progress = FALSE,
85
- show_col_types = FALSE
86
  )
87
 
88
- # Check for date column (MESS_DATUM or MESS_DATUM_BEGINN for monthly)
89
- if (!"MESS_DATUM" %in% names(df)) {
90
- if ("MESS_DATUM_BEGINN" %in% names(df)) {
91
- df$MESS_DATUM <- df$MESS_DATUM_BEGINN
92
- } else {
93
- return(NULL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
  }
96
 
97
- # Parse Date with dynamic format detection
98
- # 8: YYYYMMDD (Daily)
99
- # 10: YYYYMMDDHH (Hourly)
100
- # 12: YYYYMMDDHHMM (10-min)
 
101
  raw_date <- as.character(df$MESS_DATUM)
102
- nch <- nchar(raw_date[1])
 
 
103
 
 
104
  fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
105
 
106
  df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
107
 
108
- # Filter valid dates
109
  df <- df[!is.na(df$datetime), ]
110
 
111
- # Window Filter - Discard rows outside requested window immediately to save memory/processing
112
  if (!is.null(start_date)) {
113
  s_limit <- as.POSIXct(start_date)
114
  df <- df[df$datetime >= s_limit, ]
115
  }
116
  if (!is.null(end_date)) {
117
- e_limit <- as.POSIXct(end_date) + days(1) # Include the end date fully
118
  df <- df[df$datetime <= e_limit, ]
119
  }
120
 
@@ -219,7 +255,7 @@ read_dwd_data <- function(zip_path, start_date = NULL, end_date = NULL) {
219
  . == "V_TE100" ~ "soil_temp_100cm",
220
  . == "V_TE002M" ~ "soil_temp_2cm",
221
  . == "V_TE005M" ~ "soil_temp_5cm",
222
- . == "V_TE010M" ~ "soil_temp_10cm",
223
  . == "V_TE020M" ~ "soil_temp_20cm",
224
  . == "V_TE050M" ~ "soil_temp_50cm",
225
 
 
 
 
 
 
 
 
1
  #' Parse content of a DWD data zip file
2
  #'
3
  #' @param zip_path Local path to the .zip file
 
21
  return(res)
22
  }
23
 
24
+ #' Parse content of a DWD data zip file
25
+ #'
26
+ #' @param zip_path Local path to the .zip file
27
+ #' @param start_date Optional filter start (POSIXct)
28
+ #' @param end_date Optional filter end (POSIXct)
29
+ #' @return Tibble with parsed data or NULL
30
  #' Parse content of a DWD data zip file
31
  #'
32
  #' @param zip_path Local path to the .zip file
 
68
  # Read Data
69
  # DWD standard: semicolon sep
70
  na_vec <- generate_dwd_na_strings()
71
+ # fread strips whitespace by default, so we should pass trimmed NA strings to avoid warnings/errors
72
+ na_vec_clean <- unique(trimws(na_vec))
73
+ # Ensure empty string is included if not already
74
+ if (!"" %in% na_vec_clean) na_vec_clean <- c(na_vec_clean, "")
75
 
76
  tryCatch(
77
  {
78
+ # Use data.table::fread for faster parsing
79
+ # Read everything as character first to separate schema from data loading safety
80
+ dt <- data.table::fread(
81
  data_file,
82
+ sep = ";",
83
+ header = TRUE,
84
+ na.strings = na_vec_clean,
85
+ colClasses = "character", # Force character to avoid type inference issues with weird DWD codes
86
+ encoding = "Latin-1",
87
+ showProgress = FALSE,
88
+ data.table = TRUE
 
89
  )
90
 
91
+ # Standardize Date Column Name
92
+ if ("MESS_DATUM_BEGINN" %in% names(dt)) {
93
+ data.table::setnames(dt, "MESS_DATUM_BEGINN", "MESS_DATUM")
94
+ }
95
+
96
+ if (!"MESS_DATUM" %in% names(dt)) {
97
+ return(NULL)
98
+ }
99
+
100
+ # --- Optimization: Pre-filter rows by date string BEFORE heavy processing ---
101
+ # DWD dates are usually monotonic strings/numbers: YYYYMMDD, YYYYMMDDHH, etc.
102
+ # We can filter lexically or numerically without parsing to POSIXct first.
103
+ if (!is.null(start_date) || !is.null(end_date)) {
104
+ # Determine format from first row
105
+ sample_date <- dt$MESS_DATUM[1]
106
+ nch <- nchar(sample_date)
107
+
108
+ # Format mappings correspond to DWD standards
109
+ fmt_str <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
110
+
111
+ if (!is.na(nch)) {
112
+ if (!is.null(start_date)) {
113
+ # Format request date to match DWD string format
114
+ # Explicitly use UTC to avoid timezone shifts during string formatting if inputs are UTC
115
+ s_val <- format(as.POSIXct(start_date), format = fmt_str, tz = "UTC")
116
+ dt <- dt[MESS_DATUM >= s_val]
117
+ }
118
+
119
+ if (!is.null(end_date)) {
120
+ # Add buffer (end of day) if needed, but usually exact match on string works
121
+ # If end_date is 2025-01-01, we want up to 2025-01-01 23:59
122
+ e_limit <- as.POSIXct(end_date) + days(1)
123
+ e_val <- format(e_limit, format = fmt_str, tz = "UTC")
124
+ dt <- dt[MESS_DATUM <= e_val]
125
+ }
126
  }
127
  }
128
 
129
+ # Convert to tibble/data.frame for existing dplyr pipeline compatibility
130
+ # (Keeping existing logic for column mapping to minimize regression risk)
131
+ df <- as_tibble(dt)
132
+
133
+ # Parse Date logic (Original)
134
  raw_date <- as.character(df$MESS_DATUM)
135
+ if (length(raw_date) == 0) {
136
+ return(NULL)
137
+ } # If filtered to empty
138
 
139
+ nch <- nchar(raw_date[1])
140
  fmt <- if (nch == 8) "%Y%m%d" else if (nch == 10) "%Y%m%d%H" else if (nch == 12) "%Y%m%d%H%M" else "%Y%m%d%H"
141
 
142
  df$datetime <- as.POSIXct(raw_date, format = fmt, tz = "UTC")
143
 
144
+ # Filter valid dates (Safety check)
145
  df <- df[!is.na(df$datetime), ]
146
 
147
+ # Additional safety window filter (dates might strictly parse differently than strings)
148
  if (!is.null(start_date)) {
149
  s_limit <- as.POSIXct(start_date)
150
  df <- df[df$datetime >= s_limit, ]
151
  }
152
  if (!is.null(end_date)) {
153
+ e_limit <- as.POSIXct(end_date) + days(1)
154
  df <- df[df$datetime <= e_limit, ]
155
  }
156
 
 
255
  . == "V_TE100" ~ "soil_temp_100cm",
256
  . == "V_TE002M" ~ "soil_temp_2cm",
257
  . == "V_TE005M" ~ "soil_temp_5cm",
258
+ . == "V_TE010M" ~ "soil_temp_100cm",
259
  . == "V_TE020M" ~ "soil_temp_20cm",
260
  . == "V_TE050M" ~ "soil_temp_50cm",
261
 
fun/plot_weather_dwd.R CHANGED
@@ -1,7 +1,5 @@
1
  # funs/plot_weather_dwd.R
2
- library(dplyr)
3
- library(plotly)
4
- library(lubridate)
5
 
6
  #' Create a placeholder plot for missing data
7
  #' @param message Message to display
 
1
  # funs/plot_weather_dwd.R
2
+
 
 
3
 
4
  #' Create a placeholder plot for missing data
5
  #' @param message Message to display
global.R CHANGED
@@ -13,6 +13,8 @@ library(curl)
13
  library(stringr)
14
  library(purrr)
15
  library(shinyjs)
 
 
16
 
17
 
18
  # --- Configuration ---
 
13
  library(stringr)
14
  library(purrr)
15
  library(shinyjs)
16
+ library(tibble)
17
+ library(data.table)
18
 
19
 
20
  # --- Configuration ---