nakas commited on
Commit
e63bfab
·
1 Parent(s): 1b909aa

Add RRFS REFC downloader + Gradio app with Leaflet overlay (REFC), NOAA S3 source; ignore data/ and GRIB files

Browse files
Files changed (5) hide show
  1. .gitignore +11 -0
  2. README.md +25 -13
  3. app.py +232 -0
  4. download_latest_refc.py +116 -0
  5. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local data downloads and indices
2
+ data/
3
+ *.grib2
4
+ *.grib2.idx
5
+
6
+ # OS cruft
7
+ .DS_Store
8
+
9
+ # Python
10
+ __pycache__/
11
+ *.pyc
README.md CHANGED
@@ -1,13 +1,25 @@
1
- ---
2
- title: Refs Take5
3
- emoji: 💻
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RRFS REFC GRIB Downloader (NOAA)
2
+
3
+ This workspace contains:
4
+
5
+ - A CLI script to fetch a current RRFS GRIB2 file that contains REFC from NOAA’s official S3 (`noaa-rrfs-pds`).
6
+ - A simple Gradio app to do the same interactively.
7
+
8
+ Sources are 100% official: NOAA Big Data Program S3 bucket `noaa-rrfs-pds` under `rrfs_a/rrfs.YYYYMMDD/HH/` (real‑time experimental RRFS prototype). The script verifies REFC exists via the `.idx` sidecar before downloading.
9
+
10
+ CLI use
11
+
12
+ - Run: `python3 download_latest_refc.py`
13
+ - Saves to `data/rrfs.tHHz.prslev.2p5km.f000.<domain>.grib2` (domain typically `hi` or `pr` for smaller files).
14
+ - Also saves `.idx` and prints REFC lines for verification.
15
+
16
+ Gradio app
17
+
18
+ - Run locally: `python3 app.py`
19
+ - In Spaces, add `requirements.txt` and set the entrypoint to `app.py`.
20
+
21
+ Notes
22
+
23
+ - We pick the latest available cycle for the current UTC day by listing the S3 prefix.
24
+ - To keep downloads practical, the UI defaults to small domains (Hawaii or Puerto Rico) where REFC is present and file sizes are tens of MB. Larger domains (e.g., North America natlev) can be tens of GB.
25
+ - No synthetic data or proxies are used; files are fetched directly from NOAA’s S3.
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import io
4
+ import base64
5
+ import xml.etree.ElementTree as ET
6
+ from datetime import datetime, timezone
7
+ from typing import List, Tuple
8
+
9
+ import gradio as gr
10
+ import requests
11
+ import numpy as np
12
+ import xarray as xr
13
+ from PIL import Image
14
+ from matplotlib import cm, colors
15
+ from scipy.interpolate import griddata
16
+
17
+ S3_BUCKET = "https://noaa-rrfs-pds.s3.amazonaws.com"
18
+ PREFIX_ROOT = "rrfs_a"
19
+
20
+
21
+ def list_bucket(prefix: str):
22
+ params = {"delimiter": "/", "prefix": prefix}
23
+ r = requests.get(S3_BUCKET + "/", params=params, timeout=20)
24
+ r.raise_for_status()
25
+ return ET.fromstring(r.text)
26
+
27
+
28
+ def latest_day_and_cycle() -> Tuple[str, str]:
29
+ day = datetime.now(timezone.utc).strftime("%Y%m%d")
30
+ root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day}/")
31
+ hours = []
32
+ for cp in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}CommonPrefixes"):
33
+ pref = cp.find("{http://s3.amazonaws.com/doc/2006-03-01/}Prefix").text
34
+ parts = pref.strip("/").split("/")
35
+ if len(parts) >= 3 and parts[2].isdigit():
36
+ hours.append(parts[2])
37
+ if not hours:
38
+ raise gr.Error(f"No cycles found for {day}")
39
+ return day, max(hours)
40
+
41
+
42
+ def list_prslev(day: str, hh: str) -> List[str]:
43
+ root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day}/{hh}/")
44
+ keys = []
45
+ for ct in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}Contents"):
46
+ key = ct.find("{http://s3.amazonaws.com/doc/2006-03-01/}Key").text
47
+ if key.endswith(".grib2") and ".prslev" in key:
48
+ keys.append(key)
49
+ return sorted(keys)
50
+
51
+
52
+ def parse_domains_and_hours(keys: List[str]) -> Tuple[List[str], List[str]]:
53
+ domains = set()
54
+ hours = set()
55
+ for k in keys:
56
+ m = re.search(r"\.f(\d{3})\.([a-z]+)\.grib2$", k)
57
+ if m:
58
+ hours.add(m.group(1))
59
+ domains.add(m.group(2))
60
+ return sorted(domains), sorted(hours)
61
+
62
+
63
+ def build_key(day: str, hh: str, dom: str, fhr: str) -> str:
64
+ # Prefer 2.5km prslev variant if present
65
+ candidates = [
66
+ f"{PREFIX_ROOT}/rrfs.{day}/{hh}/rrfs.t{hh}z.prslev.2p5km.f{fhr}.{dom}.grib2",
67
+ f"{PREFIX_ROOT}/rrfs.{day}/{hh}/rrfs.t{hh}z.prslev.f{fhr}.{dom}.grib2",
68
+ ]
69
+ for c in candidates:
70
+ # check existence via idx (small)
71
+ r = requests.get(f"{S3_BUCKET}/{c}.idx", timeout=15)
72
+ if r.status_code == 200:
73
+ return c
74
+ raise gr.Error("No matching GRIB key found for selection")
75
+
76
+
77
+ def ensure_refc_in_idx(key: str) -> Tuple[bool, str]:
78
+ idx_url = f"{S3_BUCKET}/{key}.idx"
79
+ r = requests.get(idx_url, timeout=20)
80
+ if r.status_code != 200:
81
+ return False, "Index not found"
82
+ refc_lines = "\n".join([ln for ln in r.text.splitlines() if "REFC:" in ln])
83
+ return ("REFC:" in r.text), refc_lines
84
+
85
+
86
+ def fetch_latest(dom: str, fhr: str):
87
+ day, hh = latest_day_and_cycle()
88
+ keys = list_prslev(day, hh)
89
+ if not keys:
90
+ raise gr.Error("No prslev keys available for latest cycle")
91
+ key = build_key(day, hh, dom, fhr)
92
+ ok, refc = ensure_refc_in_idx(key)
93
+ if not ok:
94
+ raise gr.Error("Selected file does not contain REFC")
95
+ url = f"{S3_BUCKET}/{key}"
96
+ os.makedirs("data", exist_ok=True)
97
+ out_path = os.path.join("data", os.path.basename(key))
98
+ with requests.get(url, stream=True, timeout=60) as r:
99
+ r.raise_for_status()
100
+ with open(out_path, "wb") as f:
101
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
102
+ if chunk:
103
+ f.write(chunk)
104
+ size_mb = os.path.getsize(out_path) / (1024 * 1024)
105
+ html = generate_leaflet_overlay(out_path)
106
+ return (
107
+ f"Saved: {out_path} (\u2248 {size_mb:.1f} MiB)\nCycle: {day} {hh}Z\nURL: {url}",
108
+ refc or "(REFC present; see .idx for details)",
109
+ url,
110
+ html,
111
+ )
112
+
113
+
114
+ def generate_leaflet_overlay(grib_path: str) -> str:
115
+ # Read REFC field using cfgrib via xarray
116
+ # Use backend_kwargs to avoid index files
117
+ ds = xr.open_dataset(
118
+ grib_path,
119
+ engine="cfgrib",
120
+ backend_kwargs={
121
+ "indexpath": "",
122
+ "filter_by_keys": {"shortName": "refc"},
123
+ },
124
+ )
125
+ # Pick the first data variable
126
+ var_name = list(ds.data_vars)[0]
127
+ da = ds[var_name]
128
+ # Drop time dimension if present
129
+ for dim in ["time", "valid_time", "step"]:
130
+ if dim in da.dims and da.sizes.get(dim, 1) == 1:
131
+ da = da.isel({dim: 0})
132
+ # Lat/lon variables
133
+ lat = ds.get("latitude") or ds.coords.get("latitude")
134
+ lon = ds.get("longitude") or ds.coords.get("longitude")
135
+ if lat is None or lon is None:
136
+ # Some cfgrib versions expose lat/lon on the dataarray
137
+ lat = da.coords.get("latitude")
138
+ lon = da.coords.get("longitude")
139
+ if lat is None or lon is None:
140
+ raise gr.Error("Could not locate latitude/longitude coordinates in GRIB")
141
+
142
+ latv = np.array(lat)
143
+ lonv = np.array(lon)
144
+ data = np.array(da)
145
+
146
+ # Build a target regular lat/lon grid for Leaflet overlay
147
+ lat_min = float(np.nanmin(latv))
148
+ lat_max = float(np.nanmax(latv))
149
+ lon_min = float(np.nanmin(lonv))
150
+ lon_max = float(np.nanmax(lonv))
151
+
152
+ # Reasonable output grid size for small domains
153
+ ny, nx = 400, 400
154
+ tgt_lats = np.linspace(lat_min, lat_max, ny)
155
+ tgt_lons = np.linspace(lon_min, lon_max, nx)
156
+ grid_lon, grid_lat = np.meshgrid(tgt_lons, tgt_lats)
157
+
158
+ # Interpolate to regular grid
159
+ points = np.column_stack((lonv.ravel(), latv.ravel()))
160
+ values = data.ravel()
161
+ # Mask missing/extreme values
162
+ mask = np.isfinite(points[:, 0]) & np.isfinite(points[:, 1]) & np.isfinite(values)
163
+ points = points[mask]
164
+ values = values[mask]
165
+ # Use nearest for robustness
166
+ grid = griddata(points, values, (grid_lon, grid_lat), method="nearest")
167
+
168
+ # Color mapping for reflectivity (0..75 dBZ); transparent under 5 dBZ
169
+ vmin, vmax = 0.0, 75.0
170
+ norm = colors.Normalize(vmin=vmin, vmax=vmax)
171
+ cmap = cm.get_cmap("turbo")
172
+ rgba = cmap(norm(np.clip(grid, vmin, vmax))) # (ny, nx, 4)
173
+ alpha = np.where(np.isnan(grid) | (grid < 5.0), 0.0, 0.65)
174
+ rgba[..., 3] = alpha
175
+
176
+ img = (rgba * 255).astype(np.uint8)
177
+ image = Image.fromarray(img, mode="RGBA")
178
+ buf = io.BytesIO()
179
+ image.save(buf, format="PNG")
180
+ encoded = base64.b64encode(buf.getvalue()).decode("ascii")
181
+
182
+ # Build Leaflet HTML with ImageOverlay
183
+ html = f"""
184
+ <!DOCTYPE html>
185
+ <html>
186
+ <head>
187
+ <meta charset=\"utf-8\" />
188
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"/>
189
+ <link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\"/>
190
+ <style>#map {{ height: 520px; width: 100%; }}</style>
191
+ </head>
192
+ <body>
193
+ <div id=\"map\"></div>
194
+ <script src=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.js\"></script>
195
+ <script>
196
+ var map = L.map('map').setView([{(lat_min + lat_max)/2:.4f}, {(lon_min + lon_max)/2:.4f}], 6);
197
+ L.tileLayer('https://tile.openstreetmap.org/{{z}}/{{x}}/{{y}}.png', {{
198
+ maxZoom: 12,
199
+ attribution: '&copy; OpenStreetMap contributors'
200
+ }}).addTo(map);
201
+ var bounds = L.latLngBounds([[{lat_min:.6f}, {lon_min:.6f}], [{lat_max:.6f}, {lon_max:.6f}]]);
202
+ var img = 'data:image/png;base64,{encoded}';
203
+ L.imageOverlay(img, bounds, {{opacity: 1.0, interactive: false}}).addTo(map);
204
+ map.fitBounds(bounds);
205
+ </script>
206
+ </body>
207
+ </html>
208
+ """
209
+ return html
210
+
211
+
212
+ def build_ui():
213
+ with gr.Blocks(title="RRFS REFC Downloader (NOAA S3)") as demo:
214
+ gr.Markdown("""
215
+ Downloads a current Rapid Refresh Forecast System (RRFS) GRIB2 file that contains REFC from NOAA’s official S3 (noaa-rrfs-pds).
216
+ """)
217
+ with gr.Row():
218
+ dom = gr.Dropdown(label="Domain", choices=["hi", "pr"], value="hi", info="Use a small domain to keep download size reasonable")
219
+ fhr = gr.Dropdown(label="Forecast Hour", choices=[f"{i:03d}" for i in range(0, 10)], value="000")
220
+ run = gr.Button("Fetch Latest RRFS REFC GRIB")
221
+ status = gr.Textbox(label="Download Status", interactive=False)
222
+ idx = gr.Textbox(label="REFC lines from .idx", lines=6, interactive=False)
223
+ link = gr.Textbox(label="Source URL", interactive=False)
224
+ leaflet = gr.HTML(label="Leaflet Map Overlay")
225
+
226
+ run.click(fn=fetch_latest, inputs=[dom, fhr], outputs=[status, idx, link, leaflet])
227
+ return demo
228
+
229
+
230
+ if __name__ == "__main__":
231
+ app = build_ui()
232
+ app.launch()
download_latest_refc.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import sys
4
+ import time
5
+ import xml.etree.ElementTree as ET
6
+ from datetime import datetime, timezone
7
+
8
+ import requests
9
+
10
+ S3_BUCKET = "https://noaa-rrfs-pds.s3.amazonaws.com"
11
+ PREFIX_ROOT = "rrfs_a"
12
+
13
+
14
+ def list_bucket(prefix: str):
15
+ params = {"delimiter": "/", "prefix": prefix}
16
+ r = requests.get(S3_BUCKET + "/", params=params, timeout=20)
17
+ r.raise_for_status()
18
+ return ET.fromstring(r.text)
19
+
20
+
21
+ def find_latest_cycle(day_ymd: str) -> str | None:
22
+ root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/")
23
+ hours = []
24
+ for cp in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}CommonPrefixes"):
25
+ pref = cp.find("{http://s3.amazonaws.com/doc/2006-03-01/}Prefix").text
26
+ parts = pref.strip("/").split("/")
27
+ if len(parts) >= 3:
28
+ hh = parts[2]
29
+ if hh.isdigit() and len(hh) == 2:
30
+ hours.append(hh)
31
+ return max(hours) if hours else None
32
+
33
+
34
+ def list_prslev_keys(day_ymd: str, hh: str) -> list[str]:
35
+ # Returns keys like rrfs_a/rrfs.YYYYMMDD/HH/rrfs.tHHz.prslev.2p5km.fNNN.DOM.grib2(.idx)
36
+ root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/{hh}/")
37
+ keys = []
38
+ for ct in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}Contents"):
39
+ key = ct.find("{http://s3.amazonaws.com/doc/2006-03-01/}Key").text
40
+ if "/rrfs.t" in key and ".prslev" in key and key.endswith(".grib2"):
41
+ keys.append(key)
42
+ return keys
43
+
44
+
45
+ def choose_smallest_refc_candidate(keys: list[str]) -> str | None:
46
+ # Prefer smaller domains to keep downloads reasonable (hi, pr), then others
47
+ domain_order = ["hi", "pr", "ak", "conus", "na"]
48
+ # Prefer f000 first
49
+ sorted_keys = sorted(keys, key=lambda k: ("f000" not in k, next((i for i, d in enumerate(domain_order) if f".{d}.grib2" in k), 99), k))
50
+ return sorted_keys[0] if sorted_keys else None
51
+
52
+
53
+ def ensure_refc_in_idx(grib_url: str) -> bool:
54
+ idx_url = grib_url + ".idx"
55
+ r = requests.get(idx_url, timeout=20)
56
+ if r.status_code != 200:
57
+ return False
58
+ return "REFC:" in r.text
59
+
60
+
61
+ def download(url: str, out_path: str):
62
+ with requests.get(url, stream=True, timeout=30) as r:
63
+ r.raise_for_status()
64
+ with open(out_path, "wb") as f:
65
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
66
+ if chunk:
67
+ f.write(chunk)
68
+
69
+
70
+ def main():
71
+ day = datetime.now(timezone.utc).strftime("%Y%m%d")
72
+ latest = find_latest_cycle(day)
73
+ if latest is None:
74
+ print(f"No cycles found for {day} under {S3_BUCKET}/{PREFIX_ROOT}", file=sys.stderr)
75
+ sys.exit(2)
76
+
77
+ keys = list_prslev_keys(day, latest)
78
+ if not keys:
79
+ print(f"No prslev GRIB2 keys found for {day} {latest}Z", file=sys.stderr)
80
+ sys.exit(2)
81
+
82
+ candidate = choose_smallest_refc_candidate(keys)
83
+ if candidate is None:
84
+ print("No candidate GRIB2 key found", file=sys.stderr)
85
+ sys.exit(2)
86
+
87
+ grib_url = f"{S3_BUCKET}/{candidate}"
88
+ if not ensure_refc_in_idx(grib_url):
89
+ print("Chosen file does not contain REFC in index; aborting per requirements.", file=sys.stderr)
90
+ sys.exit(3)
91
+
92
+ os.makedirs("data", exist_ok=True)
93
+ out = os.path.join("data", os.path.basename(candidate))
94
+ print(f"Downloading: {grib_url}\n -> {out}")
95
+ t0 = time.time()
96
+ download(grib_url, out)
97
+ dt = time.time() - t0
98
+ size_mb = os.path.getsize(out) / (1024 * 1024)
99
+ print(f"Done: {size_mb:.1f} MiB in {dt:.1f}s")
100
+
101
+ # Save index for quick verification
102
+ idx_path = out + ".idx"
103
+ r = requests.get(grib_url + ".idx", timeout=20)
104
+ r.raise_for_status()
105
+ with open(idx_path, "wb") as f:
106
+ f.write(r.content)
107
+ # Echo REFC lines
108
+ lines = [ln for ln in r.text.splitlines() if "REFC:" in ln]
109
+ print("REFC index lines:")
110
+ for ln in lines[:5]:
111
+ print(ln)
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
116
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ requests>=2.31.0
3
+ numpy>=1.23
4
+ xarray>=2023.1.0
5
+ cfgrib>=0.9.10.4
6
+ eccodes>=1.6.1
7
+ matplotlib>=3.7
8
+ Pillow>=10.0
9
+ scipy>=1.10