Spaces:

nakas
/

refs_take5

Sleeping

App Files Files Community

nakas commited on Oct 22, 2025

Commit

e63bfab

1 Parent(s): 1b909aa

Add RRFS REFC downloader + Gradio app with Leaflet overlay (REFC), NOAA S3 source; ignore data/ and GRIB files

Browse files

Files changed (5) hide show

.gitignore +11 -0
README.md +25 -13
app.py +232 -0
download_latest_refc.py +116 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Local data downloads and indices
+data/
+*.grib2
+*.grib2.idx
+# OS cruft
+.DS_Store
+# Python
+__pycache__/
+*.pyc

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
----
-title: Refs Take5
-emoji: 💻
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+RRFS REFC GRIB Downloader (NOAA)
+This workspace contains:
+- A CLI script to fetch a current RRFS GRIB2 file that contains REFC from NOAA’s official S3 (`noaa-rrfs-pds`).
+- A simple Gradio app to do the same interactively.
+Sources are 100% official: NOAA Big Data Program S3 bucket `noaa-rrfs-pds` under `rrfs_a/rrfs.YYYYMMDD/HH/` (real‑time experimental RRFS prototype). The script verifies REFC exists via the `.idx` sidecar before downloading.
+CLI use
+- Run: `python3 download_latest_refc.py`
+  - Saves to `data/rrfs.tHHz.prslev.2p5km.f000.<domain>.grib2` (domain typically `hi` or `pr` for smaller files).
+  - Also saves `.idx` and prints REFC lines for verification.
+Gradio app
+- Run locally: `python3 app.py`
+- In Spaces, add `requirements.txt` and set the entrypoint to `app.py`.
+Notes
+- We pick the latest available cycle for the current UTC day by listing the S3 prefix.
+- To keep downloads practical, the UI defaults to small domains (Hawaii or Puerto Rico) where REFC is present and file sizes are tens of MB. Larger domains (e.g., North America natlev) can be tens of GB.
+- No synthetic data or proxies are used; files are fetched directly from NOAA’s S3.

app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+import re
+import io
+import base64
+import xml.etree.ElementTree as ET
+from datetime import datetime, timezone
+from typing import List, Tuple
+import gradio as gr
+import requests
+import numpy as np
+import xarray as xr
+from PIL import Image
+from matplotlib import cm, colors
+from scipy.interpolate import griddata
+S3_BUCKET = "https://noaa-rrfs-pds.s3.amazonaws.com"
+PREFIX_ROOT = "rrfs_a"
+def list_bucket(prefix: str):
+    params = {"delimiter": "/", "prefix": prefix}
+    r = requests.get(S3_BUCKET + "/", params=params, timeout=20)
+    r.raise_for_status()
+    return ET.fromstring(r.text)
+def latest_day_and_cycle() -> Tuple[str, str]:
+    day = datetime.now(timezone.utc).strftime("%Y%m%d")
+    root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day}/")
+    hours = []
+    for cp in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}CommonPrefixes"):
+        pref = cp.find("{http://s3.amazonaws.com/doc/2006-03-01/}Prefix").text
+        parts = pref.strip("/").split("/")
+        if len(parts) >= 3 and parts[2].isdigit():
+            hours.append(parts[2])
+    if not hours:
+        raise gr.Error(f"No cycles found for {day}")
+    return day, max(hours)
+def list_prslev(day: str, hh: str) -> List[str]:
+    root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day}/{hh}/")
+    keys = []
+    for ct in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}Contents"):
+        key = ct.find("{http://s3.amazonaws.com/doc/2006-03-01/}Key").text
+        if key.endswith(".grib2") and ".prslev" in key:
+            keys.append(key)
+    return sorted(keys)
+def parse_domains_and_hours(keys: List[str]) -> Tuple[List[str], List[str]]:
+    domains = set()
+    hours = set()
+    for k in keys:
+        m = re.search(r"\.f(\d{3})\.([a-z]+)\.grib2$", k)
+        if m:
+            hours.add(m.group(1))
+            domains.add(m.group(2))
+    return sorted(domains), sorted(hours)
+def build_key(day: str, hh: str, dom: str, fhr: str) -> str:
+    # Prefer 2.5km prslev variant if present
+    candidates = [
+        f"{PREFIX_ROOT}/rrfs.{day}/{hh}/rrfs.t{hh}z.prslev.2p5km.f{fhr}.{dom}.grib2",
+        f"{PREFIX_ROOT}/rrfs.{day}/{hh}/rrfs.t{hh}z.prslev.f{fhr}.{dom}.grib2",
+    ]
+    for c in candidates:
+        # check existence via idx (small)
+        r = requests.get(f"{S3_BUCKET}/{c}.idx", timeout=15)
+        if r.status_code == 200:
+            return c
+    raise gr.Error("No matching GRIB key found for selection")
+def ensure_refc_in_idx(key: str) -> Tuple[bool, str]:
+    idx_url = f"{S3_BUCKET}/{key}.idx"
+    r = requests.get(idx_url, timeout=20)
+    if r.status_code != 200:
+        return False, "Index not found"
+    refc_lines = "\n".join([ln for ln in r.text.splitlines() if "REFC:" in ln])
+    return ("REFC:" in r.text), refc_lines
+def fetch_latest(dom: str, fhr: str):
+    day, hh = latest_day_and_cycle()
+    keys = list_prslev(day, hh)
+    if not keys:
+        raise gr.Error("No prslev keys available for latest cycle")
+    key = build_key(day, hh, dom, fhr)
+    ok, refc = ensure_refc_in_idx(key)
+    if not ok:
+        raise gr.Error("Selected file does not contain REFC")
+    url = f"{S3_BUCKET}/{key}"
+    os.makedirs("data", exist_ok=True)
+    out_path = os.path.join("data", os.path.basename(key))
+    with requests.get(url, stream=True, timeout=60) as r:
+        r.raise_for_status()
+        with open(out_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1024 * 1024):
+                if chunk:
+                    f.write(chunk)
+    size_mb = os.path.getsize(out_path) / (1024 * 1024)
+    html = generate_leaflet_overlay(out_path)
+    return (
+        f"Saved: {out_path} (\u2248 {size_mb:.1f} MiB)\nCycle: {day} {hh}Z\nURL: {url}",
+        refc or "(REFC present; see .idx for details)",
+        url,
+        html,
+    )
+def generate_leaflet_overlay(grib_path: str) -> str:
+    # Read REFC field using cfgrib via xarray
+    # Use backend_kwargs to avoid index files
+    ds = xr.open_dataset(
+        grib_path,
+        engine="cfgrib",
+        backend_kwargs={
+            "indexpath": "",
+            "filter_by_keys": {"shortName": "refc"},
+        },
+    )
+    # Pick the first data variable
+    var_name = list(ds.data_vars)[0]
+    da = ds[var_name]
+    # Drop time dimension if present
+    for dim in ["time", "valid_time", "step"]:
+        if dim in da.dims and da.sizes.get(dim, 1) == 1:
+            da = da.isel({dim: 0})
+    # Lat/lon variables
+    lat = ds.get("latitude") or ds.coords.get("latitude")
+    lon = ds.get("longitude") or ds.coords.get("longitude")
+    if lat is None or lon is None:
+        # Some cfgrib versions expose lat/lon on the dataarray
+        lat = da.coords.get("latitude")
+        lon = da.coords.get("longitude")
+    if lat is None or lon is None:
+        raise gr.Error("Could not locate latitude/longitude coordinates in GRIB")
+    latv = np.array(lat)
+    lonv = np.array(lon)
+    data = np.array(da)
+    # Build a target regular lat/lon grid for Leaflet overlay
+    lat_min = float(np.nanmin(latv))
+    lat_max = float(np.nanmax(latv))
+    lon_min = float(np.nanmin(lonv))
+    lon_max = float(np.nanmax(lonv))
+    # Reasonable output grid size for small domains
+    ny, nx = 400, 400
+    tgt_lats = np.linspace(lat_min, lat_max, ny)
+    tgt_lons = np.linspace(lon_min, lon_max, nx)
+    grid_lon, grid_lat = np.meshgrid(tgt_lons, tgt_lats)
+    # Interpolate to regular grid
+    points = np.column_stack((lonv.ravel(), latv.ravel()))
+    values = data.ravel()
+    # Mask missing/extreme values
+    mask = np.isfinite(points[:, 0]) & np.isfinite(points[:, 1]) & np.isfinite(values)
+    points = points[mask]
+    values = values[mask]
+    # Use nearest for robustness
+    grid = griddata(points, values, (grid_lon, grid_lat), method="nearest")
+    # Color mapping for reflectivity (0..75 dBZ); transparent under 5 dBZ
+    vmin, vmax = 0.0, 75.0
+    norm = colors.Normalize(vmin=vmin, vmax=vmax)
+    cmap = cm.get_cmap("turbo")
+    rgba = cmap(norm(np.clip(grid, vmin, vmax)))  # (ny, nx, 4)
+    alpha = np.where(np.isnan(grid) | (grid < 5.0), 0.0, 0.65)
+    rgba[..., 3] = alpha
+    img = (rgba * 255).astype(np.uint8)
+    image = Image.fromarray(img, mode="RGBA")
+    buf = io.BytesIO()
+    image.save(buf, format="PNG")
+    encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+    # Build Leaflet HTML with ImageOverlay
+    html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset=\"utf-8\" />
+  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"/>
+  <link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\"/>
+  <style>#map {{ height: 520px; width: 100%; }}</style>
+</head>
+<body>
+  <div id=\"map\"></div>
+  <script src=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.js\"></script>
+  <script>
+    var map = L.map('map').setView([{(lat_min + lat_max)/2:.4f}, {(lon_min + lon_max)/2:.4f}], 6);
+    L.tileLayer('https://tile.openstreetmap.org/{{z}}/{{x}}/{{y}}.png', {{
+      maxZoom: 12,
+      attribution: '&copy; OpenStreetMap contributors'
+    }}).addTo(map);
+    var bounds = L.latLngBounds([[{lat_min:.6f}, {lon_min:.6f}], [{lat_max:.6f}, {lon_max:.6f}]]);
+    var img = 'data:image/png;base64,{encoded}';
+    L.imageOverlay(img, bounds, {{opacity: 1.0, interactive: false}}).addTo(map);
+    map.fitBounds(bounds);
+  </script>
+</body>
+</html>
+"""
+    return html
+def build_ui():
+    with gr.Blocks(title="RRFS REFC Downloader (NOAA S3)") as demo:
+        gr.Markdown("""
+        Downloads a current Rapid Refresh Forecast System (RRFS) GRIB2 file that contains REFC from NOAA’s official S3 (noaa-rrfs-pds).
+        """)
+        with gr.Row():
+            dom = gr.Dropdown(label="Domain", choices=["hi", "pr"], value="hi", info="Use a small domain to keep download size reasonable")
+            fhr = gr.Dropdown(label="Forecast Hour", choices=[f"{i:03d}" for i in range(0, 10)], value="000")
+        run = gr.Button("Fetch Latest RRFS REFC GRIB")
+        status = gr.Textbox(label="Download Status", interactive=False)
+        idx = gr.Textbox(label="REFC lines from .idx", lines=6, interactive=False)
+        link = gr.Textbox(label="Source URL", interactive=False)
+        leaflet = gr.HTML(label="Leaflet Map Overlay")
+        run.click(fn=fetch_latest, inputs=[dom, fhr], outputs=[status, idx, link, leaflet])
+    return demo
+if __name__ == "__main__":
+    app = build_ui()
+    app.launch()

download_latest_refc.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/usr/bin/env python3
+import os
+import sys
+import time
+import xml.etree.ElementTree as ET
+from datetime import datetime, timezone
+import requests
+S3_BUCKET = "https://noaa-rrfs-pds.s3.amazonaws.com"
+PREFIX_ROOT = "rrfs_a"
+def list_bucket(prefix: str):
+    params = {"delimiter": "/", "prefix": prefix}
+    r = requests.get(S3_BUCKET + "/", params=params, timeout=20)
+    r.raise_for_status()
+    return ET.fromstring(r.text)
+def find_latest_cycle(day_ymd: str) -> str | None:
+    root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/")
+    hours = []
+    for cp in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}CommonPrefixes"):
+        pref = cp.find("{http://s3.amazonaws.com/doc/2006-03-01/}Prefix").text
+        parts = pref.strip("/").split("/")
+        if len(parts) >= 3:
+            hh = parts[2]
+            if hh.isdigit() and len(hh) == 2:
+                hours.append(hh)
+    return max(hours) if hours else None
+def list_prslev_keys(day_ymd: str, hh: str) -> list[str]:
+    # Returns keys like rrfs_a/rrfs.YYYYMMDD/HH/rrfs.tHHz.prslev.2p5km.fNNN.DOM.grib2(.idx)
+    root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/{hh}/")
+    keys = []
+    for ct in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}Contents"):
+        key = ct.find("{http://s3.amazonaws.com/doc/2006-03-01/}Key").text
+        if "/rrfs.t" in key and ".prslev" in key and key.endswith(".grib2"):
+            keys.append(key)
+    return keys
+def choose_smallest_refc_candidate(keys: list[str]) -> str | None:
+    # Prefer smaller domains to keep downloads reasonable (hi, pr), then others
+    domain_order = ["hi", "pr", "ak", "conus", "na"]
+    # Prefer f000 first
+    sorted_keys = sorted(keys, key=lambda k: ("f000" not in k, next((i for i, d in enumerate(domain_order) if f".{d}.grib2" in k), 99), k))
+    return sorted_keys[0] if sorted_keys else None
+def ensure_refc_in_idx(grib_url: str) -> bool:
+    idx_url = grib_url + ".idx"
+    r = requests.get(idx_url, timeout=20)
+    if r.status_code != 200:
+        return False
+    return "REFC:" in r.text
+def download(url: str, out_path: str):
+    with requests.get(url, stream=True, timeout=30) as r:
+        r.raise_for_status()
+        with open(out_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1024 * 1024):
+                if chunk:
+                    f.write(chunk)
+def main():
+    day = datetime.now(timezone.utc).strftime("%Y%m%d")
+    latest = find_latest_cycle(day)
+    if latest is None:
+        print(f"No cycles found for {day} under {S3_BUCKET}/{PREFIX_ROOT}", file=sys.stderr)
+        sys.exit(2)
+    keys = list_prslev_keys(day, latest)
+    if not keys:
+        print(f"No prslev GRIB2 keys found for {day} {latest}Z", file=sys.stderr)
+        sys.exit(2)
+    candidate = choose_smallest_refc_candidate(keys)
+    if candidate is None:
+        print("No candidate GRIB2 key found", file=sys.stderr)
+        sys.exit(2)
+    grib_url = f"{S3_BUCKET}/{candidate}"
+    if not ensure_refc_in_idx(grib_url):
+        print("Chosen file does not contain REFC in index; aborting per requirements.", file=sys.stderr)
+        sys.exit(3)
+    os.makedirs("data", exist_ok=True)
+    out = os.path.join("data", os.path.basename(candidate))
+    print(f"Downloading: {grib_url}\n -> {out}")
+    t0 = time.time()
+    download(grib_url, out)
+    dt = time.time() - t0
+    size_mb = os.path.getsize(out) / (1024 * 1024)
+    print(f"Done: {size_mb:.1f} MiB in {dt:.1f}s")
+    # Save index for quick verification
+    idx_path = out + ".idx"
+    r = requests.get(grib_url + ".idx", timeout=20)
+    r.raise_for_status()
+    with open(idx_path, "wb") as f:
+        f.write(r.content)
+    # Echo REFC lines
+    lines = [ln for ln in r.text.splitlines() if "REFC:" in ln]
+    print("REFC index lines:")
+    for ln in lines[:5]:
+        print(ln)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+requests>=2.31.0
+numpy>=1.23
+xarray>=2023.1.0
+cfgrib>=0.9.10.4
+eccodes>=1.6.1
+matplotlib>=3.7
+Pillow>=10.0
+scipy>=1.10