Spaces:

clarindasusan
/

cyclone-pred-api

Sleeping

App Files Files Community

clarindasusan commited on Feb 18

Commit

7042517

verified ·

1 Parent(s): 10c13cf

Create data_loader.py

Browse files

Files changed (1) hide show

src/data_loader.py +129 -0

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+CycloneDataLoader
+=================
+Joins the four cyclone CSVs into a single training-ready DataFrame.
+Handles spatial nearest-neighbour merging for SST, moisture, and shear.
+"""
+import pandas as pd
+import numpy as np
+from scipy.spatial import cKDTree
+from pathlib import Path
+def _nearest_merge(
+    base: pd.DataFrame,
+    aux: pd.DataFrame,
+    aux_cols: list[str],
+    aux_lat: str = "latitude",
+    aux_lon: str = "longitude",
+) -> pd.DataFrame:
+    """
+    For each row in `base`, find the spatially nearest row in `aux`
+    and attach aux_cols. Uses a cKDTree for efficiency.
+    """
+    tree = cKDTree(aux[[aux_lat, aux_lon]].values)
+    dists, idxs = tree.query(base[["latitude", "longitude"]].values)
+    for col in aux_cols:
+        base[col] = aux[col].iloc[idxs].values
+    return base
+def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
+    """
+    Returns a clean DataFrame with all 8 cyclone model features + a
+    synthetic `risk_score` label derived from intensity/landfall signals.
+    Columns produced (matching CYCLONE_FEATURES exactly):
+      wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
+      track_curvature, distance_to_coast_km, storm_surge_potential,
+      atmospheric_moisture, shear_index
+    """
+    p = Path(data_dir)
+    # ── Load raw tables ────────────────────────────────────────────────────
+    tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
+    sst    = pd.read_csv(p / "sea_surface_temp.csv")
+    moist  = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
+    shear  = pd.read_csv(p / "wind_shear.csv")
+    # ── Normalise column names to lowercase ───────────────────────────────
+    for df in (tracks, sst, moist, shear):
+        df.columns = df.columns.str.lower().str.strip()
+    # ── Base: use tracks as spine ──────────────────────────────────────────
+    df = tracks.copy()
+    # Rename to match model schema
+    df = df.rename(columns={
+        "wind_speed_kmh":        "wind_speed_kmh",       # already correct
+        "central_pressure_hpa":  "central_pressure_hpa", # already correct
+        "track_curvature":       "track_curvature",
+        "distance_to_coast_km":  "distance_to_coast_km",
+        "storm_surge_potential": "storm_surge_potential",
+    })
+    # ── Merge SST (spatial nearest) ───────────────────────────────────────
+    # SST has time_index — match on spatial proximity only (SST changes slowly)
+    df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])
+    # ── Merge atmospheric moisture (spatial nearest) ───────────────────────
+    df = _nearest_merge(
+        df, moist,
+        aux_cols=["atmospheric_moisture"],
+    )
+    # ── Merge shear (spatial nearest) ─────────────────────────────────────
+    df = _nearest_merge(df, shear, aux_cols=["shear_index"])
+    # ── Validate all features present ─────────────────────────────────────
+    required = [
+        "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
+        "track_curvature", "distance_to_coast_km", "storm_surge_potential",
+        "atmospheric_moisture", "shear_index",
+    ]
+    missing = [c for c in required if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing columns after merge: {missing}")
+    # ── Synthesise risk label ──────────────────────────────────────────────
+    # Combines intensity + proximity + surge into a [0,1] score.
+    # Replace this with ground-truth labels if you have them.
+    df["risk_score"] = _compute_risk_label(df)
+    # ── Drop rows with NaN in any model feature ───────────────────────────
+    df = df.dropna(subset=required + ["risk_score"])
+    return df[required + ["risk_score", "date", "latitude", "longitude",
+                          "basin", "storm_type"]].reset_index(drop=True)
+def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
+    """
+    Synthetic risk score [0, 1] derived from physical intensity signals.
+    Logic:
+      - High wind + low pressure       → high base risk
+      - Close to coast                 → landfall amplification
+      - High storm surge               → direct impact multiplier
+      - SST > 28°C                     → intensification bonus
+    """
+    # Normalise each driver to [0, 1]
+    wind_norm     = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
+    pressure_norm = np.clip(
+        (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
+    )
+    coast_norm    = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
+    surge_norm    = np.clip(df["storm_surge_potential"], 0, 1)
+    sst_bonus     = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)
+    score = (
+        0.30 * wind_norm +
+        0.25 * pressure_norm +
+        0.20 * coast_norm +
+        0.15 * surge_norm +
+        0.10 * sst_bonus
+    )
+    # Add small noise to prevent the model from memorising exact thresholds
+    noise = np.random.normal(0, 0.02, len(score))
+    return np.clip(score + noise, 0.0, 1.0)