""" CycloneDataLoader ================= Joins the four cyclone CSVs into a single training-ready DataFrame. Handles spatial nearest-neighbour merging for SST, moisture, and shear. """ import pandas as pd import numpy as np from scipy.spatial import cKDTree from pathlib import Path def _nearest_merge( base: pd.DataFrame, aux: pd.DataFrame, aux_cols: list[str], aux_lat: str = "latitude", aux_lon: str = "longitude", ) -> pd.DataFrame: """ For each row in `base`, find the spatially nearest row in `aux` and attach aux_cols. Uses a cKDTree for efficiency. """ tree = cKDTree(aux[[aux_lat, aux_lon]].values) dists, idxs = tree.query(base[["latitude", "longitude"]].values) for col in aux_cols: base[col] = aux[col].iloc[idxs].values return base def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame: """ Returns a clean DataFrame with all 8 cyclone model features + a synthetic `risk_score` label derived from intensity/landfall signals. Columns produced (matching CYCLONE_FEATURES exactly): wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c, track_curvature, distance_to_coast_km, storm_surge_potential, atmospheric_moisture, shear_index """ p = Path(data_dir) # ── Load raw tables ──────────────────────────────────────────────────── tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"]) sst = pd.read_csv(p / "sea_surface_temp.csv") moist = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"]) shear = pd.read_csv(p / "wind_shear.csv") # ── Normalise column names to lowercase ─────────────────────────────── for df in (tracks, sst, moist, shear): df.columns = df.columns.str.lower().str.strip() # ── Base: use tracks as spine ────────────────────────────────────────── df = tracks.copy() # Rename to match model schema df = df.rename(columns={ "wind_speed_kmh": "wind_speed_kmh", # already correct "central_pressure_hpa": "central_pressure_hpa", # already correct "track_curvature": "track_curvature", "distance_to_coast_km": "distance_to_coast_km", "storm_surge_potential": "storm_surge_potential", }) # ── Merge SST (spatial nearest) ─────────────────────────────────────── # SST has time_index — match on spatial proximity only (SST changes slowly) df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"]) # ── Merge atmospheric moisture (spatial nearest) ─────────────────────── df = _nearest_merge( df, moist, aux_cols=["atmospheric_moisture"], ) # ── Merge shear (spatial nearest) ───────────────────────────────────── df = _nearest_merge(df, shear, aux_cols=["shear_index"]) # ── Validate all features present ───────────────────────────────────── required = [ "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c", "track_curvature", "distance_to_coast_km", "storm_surge_potential", "atmospheric_moisture", "shear_index", ] missing = [c for c in required if c not in df.columns] if missing: raise ValueError(f"Missing columns after merge: {missing}") # ── Synthesise risk label ────────────────────────────────────────────── # Combines intensity + proximity + surge into a [0,1] score. # Replace this with ground-truth labels if you have them. df["risk_score"] = _compute_risk_label(df) # ── Drop rows with NaN in any model feature ─────────────────────────── df = df.dropna(subset=required + ["risk_score"]) return df[required + ["risk_score", "date", "latitude", "longitude", "basin", "storm_type"]].reset_index(drop=True) def _compute_risk_label(df: pd.DataFrame) -> pd.Series: """ Synthetic risk score [0, 1] derived from physical intensity signals. Logic: - High wind + low pressure → high base risk - Close to coast → landfall amplification - High storm surge → direct impact multiplier - SST > 28°C → intensification bonus """ # Normalise each driver to [0, 1] wind_norm = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1) pressure_norm = np.clip( (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1 ) coast_norm = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1) surge_norm = np.clip(df["storm_surge_potential"], 0, 1) sst_bonus = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1) score = ( 0.30 * wind_norm + 0.25 * pressure_norm + 0.20 * coast_norm + 0.15 * surge_norm + 0.10 * sst_bonus ) # Add small noise to prevent the model from memorising exact thresholds noise = np.random.normal(0, 0.02, len(score)) return np.clip(score + noise, 0.0, 1.0)