Spaces:
Sleeping
Sleeping
| """ | |
| CycloneDataLoader | |
| ================= | |
| Joins the four cyclone CSVs into a single training-ready DataFrame. | |
| Handles spatial nearest-neighbour merging for SST, moisture, and shear. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from scipy.spatial import cKDTree | |
| from pathlib import Path | |
| def _nearest_merge( | |
| base: pd.DataFrame, | |
| aux: pd.DataFrame, | |
| aux_cols: list[str], | |
| aux_lat: str = "latitude", | |
| aux_lon: str = "longitude", | |
| ) -> pd.DataFrame: | |
| """ | |
| For each row in `base`, find the spatially nearest row in `aux` | |
| and attach aux_cols. Uses a cKDTree for efficiency. | |
| """ | |
| tree = cKDTree(aux[[aux_lat, aux_lon]].values) | |
| dists, idxs = tree.query(base[["latitude", "longitude"]].values) | |
| for col in aux_cols: | |
| base[col] = aux[col].iloc[idxs].values | |
| return base | |
| def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame: | |
| """ | |
| Returns a clean DataFrame with all 8 cyclone model features + a | |
| synthetic `risk_score` label derived from intensity/landfall signals. | |
| Columns produced (matching CYCLONE_FEATURES exactly): | |
| wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c, | |
| track_curvature, distance_to_coast_km, storm_surge_potential, | |
| atmospheric_moisture, shear_index | |
| """ | |
| p = Path(data_dir) | |
| # ββ Load raw tables ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"]) | |
| sst = pd.read_csv(p / "sea_surface_temp.csv") | |
| moist = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"]) | |
| shear = pd.read_csv(p / "wind_shear.csv") | |
| # ββ Normalise column names to lowercase βββββββββββββββββββββββββββββββ | |
| for df in (tracks, sst, moist, shear): | |
| df.columns = df.columns.str.lower().str.strip() | |
| # ββ Base: use tracks as spine ββββββββββββββββββββββββββββββββββββββββββ | |
| df = tracks.copy() | |
| # Rename to match model schema | |
| df = df.rename(columns={ | |
| "wind_speed_kmh": "wind_speed_kmh", # already correct | |
| "central_pressure_hpa": "central_pressure_hpa", # already correct | |
| "track_curvature": "track_curvature", | |
| "distance_to_coast_km": "distance_to_coast_km", | |
| "storm_surge_potential": "storm_surge_potential", | |
| }) | |
| # ββ Merge SST (spatial nearest) βββββββββββββββββββββββββββββββββββββββ | |
| # SST has time_index β match on spatial proximity only (SST changes slowly) | |
| df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"]) | |
| # ββ Merge atmospheric moisture (spatial nearest) βββββββββββββββββββββββ | |
| df = _nearest_merge( | |
| df, moist, | |
| aux_cols=["atmospheric_moisture"], | |
| ) | |
| # ββ Merge shear (spatial nearest) βββββββββββββββββββββββββββββββββββββ | |
| df = _nearest_merge(df, shear, aux_cols=["shear_index"]) | |
| # ββ Validate all features present βββββββββββββββββββββββββββββββββββββ | |
| required = [ | |
| "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c", | |
| "track_curvature", "distance_to_coast_km", "storm_surge_potential", | |
| "atmospheric_moisture", "shear_index", | |
| ] | |
| missing = [c for c in required if c not in df.columns] | |
| if missing: | |
| raise ValueError(f"Missing columns after merge: {missing}") | |
| # ββ Synthesise risk label ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Combines intensity + proximity + surge into a [0,1] score. | |
| # Replace this with ground-truth labels if you have them. | |
| df["risk_score"] = _compute_risk_label(df) | |
| # ββ Drop rows with NaN in any model feature βββββββββββββββββββββββββββ | |
| df = df.dropna(subset=required + ["risk_score"]) | |
| return df[required + ["risk_score", "date", "latitude", "longitude", | |
| "basin", "storm_type"]].reset_index(drop=True) | |
| def _compute_risk_label(df: pd.DataFrame) -> pd.Series: | |
| """ | |
| Synthetic risk score [0, 1] derived from physical intensity signals. | |
| Logic: | |
| - High wind + low pressure β high base risk | |
| - Close to coast β landfall amplification | |
| - High storm surge β direct impact multiplier | |
| - SST > 28Β°C β intensification bonus | |
| """ | |
| # Normalise each driver to [0, 1] | |
| wind_norm = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1) | |
| pressure_norm = np.clip( | |
| (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1 | |
| ) | |
| coast_norm = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1) | |
| surge_norm = np.clip(df["storm_surge_potential"], 0, 1) | |
| sst_bonus = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1) | |
| score = ( | |
| 0.30 * wind_norm + | |
| 0.25 * pressure_norm + | |
| 0.20 * coast_norm + | |
| 0.15 * surge_norm + | |
| 0.10 * sst_bonus | |
| ) | |
| # Add small noise to prevent the model from memorising exact thresholds | |
| noise = np.random.normal(0, 0.02, len(score)) | |
| return np.clip(score + noise, 0.0, 1.0) |