Spaces:
Sleeping
Sleeping
File size: 5,665 Bytes
7042517 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | """
CycloneDataLoader
=================
Joins the four cyclone CSVs into a single training-ready DataFrame.
Handles spatial nearest-neighbour merging for SST, moisture, and shear.
"""
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from pathlib import Path
def _nearest_merge(
base: pd.DataFrame,
aux: pd.DataFrame,
aux_cols: list[str],
aux_lat: str = "latitude",
aux_lon: str = "longitude",
) -> pd.DataFrame:
"""
For each row in `base`, find the spatially nearest row in `aux`
and attach aux_cols. Uses a cKDTree for efficiency.
"""
tree = cKDTree(aux[[aux_lat, aux_lon]].values)
dists, idxs = tree.query(base[["latitude", "longitude"]].values)
for col in aux_cols:
base[col] = aux[col].iloc[idxs].values
return base
def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
"""
Returns a clean DataFrame with all 8 cyclone model features + a
synthetic `risk_score` label derived from intensity/landfall signals.
Columns produced (matching CYCLONE_FEATURES exactly):
wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
track_curvature, distance_to_coast_km, storm_surge_potential,
atmospheric_moisture, shear_index
"""
p = Path(data_dir)
# ββ Load raw tables ββββββββββββββββββββββββββββββββββββββββββββββββββββ
tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
sst = pd.read_csv(p / "sea_surface_temp.csv")
moist = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
shear = pd.read_csv(p / "wind_shear.csv")
# ββ Normalise column names to lowercase βββββββββββββββββββββββββββββββ
for df in (tracks, sst, moist, shear):
df.columns = df.columns.str.lower().str.strip()
# ββ Base: use tracks as spine ββββββββββββββββββββββββββββββββββββββββββ
df = tracks.copy()
# Rename to match model schema
df = df.rename(columns={
"wind_speed_kmh": "wind_speed_kmh", # already correct
"central_pressure_hpa": "central_pressure_hpa", # already correct
"track_curvature": "track_curvature",
"distance_to_coast_km": "distance_to_coast_km",
"storm_surge_potential": "storm_surge_potential",
})
# ββ Merge SST (spatial nearest) βββββββββββββββββββββββββββββββββββββββ
# SST has time_index β match on spatial proximity only (SST changes slowly)
df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])
# ββ Merge atmospheric moisture (spatial nearest) βββββββββββββββββββββββ
df = _nearest_merge(
df, moist,
aux_cols=["atmospheric_moisture"],
)
# ββ Merge shear (spatial nearest) βββββββββββββββββββββββββββββββββββββ
df = _nearest_merge(df, shear, aux_cols=["shear_index"])
# ββ Validate all features present βββββββββββββββββββββββββββββββββββββ
required = [
"wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
"track_curvature", "distance_to_coast_km", "storm_surge_potential",
"atmospheric_moisture", "shear_index",
]
missing = [c for c in required if c not in df.columns]
if missing:
raise ValueError(f"Missing columns after merge: {missing}")
# ββ Synthesise risk label ββββββββββββββββββββββββββββββββββββββββββββββ
# Combines intensity + proximity + surge into a [0,1] score.
# Replace this with ground-truth labels if you have them.
df["risk_score"] = _compute_risk_label(df)
# ββ Drop rows with NaN in any model feature βββββββββββββββββββββββββββ
df = df.dropna(subset=required + ["risk_score"])
return df[required + ["risk_score", "date", "latitude", "longitude",
"basin", "storm_type"]].reset_index(drop=True)
def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
"""
Synthetic risk score [0, 1] derived from physical intensity signals.
Logic:
- High wind + low pressure β high base risk
- Close to coast β landfall amplification
- High storm surge β direct impact multiplier
- SST > 28Β°C β intensification bonus
"""
# Normalise each driver to [0, 1]
wind_norm = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
pressure_norm = np.clip(
(1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
)
coast_norm = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
surge_norm = np.clip(df["storm_surge_potential"], 0, 1)
sst_bonus = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)
score = (
0.30 * wind_norm +
0.25 * pressure_norm +
0.20 * coast_norm +
0.15 * surge_norm +
0.10 * sst_bonus
)
# Add small noise to prevent the model from memorising exact thresholds
noise = np.random.normal(0, 0.02, len(score))
return np.clip(score + noise, 0.0, 1.0) |