Spaces:
Sleeping
Sleeping
Create data_loader.py
Browse files- src/data_loader.py +129 -0
src/data_loader.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CycloneDataLoader
|
| 3 |
+
=================
|
| 4 |
+
Joins the four cyclone CSVs into a single training-ready DataFrame.
|
| 5 |
+
Handles spatial nearest-neighbour merging for SST, moisture, and shear.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
from scipy.spatial import cKDTree
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _nearest_merge(
|
| 15 |
+
base: pd.DataFrame,
|
| 16 |
+
aux: pd.DataFrame,
|
| 17 |
+
aux_cols: list[str],
|
| 18 |
+
aux_lat: str = "latitude",
|
| 19 |
+
aux_lon: str = "longitude",
|
| 20 |
+
) -> pd.DataFrame:
|
| 21 |
+
"""
|
| 22 |
+
For each row in `base`, find the spatially nearest row in `aux`
|
| 23 |
+
and attach aux_cols. Uses a cKDTree for efficiency.
|
| 24 |
+
"""
|
| 25 |
+
tree = cKDTree(aux[[aux_lat, aux_lon]].values)
|
| 26 |
+
dists, idxs = tree.query(base[["latitude", "longitude"]].values)
|
| 27 |
+
for col in aux_cols:
|
| 28 |
+
base[col] = aux[col].iloc[idxs].values
|
| 29 |
+
return base
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
|
| 33 |
+
"""
|
| 34 |
+
Returns a clean DataFrame with all 8 cyclone model features + a
|
| 35 |
+
synthetic `risk_score` label derived from intensity/landfall signals.
|
| 36 |
+
|
| 37 |
+
Columns produced (matching CYCLONE_FEATURES exactly):
|
| 38 |
+
wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
|
| 39 |
+
track_curvature, distance_to_coast_km, storm_surge_potential,
|
| 40 |
+
atmospheric_moisture, shear_index
|
| 41 |
+
"""
|
| 42 |
+
p = Path(data_dir)
|
| 43 |
+
|
| 44 |
+
# ββ Load raw tables ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
|
| 46 |
+
sst = pd.read_csv(p / "sea_surface_temp.csv")
|
| 47 |
+
moist = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
|
| 48 |
+
shear = pd.read_csv(p / "wind_shear.csv")
|
| 49 |
+
|
| 50 |
+
# ββ Normalise column names to lowercase βββββββββββββββββββββββββββββββ
|
| 51 |
+
for df in (tracks, sst, moist, shear):
|
| 52 |
+
df.columns = df.columns.str.lower().str.strip()
|
| 53 |
+
|
| 54 |
+
# ββ Base: use tracks as spine ββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
df = tracks.copy()
|
| 56 |
+
|
| 57 |
+
# Rename to match model schema
|
| 58 |
+
df = df.rename(columns={
|
| 59 |
+
"wind_speed_kmh": "wind_speed_kmh", # already correct
|
| 60 |
+
"central_pressure_hpa": "central_pressure_hpa", # already correct
|
| 61 |
+
"track_curvature": "track_curvature",
|
| 62 |
+
"distance_to_coast_km": "distance_to_coast_km",
|
| 63 |
+
"storm_surge_potential": "storm_surge_potential",
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
# ββ Merge SST (spatial nearest) βββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
# SST has time_index β match on spatial proximity only (SST changes slowly)
|
| 68 |
+
df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])
|
| 69 |
+
|
| 70 |
+
# ββ Merge atmospheric moisture (spatial nearest) βββββββββββββββββββββββ
|
| 71 |
+
df = _nearest_merge(
|
| 72 |
+
df, moist,
|
| 73 |
+
aux_cols=["atmospheric_moisture"],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# ββ Merge shear (spatial nearest) βββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
df = _nearest_merge(df, shear, aux_cols=["shear_index"])
|
| 78 |
+
|
| 79 |
+
# ββ Validate all features present βββββββββββββββββββββββββββββββββββββ
|
| 80 |
+
required = [
|
| 81 |
+
"wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
|
| 82 |
+
"track_curvature", "distance_to_coast_km", "storm_surge_potential",
|
| 83 |
+
"atmospheric_moisture", "shear_index",
|
| 84 |
+
]
|
| 85 |
+
missing = [c for c in required if c not in df.columns]
|
| 86 |
+
if missing:
|
| 87 |
+
raise ValueError(f"Missing columns after merge: {missing}")
|
| 88 |
+
|
| 89 |
+
# ββ Synthesise risk label ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
# Combines intensity + proximity + surge into a [0,1] score.
|
| 91 |
+
# Replace this with ground-truth labels if you have them.
|
| 92 |
+
df["risk_score"] = _compute_risk_label(df)
|
| 93 |
+
|
| 94 |
+
# ββ Drop rows with NaN in any model feature βββββββββββββββββββββββββββ
|
| 95 |
+
df = df.dropna(subset=required + ["risk_score"])
|
| 96 |
+
|
| 97 |
+
return df[required + ["risk_score", "date", "latitude", "longitude",
|
| 98 |
+
"basin", "storm_type"]].reset_index(drop=True)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
|
| 102 |
+
"""
|
| 103 |
+
Synthetic risk score [0, 1] derived from physical intensity signals.
|
| 104 |
+
Logic:
|
| 105 |
+
- High wind + low pressure β high base risk
|
| 106 |
+
- Close to coast β landfall amplification
|
| 107 |
+
- High storm surge β direct impact multiplier
|
| 108 |
+
- SST > 28Β°C β intensification bonus
|
| 109 |
+
"""
|
| 110 |
+
# Normalise each driver to [0, 1]
|
| 111 |
+
wind_norm = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
|
| 112 |
+
pressure_norm = np.clip(
|
| 113 |
+
(1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
|
| 114 |
+
)
|
| 115 |
+
coast_norm = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
|
| 116 |
+
surge_norm = np.clip(df["storm_surge_potential"], 0, 1)
|
| 117 |
+
sst_bonus = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)
|
| 118 |
+
|
| 119 |
+
score = (
|
| 120 |
+
0.30 * wind_norm +
|
| 121 |
+
0.25 * pressure_norm +
|
| 122 |
+
0.20 * coast_norm +
|
| 123 |
+
0.15 * surge_norm +
|
| 124 |
+
0.10 * sst_bonus
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Add small noise to prevent the model from memorising exact thresholds
|
| 128 |
+
noise = np.random.normal(0, 0.02, len(score))
|
| 129 |
+
return np.clip(score + noise, 0.0, 1.0)
|