clarindasusan commited on
Commit
7042517
Β·
verified Β·
1 Parent(s): 10c13cf

Create data_loader.py

Browse files
Files changed (1) hide show
  1. src/data_loader.py +129 -0
src/data_loader.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CycloneDataLoader
3
+ =================
4
+ Joins the four cyclone CSVs into a single training-ready DataFrame.
5
+ Handles spatial nearest-neighbour merging for SST, moisture, and shear.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from scipy.spatial import cKDTree
11
+ from pathlib import Path
12
+
13
+
14
+ def _nearest_merge(
15
+ base: pd.DataFrame,
16
+ aux: pd.DataFrame,
17
+ aux_cols: list[str],
18
+ aux_lat: str = "latitude",
19
+ aux_lon: str = "longitude",
20
+ ) -> pd.DataFrame:
21
+ """
22
+ For each row in `base`, find the spatially nearest row in `aux`
23
+ and attach aux_cols. Uses a cKDTree for efficiency.
24
+ """
25
+ tree = cKDTree(aux[[aux_lat, aux_lon]].values)
26
+ dists, idxs = tree.query(base[["latitude", "longitude"]].values)
27
+ for col in aux_cols:
28
+ base[col] = aux[col].iloc[idxs].values
29
+ return base
30
+
31
+
32
+ def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
33
+ """
34
+ Returns a clean DataFrame with all 8 cyclone model features + a
35
+ synthetic `risk_score` label derived from intensity/landfall signals.
36
+
37
+ Columns produced (matching CYCLONE_FEATURES exactly):
38
+ wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
39
+ track_curvature, distance_to_coast_km, storm_surge_potential,
40
+ atmospheric_moisture, shear_index
41
+ """
42
+ p = Path(data_dir)
43
+
44
+ # ── Load raw tables ────────────────────────────────────────────────────
45
+ tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
46
+ sst = pd.read_csv(p / "sea_surface_temp.csv")
47
+ moist = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
48
+ shear = pd.read_csv(p / "wind_shear.csv")
49
+
50
+ # ── Normalise column names to lowercase ───────────────────────────────
51
+ for df in (tracks, sst, moist, shear):
52
+ df.columns = df.columns.str.lower().str.strip()
53
+
54
+ # ── Base: use tracks as spine ──────────────────────────────────────────
55
+ df = tracks.copy()
56
+
57
+ # Rename to match model schema
58
+ df = df.rename(columns={
59
+ "wind_speed_kmh": "wind_speed_kmh", # already correct
60
+ "central_pressure_hpa": "central_pressure_hpa", # already correct
61
+ "track_curvature": "track_curvature",
62
+ "distance_to_coast_km": "distance_to_coast_km",
63
+ "storm_surge_potential": "storm_surge_potential",
64
+ })
65
+
66
+ # ── Merge SST (spatial nearest) ───────────────────────────────────────
67
+ # SST has time_index β€” match on spatial proximity only (SST changes slowly)
68
+ df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])
69
+
70
+ # ── Merge atmospheric moisture (spatial nearest) ───────────────────────
71
+ df = _nearest_merge(
72
+ df, moist,
73
+ aux_cols=["atmospheric_moisture"],
74
+ )
75
+
76
+ # ── Merge shear (spatial nearest) ─────────────────────────────────────
77
+ df = _nearest_merge(df, shear, aux_cols=["shear_index"])
78
+
79
+ # ── Validate all features present ─────────────────────────────────────
80
+ required = [
81
+ "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
82
+ "track_curvature", "distance_to_coast_km", "storm_surge_potential",
83
+ "atmospheric_moisture", "shear_index",
84
+ ]
85
+ missing = [c for c in required if c not in df.columns]
86
+ if missing:
87
+ raise ValueError(f"Missing columns after merge: {missing}")
88
+
89
+ # ── Synthesise risk label ──────────────────────────────────────────────
90
+ # Combines intensity + proximity + surge into a [0,1] score.
91
+ # Replace this with ground-truth labels if you have them.
92
+ df["risk_score"] = _compute_risk_label(df)
93
+
94
+ # ── Drop rows with NaN in any model feature ───────────────────────────
95
+ df = df.dropna(subset=required + ["risk_score"])
96
+
97
+ return df[required + ["risk_score", "date", "latitude", "longitude",
98
+ "basin", "storm_type"]].reset_index(drop=True)
99
+
100
+
101
+ def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
102
+ """
103
+ Synthetic risk score [0, 1] derived from physical intensity signals.
104
+ Logic:
105
+ - High wind + low pressure β†’ high base risk
106
+ - Close to coast β†’ landfall amplification
107
+ - High storm surge β†’ direct impact multiplier
108
+ - SST > 28Β°C β†’ intensification bonus
109
+ """
110
+ # Normalise each driver to [0, 1]
111
+ wind_norm = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
112
+ pressure_norm = np.clip(
113
+ (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
114
+ )
115
+ coast_norm = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
116
+ surge_norm = np.clip(df["storm_surge_potential"], 0, 1)
117
+ sst_bonus = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)
118
+
119
+ score = (
120
+ 0.30 * wind_norm +
121
+ 0.25 * pressure_norm +
122
+ 0.20 * coast_norm +
123
+ 0.15 * surge_norm +
124
+ 0.10 * sst_bonus
125
+ )
126
+
127
+ # Add small noise to prevent the model from memorising exact thresholds
128
+ noise = np.random.normal(0, 0.02, len(score))
129
+ return np.clip(score + noise, 0.0, 1.0)