File size: 5,665 Bytes
7042517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
CycloneDataLoader
=================
Joins the four cyclone CSVs into a single training-ready DataFrame.
Handles spatial nearest-neighbour merging for SST, moisture, and shear.
"""

import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from pathlib import Path


def _nearest_merge(
    base: pd.DataFrame,
    aux: pd.DataFrame,
    aux_cols: list[str],
    aux_lat: str = "latitude",
    aux_lon: str = "longitude",
) -> pd.DataFrame:
    """
    For each row in `base`, find the spatially nearest row in `aux`
    and attach aux_cols. Uses a cKDTree for efficiency.
    """
    tree = cKDTree(aux[[aux_lat, aux_lon]].values)
    dists, idxs = tree.query(base[["latitude", "longitude"]].values)
    for col in aux_cols:
        base[col] = aux[col].iloc[idxs].values
    return base


def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
    """
    Returns a clean DataFrame with all 8 cyclone model features + a
    synthetic `risk_score` label derived from intensity/landfall signals.

    Columns produced (matching CYCLONE_FEATURES exactly):
      wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
      track_curvature, distance_to_coast_km, storm_surge_potential,
      atmospheric_moisture, shear_index
    """
    p = Path(data_dir)

    # ── Load raw tables ────────────────────────────────────────────────────
    tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
    sst    = pd.read_csv(p / "sea_surface_temp.csv")
    moist  = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
    shear  = pd.read_csv(p / "wind_shear.csv")

    # ── Normalise column names to lowercase ───────────────────────────────
    for df in (tracks, sst, moist, shear):
        df.columns = df.columns.str.lower().str.strip()

    # ── Base: use tracks as spine ──────────────────────────────────────────
    df = tracks.copy()

    # Rename to match model schema
    df = df.rename(columns={
        "wind_speed_kmh":        "wind_speed_kmh",       # already correct
        "central_pressure_hpa":  "central_pressure_hpa", # already correct
        "track_curvature":       "track_curvature",
        "distance_to_coast_km":  "distance_to_coast_km",
        "storm_surge_potential": "storm_surge_potential",
    })

    # ── Merge SST (spatial nearest) ───────────────────────────────────────
    # SST has time_index β€” match on spatial proximity only (SST changes slowly)
    df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])

    # ── Merge atmospheric moisture (spatial nearest) ───────────────────────
    df = _nearest_merge(
        df, moist,
        aux_cols=["atmospheric_moisture"],
    )

    # ── Merge shear (spatial nearest) ─────────────────────────────────────
    df = _nearest_merge(df, shear, aux_cols=["shear_index"])

    # ── Validate all features present ─────────────────────────────────────
    required = [
        "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
        "track_curvature", "distance_to_coast_km", "storm_surge_potential",
        "atmospheric_moisture", "shear_index",
    ]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns after merge: {missing}")

    # ── Synthesise risk label ──────────────────────────────────────────────
    # Combines intensity + proximity + surge into a [0,1] score.
    # Replace this with ground-truth labels if you have them.
    df["risk_score"] = _compute_risk_label(df)

    # ── Drop rows with NaN in any model feature ───────────────────────────
    df = df.dropna(subset=required + ["risk_score"])

    return df[required + ["risk_score", "date", "latitude", "longitude",
                          "basin", "storm_type"]].reset_index(drop=True)


def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
    """
    Synthetic risk score [0, 1] derived from physical intensity signals.
    Logic:
      - High wind + low pressure       β†’ high base risk
      - Close to coast                 β†’ landfall amplification
      - High storm surge               β†’ direct impact multiplier
      - SST > 28Β°C                     β†’ intensification bonus
    """
    # Normalise each driver to [0, 1]
    wind_norm     = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
    pressure_norm = np.clip(
        (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
    )
    coast_norm    = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
    surge_norm    = np.clip(df["storm_surge_potential"], 0, 1)
    sst_bonus     = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)

    score = (
        0.30 * wind_norm +
        0.25 * pressure_norm +
        0.20 * coast_norm +
        0.15 * surge_norm +
        0.10 * sst_bonus
    )

    # Add small noise to prevent the model from memorising exact thresholds
    noise = np.random.normal(0, 0.02, len(score))
    return np.clip(score + noise, 0.0, 1.0)