Spaces:

clarindasusan
/

cyclone-pred-api

Sleeping

File size: 33,525 Bytes

"""
train_model.py
==============
Trains FuzzyNeuralNetwork models for all four disaster types.

Usage:
    python train_model.py                    # Train all
    python train_model.py --disaster flood   # Train one
    python train_model.py --disaster flood --epochs 300

Synthetic Data Strategy:
  Since real labeled training data is rarely available in a single format,
  this script generates physically-motivated synthetic datasets.
  
  Each dataset is constructed so that the ground-truth risk label follows
  the domain logic (e.g., high rainfall + low elevation + poor drainage → flood risk).
  
  When you have real data:
    Replace the generate_*_data() functions with your own data loaders.
    The rest of the training pipeline stays identical.
"""
import os

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")

import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import os
import argparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_absolute_error
from scipy.spatial import cKDTree
from src.fuzzy_neural_network import FuzzyNeuralNetwork, FNNTrainer, save_model
from src.disaster_predictors import (
    FLOOD_FEATURES, CYCLONE_FEATURES, LANDSLIDE_FEATURES, EARTHQUAKE_FEATURES
)

MODEL_DIR = "models"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)


# ============================================================================
# SYNTHETIC DATA GENERATORS
# ============================================================================
# Each function returns (X: np.ndarray, y: np.ndarray)
# X shape: (n_samples, n_features) — already normalized to [0, 1]
# y shape: (n_samples,) — continuous risk score in [0, 1]

def generate_flood_data(n: int = 5000):
    

    rainfall = pd.read_csv(os.path.join(DATA_DIR, "rainfall_clean.csv"))
    flood_hist = pd.read_csv(os.path.join(DATA_DIR, "flood_history_clean.csv"))
    soil = pd.read_csv(os.path.join(DATA_DIR, "soil_moisture.csv"))
    drainage = pd.read_csv(os.path.join(DATA_DIR, "drainage_capacity.csv"))
    rivers = pd.read_csv(os.path.join(DATA_DIR, "river_network.csv"))
    elevation = pd.read_csv(os.path.join(DATA_DIR, "elevation.csv"))

    # ==============================
    # Prepare flood labels
    # ==============================
    # Ensure proper integer formatting
    flood_hist["year"] = flood_hist["year"].astype(int)
    flood_hist["month"] = flood_hist["month"].astype(int)
    
    flood_hist["date"] = pd.to_datetime(
        dict(
            year=flood_hist["year"],
            month=flood_hist["month"],
            day=1
        )
    )

    rainfall["date"] = pd.to_datetime(rainfall["date"])
    soil["date"] = pd.to_datetime(soil["date"])

    # Aggregate rainfall & soil monthly
    rainfall_monthly = rainfall.groupby(
        [rainfall["date"].dt.to_period("M"), "latitude", "longitude"]
    )["rainfall_mm"].mean().reset_index()

    rainfall_monthly["date"] = rainfall_monthly["date"].dt.to_timestamp()

    soil_monthly = soil.groupby(
        [soil["date"].dt.to_period("M"), "latitude", "longitude"]
    )["soil_saturation_pct"].mean().reset_index()

    soil_monthly["date"] = soil_monthly["date"].dt.to_timestamp()

    # ==============================
    # Spatial Nearest Join Function
    # ==============================
    def spatial_join(base_df, join_df, features):
        nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
        nbrs.fit(join_df[["latitude", "longitude"]])
        distances, indices = nbrs.kneighbors(base_df[["latitude", "longitude"]])
        joined = join_df.iloc[indices.flatten()][features].reset_index(drop=True)
        return joined

    # ==============================
    # Align all features to flood history
    # ==============================
    base = flood_hist.copy()

    # Rainfall
    rain_features = spatial_join(base, rainfall_monthly,
                                 ["rainfall_mm"])
    base["rainfall_mm"] = rain_features["rainfall_mm"]

    # Soil moisture
    soil_features = spatial_join(base, soil_monthly,
                                 ["soil_saturation_pct"])
    base["soil_saturation_pct"] = soil_features["soil_saturation_pct"]

    # Drainage
    drainage_features = spatial_join(base, drainage,
                                     ["drainage_capacity_index"])
    base["drainage_capacity_index"] = drainage_features["drainage_capacity_index"]

    # Elevation
    elevation_features = spatial_join(base, elevation,
                                      ["elevation_m", 
                                       "flow_accumulation", "twi"])
    for col in elevation_features.columns:
        base[col] = elevation_features[col]

    # Distance to nearest river
    river_coords = rivers[["latitude", "longitude"]]
    nbrs = NearestNeighbors(n_neighbors=1).fit(river_coords)
    dist, _ = nbrs.kneighbors(base[["latitude", "longitude"]])
    base["dist_river"] = dist

    # ==============================
    # Feature Selection
    # ==============================
    features = [
        "rainfall_mm",
        "elevation_m",
        
        "soil_saturation_pct",
        "dist_river",
        "drainage_capacity_index",
        "flow_accumulation",
        "twi",
    ]

    base = base.dropna(subset=features + ["severity_score"])

    # ==============================
    # Normalize
    # ==============================
    scaler = MinMaxScaler()
    X = scaler.fit_transform(base[features])

    # Normalize target
    y = MinMaxScaler().fit_transform(
        base[["severity_score"]]
    ).flatten()

    return X.astype(np.float32), y.astype(np.float32)



# ── Replace generate_cyclone_data() entirely ──────────────────────────────

def generate_cyclone_data(n: int = 3000):
    """
    Loads and joins the four cyclone CSVs:
      - cyclone_tracks_clean.csv  (spine)
      - sea_surface_temp.csv      (spatial join)
      - atmospheric_moisture.csv  (spatial join)
      - wind_shear.csv            (spatial join)
    Returns X (normalized), y (risk_score) matching CYCLONE_FEATURES order.
    """

    def nearest_merge(base_df, aux_df, cols):
        tree = cKDTree(aux_df[["latitude", "longitude"]].values)
        _, idxs = tree.query(base_df[["latitude", "longitude"]].values)
        base_df = base_df.copy() 
        for col in cols:
            base_df[col] = aux_df[col].iloc[idxs].values
        return base_df

    # ── Load ──────────────────────────────────────────────────────────────
    tracks = pd.read_csv(os.path.join(DATA_DIR, "cyclone_tracks_clean.csv"))
    sst    = pd.read_csv(os.path.join(DATA_DIR, "sea_surface_temp.csv"))
    moist  = pd.read_csv(os.path.join(DATA_DIR, "atmospheric_moisture.csv"))
    shear  = pd.read_csv(os.path.join(DATA_DIR, "wind_shear.csv"))

    # Normalise column names
    for df in (tracks, sst, moist, shear):
        df.columns = df.columns.str.lower().str.strip()

    base = tracks.copy()

    # ── Spatial joins ─────────────────────────────────────────────────────
    base = nearest_merge(base, sst,   ["sea_surface_temp_c"])
    base = nearest_merge(base, moist, ["atmospheric_moisture"])
    base = nearest_merge(base, shear, ["shear_index"])

    # ── Validate required columns present ─────────────────────────────────
    required = [
        "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
        "track_curvature", "distance_to_coast_km", "storm_surge_potential",
        "atmospheric_moisture", "shear_index",
    ]
    missing = [c for c in required if c not in base.columns]
    if missing:
        raise ValueError(
            f"Cyclone data missing columns after join: {missing}\n"
            f"Available columns: {list(base.columns)}"
        )

    base = base.dropna(subset=required)

    # ── Build risk label ───────────────────────────────────────────────────
    # Use severity_score if it exists in tracks, otherwise derive it
    if "severity_score" in base.columns and base["severity_score"].notna().sum() > 0:
        base["risk_score"] = MinMaxScaler().fit_transform(
            base[["severity_score"]]
        ).flatten()
    else:
        wind_norm     = np.clip(base["wind_speed_kmh"] / 350.0, 0, 1)
        pressure_norm = np.clip(
            (1013 - base["central_pressure_hpa"]) / (1013 - 870), 0, 1
        )
        coast_norm    = np.clip(1 - base["distance_to_coast_km"] / 500.0, 0, 1)
        surge_norm    = np.clip(base["storm_surge_potential"], 0, 1)
        sst_bonus     = np.clip((base["sea_surface_temp_c"] - 26) / 9, 0, 1)
        shear_penalty = np.clip(base["shear_index"], 0, 1)

        base["risk_score"] = np.clip(
            0.30 * wind_norm +
            0.25 * pressure_norm +
            0.20 * coast_norm +
            0.15 * surge_norm +
            0.10 * sst_bonus -
            0.10 * shear_penalty +          # high shear weakens cyclones
            np.random.normal(0, 0.02, len(base)),
            0.0, 1.0
        )

    # ── Normalise features (mirrors FEATURE_RANGES in disaster_predictors) ─
    from src.disaster_predictors import FEATURE_RANGES
    X = np.zeros((len(base), len(required)), dtype=np.float32)
    for i, feat in enumerate(required):
        lo, hi = FEATURE_RANGES[feat]
        X[:, i] = np.clip(
            (base[feat].values - lo) / (hi - lo + 1e-8), 0.0, 1.0
        )

    y = base["risk_score"].values.astype(np.float32)
    return X, y

def generate_landslide_data(n: int = 4000):
    print("[Landslide] Using REAL data loader")

    def nearest_merge(base_df, aux_df, cols,
                      base_lat="latitude", base_lon="longitude",
                      aux_lat="latitude",  aux_lon="longitude"):
        if aux_lat not in aux_df.columns or aux_lon not in aux_df.columns:
            raise ValueError(
                f"nearest_merge: aux_df missing lat/lon. Has: {list(aux_df.columns)}"
            )
        tree = cKDTree(aux_df[[aux_lat, aux_lon]].values)
        _, idxs = tree.query(base_df[[base_lat, base_lon]].values)
        base_df = base_df.copy()
        for col in cols:
            if col not in aux_df.columns:
                raise ValueError(
                    f"nearest_merge: '{col}' not in aux_df. Has: {list(aux_df.columns)}"
                )
            base_df[col] = aux_df[col].iloc[idxs].values
        return base_df

    # ── Load ──────────────────────────────────────────────────────────────
    print("[Landslide] Loading CSVs...")
    catalog = pd.read_csv(os.path.join(DATA_DIR, "Global_Landslide_Catalog_Export_rows.csv"))
    veg     = pd.read_csv(os.path.join(DATA_DIR, "vegetation_ndvi_aggregated.csv"))
    faults  = pd.read_csv(os.path.join(DATA_DIR, "fault_lines.csv"))
    elev    = pd.read_csv(os.path.join(DATA_DIR, "elevation.csv"))
    rain    = pd.read_csv(os.path.join(DATA_DIR, "rainfall_clean.csv"))

    for df in (catalog, veg, faults, elev, rain):
        df.columns = df.columns.str.lower().str.strip()

    print(f"[Landslide] Catalog: {len(catalog)} rows, cols: {list(catalog.columns)}")
    print(f"[Landslide] Veg cols: {list(veg.columns)}")
    print(f"[Landslide] Fault cols: {list(faults.columns)}")
    print(f"[Landslide] Elev cols: {list(elev.columns)}")
    print(f"[Landslide] Rain cols: {list(rain.columns)}")

    # ── Clean catalog spine ───────────────────────────────────────────────
    catalog = catalog.dropna(subset=["latitude", "longitude"])
    catalog["event_date"] = pd.to_datetime(
        catalog["event_date"], errors="coerce"
    )
    catalog = catalog.dropna(subset=["event_date"])
    print(f"[Landslide] After date clean: {len(catalog)} rows")

    # ── historical_landslide_freq ─────────────────────────────────────────
    catalog["lat_bin"] = (catalog["latitude"]  / 0.5).round() * 0.5
    catalog["lon_bin"] = (catalog["longitude"] / 0.5).round() * 0.5
    freq_map = (
        catalog.groupby(["lat_bin", "lon_bin"])
        .size().reset_index(name="event_count")
    )
    catalog = catalog.merge(freq_map, on=["lat_bin", "lon_bin"], how="left")
    catalog["historical_landslide_freq"] = (
        catalog["event_count"] / catalog["event_count"].max()
    ).clip(0, 1)

    base = catalog.copy()

    # ── Vegetation ────────────────────────────────────────────────────────
    print("[Landslide] Merging vegetation...")
    if "vegetation_cover_pct" not in veg.columns and "ndvi" in veg.columns:
        veg["vegetation_cover_pct"] = ((veg["ndvi"] + 1) / 2 * 100).clip(0, 100)

    if "latitude" not in veg.columns or "longitude" not in veg.columns:
        mean_cover = float(veg["vegetation_cover_pct"].mean())
        print(f"[Landslide] Veg has no coordinates — broadcasting mean: {mean_cover:.1f}%")
        base["vegetation_cover_pct"] = mean_cover
    else:
        base = nearest_merge(base, veg, ["vegetation_cover_pct"])

    # ── Fault lines ───────────────────────────────────────────────────────
    print("[Landslide] Merging fault lines...")
    faults = faults.rename(columns={"seismic_hazard_index": "seismic_activity_index"})

    fault_tree   = cKDTree(np.radians(faults[["latitude", "longitude"]].values))
    event_coords = np.radians(base[["latitude", "longitude"]].values)
    dists_rad, idxs = fault_tree.query(event_coords)

    base = base.copy()
    base["distance_to_fault_km"]   = dists_rad * 6371
    base["seismic_activity_index"] = faults["seismic_activity_index"].iloc[idxs].values

    # ── Elevation → aspect_index ──────────────────────────────────────────
    print("[Landslide] Merging elevation...")
    if "aspect_index" not in elev.columns:
        if "aspect_degrees" in elev.columns:
            elev["aspect_index"] = (elev["aspect_degrees"] / 360.0).clip(0, 1)
        else:
            elev["aspect_index"] = 0.5

    base = nearest_merge(base, elev, ["slope_degrees","aspect_index"])

    # ── Rainfall ──────────────────────────────────────────────────────────
    print("[Landslide] Merging rainfall...")
    rain["date"] = pd.to_datetime(rain["date"], errors="coerce")
    rain_agg = (
        rain.groupby(["latitude", "longitude"])["rainfall_mm"]
        .mean().reset_index()
        .rename(columns={"rainfall_mm": "rainfall_intensity_mmh"})
    )
    rain_agg["rainfall_intensity_mmh"] = rain_agg["rainfall_intensity_mmh"].clip(0, 200)
    base = nearest_merge(base, rain_agg, ["rainfall_intensity_mmh"])

    # ── soil_type_index proxy ─────────────────────────────────────────────
    slope_norm = np.clip(base["slope_degrees"] / 90.0, 0, 1)
    veg_norm  = np.clip(base["vegetation_cover_pct"] / 100.0, 0, 1)
    rain_norm = np.clip(base["rainfall_intensity_mmh"] / 200.0, 0, 1)
    base["soil_type_index"] = np.clip(                          # ← FIX 1: now saved
        1.0 - (0.4 * slope_norm + 0.3 * (1 - veg_norm) + 0.3 * rain_norm), 0, 1
    )

    # ── Risk label ────────────────────────────────────────────────────────
    if "fatality_count" in base.columns:                        # ← FIX 2: no base.get()
        base["fatality_count"] = pd.to_numeric(
            base["fatality_count"], errors="coerce"
        ).fillna(0)
    else:
        base["fatality_count"] = 0.0

    size_map = {"small": 0.2, "medium": 0.5, "large": 0.8,
                "very_large": 1.0, "unknown": 0.3}
    if "landslide_size" in base.columns:                        # ← FIX 2: no base.get()
        base["size_score"] = (
            base["landslide_size"]
            .astype(str).str.lower().str.strip()
            .map(size_map).fillna(0.3)
        )
    else:
        base["size_score"] = 0.3

    max_fatal     = base["fatality_count"].max()
    fatality_norm = (
        np.log1p(base["fatality_count"]) / np.log1p(max_fatal + 1)
    ).clip(0, 1).values

    base["risk_score"] = np.clip(
        0.35 * base["size_score"] +
        0.25 * base["historical_landslide_freq"] +
        0.20 * fatality_norm +
        0.15 * slope_norm +
        0.05 * (1 - veg_norm) +
        np.random.normal(0, 0.02, len(base)),
        0.0, 1.0
    )

    print(f"[Landslide] Risk score: mean={base['risk_score'].mean():.3f}, "
          f"std={base['risk_score'].std():.3f}, "
          f">0.5: {(base['risk_score'] > 0.5).sum()} rows")

    # ── Final feature matrix ──────────────────────────────────────────────
    features = [
        "slope_degrees","rainfall_intensity_mmh", "soil_type_index",
        "vegetation_cover_pct", "seismic_activity_index",
        "distance_to_fault_km", "aspect_index", "historical_landslide_freq",
    ]

    # Verify features match LANDSLIDE_FEATURES exactly
    assert features == list(LANDSLIDE_FEATURES), (
        f"Feature mismatch!\n  train:      {features}\n"
        f"  predictor: {list(LANDSLIDE_FEATURES)}"
    )

    base = base.dropna(subset=features + ["risk_score"])
    print(f"[Landslide] Final training rows: {len(base)}")

    if len(base) < 50:
        raise ValueError(
            f"Only {len(base)} clean rows — check CSV paths and column names"
        )

    from src.disaster_predictors import FEATURE_RANGES
    X = np.zeros((len(base), len(features)), dtype=np.float32)
    for i, feat in enumerate(features):
        lo, hi = FEATURE_RANGES[feat]
        X[:, i] = np.clip(
            (base[feat].values - lo) / (hi - lo + 1e-8), 0, 1
        )

    y = base["risk_score"].values.astype(np.float32)
    return X, y

def generate_earthquake_data(n: int = 3000):
    """
    Loads and joins earthquake datasets:
      - earthquake_history.csv       (spine + historical_seismicity, focal_depth_km, tectonic_stress_index)
      - fault_lines_earthquake.csv   (distance_to_fault_km, seismic_hazard_index)
      - soil_liquefaction.csv        (soil_liquefaction_index)
      - vs30_bedrock.csv             (bedrock_amplification)
      - building_vulnerability.csv   (building_vulnerability)
      - population_earthquake.csv    (population_density_norm)
    """
    print("[Earthquake] Using REAL data loader")

    def nearest_merge(base_df, aux_df, cols,
                      base_lat="latitude", base_lon="longitude",
                      aux_lat="latitude",  aux_lon="longitude"):
        if aux_lat not in aux_df.columns or aux_lon not in aux_df.columns:
            raise ValueError(
                f"nearest_merge: aux_df missing lat/lon. Has: {list(aux_df.columns)}"
            )
        tree = cKDTree(aux_df[[aux_lat, aux_lon]].values)
        _, idxs = tree.query(base_df[[base_lat, base_lon]].values)
        base_df = base_df.copy()
        for col in cols:
            if col not in aux_df.columns:
                raise ValueError(
                    f"nearest_merge: '{col}' not in aux_df. "
                    f"Has: {list(aux_df.columns)}"
                )
            base_df[col] = aux_df[col].iloc[idxs].values
        return base_df

    # ── Load ──────────────────────────────────────────────────────────────
    print("[Earthquake] Loading CSVs...")
    history  = pd.read_csv(os.path.join(DATA_DIR, "earthquake_history.csv"))
    faults   = pd.read_csv(os.path.join(DATA_DIR, "fault_lines_earthquake.csv"))
    liquef   = pd.read_csv(os.path.join(DATA_DIR, "soil_liquefaction.csv"))
    vs30     = pd.read_csv(os.path.join(DATA_DIR, "vs30_bedrock.csv"))
    bldg     = pd.read_csv(os.path.join(DATA_DIR, "building_vulnerability.csv"))
    pop      = pd.read_csv(os.path.join(DATA_DIR, "population_earthquake.csv"))

    for df in (history, faults, liquef, vs30, bldg, pop):
        df.columns = df.columns.str.lower().str.strip()

    print(f"[Earthquake] History: {len(history)} rows, cols: {list(history.columns)}")
    print(f"[Earthquake] Faults cols:  {list(faults.columns)}")
    print(f"[Earthquake] Liquef cols:  {list(liquef.columns)}")
    print(f"[Earthquake] VS30 cols:    {list(vs30.columns)}")
    print(f"[Earthquake] Bldg cols:    {list(bldg.columns)}")
    print(f"[Earthquake] Pop cols:     {list(pop.columns)}")

    # ── Clean history spine ───────────────────────────────────────────────
    history = history.dropna(subset=["latitude", "longitude"])
    history["date"] = pd.to_datetime(history["date"], errors="coerce")
    history = history.dropna(subset=["date"])
    print(f"[Earthquake] After date clean: {len(history)} rows")

    if len(history) == 0:
        raise ValueError(
            "earthquake_history has 0 rows after date parsing. "
            f"Sample raw dates: {pd.read_csv(os.path.join(DATA_DIR, 'earthquake_history.csv'))['date'].head().tolist()}"
        )

    base = history.copy()

    # ── Fault lines → distance_to_fault_km ───────────────────────────────
    # fault_lines_earthquake already has distance_to_fault_km as a column
    # but we still spatial-join to get the nearest fault's values
    print("[Earthquake] Merging fault lines...")
    base = nearest_merge(base, faults, ["distance_to_fault_km"])

    # ── Soil liquefaction ─────────────────────────────────────────────────
    print("[Earthquake] Merging soil liquefaction...")
    base = nearest_merge(base, liquef, ["soil_liquefaction_index"])

    # ── VS30 / bedrock amplification ──────────────────────────────────────
    print("[Earthquake] Merging VS30 bedrock...")
    base = nearest_merge(base, vs30, ["bedrock_amplification"])

    # ── Building vulnerability ────────────────────────────────────────────
    print("[Earthquake] Merging building vulnerability...")
    base = nearest_merge(base, bldg, ["building_vulnerability"])

    # ── Population density ────────────────────────────────────────────────
    print("[Earthquake] Merging population...")
    base = nearest_merge(base, pop, ["population_density_norm"])

    # ── Validate all required columns present ─────────────────────────────
    required = [
        "historical_seismicity", "distance_to_fault_km", "soil_liquefaction_index",
        "focal_depth_km", "tectonic_stress_index", "building_vulnerability",
        "population_density_norm", "bedrock_amplification",
    ]
    missing = [c for c in required if c not in base.columns]
    if missing:
        raise ValueError(
            f"Missing columns after all merges: {missing}\n"
            f"Available: {list(base.columns)}"
        )

    base = base.dropna(subset=required)
    print(f"[Earthquake] Rows after dropna: {len(base)}")

    if len(base) < 50:
        raise ValueError(
            f"Only {len(base)} clean rows — check CSV paths and column names"
        )

    # ── Risk label ────────────────────────────────────────────────────────
    # Use magnitude if available, otherwise derive from features
    if "magnitude" in base.columns:
        base["magnitude"] = pd.to_numeric(base["magnitude"], errors="coerce").fillna(0)
        mag_norm = np.clip((base["magnitude"] - 2.0) / 7.0, 0, 1)  # scale 2–9
    else:
        mag_norm = pd.Series(np.zeros(len(base)))

    depth_norm   = np.clip(base["focal_depth_km"] / 700.0, 0, 1)
    fault_norm   = np.clip(base["distance_to_fault_km"] / 200.0, 0, 1)
    liquef_norm  = np.clip(base["soil_liquefaction_index"], 0, 1)
    vuln_norm    = np.clip(base["building_vulnerability"], 0, 1)
    pop_norm     = np.clip(base["population_density_norm"], 0, 1)
    amp_norm     = np.clip(base["bedrock_amplification"], 0, 1)
    stress_norm  = np.clip(base["tectonic_stress_index"], 0, 1)
    seism_norm   = np.clip(base["historical_seismicity"], 0, 1)

    base["risk_score"] = np.clip(
        0.25 * mag_norm.values +
        0.20 * (1 - depth_norm) +        # shallow = more damage
        0.15 * (1 - fault_norm) +        # close to fault = more risk
        0.15 * liquef_norm +
        0.10 * vuln_norm +
        0.05 * pop_norm +
        0.05 * amp_norm +
        0.05 * seism_norm +
        np.random.normal(0, 0.02, len(base)),
        0.0, 1.0
    )

    print(f"[Earthquake] Risk score: mean={base['risk_score'].mean():.3f}, "
          f"std={base['risk_score'].std():.3f}, "
          f">0.5: {(base['risk_score'] > 0.5).sum()} rows")

    # ── Normalise features ────────────────────────────────────────────────
    features = [
        "historical_seismicity", "distance_to_fault_km", "soil_liquefaction_index",
        "focal_depth_km", "tectonic_stress_index", "building_vulnerability",
        "population_density_norm", "bedrock_amplification",
    ]
        # Add immediately before the assert
    print(f"[Earthquake] Columns available before assert: {list(base.columns)}")
    print(f"[Earthquake] Required: {required}")
    print(f"[Earthquake] Missing: {[c for c in required if c not in base.columns]}")


    assert features == list(EARTHQUAKE_FEATURES), (
        f"Feature mismatch!\n  train:     {features}\n"
        f"  predictor: {list(EARTHQUAKE_FEATURES)}"
    )

    from src.disaster_predictors import FEATURE_RANGES
    X = np.zeros((len(base), len(features)), dtype=np.float32)
    for i, feat in enumerate(features):
        lo, hi = FEATURE_RANGES[feat]
        X[:, i] = np.clip(
            (base[feat].values - lo) / (hi - lo + 1e-8), 0, 1
        )

    y = base["risk_score"].values.astype(np.float32)
    return X, y


DATA_GENERATORS = {
    "flood":      (generate_flood_data,      FLOOD_FEATURES),
    "cyclone":    (generate_cyclone_data,     CYCLONE_FEATURES),
    "landslide":  (generate_landslide_data,   LANDSLIDE_FEATURES),
    "earthquake": (generate_earthquake_data,  EARTHQUAKE_FEATURES),
}



# ============================================================================
# TRAINING PIPELINE
# ============================================================================

def evaluate_model(model: FuzzyNeuralNetwork, X: torch.Tensor, y: torch.Tensor) -> dict:
    model.eval()
    with torch.no_grad():
        preds = model(X).numpy().flatten()
    y_np = y.numpy()

    threshold = float(np.median(y_np))
    try:
        auc = float(roc_auc_score((y_np > threshold).astype(int), preds))
    except ValueError as e:
        print(f"  [Warning] AUC undefined: {e}")
        auc = float("nan")      # always float, never string

    mae = mean_absolute_error(y_np, preds)

    return {
        "MAE":             round(float(mae), 4),
        "AUC-ROC":         round(auc, 4) if not np.isnan(auc) else float("nan"),
        "Mean Prediction": round(float(preds.mean()), 4),
        "Mean Label":      round(float(y_np.mean()), 4),
        "Std Prediction":  round(float(preds.std()), 4),
    }


def train_disaster_model(disaster_type: str, epochs: int = 1000, n_samples: int = None):
    print(f"\n{'='*60}")
    print(f"  Training FNN for: {disaster_type.upper()}")
    print(f"{'='*60}")

    generator_fn, feature_names = DATA_GENERATORS[disaster_type]
    n = n_samples or {
        "flood": 5000, "cyclone": 3000,
        "landslide": 4000, "earthquake": 3000
    }[disaster_type]

    REAL_DATA_GENERATORS = {"flood", "cyclone", "landslide", "earthquake"}
    if disaster_type in REAL_DATA_GENERATORS:
        print(f"Loading real data for {disaster_type}...")
    else:
        print(f"Generating {n} synthetic samples...")

    X, y = generator_fn(n)
    print(f"  Data loaded: X={X.shape}, y={y.shape}, "
          f"y_mean={y.mean():.3f}, y_std={y.std():.3f}")

    # Train/val/test split
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.15, random_state=SEED
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.15, random_state=SEED
    )

    print(f"  Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

    # Tensors
    X_train_t = torch.tensor(X_train)
    y_train_t = torch.tensor(y_train)
    X_val_t   = torch.tensor(X_val)
    y_val_t   = torch.tensor(y_val)
    X_test_t  = torch.tensor(X_test)
    y_test_t  = torch.tensor(y_test)

    # Model
    n_features = len(feature_names)
    model = FuzzyNeuralNetwork(
        n_features=n_features,
        n_terms=3,
        hidden_dims=[64, 32],
        dropout=0.2
    )

    print(f"  Model: FNN with {n_features} inputs, 3 fuzzy terms, 64→32 deep head")
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Trainable parameters: {total_params:,}")

    # Train
    trainer = FNNTrainer(model, lr=1e-3, weight_decay=1e-4)
    trainer.fit(
        X_train_t, y_train_t,
        X_val_t,   y_val_t,
        epochs=epochs, batch_size=64, patience=50
    )

    # Evaluate
    print("\n  Test set evaluation:")
    metrics = evaluate_model(model, X_test_t, y_test_t)
    for k, v in metrics.items():
        print(f"    {k}: {v}")

    # Save
    os.makedirs(MODEL_DIR, exist_ok=True)
    model_path = os.path.join(MODEL_DIR, f"fnn_{disaster_type}_model.pt")
    save_model(model, model_path, feature_names)

    feat_path = os.path.join(MODEL_DIR, "feature_names", f"{disaster_type}_features.txt")
    os.makedirs(os.path.dirname(feat_path), exist_ok=True)
    with open(feat_path, "w") as f:
        f.write("\n".join(feature_names))

    print(f"\n  Model saved to: {model_path}")
    return metrics   # ← always returns, never None


def train_all(epochs: int = 200):
    results = {}
    for disaster_type in DATA_GENERATORS:
        try:
            metrics = train_disaster_model(disaster_type, epochs=epochs)
            if metrics is None:
                raise RuntimeError("train_disaster_model returned None")
            results[disaster_type] = metrics
        except Exception as e:
            print(f"\n  [ERROR] {disaster_type} training failed: {e}")
            import traceback
            traceback.print_exc()
            results[disaster_type] = {
                "MAE": float("nan"),
                "AUC-ROC": float("nan"),
                "Mean Prediction": float("nan"),
                "Std Prediction": float("nan"),
            }

    print("\n" + "="*60)
    print("  TRAINING SUMMARY")
    print("="*60)
    for dt, metrics in results.items():
        auc = metrics["AUC-ROC"]
        mae = metrics["MAE"]
        auc_str = f"{auc:.4f}" if isinstance(auc, float) and not np.isnan(auc) else "nan"
        mae_str = f"{mae:.4f}" if isinstance(mae, float) and not np.isnan(mae) else "nan"
        print(f"  {dt.upper():12s} | MAE: {mae_str} | AUC: {auc_str}")
    print("="*60)

# ============================================================================
# ENTRY POINT
# ============================================================================

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train FNN disaster models")
    parser.add_argument(
        "--disaster",
        choices=list(DATA_GENERATORS.keys()) + ["all"],
        default="all",
        help="Which disaster model to train"
    )
    parser.add_argument("--epochs", type=int, default=200)
    parser.add_argument("--samples", type=int, default=None)

    args = parser.parse_args()

    if args.disaster == "all":
        train_all(epochs=args.epochs)
    else:
        train_disaster_model(args.disaster, epochs=args.epochs, n_samples=args.samples)