"""
CycloneDataLoader
=================
Joins the four cyclone CSVs into a single training-ready DataFrame.
Handles spatial nearest-neighbour merging for SST, moisture, and shear.
"""

import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from pathlib import Path


def _nearest_merge(
    base: pd.DataFrame,
    aux: pd.DataFrame,
    aux_cols: list[str],
    aux_lat: str = "latitude",
    aux_lon: str = "longitude",
) -> pd.DataFrame:
    """
    For each row in `base`, find the spatially nearest row in `aux`
    and attach aux_cols. Uses a cKDTree for efficiency.
    """
    tree = cKDTree(aux[[aux_lat, aux_lon]].values)
    dists, idxs = tree.query(base[["latitude", "longitude"]].values)
    for col in aux_cols:
        base[col] = aux[col].iloc[idxs].values
    return base


def load_cyclone_training_data(data_dir: str = "data") -> pd.DataFrame:
    """
    Returns a clean DataFrame with all 8 cyclone model features + a
    synthetic `risk_score` label derived from intensity/landfall signals.

    Columns produced (matching CYCLONE_FEATURES exactly):
      wind_speed_kmh, central_pressure_hpa, sea_surface_temp_c,
      track_curvature, distance_to_coast_km, storm_surge_potential,
      atmospheric_moisture, shear_index
    """
    p = Path(data_dir)

    # ── Load raw tables ────────────────────────────────────────────────────
    tracks = pd.read_csv(p / "cyclone_tracks_clean.csv", parse_dates=["date"])
    sst    = pd.read_csv(p / "sea_surface_temp.csv")
    moist  = pd.read_csv(p / "atmospheric_moisture.csv", parse_dates=["date"])
    shear  = pd.read_csv(p / "wind_shear.csv")

    # ── Normalise column names to lowercase ───────────────────────────────
    for df in (tracks, sst, moist, shear):
        df.columns = df.columns.str.lower().str.strip()

    # ── Base: use tracks as spine ──────────────────────────────────────────
    df = tracks.copy()

    # Rename to match model schema
    df = df.rename(columns={
        "wind_speed_kmh":        "wind_speed_kmh",       # already correct
        "central_pressure_hpa":  "central_pressure_hpa", # already correct
        "track_curvature":       "track_curvature",
        "distance_to_coast_km":  "distance_to_coast_km",
        "storm_surge_potential": "storm_surge_potential",
    })

    # ── Merge SST (spatial nearest) ───────────────────────────────────────
    # SST has time_index — match on spatial proximity only (SST changes slowly)
    df = _nearest_merge(df, sst, aux_cols=["sea_surface_temp_c"])

    # ── Merge atmospheric moisture (spatial nearest) ───────────────────────
    df = _nearest_merge(
        df, moist,
        aux_cols=["atmospheric_moisture"],
    )

    # ── Merge shear (spatial nearest) ─────────────────────────────────────
    df = _nearest_merge(df, shear, aux_cols=["shear_index"])

    # ── Validate all features present ─────────────────────────────────────
    required = [
        "wind_speed_kmh", "central_pressure_hpa", "sea_surface_temp_c",
        "track_curvature", "distance_to_coast_km", "storm_surge_potential",
        "atmospheric_moisture", "shear_index",
    ]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns after merge: {missing}")

    # ── Synthesise risk label ──────────────────────────────────────────────
    # Combines intensity + proximity + surge into a [0,1] score.
    # Replace this with ground-truth labels if you have them.
    df["risk_score"] = _compute_risk_label(df)

    # ── Drop rows with NaN in any model feature ───────────────────────────
    df = df.dropna(subset=required + ["risk_score"])

    return df[required + ["risk_score", "date", "latitude", "longitude",
                          "basin", "storm_type"]].reset_index(drop=True)


def _compute_risk_label(df: pd.DataFrame) -> pd.Series:
    """
    Synthetic risk score [0, 1] derived from physical intensity signals.
    Logic:
      - High wind + low pressure       → high base risk
      - Close to coast                 → landfall amplification
      - High storm surge               → direct impact multiplier
      - SST > 28°C                     → intensification bonus
    """
    # Normalise each driver to [0, 1]
    wind_norm     = np.clip(df["wind_speed_kmh"] / 350.0, 0, 1)
    pressure_norm = np.clip(
        (1013 - df["central_pressure_hpa"]) / (1013 - 870), 0, 1
    )
    coast_norm    = np.clip(1 - df["distance_to_coast_km"] / 500.0, 0, 1)
    surge_norm    = np.clip(df["storm_surge_potential"], 0, 1)
    sst_bonus     = np.clip((df["sea_surface_temp_c"] - 26) / 9, 0, 1)

    score = (
        0.30 * wind_norm +
        0.25 * pressure_norm +
        0.20 * coast_norm +
        0.15 * surge_norm +
        0.10 * sst_bonus
    )

    # Add small noise to prevent the model from memorising exact thresholds
    noise = np.random.normal(0, 0.02, len(score))
    return np.clip(score + noise, 0.0, 1.0)