"""
Stress test & real-world verification for the crew sequence risk model.

Test 1 — 2024 holdout backtest
  Reload model, score 2024 rows (never seen during training), check:
  - ROC-AUC, Average Precision
  - Decile calibration: do higher-score deciles have proportionally higher actual bad rates?
  - Monotonicity: is each decile worse than the previous?

Test 2 — Historic storm event replay
  For known catastrophic weather events, download IEM METAR for affected airports,
  compute per-day weather severity, and check whether the model + weather penalty
  would have correctly flagged those sequences as high-risk.

  Events:
    A. Winter Storm Elliott  — Dec 23-26 2022 (ORD, BUF, CLT, DEN, MDW)
    B. DFW summer convective — Jul 10 2023 (DFW-centric, peak convective season)
    C. Southeast ice storm   — Jan 16 2024 (ATL, CLT, RDU, MIA)
"""

import os, sys
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.dirname(__file__))
from weather import download_historical_weather, _compute_severity

PROC  = os.path.join(os.path.dirname(__file__), "..", "data", "processed")
PLOTS = os.path.join(PROC, "plots")
os.makedirs(PLOTS, exist_ok=True)

FEATURE_COLS = [
    "A_weather_delay_rate", "A_weather_cancel_rate", "A_avg_weather_delay_min",
    "A_p75_weather_delay_min", "A_p95_weather_delay_min", "A_nas_delay_rate",
    "A_overall_weather_delay_rate", "A_overall_avg_weather_delay_min",
    "B_weather_delay_rate", "B_weather_cancel_rate", "B_avg_weather_delay_min",
    "B_p75_weather_delay_min", "B_p95_weather_delay_min", "B_nas_delay_rate",
    "B_overall_weather_delay_rate", "B_overall_avg_weather_delay_min",
    "pair_combined_weather_rate", "pair_max_weather_rate", "pair_min_weather_rate",
    "pair_weather_rate_sum", "pair_avg_weather_delay_min", "both_high_risk",
    "Month", "is_spring_summer", "median_turnaround_min",
]

STORM_EVENTS = {
    "Winter Storm Elliott (Dec 23-26 2022)": {
        "airports": ["ORD", "BUF", "CLT", "DEN", "MDW", "LGA", "DFW"],
        "year": 2022,
        "dates": pd.date_range("2022-12-23", "2022-12-26"),
        "expected": "CRITICAL — massive cancellation event across US",
    },
    "DFW Convective Outbreak (Jul 10 2023)": {
        "airports": ["DFW", "DAL", "AUS", "SAT", "HOU", "MSY", "ORD"],
        "year": 2023,
        "dates": pd.date_range("2023-07-08", "2023-07-12"),
        "expected": "HIGH — peak summer convective at DFW hub",
    },
    "Southeast Ice Storm (Jan 16-17 2024)": {
        "airports": ["ATL", "CLT", "RDU", "GSO", "BNA", "MIA", "DFW"],
        "year": 2024,
        "dates": pd.date_range("2024-01-15", "2024-01-18"),
        "expected": "HIGH — ice/freezing rain across Southeast",
    },
}


# ---------------------------------------------------------------------------
# Test 1: 2024 holdout backtest
# ---------------------------------------------------------------------------

def test1_holdout_backtest():
    print("\n" + "="*65)
    print("TEST 1 — 2024 Holdout Backtest")
    print("="*65)

    df = pd.read_parquet(os.path.join(PROC, "sequence_features.parquet"))
    season_cols = [c for c in df.columns if c.startswith("season_")]
    feat_cols = [c for c in FEATURE_COLS + season_cols if c in df.columns]

    val = df[df["Year"] == 2024].copy()
    X_val = val[feat_cols].astype(float)
    y_val = val["target"].astype(int)

    model = xgb.XGBClassifier()
    model.load_model(os.path.join(PROC, "xgb_model.json"))
    proba = model.predict_proba(X_val)[:, 1]

    roc  = roc_auc_score(y_val, proba)
    ap   = average_precision_score(y_val, proba)
    print(f"\nROC-AUC (2024):          {roc:.4f}")
    print(f"Average Precision (2024):{ap:.4f}")
    print(f"Baseline bad rate:       {y_val.mean():.3f} ({y_val.mean():.1%})")

    # Decile calibration
    val = val.copy()
    val["proba"] = proba
    val["decile"] = pd.qcut(val["proba"], q=10, labels=False, duplicates="drop")

    decile_tbl = (
        val.groupby("decile")
        .agg(
            n=("target", "count"),
            actual_bad_rate=("target", "mean"),
            mean_predicted=("proba", "mean"),
            mean_observed=("observed_bad_rate", "mean"),
        )
        .reset_index()
    )
    print("\nDecile Calibration (0=lowest predicted risk, 9=highest):")
    print(decile_tbl.to_string(index=False, float_format="{:.3f}".format))

    # Monotonicity check
    rates = decile_tbl["actual_bad_rate"].values
    monotone = all(rates[i] <= rates[i+1] for i in range(len(rates)-1))
    print(f"\nMonotonicity (each decile worse than prev): {'PASS' if monotone else 'FAIL'}")

    # Lift: top decile vs bottom decile
    top_rate    = decile_tbl.iloc[-1]["actual_bad_rate"]
    bottom_rate = decile_tbl.iloc[0]["actual_bad_rate"]
    print(f"Lift (top/bottom decile): {top_rate/bottom_rate:.1f}x")

    # Plot calibration
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    ax = axes[0]
    ax.plot(decile_tbl["mean_predicted"], decile_tbl["actual_bad_rate"],
            "o-", color="steelblue", label="Actual bad rate")
    ax.plot([0,1],[0,1], "k--", alpha=0.4, label="Perfect calibration")
    ax.set_xlabel("Mean predicted probability")
    ax.set_ylabel("Actual bad rate")
    ax.set_title("Calibration — 2024 holdout\n(by predicted probability decile)")
    ax.legend()
    ax.grid(alpha=0.3)

    ax = axes[1]
    bars = ax.bar(decile_tbl["decile"], decile_tbl["actual_bad_rate"],
                  color=plt.cm.RdYlGn_r(decile_tbl["actual_bad_rate"]))
    ax.axhline(y_val.mean(), color="navy", linestyle="--", label=f"Baseline {y_val.mean():.1%}")
    ax.set_xlabel("Risk score decile (0=lowest, 9=highest)")
    ax.set_ylabel("Actual bad rate")
    ax.set_title(f"Actual bad rate by decile — 2024 holdout\nROC-AUC={roc:.3f}  AP={ap:.3f}")
    ax.legend()
    ax.grid(axis="y", alpha=0.3)

    plt.tight_layout()
    out = os.path.join(PLOTS, "stress_test1_calibration.png")
    plt.savefig(out, dpi=150)
    plt.close()
    print(f"\nCalibration plot → {out}")

    return decile_tbl


# ---------------------------------------------------------------------------
# Test 2: Historic storm event replay
# ---------------------------------------------------------------------------

def test2_storm_replay():
    print("\n" + "="*65)
    print("TEST 2 — Historic Storm Event Replay")
    print("="*65)

    all_results = []

    for event_name, ev in STORM_EVENTS.items():
        print(f"\n--- {event_name} ---")
        print(f"Expected: {ev['expected']}")
        airports  = ev["airports"]
        year      = ev["year"]
        dates     = ev["dates"]

        # Download/cache IEM METAR for these airports × year
        print(f"Fetching IEM METAR for {airports} in {year}...")
        wx = download_historical_weather(airports, [year])

        if wx.empty:
            print("  WARNING: No weather data returned")
            continue

        # Filter to event dates
        wx["date"] = pd.to_datetime(wx["date"])
        ev_wx = wx[wx["date"].isin(dates)].copy()

        if ev_wx.empty:
            print("  WARNING: No data for event dates")
            continue

        print(f"\n  Airport weather during event:")
        cols = ["iata", "date", "weather_severity", "has_thunderstorm",
                "has_snow_ice", "has_fog", "min_visibility_mi",
                "max_wind_kt", "ceiling_ft"]
        cols_present = [c for c in cols if c in ev_wx.columns]
        print(ev_wx[cols_present].sort_values(["date","iata"]).to_string(index=False, float_format="{:.2f}".format))

        # Summary stats per airport
        summary = ev_wx.groupby("iata")["weather_severity"].agg(["mean","max"])
        summary.columns = ["avg_severity", "max_severity"]
        print(f"\n  Severity summary:")
        print(summary.sort_values("max_severity", ascending=False).to_string(float_format="{:.3f}".format))

        # For each airport pair involving DFW, compute what live_score would have been
        # using peak weather day severity
        print(f"\n  Pair-level impact (A→DFW→B, worst day):")
        dfwrow = ev_wx[ev_wx["iata"] == "DFW"]
        dfw_max_sev = dfwrow["weather_severity"].max() if not dfwrow.empty else 0.0

        for iata in airports:
            if iata == "DFW":
                continue
            rows = ev_wx[ev_wx["iata"] == iata]
            if rows.empty:
                continue
            ap_max_sev = rows["weather_severity"].max()
            # Simulate weather penalty formula from predict.py
            penalty = max(dfw_max_sev * 0.6, ap_max_sev * 0.4)
            all_results.append({
                "event":       event_name,
                "airport":     iata,
                "dfw_severity":dfw_max_sev,
                "ap_severity": ap_max_sev,
                "wx_penalty":  penalty,
                "flag":        "HIGH" if penalty > 0.3 else ("MOD" if penalty > 0.15 else "LOW"),
            })
            print(f"    {iata}: airport_sev={ap_max_sev:.2f}  dfw_sev={dfw_max_sev:.2f}  "
                  f"penalty={penalty:.2f}  → {all_results[-1]['flag']}")

    # Summary table
    if all_results:
        res_df = pd.DataFrame(all_results)
        print("\n\n" + "="*65)
        print("STORM REPLAY SUMMARY")
        print("="*65)
        print(res_df.groupby(["event","flag"]).size().unstack(fill_value=0).to_string())

        # Plot
        fig, ax = plt.subplots(figsize=(12, 6))
        events_short = {e: e.split("(")[0].strip() for e in res_df["event"].unique()}
        res_df["event_short"] = res_df["event"].map(events_short)
        colors = {"HIGH": "#d62728", "MOD": "#ff7f0e", "LOW": "#2ca02c"}
        for i, (ev_short, grp) in enumerate(res_df.groupby("event_short")):
            x = range(len(grp))
            bars = ax.bar(
                [j + i*0.28 for j in range(len(grp))],
                grp["wx_penalty"].values,
                width=0.25,
                label=ev_short,
                color=[colors[f] for f in grp["flag"]],
                alpha=0.8,
            )

        ax.axhline(0.30, color="red",    linestyle="--", linewidth=1, label="HIGH threshold (0.30)")
        ax.axhline(0.15, color="orange", linestyle="--", linewidth=1, label="MOD threshold (0.15)")
        ax.set_ylabel("Weather penalty")
        ax.set_title("Storm Event Replay — Weather Penalty per Airport Pair\n(higher = model would have flagged pair as high-risk)")
        ax.legend(loc="upper right", fontsize=8)
        ax.grid(axis="y", alpha=0.3)
        plt.tight_layout()
        out = os.path.join(PLOTS, "stress_test2_storm_replay.png")
        plt.savefig(out, dpi=150)
        plt.close()
        print(f"\nStorm replay plot → {out}")

    return all_results


if __name__ == "__main__":
    t1 = test1_holdout_backtest()
    t2 = test2_storm_replay()

    print("\n" + "="*65)
    print("STRESS TEST COMPLETE")
    print("="*65)
    print("Plots saved to data/processed/plots/")
    print("  stress_test1_calibration.png")
    print("  stress_test2_storm_replay.png")