Spaces:

itaykadosh
/

AA-EPPS-Data-Challenge

Sleeping

File size: 12,218 Bytes

bef09da

"""
Real-time crew sequence risk predictor.

Usage:
  python predict.py ORD LAX          # predict risk for ORD→DFW→LAX right now
  python predict.py ORD LAX --month 7  # predict for July (historical patterns only)

How it works:
  1. Base risk  — XGBoost model trained on 2018–2024 historical delay patterns
  2. Live weather adjustment — fetches current METAR for airport_A, DFW, airport_B
                               via AWC API and adjusts the base score upward if
                               current conditions are severe.
  3. Returns a combined risk score [0, 1] with full explanation.
"""

import argparse
import datetime
import os
import sys
import numpy as np
import pandas as pd
import xgboost as xgb

sys.path.insert(0, os.path.dirname(__file__))
from weather import fetch_live_metar, _empty_weather

PROC = os.path.join(os.path.dirname(__file__), "..", "data", "processed")

FEATURE_COLS = [
    "A_weather_delay_rate", "A_weather_cancel_rate", "A_avg_weather_delay_min",
    "A_p75_weather_delay_min", "A_p95_weather_delay_min", "A_nas_delay_rate",
    "A_overall_weather_delay_rate", "A_overall_avg_weather_delay_min",
    "B_weather_delay_rate", "B_weather_cancel_rate", "B_avg_weather_delay_min",
    "B_p75_weather_delay_min", "B_p95_weather_delay_min", "B_nas_delay_rate",
    "B_overall_weather_delay_rate", "B_overall_avg_weather_delay_min",
    "pair_combined_weather_rate", "pair_max_weather_rate", "pair_min_weather_rate",
    "pair_weather_rate_sum", "pair_avg_weather_delay_min", "both_high_risk",
    "Month", "is_spring_summer", "median_turnaround_min",
]

RISK_LABELS = {
    (0.0, 0.3): ("LOW",      "✓ Safe to sequence"),
    (0.3, 0.55): ("MODERATE", "⚠ Use caution — review turnaround buffer"),
    (0.55, 0.75): ("HIGH",    "✗ Avoid if possible — weather-prone pair"),
    (0.75, 1.01): ("CRITICAL","✗✗ Do not sequence — high cascade risk"),
}


def risk_label(score: float) -> tuple[str, str]:
    for (lo, hi), (label, advice) in RISK_LABELS.items():
        if lo <= score < hi:
            return label, advice
    return "CRITICAL", "Do not sequence"


class PairRiskPredictor:
    def __init__(self):
        self.model = xgb.XGBClassifier(device="cuda", tree_method="hist")
        self.model.load_model(os.path.join(PROC, "xgb_model.json"))
        self.airport_features = pd.read_parquet(os.path.join(PROC, "airport_features.parquet"))
        self.pair_scores      = pd.read_parquet(os.path.join(PROC, "pair_risk_scores.parquet"))

    def _get_airport_features(self, airport: str, month: int) -> dict:
        row = self.airport_features[
            (self.airport_features["airport"] == airport) &
            (self.airport_features["Month"] == month)
        ]
        if row.empty:
            # Fall back to annual average for that airport
            row = self.airport_features[self.airport_features["airport"] == airport]
        if row.empty:
            return {}
        return row.mean(numeric_only=True).to_dict()

    def _build_feature_vector(self, airport_a: str, airport_b: str, month: int) -> pd.DataFrame:
        fa = self._get_airport_features(airport_a, month)
        fb = self._get_airport_features(airport_b, month)

        feat = {
            "A_weather_delay_rate":          fa.get("weather_delay_rate",       np.nan),
            "A_weather_cancel_rate":          fa.get("weather_cancel_rate",      np.nan),
            "A_avg_weather_delay_min":        fa.get("avg_weather_delay_min",    np.nan),
            "A_p75_weather_delay_min":        fa.get("p75_weather_delay_min",    np.nan),
            "A_p95_weather_delay_min":        fa.get("p95_weather_delay_min",    np.nan),
            "A_nas_delay_rate":              fa.get("nas_delay_rate",           np.nan),
            "A_overall_weather_delay_rate":   fa.get("overall_weather_delay_rate",np.nan),
            "A_overall_avg_weather_delay_min":fa.get("overall_avg_weather_delay_min",np.nan),
            "B_weather_delay_rate":          fb.get("weather_delay_rate",       np.nan),
            "B_weather_cancel_rate":          fb.get("weather_cancel_rate",      np.nan),
            "B_avg_weather_delay_min":        fb.get("avg_weather_delay_min",    np.nan),
            "B_p75_weather_delay_min":        fb.get("p75_weather_delay_min",    np.nan),
            "B_p95_weather_delay_min":        fb.get("p95_weather_delay_min",    np.nan),
            "B_nas_delay_rate":              fb.get("nas_delay_rate",           np.nan),
            "B_overall_weather_delay_rate":   fb.get("overall_weather_delay_rate",np.nan),
            "B_overall_avg_weather_delay_min":fb.get("overall_avg_weather_delay_min",np.nan),
            "Month":          month,
            "is_spring_summer": int(month in (3,4,5,6,7,8)),
            "median_turnaround_min": 90.0,  # default 90-min turnaround at DFW
        }

        a_rate = feat["A_weather_delay_rate"] or 0
        b_rate = feat["B_weather_delay_rate"] or 0
        feat["pair_combined_weather_rate"]  = a_rate * b_rate
        feat["pair_max_weather_rate"]       = max(a_rate, b_rate)
        feat["pair_min_weather_rate"]       = min(a_rate, b_rate)
        feat["pair_weather_rate_sum"]       = a_rate + b_rate
        feat["pair_avg_weather_delay_min"]  = (
            (feat["A_avg_weather_delay_min"] or 0) +
            (feat["B_avg_weather_delay_min"] or 0)
        ) / 2

        # both_high_risk: both airports above 75th percentile of delay rate
        all_rates = self.airport_features["weather_delay_rate"].dropna()
        p75 = all_rates.quantile(0.75)
        feat["both_high_risk"] = int(a_rate > p75 and b_rate > p75)

        # Season dummies
        season = {3:"spring",4:"spring",5:"spring",6:"summer",7:"summer",8:"summer",
                  9:"fall",10:"fall",11:"fall",12:"winter",1:"winter",2:"winter"}[month]
        for s in ("fall","spring","summer","winter"):
            feat[f"season_{s}"] = int(season == s)

        return pd.DataFrame([feat])

    def predict_historical(self, airport_a: str, airport_b: str, month: int = None) -> dict:
        """Predict using historical patterns only (no live weather)."""
        if month is None:
            month = datetime.date.today().month

        fv = self._build_feature_vector(airport_a, airport_b, month)
        model_cols = self.model.get_booster().feature_names
        for col in model_cols:
            if col not in fv.columns:
                fv[col] = 0.0
        fv = fv[model_cols].astype(float)

        base_score = float(self.model.predict_proba(fv)[0, 1])

        # Look up observed rate from historical data if available
        hist = self.pair_scores[
            (self.pair_scores["airport_A"] == airport_a) &
            (self.pair_scores["airport_B"] == airport_b) &
            (self.pair_scores["Month"] == month)
        ]
        observed = float(hist["observed_bad_rate"].iloc[0]) if not hist.empty else None

        label, advice = risk_label(base_score)
        return {
            "airport_a":      airport_a,
            "airport_b":      airport_b,
            "hub":            "DFW",
            "month":          month,
            "base_risk_score": base_score,
            "risk_label":     label,
            "advice":         advice,
            "observed_bad_rate": observed,
        }

    def predict_live(self, airport_a: str, airport_b: str) -> dict:
        """
        Full real-time prediction: historical model + live weather adjustment.
        Fetches current METAR for airport_A, DFW, and airport_B.
        """
        month = datetime.date.today().month
        result = self.predict_historical(airport_a, airport_b, month)

        print(f"  Fetching live METAR for {airport_a}, DFW, {airport_b}...")
        weather = fetch_live_metar([airport_a, "DFW", airport_b])

        wa  = weather.get(airport_a, _empty_weather())
        wdfw= weather.get("DFW",     _empty_weather())
        wb  = weather.get(airport_b, _empty_weather())

        # DFW weather affects both legs — weight it higher
        weather_penalty = max(
            wdfw["weather_severity"] * 0.6,   # DFW: hub, both legs affected
            wa["weather_severity"]   * 0.4,   # leg 1: A→DFW
            wb["weather_severity"]   * 0.4,   # leg 2: DFW→B
        )

        base = result["base_risk_score"]
        # Blend: bad weather pushes score toward 1; clear weather doesn't reduce below base
        live_score = base + (1.0 - base) * weather_penalty
        live_score = float(np.clip(live_score, 0, 1))

        label, advice = risk_label(live_score)
        result.update({
            "live_risk_score":   live_score,
            "weather_penalty":   weather_penalty,
            "risk_label":        label,
            "advice":            advice,
            "weather_airport_a": wa,
            "weather_dfw":       wdfw,
            "weather_airport_b": wb,
        })
        return result


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _fmt_weather(label: str, w: dict) -> str:
    raw = w.get("raw", "")
    sev = w.get("weather_severity", 0)
    flags = []
    if w.get("has_thunderstorm"): flags.append("THUNDERSTORM")
    if w.get("has_fog"):          flags.append("FOG/MIST")
    if w.get("has_snow_ice"):     flags.append("SNOW/ICE")
    if w.get("has_low_ceiling"):  flags.append(f"LOW CEILING ({w.get('ceiling_ft',0):.0f}ft)")
    vis = w.get("min_visibility_mi", 10)
    if vis < 3:                   flags.append(f"LOW VIS ({vis:.1f}mi)")
    wind = w.get("max_wind_kt", 0)
    if wind > 20:                 flags.append(f"HIGH WIND ({wind:.0f}kt)")
    flag_str = ", ".join(flags) if flags else "Clear"
    return (
        f"  {label:12s}  severity={sev:.2f}  [{flag_str}]\n"
        f"             METAR: {raw[:80] if raw else 'unavailable'}"
    )


def main():
    parser = argparse.ArgumentParser(description="Predict crew sequence risk for airport_A → DFW → airport_B")
    parser.add_argument("airport_a", help="Inbound airport IATA code (e.g. ORD)")
    parser.add_argument("airport_b", help="Outbound airport IATA code (e.g. LAX)")
    parser.add_argument("--month", type=int, default=None, help="Month 1-12 (default: current month)")
    parser.add_argument("--no-live", action="store_true", help="Skip live weather fetch")
    args = parser.parse_args()

    predictor = PairRiskPredictor()

    print(f"\n{'='*60}")
    print(f"  Crew Sequence Risk:  {args.airport_a.upper()} → DFW → {args.airport_b.upper()}")
    print(f"{'='*60}")

    if args.no_live or args.month:
        month = args.month or datetime.date.today().month
        result = predictor.predict_historical(args.airport_a.upper(), args.airport_b.upper(), month)
        score  = result["base_risk_score"]
        label  = result["risk_label"]
        print(f"\n  Historical risk score : {score:.3f}")
        print(f"  Risk level            : {label}")
        print(f"  Advice                : {result['advice']}")
        if result["observed_bad_rate"] is not None:
            print(f"  Observed bad rate     : {result['observed_bad_rate']:.1%}  (historical month {month})")
    else:
        result = predictor.predict_live(args.airport_a.upper(), args.airport_b.upper())
        base   = result["base_risk_score"]
        live   = result["live_risk_score"]
        print(f"\n  Historical base score : {base:.3f}")
        print(f"  Live weather penalty  : +{result['weather_penalty']:.3f}")
        print(f"  FINAL risk score      : {live:.3f}")
        print(f"  Risk level            : {result['risk_label']}")
        print(f"  Advice                : {result['advice']}")
        if result.get("observed_bad_rate"):
            print(f"  Historical bad rate   : {result['observed_bad_rate']:.1%}")
        print(f"\n  Current conditions:")
        print(_fmt_weather(args.airport_a.upper(), result["weather_airport_a"]))
        print(_fmt_weather("DFW (hub)",             result["weather_dfw"]))
        print(_fmt_weather(args.airport_b.upper(), result["weather_airport_b"]))

    print(f"\n{'='*60}\n")


if __name__ == "__main__":
    main()