Spaces:
Sleeping
Sleeping
| """ | |
| Out-of-Distribution (OOD) Validation for the crew sequence risk model. | |
| Three OOD tests: | |
| 1. Temporal OOD — 2015 data (before training window, never seen) | |
| 2. Extreme event — Jan/Feb 2015 (record NE blizzards, high disruption) | |
| 3. Carrier OOD — Non-AA carriers (DL, UA, WN) at DFW, 2022–2023 | |
| Tests whether airport-level risk is carrier-agnostic | |
| KEY: OOD data is aggregated to pair×month level (same as training) before | |
| evaluation. Flight-level→aggregate mismatch was the prior bug. | |
| All tests use the model trained on AA/DFW 2018–2023 (2015–2017 never seen). | |
| """ | |
| import os | |
| import io | |
| import zipfile | |
| import requests | |
| import numpy as np | |
| import pandas as pd | |
| import xgboost as xgb | |
| from sklearn.metrics import roc_auc_score, average_precision_score, classification_report | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| RAW_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "raw") | |
| OOD_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "ood") | |
| PROC_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "processed") | |
| os.makedirs(OOD_DIR, exist_ok=True) | |
| BTS_URL = ( | |
| "https://www.transtats.bts.gov/PREZIP/" | |
| "On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{year}_{month}.zip" | |
| ) | |
| KEEP_COLS = [ | |
| "FlightDate", "Reporting_Airline", "Origin", "Dest", | |
| "CRSDepTime", "DepTime", "DepDelay", "DepDelayMinutes", | |
| "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn", | |
| "CRSArrTime", "ArrTime", "ArrDelay", "ArrDelayMinutes", | |
| "Cancelled", "CancellationCode", "Diverted", | |
| "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", | |
| "DayOfWeek", "Month", "Year", | |
| ] | |
| WEATHER_DELAY_THRESHOLD_MIN = 15 | |
| TURNAROUND_MAX_HRS = 4 | |
| TURNAROUND_MIN_HRS = 0.5 | |
| # Must match training feature cols exactly | |
| FEATURE_COLS = [ | |
| "A_weather_delay_rate", "A_weather_cancel_rate", "A_avg_weather_delay_min", | |
| "A_p75_weather_delay_min", "A_p95_weather_delay_min", "A_nas_delay_rate", | |
| "A_overall_weather_delay_rate", "A_overall_avg_weather_delay_min", | |
| "B_weather_delay_rate", "B_weather_cancel_rate", "B_avg_weather_delay_min", | |
| "B_p75_weather_delay_min", "B_p95_weather_delay_min", "B_nas_delay_rate", | |
| "B_overall_weather_delay_rate", "B_overall_avg_weather_delay_min", | |
| "pair_combined_weather_rate", "pair_max_weather_rate", "pair_min_weather_rate", | |
| "pair_weather_rate_sum", "pair_avg_weather_delay_min", "both_high_risk", | |
| "Month", "is_spring_summer", "median_turnaround_min", | |
| ] | |
| # Global bad-rate threshold from training (from feature_engineering output) | |
| GLOBAL_BAD_RATE_THRESHOLD = 0.167 | |
| # --------------------------------------------------------------------------- | |
| # Core pipeline helpers (mirrors feature_engineering.py logic) | |
| # --------------------------------------------------------------------------- | |
| def add_season(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| df["FlightDate"] = pd.to_datetime(df["FlightDate"]) | |
| df["Month"] = df["FlightDate"].dt.month | |
| df["Year"] = df["FlightDate"].dt.year | |
| df["Season"] = df["Month"].map({ | |
| 12:"winter",1:"winter",2:"winter", | |
| 3:"spring",4:"spring",5:"spring", | |
| 6:"summer",7:"summer",8:"summer", | |
| 9:"fall",10:"fall",11:"fall", | |
| }) | |
| return df | |
| def _parse_hhmm(series: pd.Series) -> pd.Series: | |
| s = series.fillna(0).astype(int).astype(str).str.zfill(4) | |
| return s.str[:2].astype(int) * 60 + s.str[2:].astype(int) | |
| def build_sequences_agg(df: pd.DataFrame, hub: str) -> pd.DataFrame: | |
| """ | |
| Mirrors updated feature_engineering.py: aggregate inbound/outbound to | |
| (date, airport) level first, then cross-join → pair×date rows. | |
| Returns flight-level-labeled pairs ready for aggregation. | |
| """ | |
| inbound = df[(df["Dest"] == hub) & (df["Cancelled"] != 1)].copy() | |
| outbound = df[(df["Origin"] == hub) & (df["Cancelled"] != 1)].copy() | |
| inbound["arr_min"] = _parse_hhmm(inbound["ArrTime"]) | |
| outbound["dep_min"] = _parse_hhmm(outbound["DepTime"]) | |
| ib_agg = ( | |
| inbound.groupby(["FlightDate","Origin","Month","Season","Year"]) | |
| .agg( | |
| arr_min_earliest = ("arr_min", "min"), | |
| arr_min_latest = ("arr_min", "max"), | |
| weather_delay_A = ("WeatherDelay", lambda x: x.fillna(0).max()), | |
| arr_delay_A = ("ArrDelay", lambda x: x.fillna(0).max()), | |
| nas_delay_A = ("NASDelay", lambda x: x.fillna(0).max()), | |
| late_aircraft_A = ("LateAircraftDelay",lambda x: x.fillna(0).max()), | |
| ) | |
| .reset_index() | |
| .rename(columns={"Origin":"airport_A"}) | |
| ) | |
| ob_agg = ( | |
| outbound.groupby(["FlightDate","Dest"]) | |
| .agg( | |
| dep_min_earliest = ("dep_min", "min"), | |
| dep_min_latest = ("dep_min", "max"), | |
| weather_delay_B = ("WeatherDelay", lambda x: x.fillna(0).max()), | |
| dep_delay_B = ("DepDelay", lambda x: x.fillna(0).max()), | |
| late_aircraft_B = ("LateAircraftDelay",lambda x: x.fillna(0).max()), | |
| ) | |
| .reset_index() | |
| .rename(columns={"Dest":"airport_B"}) | |
| ) | |
| pairs = ib_agg.merge(ob_agg, on="FlightDate", how="inner") | |
| pairs = pairs[pairs["airport_A"] != pairs["airport_B"]].copy() | |
| turnaround_mid = pairs["dep_min_earliest"] - pairs["arr_min_latest"] | |
| feasible = ( | |
| (pairs["dep_min_latest"] >= pairs["arr_min_earliest"] + TURNAROUND_MIN_HRS * 60) & | |
| (pairs["dep_min_earliest"] <= pairs["arr_min_latest"] + TURNAROUND_MAX_HRS * 60) | |
| ) | |
| pairs = pairs[feasible].copy() | |
| pairs["turnaround_min"] = turnaround_mid[feasible].clip(lower=0) | |
| return pairs | |
| def label_pairs(pairs: pd.DataFrame) -> pd.DataFrame: | |
| weather_A = pairs["weather_delay_A"].fillna(0) >= WEATHER_DELAY_THRESHOLD_MIN | |
| weather_B = pairs["weather_delay_B"].fillna(0) >= WEATHER_DELAY_THRESHOLD_MIN | |
| cascade = ( | |
| (pairs["arr_delay_A"].fillna(0) >= WEATHER_DELAY_THRESHOLD_MIN) & | |
| (pairs["late_aircraft_B"].fillna(0) >= WEATHER_DELAY_THRESHOLD_MIN) | |
| ) | |
| pairs["raw_label"] = (weather_A | weather_B | cascade).astype(int) | |
| return pairs | |
| def aggregate_to_pair_month(pairs: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Collapse pair×date rows → pair×month rows, binarize using global threshold. | |
| Mirrors save_features() in feature_engineering.py. | |
| """ | |
| agg = ( | |
| pairs.groupby(["airport_A","airport_B","Month","Year"]) | |
| .agg( | |
| observed_bad_rate = ("raw_label", "mean"), | |
| median_turnaround_min= ("turnaround_min", "median"), | |
| n_sequences = ("raw_label", "count"), | |
| ) | |
| .reset_index() | |
| ) | |
| agg["target"] = (agg["observed_bad_rate"] >= GLOBAL_BAD_RATE_THRESHOLD).astype(int) | |
| return agg | |
| def attach_airport_features(pairs: pd.DataFrame, airport_features: pd.DataFrame) -> pd.DataFrame: | |
| feat_a = airport_features.rename(columns=lambda c: f"A_{c}" if c not in ("airport","Month") else c)\ | |
| .rename(columns={"airport":"airport_A"}) | |
| feat_b = airport_features.rename(columns=lambda c: f"B_{c}" if c not in ("airport","Month") else c)\ | |
| .rename(columns={"airport":"airport_B"}) | |
| pairs = pairs.merge(feat_a, on=["airport_A","Month"], how="left") | |
| pairs = pairs.merge(feat_b, on=["airport_B","Month"], how="left") | |
| pairs["pair_combined_weather_rate"] = pairs["A_weather_delay_rate"] * pairs["B_weather_delay_rate"] | |
| pairs["pair_max_weather_rate"] = pairs[["A_weather_delay_rate","B_weather_delay_rate"]].max(axis=1) | |
| pairs["pair_min_weather_rate"] = pairs[["A_weather_delay_rate","B_weather_delay_rate"]].min(axis=1) | |
| pairs["pair_weather_rate_sum"] = pairs["A_weather_delay_rate"] + pairs["B_weather_delay_rate"] | |
| pairs["pair_avg_weather_delay_min"] = (pairs["A_avg_weather_delay_min"] + pairs["B_avg_weather_delay_min"]) / 2 | |
| pairs["both_high_risk"] = ( | |
| (pairs["A_weather_delay_rate"] > pairs["A_weather_delay_rate"].quantile(0.75)) & | |
| (pairs["B_weather_delay_rate"] > pairs["B_weather_delay_rate"].quantile(0.75)) | |
| ).astype(int) | |
| # Season dummies from Month (no Season col at pair×month level) | |
| pairs["season_spring"] = pairs["Month"].isin([3,4,5]).astype(int) | |
| pairs["season_summer"] = pairs["Month"].isin([6,7,8]).astype(int) | |
| pairs["season_fall"] = pairs["Month"].isin([9,10,11]).astype(int) | |
| pairs["season_winter"] = pairs["Month"].isin([12,1,2]).astype(int) | |
| pairs["is_spring_summer"] = (pairs["season_spring"] | pairs["season_summer"]) | |
| return pairs | |
| def run_ood_pipeline(df: pd.DataFrame, hub: str, airport_features: pd.DataFrame) -> pd.DataFrame: | |
| """Full pipeline: raw flights → labeled pair×month rows with features.""" | |
| df = add_season(df) | |
| pairs = build_sequences_agg(df, hub=hub) | |
| pairs = label_pairs(pairs) | |
| agg = aggregate_to_pair_month(pairs) | |
| agg = attach_airport_features(agg, airport_features) | |
| return agg | |
| def evaluate(name: str, model: xgb.XGBClassifier, | |
| df: pd.DataFrame, feature_cols: list[str]) -> dict: | |
| X = df[feature_cols].astype(float) | |
| y = df["target"].astype(int) | |
| mask = X.notna().all(axis=1) | |
| X, y = X[mask], y[mask] | |
| if len(y) == 0 or y.nunique() < 2: | |
| print(f"[{name}] Skipped — insufficient data or single class.") | |
| return {} | |
| proba = model.predict_proba(X)[:, 1] | |
| pred = model.predict(X) | |
| auc = roc_auc_score(y, proba) | |
| ap = average_precision_score(y, proba) | |
| rate = y.mean() | |
| print(f"\n{'='*55}") | |
| print(f"OOD Test: {name}") | |
| print(f" Rows (pair×month): {len(y):,}") | |
| print(f" Positive rate: {rate:.1%}") | |
| print(f" ROC-AUC: {auc:.4f}") | |
| print(f" Avg Precision: {ap:.4f}") | |
| print(classification_report(y, pred, target_names=["low_risk","high_risk"], zero_division=0)) | |
| return {"name": name, "n": len(y), "positive_rate": rate, "roc_auc": auc, "avg_precision": ap} | |
| # --------------------------------------------------------------------------- | |
| # OOD Test 1: Temporal — all-carrier DFW 2015 (never in training) | |
| # --------------------------------------------------------------------------- | |
| def test_temporal_ood(model, airport_features, feature_cols): | |
| print("\n[OOD 1] Temporal — all-carrier DFW 2015 (before training window)") | |
| raw = os.path.join(RAW_DIR, "bts_all_dfw_2015.parquet") | |
| if not os.path.exists(raw): | |
| print(" bts_all_dfw_2015.parquet not found — skipping"); return {} | |
| df = pd.read_parquet(raw) | |
| agg = run_ood_pipeline(df, hub="DFW", airport_features=airport_features) | |
| agg.to_parquet(os.path.join(OOD_DIR, "ood_temporal_2015.parquet"), index=False) | |
| return evaluate("Temporal OOD — all-carrier DFW 2015", model, agg, feature_cols) | |
| # --------------------------------------------------------------------------- | |
| # OOD Test 2: Extreme event — Jan/Feb 2015 record NE blizzards | |
| # --------------------------------------------------------------------------- | |
| def test_extreme_event_ood(model, airport_features, feature_cols): | |
| print("\n[OOD 2] Extreme event — Jan/Feb 2015 NE blizzards") | |
| raw = os.path.join(RAW_DIR, "bts_all_dfw_2015.parquet") | |
| if not os.path.exists(raw): | |
| print(" bts_all_dfw_2015.parquet not found — skipping"); return {} | |
| df = pd.read_parquet(raw) | |
| df["FlightDate"] = pd.to_datetime(df["FlightDate"]) | |
| df = df[df["FlightDate"].dt.month.isin([1, 2])].copy() | |
| agg = run_ood_pipeline(df, hub="DFW", airport_features=airport_features) | |
| agg.to_parquet(os.path.join(OOD_DIR, "ood_extreme_2015_janfeb.parquet"), index=False) | |
| result = evaluate("Extreme Event OOD — Jan/Feb 2015 blizzards", model, agg, feature_cols) | |
| # Show which pairs the model scored highest during blizzard months | |
| if len(agg) > 0: | |
| X = agg[feature_cols].astype(float) | |
| mask = X.notna().all(axis=1) | |
| agg_clean = agg[mask].copy() | |
| agg_clean["risk_score"] = model.predict_proba(X[mask])[:, 1] | |
| worst = ( | |
| agg_clean.groupby(["airport_A","airport_B"]) | |
| .agg(avg_risk=("risk_score","mean"), n=("risk_score","count"), | |
| actual_bad_rate=("target","mean")) | |
| .sort_values("avg_risk", ascending=False) | |
| .head(10) | |
| ) | |
| print("\nTop 10 riskiest pairs during Jan/Feb 2015 blizzards:") | |
| print(worst.to_string()) | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # OOD Test 3: Carrier OOD — non-AA carriers at DFW (DL, UA, WN, B6) | |
| # --------------------------------------------------------------------------- | |
| def test_carrier_ood(model, airport_features, feature_cols): | |
| """ | |
| Raw files contain all carriers at DFW. Filter out AA → test whether | |
| airport-level weather risk generalises across carriers at same hub. | |
| """ | |
| print("\n[OOD 3] Carrier OOD — non-AA carriers at DFW (2022–2023)") | |
| cached = os.path.join(OOD_DIR, "ood_non_aa_dfw_2022_2023.parquet") | |
| if os.path.exists(cached): | |
| agg = pd.read_parquet(cached) | |
| else: | |
| frames = [] | |
| for year in [2022, 2023]: | |
| raw = os.path.join(RAW_DIR, f"bts_all_dfw_{year}.parquet") | |
| if not os.path.exists(raw): | |
| print(f" {raw} not found — skipping {year}"); continue | |
| chunk = pd.read_parquet(raw) | |
| chunk = chunk[chunk["Reporting_Airline"] != "AA"].copy() | |
| print(f" {year}: {len(chunk):,} non-AA rows ({chunk['Reporting_Airline'].nunique()} carriers)") | |
| frames.append(chunk) | |
| if not frames: | |
| print(" No data — skipping carrier OOD"); return {} | |
| df = pd.concat(frames, ignore_index=True) | |
| agg = run_ood_pipeline(df, hub="DFW", airport_features=airport_features) | |
| agg.to_parquet(cached, index=False) | |
| return evaluate("Carrier OOD — non-AA/DFW 2022–2023", model, agg, feature_cols) | |
| # --------------------------------------------------------------------------- | |
| # Summary plot | |
| # --------------------------------------------------------------------------- | |
| def plot_ood_summary(results: list[dict], in_dist: dict): | |
| valid = [r for r in results if r] | |
| if not valid: | |
| print("No OOD results to plot."); return | |
| all_res = [in_dist] + valid | |
| names = [r["name"] for r in all_res] | |
| auc_vals = [r["roc_auc"] for r in all_res] | |
| ap_vals = [r["avg_precision"] for r in all_res] | |
| colors = ["steelblue"] + ["coral"] * len(valid) | |
| x, w = np.arange(len(names)), 0.35 | |
| fig, ax = plt.subplots(figsize=(13, 5)) | |
| ax.bar(x - w/2, auc_vals, w, label="ROC-AUC", color=colors, alpha=0.85) | |
| ax.bar(x + w/2, ap_vals, w, label="Avg Precision", color=colors, alpha=0.5, hatch="//") | |
| ax.axhline(0.5, color="gray", linestyle="--", linewidth=0.8, label="Random baseline") | |
| ax.set_xticks(x) | |
| ax.set_xticklabels([n.replace(" — ","\n") for n in names], fontsize=8) | |
| ax.set_ylim(0, 1) | |
| ax.set_ylabel("Score") | |
| ax.set_title("In-Distribution vs OOD Model Performance\n(blue = in-dist, coral = OOD)") | |
| ax.legend() | |
| plt.tight_layout() | |
| out = os.path.join(PROC_DIR, "ood_comparison.png") | |
| plt.savefig(out, dpi=150) | |
| print(f"\nOOD comparison plot → {out}") | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| model = xgb.XGBClassifier() | |
| model.load_model(os.path.join(PROC_DIR, "xgb_model.json")) | |
| airport_features = pd.read_parquet(os.path.join(PROC_DIR, "airport_features.parquet")) | |
| # In-distribution: 2024 holdout (already aggregated) | |
| train_data = pd.read_parquet(os.path.join(PROC_DIR, "sequence_features.parquet")) | |
| val_data = train_data[train_data["Year"] == train_data["Year"].max()].copy() | |
| season_cols = [c for c in val_data.columns if c.startswith("season_")] | |
| feature_cols = [c for c in FEATURE_COLS + season_cols if c in val_data.columns] | |
| in_dist = evaluate("In-Distribution (2024 val)", model, val_data, feature_cols) | |
| results = [ | |
| test_temporal_ood(model, airport_features, feature_cols), | |
| test_extreme_event_ood(model, airport_features, feature_cols), | |
| test_carrier_ood(model, airport_features, feature_cols), | |
| ] | |
| plot_ood_summary(results, in_dist) | |
| summary = pd.DataFrame([in_dist] + [r for r in results if r]) | |
| out_csv = os.path.join(PROC_DIR, "ood_summary.csv") | |
| summary.to_csv(out_csv, index=False) | |
| print(f"Summary → {out_csv}") | |
| print(summary.to_string(index=False)) | |
| if __name__ == "__main__": | |
| main() | |