""" Enhanced XGBoost model using enriched features: - Duty/fatigue features (from feature_engineering_duty.py) - Actual METAR weather for airports A, B, and DFW hub (from enrich_features.py) - Time-of-day signals (afternoon TS window, morning arrivals) Run after: python src/feature_engineering.py python src/feature_engineering_duty.py python src/enrich_features.py """ import os import numpy as np import pandas as pd import xgboost as xgb from sklearn.metrics import ( classification_report, roc_auc_score, average_precision_score, ) import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt PROCESSED_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "processed") # Base features (same as model.py) BASE_FEATURES = [ "A_weather_delay_rate", "A_weather_cancel_rate", "A_avg_weather_delay_min", "A_p75_weather_delay_min", "A_p95_weather_delay_min", "A_nas_delay_rate", "A_overall_weather_delay_rate", "A_overall_avg_weather_delay_min", "B_weather_delay_rate", "B_weather_cancel_rate", "B_avg_weather_delay_min", "B_p75_weather_delay_min", "B_p95_weather_delay_min", "B_nas_delay_rate", "B_overall_weather_delay_rate", "B_overall_avg_weather_delay_min", "pair_combined_weather_rate", "pair_max_weather_rate", "pair_min_weather_rate", "pair_weather_rate_sum", "pair_avg_weather_delay_min", "both_high_risk", "Month", "is_spring_summer", "median_turnaround_min", ] # Duty features (from feature_engineering_duty.py) DUTY_FEATURES = [ "A_dep_hour_median", "A_late_dep_rate", "A_early_dep_rate", "A_avg_block_min", "A_late_aircraft_delay_rate", "A_avg_late_aircraft_min", "B_dep_hour_median", "B_late_dep_rate", "B_early_dep_rate", "B_avg_block_min", "B_late_aircraft_delay_rate", "B_avg_late_aircraft_min", "tight_connection_rate", "very_tight_rate", "cascade_risk", "total_duty_block_min", "duty_overrun_risk", "late_dep_sequence", ] # Actual METAR weather features (from enrich_features.py) METAR_FEATURES = [ # DFW hub weather (all sequences pass through) "dfw_ts_rate", "dfw_fog_rate", "dfw_snow_rate", "dfw_low_ceil_rate", "dfw_avg_severity", "dfw_avg_wind_kt", # Airport A observed weather "A_wx_ts_rate", "A_wx_fog_rate", "A_wx_snow_rate", "A_wx_avg_severity", "A_wx_avg_wind_kt", # Airport B observed weather "B_wx_ts_rate", "B_wx_fog_rate", "B_wx_snow_rate", "B_wx_avg_severity", "B_wx_avg_wind_kt", # Pair-level METAR derived "pair_wx_ts_rate", "pair_wx_severity", "dfw_x_A_ts", "dfw_x_B_ts", ] # Time-of-day features (from enrich_features.py) TOD_FEATURES = [ "B_afternoon_dep_rate", # DFW→B departures in TS peak window (14-19h) "A_morning_arr_rate", # A→DFW arrivals in morning (06-12h) ] def load_features(): path = os.path.join(PROCESSED_DIR, "sequence_features_enhanced.parquet") if not os.path.exists(path): raise FileNotFoundError(f"Run enrich_features.py first.\nExpected: {path}") df = pd.read_parquet(path) season_cols = [c for c in df.columns if c.startswith("season_")] print(f"Loaded enhanced features: {df.shape}") return df, season_cols def build_feature_cols(df: pd.DataFrame, season_cols: list[str]) -> list[str]: all_candidates = BASE_FEATURES + DUTY_FEATURES + METAR_FEATURES + TOD_FEATURES + season_cols present = [c for c in all_candidates if c in df.columns] missing = [c for c in all_candidates if c not in df.columns] if missing: print(f" Missing features (skipped): {missing}") print(f" Using {len(present)} features total") return present def train(df: pd.DataFrame, feature_cols: list[str]) -> xgb.XGBClassifier: # Fill NaN in new features with column median (airports without METAR data) df = df.copy() for col in feature_cols: if df[col].isna().any(): df[col] = df[col].fillna(df[col].median()) X = df[feature_cols].astype(float) y = df["target"].astype(int) neg, pos = (y == 0).sum(), (y == 1).sum() scale_pos_weight = neg / pos print(f"Class balance — neg: {neg:,}, pos: {pos:,}, spw: {scale_pos_weight:.2f}") model = xgb.XGBClassifier( n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=scale_pos_weight, eval_metric="aucpr", early_stopping_rounds=30, random_state=42, n_jobs=-1, device="cuda", tree_method="hist", ) train_mask = df["Year"] < df["Year"].max() val_mask = df["Year"] == df["Year"].max() X_train, y_train = X[train_mask], y[train_mask] X_val, y_val = X[val_mask], y[val_mask] print(f"Train: {len(X_train):,} rows | Val: {len(X_val):,} rows") model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=50) y_prob = model.predict_proba(X_val)[:, 1] y_pred = model.predict(X_val) print(f"\n--- Enhanced Model Validation ---") print(f"ROC-AUC: {roc_auc_score(y_val, y_prob):.4f}") print(f"Average Precision: {average_precision_score(y_val, y_prob):.4f}") print(classification_report(y_val, y_pred, target_names=["low_risk", "high_risk"])) return model, df # return df with filled NaN for scoring def plot_feature_importance(model: xgb.XGBClassifier, feature_cols: list[str]): feat_df = pd.DataFrame({ "feature": feature_cols, "importance": model.feature_importances_, }).sort_values("importance", ascending=True).tail(30) fig, ax = plt.subplots(figsize=(10, 10)) colors = [] for f in feat_df["feature"]: if f in METAR_FEATURES or f.startswith("dfw_"): colors.append("coral") elif f in DUTY_FEATURES: colors.append("mediumseagreen") elif f in TOD_FEATURES: colors.append("mediumpurple") else: colors.append("steelblue") ax.barh(feat_df["feature"], feat_df["importance"], color=colors) from matplotlib.patches import Patch legend_elements = [ Patch(facecolor="steelblue", label="Base BTS features"), Patch(facecolor="mediumseagreen", label="Duty/fatigue features"), Patch(facecolor="coral", label="Actual METAR weather"), Patch(facecolor="mediumpurple", label="Time-of-day features"), ] ax.legend(handles=legend_elements, loc="lower right", fontsize=9) ax.set_xlabel("Feature Importance (gain)") ax.set_title("Top 30 Feature Importances — Enhanced Model\n" "Base + Duty + METAR + Time-of-Day") plt.tight_layout() out = os.path.join(PROCESSED_DIR, "feature_importance_enhanced.png") plt.savefig(out, dpi=150) plt.close() print(f"Feature importance plot saved → {out}") def main(): df, season_cols = load_features() feature_cols = build_feature_cols(df, season_cols) model, df_filled = train(df, feature_cols) model_path = os.path.join(PROCESSED_DIR, "xgb_model_enhanced.json") model.save_model(model_path) print(f"Model saved → {model_path}") plot_feature_importance(model, feature_cols) # Re-score all pairs with enhanced model X_all = df_filled[feature_cols].astype(float) df_filled = df_filled.copy() df_filled["risk_score"] = model.predict_proba(X_all)[:, 1] pair_scores = ( df_filled.groupby(["airport_A", "airport_B", "Month"]) .agg( avg_risk_score = ("risk_score", "mean"), max_risk_score = ("risk_score", "max"), n_sequences = ("n_sequences", "sum"), observed_bad_rate= ("observed_bad_rate", "mean"), ) .reset_index() .sort_values("avg_risk_score", ascending=False) ) out = os.path.join(PROCESSED_DIR, "pair_risk_scores_enhanced.parquet") pair_scores.to_parquet(out, index=False) print(f"\nEnhanced pair risk scores saved → {out}") print("\nTop 20 riskiest pairs (enhanced model):") print(pair_scores.head(20).to_string(index=False)) if __name__ == "__main__": main()