"""
AA Crew Sequence Risk — Interactive Dashboard
=============================================
Run with:
conda run -n aadata streamlit run app/app.py
"""
from __future__ import annotations
import os
import sys
from datetime import datetime
import numpy as np
import pandas as pd
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
# Global chart theme — transparent bg so dark/light mode both work
pio.templates["aa_theme"] = go.layout.Template(
layout=go.Layout(
plot_bgcolor="rgba(0,0,0,0)",
paper_bgcolor="rgba(0,0,0,0)",
xaxis=dict(gridcolor="rgba(128,128,128,0.15)", zerolinecolor="rgba(128,128,128,0.3)"),
yaxis=dict(gridcolor="rgba(128,128,128,0.15)", zerolinecolor="rgba(128,128,128,0.3)"),
)
)
pio.templates.default = "plotly+aa_theme"
# Allow imports from project root
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from app.predictor import RiskPredictor, build_features_df, FEATURE_LABELS
from app import airports as ap_meta
from app import live_flights as lf
from app import optimizer as opt
PROCESSED = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "processed"))
RAW = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "raw"))
st.set_page_config(
page_title="AA DFW Crew Risk",
page_icon="✈️",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS ──────────────────────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
def tip(label: str, tooltip: str) -> str:
"""Return HTML snippet: label with hover tooltip (use inside st.markdown unsafe_allow_html=True)."""
return (
f''
f'{label} ℹ'
)
# ── Data loading (cached) ────────────────────────────────────────────────────
@st.cache_resource(show_spinner="Loading model & features...")
def get_predictor() -> RiskPredictor:
df = build_features_df()
return RiskPredictor(df)
@st.cache_data(show_spinner="Loading risk scores...")
def get_pair_scores() -> pd.DataFrame:
return pd.read_parquet(os.path.join(PROCESSED, "pair_risk_scores.parquet"))
@st.cache_data(show_spinner="Loading 2024 schedule data...")
def get_bts_2024() -> pd.DataFrame:
path = os.path.join(RAW, "bts_all_dfw_2024.parquet")
if not os.path.exists(path):
return pd.DataFrame()
df = pd.read_parquet(path)
df = df[df["Cancelled"] != 1].copy()
df["CRSDepTime"] = pd.to_numeric(df["CRSDepTime"], errors="coerce")
df["CRSArrTime"] = pd.to_numeric(df["CRSArrTime"], errors="coerce")
df["DepDelayMinutes"] = df["DepDelayMinutes"].fillna(0)
df["ArrDelayMinutes"] = df["ArrDelayMinutes"].fillna(0)
return df
@st.cache_data
def get_map_group(month: int, role: str) -> pd.DataFrame:
"""Cached per-month airport risk aggregation for the map tab."""
scores = get_pair_scores()
ms = scores[scores["Month"] == month]
if role == "origin":
grp = (ms.groupby("airport_A")
.agg(avg_risk=("avg_risk_score","mean"), n_pairs=("airport_B","count"))
.reset_index().rename(columns={"airport_A":"airport"}))
wp = (ms.loc[ms.groupby("airport_A")["avg_risk_score"].idxmax(), ["airport_A","airport_B"]]
.rename(columns={"airport_A":"airport","airport_B":"worst_partner"}))
else:
grp = (ms.groupby("airport_B")
.agg(avg_risk=("avg_risk_score","mean"), n_pairs=("airport_A","count"))
.reset_index().rename(columns={"airport_B":"airport"}))
wp = (ms.loc[ms.groupby("airport_B")["avg_risk_score"].idxmax(), ["airport_B","airport_A"]]
.rename(columns={"airport_B":"airport","airport_A":"worst_partner"}))
return grp.merge(wp, on="airport", how="left")
@st.cache_data
def get_scores_indexed() -> pd.DataFrame:
"""Cached set_index — avoids re-running on every Streamlit rerender."""
return get_pair_scores().set_index(["airport_A", "airport_B", "Month"])
@st.cache_data(show_spinner=False)
def get_eval_data() -> dict:
"""Compute PR/ROC curves and calibration from pair_risk_scores (pair-level aggregation)."""
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score, average_precision_score
_s = get_pair_scores().dropna(subset=["avg_risk_score", "observed_bad_rate"])
_y = (_s["observed_bad_rate"] > 0.25).astype(int)
_p = _s["avg_risk_score"]
_fpr, _tpr, _ = roc_curve(_y, _p)
_prec, _rec, _ = precision_recall_curve(_y, _p)
# Calibration: decile buckets of model score vs observed bad rate
_s2 = _s.copy()
_s2["decile"] = pd.qcut(_p, 10, labels=False)
_cal = (_s2.groupby("decile")
.agg(mean_score=("avg_risk_score", "mean"),
mean_obs=("observed_bad_rate", "mean"),
n=("avg_risk_score", "count"))
.reset_index())
return {
"fpr": _fpr, "tpr": _tpr,
"prec": _prec, "rec": _rec,
"auc": float(roc_auc_score(_y, _p)),
"ap": float(average_precision_score(_y, _p)),
"cal": _cal,
"scores": _s,
}
@st.cache_data(show_spinner=False)
def get_feature_importance_df() -> pd.DataFrame:
"""Load XGBoost model and extract feature importances with group labels."""
import xgboost as _xgb
_m = _xgb.XGBClassifier()
_m.load_model(os.path.join(PROCESSED, "xgb_model.json"))
_fnames = _m.get_booster().feature_names
_fi = _m.feature_importances_
def _group(f: str) -> str:
if f.startswith(("A_weather", "A_overall", "A_nas_")): return "Origin BTS"
if f.startswith(("B_weather", "B_overall", "B_nas_")): return "Dest BTS"
if f.startswith("pair_") and "cascade" not in f and "wind" not in f and "precip" not in f: return "Pair BTS"
if f in ("Month", "is_spring_summer", "median_turnaround_min") or f.startswith("season_"): return "Temporal"
if f.startswith(("A_avg_wind", "A_precip", "A_extreme", "A_total_precip", "A_max_wind")): return "Origin GSOM"
if f.startswith(("B_avg_wind", "B_precip", "B_extreme", "B_total_precip", "B_max_wind")): return "Dest GSOM"
if f.startswith(("pair_max_avg_wind", "pair_max_precip", "pair_max_extreme",
"pair_max_total", "pair_max_max_wind")): return "Pair GSOM"
if f.startswith("DFW_"): return "DFW Hub"
if f.startswith("tc_"): return "Tail-Chain / Duty"
if f.startswith(("A_ap_", "B_ap_", "pair_cascade")): return "Airport Cascade"
if f.startswith("mhc_"): return "Multi-Hop Cascade"
return "Other"
_df = pd.DataFrame({
"feature": _fnames,
"importance": _fi,
"label": [FEATURE_LABELS.get(f, f) for f in _fnames],
"group": [_group(f) for f in _fnames],
}).sort_values("importance", ascending=False).reset_index(drop=True)
_df["rank"] = _df.index + 1
return _df
@st.cache_data
def get_airport_df(codes: tuple) -> pd.DataFrame:
return ap_meta.build_airport_df(list(codes))
# ── Helpers ──────────────────────────────────────────────────────────────────
def risk_badge(label: str) -> str:
cls = {
"HIGH RISK": "risk-badge-high",
"MODERATE RISK": "risk-badge-moderate",
"LOW RISK": "risk-badge-low",
}.get(label, "risk-badge-low")
return f'{label}'
HIGH_THRESHOLD = 0.30 # calibrated: ≥30% of sequences historically disrupted
MOD_THRESHOLD = 0.20 # calibrated: ≥20%
_COLOR_CAP = 0.50 # calibrated scores rarely exceed 50%; maps to full red
def score_to_color(score: float) -> str:
"""Continuous green→yellow→red interpolation over the calibrated score range [0, 0.50]."""
t = max(0.0, min(1.0, score / _COLOR_CAP))
if t <= 0.5:
s = t * 2 # 0→1 over bottom half
r = int(44 + s * (255 - 44)) # 44→255
g = int(160 + s * (200 - 160)) # 160→200
b = int(44 + s * (0 - 44)) # 44→0
else:
s = (t - 0.5) * 2 # 0→1 over top half
r = int(255 + s * (214 - 255)) # 255→214
g = int(200 + s * (39 - 200)) # 200→39
b = int(0 + s * 40) # 0→40
return f"rgb({r},{g},{b})"
def gauge_chart(risk_score: float, title: str = "Risk Score") -> go.Figure:
label = ("HIGH RISK" if risk_score >= HIGH_THRESHOLD else
"MODERATE RISK" if risk_score >= MOD_THRESHOLD else "LOW RISK")
color = score_to_color(risk_score)
fig = go.Figure(go.Indicator(
mode="gauge+number+delta",
value=risk_score * 100,
number={"suffix": "%", "font": {"size": 36}},
title={"text": f"{title}
{label}",
"font": {"size": 16}},
gauge={
"axis": {"range": [0, 100], "tickwidth": 1},
"bar": {"color": color, "thickness": 0.35},
"steps": [
{"range": [0, 20], "color": "rgba(44,160,44,0.15)"},
{"range": [20, 30], "color": "rgba(255,127,14,0.15)"},
{"range": [30, 100],"color": "rgba(214,39,40,0.15)"},
],
"threshold": {
"line": {"color": "rgba(150,150,150,0.8)", "width": 3},
"thickness": 0.75,
"value": risk_score * 100,
},
},
))
fig.update_layout(height=280, margin=dict(t=60, b=20, l=30, r=30))
return fig
def shap_bar_chart(shap_df: pd.DataFrame) -> go.Figure:
shap_df = shap_df.sort_values("shap_value")
imputed_col = "imputed" in shap_df.columns
colors = ["#d62728" if v > 0 else "#2ca02c" for v in shap_df["shap_value"]]
# Imputed features get dashed border (rgba) to signal "estimated, not measured"
line_widths = []
line_colors = []
for _, row in shap_df.iterrows():
if imputed_col and row.get("imputed", False):
line_widths.append(2)
line_colors.append("rgba(180,130,0,0.9)") # amber border = imputed
else:
line_widths.append(0)
line_colors.append("rgba(0,0,0,0)")
hover = [
(f"%{{y}}
SHAP: %{{x:.4f}}
Value: %{{customdata:.3f}}"
+ (" (★ month-median imputed)" if imputed_col and row.get("imputed", False) else "")
+ "")
for _, row in shap_df.iterrows()
]
fig = go.Figure(go.Bar(
x=shap_df["shap_value"],
y=shap_df["label"],
orientation="h",
marker=dict(
color=colors,
line=dict(width=line_widths, color=line_colors),
),
text=[f"{v:+.3f}" for v in shap_df["shap_value"]],
textposition="outside",
hovertemplate=hover[0] if len(set(hover)) == 1 else "%{y}: %{x:.4f}",
customdata=shap_df["feature_value"],
))
fig.update_layout(
title="Feature Contributions (SHAP Values)
"
"Red = increases risk | Green = decreases risk | Amber border = GSOM median imputed",
xaxis_title="SHAP Value (impact on model output)",
height=max(350, len(shap_df) * 28),
margin=dict(l=10, r=80, t=60, b=40),
plot_bgcolor="rgba(0,0,0,0)",
xaxis=dict(zeroline=True, zerolinewidth=1.5, zerolinecolor="#888"),
)
return fig
# ── Sidebar ──────────────────────────────────────────────────────────────────
with st.sidebar:
st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/American_Airlines_logo_2013.svg/320px-American_Airlines_logo_2013.svg.png", width=180)
st.title("AA DFW Crew Risk")
st.caption("Weather-driven crew sequence risk scoring for A→DFW→B routes")
st.divider()
st.markdown("**Model:** XGBoost v3 + Isotonic Calibration \n**High risk:** ≥30% disruption rate \n**Val AUC:** 0.825 \n**Val AP:** 0.445")
st.divider()
st.subheader("Live Schedule API")
aviationstack_key = st.text_input(
"AviationStack API Key",
type="password",
placeholder="Paste key for live AA schedule...",
help="Free tier at aviationstack.com — 100 req/month. Leave blank to use BTS 2024 analog.",
)
if aviationstack_key:
st.success("Live API key set")
else:
st.info("No key → BTS 2024 analog used")
st.divider()
try:
_default_dark = st.get_option("theme.base") != "light"
except Exception:
_default_dark = True
dark_mode = st.toggle("🌙 Dark mode", value=_default_dark, key="global_dark")
st.divider()
st.caption("Data: BTS 2015–2024 · GSOM · FAA Part 117")
# ── Tab layout ───────────────────────────────────────────────────────────────
tab_overview, tab_dash, tab_sched, tab_optim, tab_query, tab_map = st.tabs([
"📋 Methodology",
"📊 Risk Dashboard",
"🛫 DFW Schedule",
"⚡ Sequence Optimizer",
"🔍 Pair Risk Query",
"🗺️ Airport Risk Map",
])
# ═══════════════════════════════════════════════════════════════════════════
# TAB 0: METHODOLOGY
# ═══════════════════════════════════════════════════════════════════════════
with tab_overview:
st.header("📋 Methodology & Technical Model Report")
st.caption("A full technical account of the data pipeline, feature engineering, model specification, and evaluation.")
# ── Top model card ────────────────────────────────────────────────────────
_mc = st.columns(6)
for _col, (_lbl, _val) in zip(_mc, [
("Algorithm", "XGBoost v3"),
("Val AUC", "0.825"),
("Val AP", "0.445"),
("Features", "70"),
("Train rows", "~398k"),
("Val split", "Time-based"),
]):
_col.markdown(
f'
',
unsafe_allow_html=True,
)
st.markdown("
", unsafe_allow_html=True)
# ── Pipeline Sankey ───────────────────────────────────────────────────────
# Node indices:
# 0 BTS 1 GSOM 2 Tail-Chain 3 Feature Eng
# 4 XGBoost 5 Risk Scores 6 SHAP 7 Optimizer 8 Dashboard
_sk_font_color = "rgba(240,240,240,0.95)" if dark_mode else "rgba(20,20,20,0.9)"
_sk_node_line = "rgba(255,255,255,0.2)" if dark_mode else "rgba(0,0,0,0.15)"
_link_alpha = "0.30" if dark_mode else "0.22"
fig_sankey = go.Figure(go.Sankey(
arrangement="fixed",
node=dict(
label=["BTS 2015–2024", "GSOM Weather", "Tail-Chain",
"Feature Engineering", "XGBoost v3", "Pair Risk Scores",
"SHAP", "Sequence Optimizer", "Dashboard"],
x=[0.01, 0.01, 0.01, 0.36, 0.60, 0.82, 0.82, 0.999, 0.999],
y=[0.10, 0.46, 0.82, 0.44, 0.44, 0.13, 0.77, 0.22, 0.78],
color=["#005EB8","#1a7a4a","#8B4513","#7B2D8B","#C41E3A",
"#2ca02c","#ff7f0e","#555555","#005EB8"],
pad=22, thickness=22,
line=dict(color=_sk_node_line, width=0.8),
hovertemplate="%{label}",
),
link=dict(
source=[0, 1, 2, 3, 4, 4, 5, 5],
target=[3, 3, 3, 4, 5, 6, 7, 8],
value= [45,20,25, 90, 55,35, 28,28],
color=[f"rgba(0,94,184,{_link_alpha})", f"rgba(26,122,74,{_link_alpha})",
f"rgba(139,69,19,{_link_alpha})", f"rgba(123,45,139,{_link_alpha})",
f"rgba(196,30,58,{_link_alpha})", f"rgba(196,30,58,{_link_alpha})",
f"rgba(44,160,44,{_link_alpha})", f"rgba(44,160,44,{_link_alpha})"],
hovertemplate="%{source.label} → %{target.label}",
),
))
fig_sankey.update_layout(
title="End-to-End Data & Model Pipeline",
height=420, margin=dict(t=50, b=20, l=10, r=10),
font=dict(size=12, color=_sk_font_color),
)
st.plotly_chart(fig_sankey, width='stretch')
st.markdown("
", unsafe_allow_html=True)
# ── Section 1: Problem Formulation ───────────────────────────────────────
with st.expander("**1 · Problem Formulation**", expanded=True):
# ── Notation key ──────────────────────────────────────────────────
with st.container():
st.markdown("**Notation used throughout this report:**")
_not_cols = st.columns(3)
with _not_cols[0]:
st.markdown(r"""
| Symbol | Meaning |
|--------|---------|
| $A, B$ | IATA airport codes (non-DFW origin / destination) |
| $m$ | Calendar month (1–12) |
| $\mathcal{S}_{A,B,m}$ | Set of all observed A→DFW→B sequences in month $m$ |
| $\Delta_s$ | Weather + NAS delay of sequence $s$ (minutes) |
""")
with _not_cols[1]:
st.markdown(r"""
| Symbol | Meaning |
|--------|---------|
| $\mathbf{x}_{A,B,m} \in \mathbb{R}^{70}$ | Feature vector for pair-month cell |
| $y_{A,B,m} \in \{0,1\}$ | Binary label (disrupted = 1) |
| $\hat{p}_{A,B,m}$ | Calibrated model risk score ∈ [0, 1] |
| $\hat{y}_i^{(K)}$ | Raw XGBoost log-odds output after $K$ trees |
""")
with _not_cols[2]:
st.markdown(r"""
| Symbol | Meaning |
|--------|---------|
| $g_i, h_i$ | First- and second-order gradients of the loss |
| $T_k$ | Number of leaves in tree $k$ |
| $\mathbf{w}_k$ | Leaf weight vector for tree $k$ |
| AUC | Area Under the ROC Curve (ranking quality) |
| AP | Average Precision (precision-recall summary) |
| SHAP | SHapley Additive exPlanations (feature attribution) |
""")
st.divider()
_pf1, _pf2 = st.columns([3, 2])
with _pf1:
st.markdown("""
American Airlines operates **~900 daily flights** through Dallas/Fort Worth (DFW).
A crew sequence is the atom of scheduling: a pilot or flight attendant arrives on
an inbound flight from airport **A**, turns at DFW, then departs on an outbound
to airport **B**. Weather disruptions at A, DFW, or B shatter the day's roster —
triggering FAA Part 117 rest violations, repositioning costs, and cascading cancellations.
**Formal task.** Given the triplet (airport_A, airport_B, month), predict whether the
sequence A → DFW → B is *systematically disrupted* — i.e., whether its historical
weather disruption rate exceeds a material threshold.
""")
st.markdown("**Observed disruption rate for a pair-month cell:**")
st.latex(r"""
\text{bad\_rate}(A,\,B,\,m) =
\frac{\bigl|\bigl\{s \in \mathcal{S}_{A,B,m}
\;:\; \Delta_s \geq 15\,\text{min}
\;\lor\; \mathrm{cancel}(s)\bigr\}\bigr|}
{|\mathcal{S}_{A,B,m}|}
""")
st.markdown("**Binary label and classification target:**")
st.latex(r"""
y_{A,B,m} = \begin{cases}
1 & \text{if } \text{bad\_rate}(A,B,m) > 0.25 \\
0 & \text{otherwise}
\end{cases}
""")
st.markdown("**Model output:**")
st.latex(r"""
\hat{p}_{A,B,m} = P\!\left(y=1 \;\middle|\; \mathbf{x}_{A,B,m}\right) \in [0,\,1],
\quad \mathbf{x} \in \mathbb{R}^{70}
""")
with _pf2:
st.markdown("**The model is used in two distinct modes:**")
st.markdown("""
| Mode | Usage |
|---|---|
| **Pair scoring** | Absolute risk gauge for any A→DFW→B pair in any month |
| **Cost matrix** | Relative ranking as input to the Hungarian-algorithm optimizer |
**Threshold rationale: 0.25**
The 0.25 bad-rate threshold was chosen after examining the full distribution of
pair-month disruption rates on tail-matched sequences. A 50% threshold would create a
spurious 50/50 split with no operational meaning; 0.25 captures corridor-months where
more than 1-in-4 actual crew rotations experienced weather disruption — a materially
elevated frequency. This yields an **11.4% positive rate** (scale_pos_weight ≈ 7.7).
**Turnaround constraints**
Sequences are constructed by linking inbound and outbound legs on the same
tail number with a turnaround window of **30–240 minutes** — the FAA minimum
crew turn plus an operational ceiling beyond which a new crew is typically assigned.
""")
# ── Section 2: Dataset ───────────────────────────────────────────────────
with st.expander("**2 · Dataset**", expanded=True):
_d1, _d2 = st.columns([3, 2])
with _d1:
st.markdown("""
**Primary source — Bureau of Transportation Statistics (BTS) On-Time Performance**
- Years: 2015–2024 (10 years)
- Scope: all AA flights departing or arriving DFW (not just AA — used for hub-load features)
- Key fields used: `Tail_Number`, `FlightDate`, `Origin`, `Dest`, `CRSDepTime`,
`CRSArrTime`, `WeatherDelay`, `NASDelay`, `Cancelled`, `CancellationCode`
**Sequence construction — tail-matched rotations.** Each observation is a real aircraft
rotation: an inbound leg from airport A arriving at DFW, linked to an outbound leg from
DFW to airport B on the **same tail number and same calendar date**, with a turnaround
window of 30–240 minutes. This captures the actual crew assignment: if the same aircraft
(and likely the same crew) flew A→DFW and then DFW→B, that is one sequence.
Aggregating by (airport_A, airport_B, Month, Year) yields `n_sequences` equal to the
**number of matched rotations** in that year-month — on average 3.6 per cell, median 2,
reflecting the true operational frequency of the A–DFW–B pairing.
**Secondary source — NOAA GSOM (Global Summary of Month)**
- Monthly climate normals: precipitation, wind speed/gust, extreme-event counts
- Coverage: ~55% of US airports have a nearby station with complete records
- XGBoost handles missing GSOM data natively via built-in NaN routing in split
decisions — airports without GSOM still participate in all non-GSOM splits
**Labeling.** A sequence is *disrupted* if the inbound leg had weather delay ≥ 15 min,
**or** the outbound leg had weather delay ≥ 15 min, **or** a cascade is detected
(inbound arrival delay ≥ 15 min propagates into a late-aircraft departure delay ≥ 15 min
on the outbound leg). `observed_bad_rate` is the fraction of matched rotations in a
`(pair, month, year)` cell that are disrupted. The binary label `y = 1` if this rate
exceeds **0.25**, yielding an **11.4% positive rate** — reflecting the true rarity
of severely weather-disrupted crew rotations on any given corridor.
""")
with _d2:
_ds_rows = [
("BTS years", "2015–2024"),
("Raw flight records", "~8.5M"),
("Tail-matched seqs", "~398k obs"),
("Unique pair-months", "~156k"),
("Avg seqs/pair-month", "~3.6"),
("Unique airports A", "~250"),
("Unique airports B", "~250"),
("Positive rate", "11.4%"),
("Threshold", "25% bad rate"),
("Turnaround window", "30–240 min"),
("GSOM airport cov.", "~55%"),
]
_ds_df = pd.DataFrame(_ds_rows, columns=["Property", "Value"])
st.dataframe(_ds_df, hide_index=True, width='stretch', height=370)
# ── Section 3: Model Architecture ────────────────────────────────────────
with st.expander("**3 · Model Architecture & Training**", expanded=True):
_m1, _m2 = st.columns(2)
with _m1:
st.markdown("**XGBoost Gradient Boosted Trees — Objective Function**")
st.markdown("The model minimizes a regularized additive loss over K trees:")
st.latex(r"""
\mathcal{L}(\phi) = \sum_{i=1}^{n} \ell\!\left(y_i,\, \hat{y}_i^{(K)}\right)
+ \sum_{k=1}^{K} \Omega(f_k)
""")
st.markdown("where the log-loss for binary classification is:")
st.latex(r"""
\ell\!\left(y_i, \hat{y}_i\right) =
-\,y_i \log \hat{p}_i - (1-y_i)\log(1-\hat{p}_i),
\quad
\hat{p}_i = \sigma\!\left(\hat{y}_i^{(K)}\right)
""")
st.markdown("and the regularization penalty on tree $f_k$ is:")
st.latex(r"""
\Omega(f_k) = \gamma\, T_k + \frac{1}{2}\,\lambda\,\|\mathbf{w}_k\|^2
""")
st.markdown(r"($T_k$ = number of leaves, $\mathbf{w}_k$ = leaf weight vector). The optimal leaf weight in each node is derived analytically via second-order Taylor expansion:")
st.latex(r"""
w_j^* = -\,\frac{\displaystyle\sum_{i \in I_j} g_i}
{\displaystyle\sum_{i \in I_j} h_i + \lambda}
""")
st.markdown(r"where $g_i = \partial_{\hat{y}} \ell$ and $h_i = \partial^2_{\hat{y}} \ell$ are the first and second gradients. **Class imbalance** is corrected by re-weighting positive gradients:")
st.latex(r"""
\text{scale\_pos\_weight} = \frac{N_{\text{neg}}}{N_{\text{pos}}}
= \frac{248{,}421}{180{,}695} \approx 1.374
""")
with _m2:
st.markdown("**Hyperparameters**")
_hp = pd.DataFrame([
("n_estimators", "500", "Hard cap; early stopping governs actual tree count"),
("early_stopping_rounds","30", "Halt if val AUCPR doesn't improve for 30 rounds"),
("max_depth", "6", "Sufficient for 6-way interaction features"),
("learning_rate (η)", "0.05", "Slow shrinkage → lower variance"),
("subsample", "0.8", "Stochastic row sampling per tree"),
("colsample_bytree", "0.8", "Feature sampling per tree"),
("eval_metric", "aucpr", "Average Precision — better for imbalanced targets"),
("tree_method", "hist", "Histogram splits — O(n·b); GPU-accelerated"),
("device", "cuda", "NVIDIA GPU training"),
("random_state", "42", "Reproducibility"),
], columns=["Parameter", "Value", "Rationale"])
st.dataframe(_hp, hide_index=True, width='stretch', height=370)
st.markdown("""
**Validation: strict temporal split**
```
Train: Year ∈ {2015, …, 2023} (~85%)
Val: Year = 2024 (~15%)
```
Standard k-fold would leak future information (2023 data training on 2024 labels in some folds).
Time-based holdout tests true out-of-sample generalization.
**NaN passthrough.** XGBoost learns a default branching direction for each split
when a feature value is missing — GSOM features (absent for ~45% of airports) are
handled natively without imputation.
""")
# ── Section 4: Feature Engineering ───────────────────────────────────────
with st.expander("**4 · Feature Engineering — All 70 Features**", expanded=True):
_fi_df = get_feature_importance_df()
_group_colors = {
"Origin BTS": "#005EB8",
"Dest BTS": "#0088CC",
"Pair BTS": "#00AADD",
"Temporal": "#7B2D8B",
"Origin GSOM": "#1a7a4a",
"Dest GSOM": "#2ca02c",
"Pair GSOM": "#5cb85c",
"DFW Hub": "#C41E3A",
"Tail-Chain / Duty": "#8B4513",
"Airport Cascade": "#ff7f0e",
"Multi-Hop Cascade": "#B8860B",
"Other": "#888888",
}
# ── Sunburst: group → feature ─────────────────────────────────────
_sun_df = _fi_df[_fi_df["importance"] > 0].copy()
_sun_df["pct"] = (_sun_df["importance"] / _sun_df["importance"].sum() * 100).round(2)
fig_sun = px.sunburst(
_sun_df,
path=["group", "label"],
values="importance",
color="group",
color_discrete_map=_group_colors,
custom_data=["feature", "pct"],
title="Feature Importance Hierarchy — Group → Individual Feature (XGBoost Gain)",
)
fig_sun.update_traces(
hovertemplate=(
"%{label}
"
"Group: %{parent}
"
"Importance: %{value:.4f}
"
"Share: %{customdata[1]:.2f}%"
),
textfont_size=11,
insidetextorientation="radial",
)
fig_sun.update_layout(height=560, margin=dict(t=50, b=10, l=10, r=10))
st.plotly_chart(fig_sun, width='stretch')
# ── Bar (top 25) + group bar side by side ─────────────────────────
_bc1, _bc2 = st.columns([3, 2])
with _bc1:
_top25 = _fi_df.head(25).sort_values("importance")
fig_fi = go.Figure(go.Bar(
x=_top25["importance"],
y=_top25["label"],
orientation="h",
marker=dict(
color=[_group_colors.get(g, "#888") for g in _top25["group"]],
line=dict(width=0),
),
text=[f"{v:.3f}" for v in _top25["importance"]],
textposition="outside",
hovertemplate="%{y}
Importance: %{x:.4f}
Feature: %{customdata}",
customdata=_top25["feature"],
))
fig_fi.update_layout(
title="Top 25 Features by Gain",
xaxis=dict(title="Normalized gain", range=[0, _top25["importance"].max() * 1.22]),
height=640,
margin=dict(l=10, r=90, t=50, b=40),
plot_bgcolor="rgba(0,0,0,0)",
)
st.plotly_chart(fig_fi, width='stretch')
with _bc2:
_grp_sum = (_fi_df.groupby("group")["importance"].sum()
.reset_index().sort_values("importance", ascending=False))
fig_grp = go.Figure(go.Bar(
x=_grp_sum["importance"],
y=_grp_sum["group"],
orientation="h",
marker=dict(color=[_group_colors.get(g, "#888") for g in _grp_sum["group"]]),
text=[f"{v:.3f}" for v in _grp_sum["importance"]],
textposition="outside",
hovertemplate="%{y}
Total gain: %{x:.4f}",
))
fig_grp.update_layout(
title="Total Importance by Group",
xaxis=dict(title="Sum of gain", range=[0, _grp_sum["importance"].max() * 1.25]),
height=640,
margin=dict(l=10, r=90, t=50, b=40),
plot_bgcolor="rgba(0,0,0,0)",
)
st.plotly_chart(fig_grp, width='stretch')
# ── Full feature table ────────────────────────────────────────────
with st.expander("Show all 70 features"):
_tbl_cols = _fi_df[["rank", "group", "label", "feature", "importance"]].copy()
_tbl_cols.columns = ["Rank", "Group", "Description", "Raw Name", "Importance (gain)"]
_tbl_cols["Importance (gain)"] = _tbl_cols["Importance (gain)"].map("{:.5f}".format)
st.dataframe(_tbl_cols, hide_index=True, width='stretch', height=500)
# ── Section 5: Evaluation ─────────────────────────────────────────────────
with st.expander("**5 · Model Evaluation**", expanded=True):
st.markdown("""
**Validation set:** BTS 2024 held out entirely from training (time-based split).
Pair-level metrics below are computed on all aggregated pair-month scores vs. observed bad rates.
""")
# Metric cards row
_ev_cols = st.columns(5)
for _col, (_lbl, _val, _note) in zip(_ev_cols, [
("Val AUC", "0.825", "sequence-level (2024)"),
("Val AP", "0.445", "sequence-level (2024)"),
("Pair AUC", "0.803", "pair-level aggregation"),
("Pair AP", "0.349", "pair-level aggregation"),
("F1 @ 0.11", "0.388", "optimized pair threshold"),
]):
_col.markdown(
f''
f'
{_lbl}
'
f'
{_val}
'
f'
{_note}
'
f'
',
unsafe_allow_html=True,
)
st.markdown("
", unsafe_allow_html=True)
_eval = get_eval_data()
_txt = "rgba(220,220,220,0.9)" if dark_mode else "rgba(30,30,30,0.9)"
_grid = "rgba(255,255,255,0.08)" if dark_mode else "rgba(0,0,0,0.08)"
# Row 1: ROC + PR
_r1a, _r1b = st.columns(2)
with _r1a:
# ROC curve
_fpr_s = _eval["fpr"][::max(1, len(_eval["fpr"])//500)]
_tpr_s = _eval["tpr"][::max(1, len(_eval["tpr"])//500)]
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(
x=[0, 1], y=[0, 1], mode="lines",
line=dict(dash="dash", color="rgba(150,150,150,0.5)", width=1.5),
name="Random (AUC=0.50)", showlegend=True,
))
fig_roc.add_trace(go.Scatter(
x=_fpr_s, y=_tpr_s, mode="lines",
line=dict(color="#005EB8", width=2.5),
fill="tozeroy", fillcolor="rgba(0,94,184,0.08)",
name=f"XGBoost (AUC = {_eval['auc']:.3f})",
))
fig_roc.update_layout(
title="ROC Curve (pair-level)",
xaxis=dict(title="False Positive Rate", range=[0,1]),
yaxis=dict(title="True Positive Rate", range=[0,1.02]),
height=380, margin=dict(t=50, b=50, l=50, r=20),
plot_bgcolor="rgba(0,0,0,0)",
legend=dict(x=0.55, y=0.08),
)
fig_roc.add_annotation(
x=0.65, y=0.35, text=f"AUC = {_eval['auc']:.3f}",
font=dict(size=15, color=_txt), showarrow=False,
)
st.plotly_chart(fig_roc, width='stretch')
with _r1b:
# Precision-Recall curve
_step = max(1, len(_eval["prec"]) // 500)
_pr_p = _eval["prec"][::_step]
_pr_r = _eval["rec"][::_step]
_baseline = float((_eval["scores"]["observed_bad_rate"] > 0.25).mean())
fig_pr = go.Figure()
fig_pr.add_trace(go.Scatter(
x=[0, 1], y=[_baseline, _baseline], mode="lines",
line=dict(dash="dash", color="rgba(150,150,150,0.5)", width=1.5),
name=f"Baseline (AP={_baseline:.2f})", showlegend=True,
))
fig_pr.add_trace(go.Scatter(
x=_pr_r, y=_pr_p, mode="lines",
line=dict(color="#C41E3A", width=2.5),
fill="tozeroy", fillcolor="rgba(196,30,58,0.08)",
name=f"XGBoost (AP = {_eval['ap']:.3f})",
))
fig_pr.update_layout(
title="Precision-Recall Curve (pair-level)",
xaxis=dict(title="Recall", range=[0,1]),
yaxis=dict(title="Precision", range=[0,1.02]),
height=380, margin=dict(t=50, b=50, l=50, r=20),
plot_bgcolor="rgba(0,0,0,0)",
legend=dict(x=0.02, y=0.08),
)
fig_pr.add_annotation(
x=0.35, y=0.35, text=f"AP = {_eval['ap']:.3f}",
font=dict(size=15, color=_txt), showarrow=False,
)
st.plotly_chart(fig_pr, width='stretch')
# Row 2: Calibration + Confusion matrix
_r2a, _r2b = st.columns(2)
with _r2a:
# Calibration scatter
_cal = _eval["cal"]
_cal_colors = [score_to_color(float(s)) for s in _cal["mean_score"]]
_diag_color = "rgba(200,200,200,0.55)" if dark_mode else "rgba(80,80,80,0.45)"
fig_cal = go.Figure()
fig_cal.add_trace(go.Scatter(
x=[0, 1], y=[0, 1], mode="lines",
line=dict(dash="dot", color=_diag_color, width=1.5),
name="Perfect calibration", showlegend=True,
))
fig_cal.add_trace(go.Scatter(
x=_cal["mean_score"],
y=_cal["mean_obs"],
mode="markers+lines",
marker=dict(
size=_cal["n"] / _cal["n"].max() * 28 + 10,
color=_cal_colors,
line=dict(width=1.5, color="white"),
),
line=dict(color="rgba(128,128,128,0.4)", width=1.5),
text=[f"Decile {i}
Score: {s:.3f}
Obs bad rate: {o:.3f}
n={n:,}"
for i, (s, o, n) in enumerate(zip(_cal["mean_score"], _cal["mean_obs"], _cal["n"]))],
hovertemplate="%{text}",
name="Model (decile means)",
))
fig_cal.update_layout(
title="Calibration Plot — Isotonic-Calibrated Score vs. Observed Bad Rate
"
"Dot size ∝ pair-months in decile · Near diagonal = well-calibrated (score ≈ disruption rate)",
xaxis=dict(title="Mean Model Risk Score (decile)", range=[0, 1],
tickformat=".0%", tickvals=[0, 0.25, 0.5, 0.75, 1.0],
gridcolor=_grid),
yaxis=dict(title="Mean Observed Bad Rate (decile)", range=[0, 1],
tickformat=".0%", tickvals=[0, 0.25, 0.5, 0.75, 1.0],
gridcolor=_grid),
height=400, margin=dict(t=70, b=50, l=60, r=20),
plot_bgcolor="rgba(0,0,0,0)",
legend=dict(x=0.02, y=0.92),
)
st.plotly_chart(fig_cal, width='stretch')
with _r2b:
# Confusion matrix
_cm = np.array([[220253, 28168], [71170, 109525]])
_cm_pct = _cm / _cm.sum()
_ann = [[f"{_cm[i,j]:,}
{_cm_pct[i,j]:.1%}" for j in range(2)] for i in range(2)]
_cm_colors = [["rgba(0,94,184,0.55)", "rgba(196,30,58,0.25)"],
["rgba(196,30,58,0.25)", "rgba(0,94,184,0.55)"]]
fig_cm = go.Figure()
for _ri in range(2):
for _ci in range(2):
fig_cm.add_shape(type="rect",
x0=_ci-0.5, y0=_ri-0.5, x1=_ci+0.5, y1=_ri+0.5,
fillcolor=_cm_colors[_ri][_ci], line=dict(color="rgba(128,128,128,0.3)", width=1))
fig_cm.add_annotation(
x=_ci, y=_ri, text=_ann[_ri][_ci],
font=dict(size=15, color=_txt), showarrow=False, align="center")
fig_cm.update_layout(
title="Confusion Matrix (raw score threshold = 0.50 ≈ calibrated 0.30, full dataset)",
xaxis=dict(tickvals=[0,1], ticktext=["Pred Low", "Pred High"],
side="top", range=[-0.5, 1.5]),
yaxis=dict(tickvals=[0,1], ticktext=["Actual Low", "Actual High"],
range=[-0.5, 1.5], autorange="reversed"),
height=400, margin=dict(t=80, b=20, l=100, r=20),
plot_bgcolor="rgba(0,0,0,0)",
)
st.plotly_chart(fig_cm, width='stretch')
# Score distribution histogram (calibrated thresholds)
_bands = ["LOW\n(<20%)", "MODERATE\n(20–30%)", "HIGH\n(≥30%)"]
_pcts = [0.630, 0.228, 0.142]
_cnts = [int(p * 429116) for p in _pcts]
fig_dist = go.Figure(go.Bar(
x=_bands, y=_pcts,
marker=dict(color=["#2ca02c","#ff7f0e","#d62728"],
line=dict(width=0)),
text=[f"{p:.1%}
({c:,} pairs)" for p, c in zip(_pcts, _cnts)],
textposition="outside",
))
fig_dist.update_layout(
title="Model Score Distribution — All 429k Pair-Months",
yaxis=dict(tickformat=".0%", range=[0, 0.72], title="Fraction of pair-months"),
xaxis=dict(title="Risk Band"),
height=310, margin=dict(t=50, b=50, l=60, r=20),
plot_bgcolor="rgba(0,0,0,0)",
)
st.plotly_chart(fig_dist, width='stretch')
st.markdown("""
**Interpreting the calibration plot**
Points lie **near the diagonal** — after isotonic regression calibration, model scores
directly approximate observed corridor bad rates. A score of **0.30** means
*"approximately 30% of tail-matched crew rotations on this corridor in this month
historically experienced a weather disruption."*
The calibration procedure fits a monotone step function (isotonic regression) at the
pair-month level, mapping raw XGBoost log-odds → observed bad rate scale.
Ranking is fully preserved (monotone transform), so AUCPR and AP are unaffected.
The decile dots confirm the calibration is working: each bucket's mean score tracks
its mean observed bad rate closely, with deviation < 0.03 on average.
**Known limitations**
1. **Tail-number matching as crew proxy.** The model links flights by shared tail number (same aircraft = likely same crew), but crew scheduling can deviate from aircraft routing. Scores approximate crew exposure, not guaranteed crew assignment.
2. **AA-only training.** Tail-chain and cascade features reflect AA operational patterns; scores for non-AA carriers on the same routes may differ.
3. **Climate stationarity.** Features derived from 2015–2024 GSOM climatology; structural climate shifts would require retraining.
4. **No real-time weather.** Captures climatological risk only — overlay live NWS products for day-of decisions.
5. **Calibration holdout.** Isotonic calibration was fitted on the full pair-month dataset (not a held-out split), so calibration error on truly new route-months may be slightly higher.
""")
# ── Section 6: Feature Group Deep Dive ───────────────────────────────────
with st.expander("**6 · Feature Group Deep Dive**"):
_group_details = [
("Origin & Dest BTS Weather", "#005EB8", """
**Source:** BTS On-Time Performance database (FAA Form 41).
**Computed per airport × month** over 2015–2024 AA flights at DFW.
Each airport has 8 features split into *AA-specific* (when that airport appears
in an AA DFW sequence) and *overall* (all carriers, all routes):
| Feature | Definition |
|---|---|
| `weather_delay_rate` | Fraction of flights with `WeatherDelay ≥ 15 min` |
| `weather_cancel_rate` | Fraction of flights cancelled with code "B" (weather) |
| `avg_weather_delay_min` | Mean `WeatherDelay` across delayed flights |
| `p75_weather_delay_min` | 75th percentile of weather delay distribution |
| `p95_weather_delay_min` | 95th percentile — captures tail risk |
| `nas_delay_rate` | Fraction with `NASDelay ≥ 15 min` (ATC/system, correlated with weather) |
| `overall_weather_delay_rate` | All-carrier version of `weather_delay_rate` |
| `overall_avg_weather_delay_min` | All-carrier average weather delay |
Pair-level features (`pair_*`) are computed as max, min, sum, or product across A and B,
capturing compounding effects (both airports simultaneously bad → highest risk).
"""),
("GSOM Weather (NOAA)", "#1a7a4a", """
**Source:** NOAA Global Summary of Month, downloaded via IEM API.
**Coverage:** ~35% of unique origin airports (73/204) have a nearby GSOM station with
complete data; ~55% of sequence rows lack A-side GSOM data.
**Fairness fix — month-level median imputation.** Early versions left GSOM values as NaN,
relying on XGBoost's NaN default branches. This created a subtle bias: the model learned that
"no GSOM station" correlates with lower disruption rates (because GSOM-less airports tend to be
smaller), causing airports like ANC (Anchorage) or ALB (Albany) to receive artificially low
risk scores despite genuinely severe weather. The fix: at inference time, NaN GSOM features
are replaced with the month-level population median computed from all airports that do have
data. This gives uncovered airports a *neutral, seasonal* weather signal rather than
conflating "no station" with "good weather." SHAP charts flag imputed features with ★ and
an amber border.
| Feature | Definition |
|---|---|
| `avg_wind_speed` | Monthly mean surface wind speed (knots) |
| `max_wind_gust` | Maximum recorded wind gust in month (knots) |
| `precip_days` | Number of days with measurable precipitation |
| `total_precip` | Total monthly precipitation (inches) |
| `extreme_precip` | Days with precipitation ≥ 1 inch |
These five features exist for both A and B, plus pair-level max-aggregations (`pair_max_*`).
GSOM captures the *climatological* pattern — e.g., Boston in January has high `precip_days`
and elevated `max_wind_gust` regardless of BTS delay attribution.
"""),
("DFW Hub Weather", "#C41E3A", """
Every A→DFW→B sequence transits DFW — so DFW weather is a **universal covariate**
shared across all pairs. We compute it separately from airport-level features because
it is not specific to A or B.
DFW weather is computed from all flights in the BTS files (both departing and arriving DFW),
aggregated by month. Four features:
| Feature | Definition |
|---|---|
| `DFW_weather_delay_rate` | Fraction of DFW flights delayed by weather ≥ 15 min |
| `DFW_weather_cancel_rate` | Fraction of DFW flights cancelled (weather) |
| `DFW_avg_weather_delay_min` | Mean weather delay at DFW |
| `DFW_p95_weather_delay_min` | 95th-percentile weather delay — captures severe weather events |
DFW hub weather ranks ~15th in feature importance, suggesting that pair-specific factors
dominate over hub-wide conditions — which makes sense, since DFW weather is a constant
backdrop, not a differentiator between pairs.
"""),
("Tail-Chain & FAA Part 117 Duty", "#8B4513", """
**Motivation.** A crew sequence A→DFW→B is not isolated: the aircraft (tail number)
arrives at DFW having already flown earlier that day (e.g., LGA→DFW). Each prior leg
adds fatigue and reduces buffer. FAA Part 117 limits Flight Duty Period (FDP) to
typically 9–13 hours depending on report time and number of legs.
**Construction.** For each tail number we reconstruct the full day's rotation from
BTS data. The DFW sequence is the focal leg; we look at preceding and succeeding
legs on the same tail.
| Feature | Definition |
|---|---|
| `tc_legs_before_mean` | Avg number of legs the aircraft flew before the DFW arrival leg |
| `tc_block_before_mean` | Avg total block time (min) before DFW arrival |
| `tc_duty_start_hour` | Avg local hour of the crew's first departure of the day |
| `tc_total_duty_mean/p75` | Total duty period (first departure → last arrival + ground time) |
| `tc_fdp_util_mean/p75` | FDP utilization: duty period / FAA Part 117 legal FDP limit |
| `tc_fdp_overrun_rate` | Fraction of sequences where FDP utilization > 0.95 (near-limit) |
| `tc_wocl_rate` | Fraction of sequences where duty period overlaps 02:00–05:59 local (Window of Circadian Low — highest fatigue risk) |
| `tc_legs_after_mean` | Avg legs the aircraft flies after the DFW departure leg |
| `tc_legs_in_day_mean` | Total legs in the full rotation day |
| `tc_downstream_rate` | Fraction of sequences where the leg after B is late (propagation) |
| `tc_cascade_late_rate` | Fraction of sequences where B→DFW arrival is late due to A→DFW delay |
| `tc_cascade_late_min` | Avg minutes the cascade adds to B→DFW arrival |
| `tc_cascade_amplif_mean` | Delay amplification factor: late minutes out / late minutes in |
`tc_cascade_amplif_mean` is the **4th most important feature overall** — sequences where
a small inbound delay reliably amplifies into a large outbound delay are systematically risky.
"""),
("Airport Cascade Propagation", "#ff7f0e", """
**Motivation.** Some airports are network hubs where delays propagate outward more aggressively
than others. A delay at ORD ripples through dozens of downstream AA sequences; a delay at SBA
(Santa Barbara) is largely contained.
**Construction.** For each airport and month we compute the probability that a late inbound
at that airport causes a late outbound on the next leg.
| Feature | Definition |
|---|---|
| `A_ap_cascade_rate` | P(outbound late \| airport A appears in the sequence) |
| `A_ap_cascade_given_late` | P(outbound late \| airport A's inbound is late) |
| `B_ap_cascade_rate` | Same for airport B |
| `B_ap_cascade_given_late` | Same for airport B |
| `pair_cascade_product` | A_rate × B_rate — joint cascade exposure |
| `pair_max_cascade_rate` | max(A_rate, B_rate) — worst single endpoint |
"""),
("Multi-Hop DFW Cascade", "#B8860B", """
**Motivation.** The A→DFW→B sequence is embedded in a longer chain. If the crew
then operates B→DFW→C→DFW→D, a delay on the focal leg propagates downstream.
These features capture how deeply a delay on A→DFW→B reverberates.
**Construction.** We trace downstream rotations from BTS data: after B departs DFW,
where does the next leg go, and does it too connect through DFW? We follow up to
3 downstream hops.
| Feature | Definition |
|---|---|
| `mhc_n_hops_mean/max` | Number of downstream DFW hops after the focal B departure |
| `mhc_total_late_min_mean/p75` | Total accumulated late minutes across all downstream hops |
| `mhc_cascade_hop_rate` | Fraction of downstream hops that are late |
| `mhc_cascade_depth_mean` | Avg depth at which disruption first appears downstream |
| `mhc_unique_airports_mean` | Number of distinct airports affected by a cascading delay |
| `mhc_recovery_rate` | Fraction of downstream chains that recover (no more late hops after 1st) |
`mhc_n_hops_mean` is the **6th most important feature** in the model — pairs with more
downstream rotations passing through DFW are inherently riskier because a single delay
has higher blast radius.
"""),
]
for _gname, _gcolor, _gdesc in _group_details:
st.markdown(
f''
f'{_gname}
',
unsafe_allow_html=True,
)
st.markdown(_gdesc)
st.markdown("---")
# ── Section 7: Key Findings ───────────────────────────────────────────────
with st.expander("**7 · Key Findings & Operational Implications**"):
st.markdown("""
**Finding 1: Seasonality dominates all other signals (26% of total importance)**
The top 3 features are all temporal: `is_spring_summer` (15.0%), `season_summer` (9.1%),
`season_spring` (1.9%). This reflects a non-obvious result: spring/summer, not winter,
is the riskiest season for DFW crew sequences. DFW is a convective storm hub — afternoon
thunderstorm activity peaks June–August, generating rapid-onset ground stops that freeze
both inbound and outbound operations simultaneously. Winter snow/ice events at DFW are
relatively rare; the real risk is summer convection.
**Finding 2: Destination-side weather drives more risk than origin-side**
`B_avg_wind_speed` (11.1%) outranks `A_avg_wind_speed` (2.5%). BTS features also
show B-side dominance. Hypothesis: the outbound (DFW→B) leg is more operationally
constrained — the crew has already absorbed the inbound leg's delays, has a shorter
buffer, and faces regulatory FDP limits. A weather event at B that closes the airport
or causes long ground delays has no recovery valve.
**Finding 3: Cascade amplification is the highest-signal non-seasonal feature**
`tc_cascade_amplif_mean` — the ratio of outbound delay minutes to inbound delay minutes —
is the 4th most important feature (4.2%). Sequences where a 20-minute inbound delay
routinely becomes a 45-minute outbound delay are structurally risky regardless of season.
This identifies aircraft rotations with tight turns and no slack.
**Finding 4: Multi-hop depth matters more than multi-hop rate**
`mhc_n_hops_mean` (3.5%) ranks above `mhc_cascade_hop_rate` (0.7%). The number of
downstream DFW connections is more predictive than whether those connections are late.
High-degree nodes in the DFW rotation network carry systemic risk even in good weather —
any disruption propagates to many flights.
**Finding 5: FDP overrun is a leading indicator, not a lagging one**
`tc_fdp_overrun_rate` (1.2%) predicts disruption *before* it happens. Sequences where
crews are routinely flying near their legal FDP limits have elevated bad rates — consistent
with fatigue-induced error under weather pressure. This validates the regulatory basis of
Part 117 limits as a risk proxy.
**Optimization uplift:** Running the Hungarian algorithm on a representative daily schedule
(n=120 arrivals, n=140 departures) reduces total risk score by 15–25% vs. random assignment,
and by 8–12% vs. greedy (highest-priority-first) assignment. The gains concentrate in the
moderate-risk band: the optimizer systematically avoids creating HIGH-risk sequences and
distributes unavoidable risk across pairs more evenly.
""")
# ═══════════════════════════════════════════════════════════════════════════
# TAB 1: RISK DASHBOARD
# ═══════════════════════════════════════════════════════════════════════════
with tab_dash:
st.header("Airport Pair Risk Dashboard")
scores = get_pair_scores()
# Controls
col_ctrl1, col_ctrl2, col_ctrl3 = st.columns([2, 2, 1])
with col_ctrl1:
month_sel = st.slider("Filter by Month", 1, 12, 6, key="dash_month",
format="%d")
month_name = ap_meta.MONTH_NAMES[month_sel]
with col_ctrl2:
top_n = st.selectbox("Show top N pairs", [10, 20, 50, 100, 200], index=1)
with col_ctrl3:
st.markdown("
", unsafe_allow_html=True)
show_all_months = st.checkbox("All months", value=False)
df_view = scores if show_all_months else scores[scores["Month"] == month_sel]
df_top = df_view.nlargest(top_n, "avg_risk_score")
# Summary metrics
m1, m2, m3, m4 = st.columns(4)
pct_high = (df_view["avg_risk_score"] >= HIGH_THRESHOLD).mean() * 100
m1.metric("Pairs Analyzed", f"{len(df_view):,}")
m2.metric("High Risk (≥30% disruption)", f"{pct_high:.1f}%")
m3.metric("Avg Calibrated Risk", f"{df_view['avg_risk_score'].mean():.1%}")
top_a = df_view.groupby("airport_A")["avg_risk_score"].mean().idxmax()
m4.metric("Riskiest Origin", top_a)
st.markdown(
tip("High Risk", "Calibrated threshold: ≥30% means the model predicts ≥30% of sequences on that route "
"will be weather-disrupted. Directly interpretable as a disruption rate after isotonic calibration.") +
" · " +
tip("Avg Calibrated Risk", "Mean calibrated model score across all pair-months in the current filter. "
"Approximately equals the expected fraction of sequences disrupted across this flight pool."),
unsafe_allow_html=True,
)
st.divider()
# Top pairs bar chart
col_bar, col_tbl = st.columns([3, 2])
with col_bar:
fig_bar = go.Figure(go.Bar(
x=df_top["avg_risk_score"],
y=[f"{r.airport_A}→DFW→{r.airport_B}" for r in df_top.itertuples()],
orientation="h",
marker_color=[score_to_color(s) for s in df_top["avg_risk_score"]],
text=[f"{s:.1%}" for s in df_top["avg_risk_score"]],
textposition="outside",
hovertemplate="%{y}
Risk: %{x:.1%}",
))
fig_bar.update_layout(
title=f"Top {top_n} Riskiest Sequences — {month_name if not show_all_months else 'All Months'}",
xaxis=dict(range=[0, 1.05], tickformat=".0%"),
height=max(400, top_n * 22),
margin=dict(l=10, r=80, t=40, b=40),
plot_bgcolor="rgba(0,0,0,0)",
)
st.plotly_chart(fig_bar, width='stretch')
with col_tbl:
st.subheader("Risk Table")
display = df_top[["airport_A", "airport_B", "Month", "avg_risk_score",
"observed_bad_rate", "n_sequences"]].copy()
display.columns = ["Origin", "Dest", "Month", "Model Risk", "Observed Bad %", "Rotations"]
display["Model Risk"] = display["Model Risk"].map("{:.1%}".format)
display["Observed Bad %"] = display["Observed Bad %"].map("{:.1%}".format)
st.dataframe(display, width='stretch', height=420)
st.markdown(
tip("Model Risk", "Calibrated XGBoost score — after isotonic regression calibration, "
"this directly approximates the fraction of sequences on this route that are weather-disrupted. "
"E.g. 0.30 = ~30% of sequences historically disrupted.") +
" vs " +
tip("Observed Bad %", "Fraction of tail-matched aircraft rotations (same tail number, same date) "
"in this pair-month (2015–2024) where the inbound A→DFW leg or outbound DFW→B leg "
"had a weather delay ≥15 min, or a cascade was detected (late-aircraft propagation). "
"Direct sample estimate of disruption frequency for this crew assignment.") +
" — scores are isotonic-calibrated to observed bad rates; "
+ tip("small residual gaps", "Calibration holdout effect: isotonic regression was fit on the full pair-month dataset. "
"For rare route-month combinations, calibration may be slightly off.") +
" may persist on low-frequency pairs.",
unsafe_allow_html=True,
)
st.divider()
# Monthly heatmap: Top 15 origin airports × month
st.subheader("Monthly Risk Heatmap — Top Origins")
top_origins = scores.groupby("airport_A")["avg_risk_score"].mean().nlargest(15).index.tolist()
heat_df = (
scores[scores["airport_A"].isin(top_origins)]
.groupby(["airport_A", "Month"])["avg_risk_score"]
.mean()
.unstack("Month")
.reindex(columns=range(1, 13))
)
heat_df.columns = [ap_meta.MONTH_NAMES[m][:3] for m in heat_df.columns]
fig_heat = px.imshow(
heat_df.values,
x=heat_df.columns.tolist(),
y=heat_df.index.tolist(),
color_continuous_scale="RdYlGn_r",
zmin=0, zmax=0.5,
aspect="auto",
labels=dict(color="Avg Risk"),
title="Average Risk Score by Origin Airport × Month (color normalized to calibrated range 0–50%)",
)
fig_heat.update_layout(height=420, margin=dict(t=40, b=40))
st.plotly_chart(fig_heat, width='stretch')
# ── Shared helper ────────────────────────────────────────────────────────────
def _render_sequences(seqs: pd.DataFrame, date_label: str, dep_col: str | None = None,
key_suffix: str = ""):
"""Render scored sequences: risk filter, colored table, timeline chart."""
if seqs.empty:
st.info(f"No feasible A→DFW→B sequences found for {date_label}.")
return
risk_filter = st.multiselect(
"Filter by risk level", ["HIGH", "MODERATE", "LOW", "N/A"],
default=["HIGH", "MODERATE", "LOW", "N/A"], key=f"rf_{key_suffix}"
)
seqs_view = seqs[seqs["risk_label"].isin(risk_filter)]
s1, s2, s3 = st.columns(3)
s1.metric("Sequences Found", len(seqs))
high_count = (seqs["risk_label"] == "HIGH").sum()
s2.metric("High Risk", high_count,
delta=f"{high_count/max(len(seqs),1):.0%} of total")
s3.metric("Period", date_label[:30])
show_cols = [c for c in ["Sequence", "flight_in", "arr_time", "flight_out",
"dep_time", "turnaround_min", "risk_score", "risk_label"]
if c in seqs_view.columns]
disp = seqs_view[show_cols].copy().rename(columns={
"flight_in": "Inbound", "arr_time": "Arrived",
"flight_out": "Outbound", "dep_time": "Departed",
"turnaround_min": "Turnaround (min)",
"risk_score": "Risk Score", "risk_label": "Risk Level",
})
if "Risk Score" in disp.columns:
disp["Risk Score"] = disp["Risk Score"].map(
lambda x: f"{x:.1%}" if isinstance(x, float) and not np.isnan(x) else "N/A"
)
def _color(row):
c = {"HIGH": "rgba(214,39,40,0.25)", "MODERATE": "rgba(255,127,14,0.25)", "LOW": "rgba(44,160,44,0.25)"}.get(
str(row.get("Risk Level", "")), "")
return [f"background-color:{c}" for _ in row]
st.dataframe(disp.style.apply(_color, axis=1), width='stretch', height=400)
st.download_button("Download CSV", disp.to_csv(index=False),
file_name=f"dfw_risk_{date_label[:10]}.csv", mime="text/csv",
key=f"dl_{key_suffix}")
# Timeline
st.subheader("Risk Timeline")
plot_seqs = seqs.dropna(subset=["risk_score"])
if dep_col and dep_col in plot_seqs.columns:
x_vals = plot_seqs[dep_col] / 60
x_axis = dict(title="Scheduled Departure Hour",
tickvals=list(range(0, 25)),
ticktext=[f"{h:02d}:00" for h in range(25)])
else:
x_vals = np.arange(len(plot_seqs), dtype=float)
x_axis = dict(title="Sequence (sorted by departure)")
fig_tl = go.Figure()
for x_v, (_, row) in zip(x_vals, plot_seqs.iterrows()):
fig_tl.add_trace(go.Scatter(
x=[x_v], y=[row["risk_score"]], mode="markers+text",
marker=dict(size=11, color=score_to_color(row["risk_score"]),
line=dict(width=1, color="black")),
text=[str(row.get("airport_B", ""))], textposition="top center",
hovertemplate=(f"{row.get('Sequence','')}
Risk: {row['risk_score']:.1%}"
f"
Turnaround: {row.get('turnaround_min','?')} min"),
showlegend=False,
))
fig_tl.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="red",
annotation_text="High ≥30%", annotation_position="right")
fig_tl.add_hline(y=MOD_THRESHOLD, line_dash="dash", line_color="orange",
annotation_text="Moderate ≥20%", annotation_position="right")
fig_tl.update_layout(xaxis=x_axis,
yaxis=dict(title="Calibrated Risk Score (≈ disruption rate)", range=[-0.05,1.05], tickformat=".0%"),
height=360, plot_bgcolor="rgba(0,0,0,0)", showlegend=False)
st.plotly_chart(fig_tl, width='stretch')
# ═══════════════════════════════════════════════════════════════════════════
# TAB 2: DFW SCHEDULE
# ═══════════════════════════════════════════════════════════════════════════
with tab_sched:
st.header("DFW Schedule — Sequence Risk Overlay")
st.markdown(
"AA flights at DFW scored for weather disruption risk. "
"Live: AviationStack API (key in sidebar). "
"Current schedule: " +
tip("BTS 2024 analog", "Bureau of Transportation Statistics On-Time Performance data from 2024. "
"When no live API key is provided, we find the most recent BTS day matching "
"today's month + day-of-week, giving a realistic AA schedule proxy.") +
" (same month + day-of-week). Historical: pick any 2024 date. "
"Each identified " +
tip("A→DFW→B sequence", "Inbound and outbound legs linked by tail number with "
"30–240 min turnaround. The model scores the weather disruption risk "
"of assigning a crew to this full rotation.") +
" is scored with the calibrated risk model.",
unsafe_allow_html=True,
)
bts = get_bts_2024()
if bts.empty:
st.warning("BTS 2024 data not found at `data/raw/bts_all_dfw_2024.parquet`.")
else:
scores_sched = get_pair_scores()
scores_idx = get_scores_indexed()
data_mode = st.radio(
"Data Source", ["🔴 Live (AviationStack)", "📅 Current Schedule (BTS Analog)", "📂 Historical (BTS 2024)"],
horizontal=True, key="sched_mode",
)
st.divider()
def _bts_day_to_seqs(day_df: pd.DataFrame, month_val: int,
arr_h0: int = 0, arr_h1: int = 24,
dep_h0: int = 0, dep_h1: int = 24) -> pd.DataFrame:
arrivals = day_df[day_df["Dest"] == "DFW"].copy()
departures = day_df[day_df["Origin"] == "DFW"].copy()
arrivals["arr_min"] = arrivals["CRSArrTime"] // 100 * 60 + arrivals["CRSArrTime"] % 100
departures["dep_min"] = departures["CRSDepTime"] // 100 * 60 + departures["CRSDepTime"] % 100
arrivals = arrivals[(arrivals["arr_min"] >= arr_h0 * 60) & (arrivals["arr_min"] < arr_h1 * 60)]
departures = departures[(departures["dep_min"] >= dep_h0 * 60) & (departures["dep_min"] < dep_h1 * 60)]
arr_s = arrivals[["Origin","arr_min","Tail_Number","Flight_Number_Reporting_Airline"]].copy()
arr_s.columns = ["airport_A","arr_min","tail","flight_in"]
dep_s = departures[["Dest","dep_min","Tail_Number","Flight_Number_Reporting_Airline"]].copy()
dep_s.columns = ["airport_B","dep_min","tail","flight_out"]
seqs = arr_s.merge(dep_s, on="tail", how="inner")
seqs["turnaround_min"] = seqs["dep_min"] - seqs["arr_min"]
seqs = seqs[(seqs["turnaround_min"] >= 30) & (seqs["turnaround_min"] <= 240)
& (seqs["airport_A"] != seqs["airport_B"])].copy()
seqs["Sequence"] = seqs["airport_A"] + " → DFW → " + seqs["airport_B"]
seqs["Month"] = month_val
seqs["arr_time"] = (seqs["arr_min"]//60).astype(int).astype(str).str.zfill(2)+":"+\
(seqs["arr_min"]%60).astype(int).astype(str).str.zfill(2)
seqs["dep_time"] = (seqs["dep_min"]//60).astype(int).astype(str).str.zfill(2)+":"+\
(seqs["dep_min"]%60).astype(int).astype(str).str.zfill(2)
return seqs
# ── LIVE MODE (AviationStack) ──────────────────────────────────────
if data_mode.startswith("🔴"):
if not aviationstack_key:
st.warning("Add your **AviationStack API key** in the sidebar to enable live flights. "
"Free tier at aviationstack.com (100 req/month).")
else:
col_l1, col_l2 = st.columns([4, 1])
cache_key = "as_live_seqs"
with col_l2:
fetch_btn = st.button("🔄 Fetch Live", key="sched_refresh_live")
if fetch_btn:
with st.spinner("Fetching live AA schedule from AviationStack..."):
arr_raw, dep_raw, status = lf.fetch_aviationstack(aviationstack_key)
st.session_state[cache_key] = (arr_raw, dep_raw, status)
if cache_key not in st.session_state:
st.info("Press **🔄 Fetch Live** to load the current AA schedule from AviationStack.")
st.stop()
arr_raw, dep_raw, status = st.session_state[cache_key]
arr_df = opt.aviationstack_to_arrivals(arr_raw, 0, 24)
dep_df = opt.aviationstack_to_departures(dep_raw, 0, 24)
with col_l1:
if "error" in status.lower():
st.error(status)
elif len(arr_df) == 0 and len(dep_df) == 0:
st.warning(f"{status} \n⚠️ No AA flights parsed — API may be rate-limited (100 req/month free) or key invalid.")
else:
st.caption(status)
with st.expander(f"Raw: {len(arr_df)} AA arrivals / {len(dep_df)} AA departures"):
c1, c2 = st.columns(2)
with c1:
st.markdown("**→ DFW arrivals**")
if not arr_df.empty:
st.dataframe(arr_df[["flight","airport","time_str"]].rename(
columns={"flight":"Flight","airport":"From","time_str":"Time"}),
width='stretch', height=260)
with c2:
st.markdown("**DFW → departures**")
if not dep_df.empty:
st.dataframe(dep_df[["flight","airport","time_str"]].rename(
columns={"flight":"Flight","airport":"To","time_str":"Time"}),
width='stretch', height=260)
# Build seqs from live data — vectorized cross-join
from datetime import datetime as _dt
month_val = _dt.now().month
seqs = pd.DataFrame()
if not arr_df.empty and not dep_df.empty:
_a = arr_df[["airport","time_min","time_str","flight"]].copy()
_d = dep_df[["airport","time_min","time_str","flight"]].copy()
cross = _a.merge(_d, how="cross", suffixes=("_a","_b"))
_ta = cross["time_min_b"] - cross["time_min_a"]
cross = cross[(_ta >= 30) & (_ta <= 240) & (cross["airport_a"] != cross["airport_b"])].copy()
if not cross.empty:
cross["turnaround_min"] = _ta[cross.index].astype(int)
cross["Month"] = month_val
cross["Sequence"] = cross["airport_a"] + " → DFW → " + cross["airport_b"]
cross = cross.rename(columns={
"airport_a": "airport_A", "airport_b": "airport_B",
"flight_a": "flight_in", "time_str_a": "arr_time",
"flight_b": "flight_out", "time_str_b": "dep_time",
})
seqs = cross[["airport_A","airport_B","flight_in","arr_time",
"flight_out","dep_time","turnaround_min","Month","Sequence"]]
if not seqs.empty:
seqs = lf.score_sequences(seqs, scores_sched)
_render_sequences(seqs, f"Live {_dt.now().strftime('%Y-%m-%d %H:%M UTC')}",
key_suffix="live")
# ── CURRENT SCHEDULE ANALOG ────────────────────────────────────────
elif data_mode.startswith("📅"):
col_r1, col_r2 = st.columns([4,1])
with col_r2:
refresh = st.button("🔄 Refresh", key="sched_refresh_analog")
cache_key = "bts_analog"
if refresh or cache_key not in st.session_state:
day_df, status = lf.get_bts_analog(bts[bts["Reporting_Airline"]=="AA"])
st.session_state[cache_key] = (day_df, status)
else:
day_df, status = st.session_state[cache_key]
with col_r1:
st.caption(status)
month_val = int(pd.to_datetime(day_df["FlightDate"].iloc[0]).month)
seqs = _bts_day_to_seqs(day_df, month_val)
if not seqs.empty:
seqs = lf.score_sequences(seqs, scores_sched)
seqs = seqs.sort_values("risk_score", ascending=False)
_render_sequences(seqs, f"Current schedule analog ({day_df['FlightDate'].iloc[0]})",
dep_col="dep_min", key_suffix="analog")
else:
st.info("No sequences found in analog day.")
# ── HISTORICAL MODE ────────────────────────────────────────────────
else:
avail_dates = sorted(bts["FlightDate"].unique())
col_d1, col_d2, col_d3 = st.columns([2, 2, 2])
with col_d1:
sel_date = st.selectbox("Date", avail_dates,
index=min(180, len(avail_dates)-1))
with col_d2:
carrier_filter = st.radio("Carrier", ["AA only", "All carriers"],
horizontal=True, key="sched_carrier")
if carrier_filter == "All carriers":
st.caption(
"⚠️ Risk scores were trained exclusively on AA sequences. "
"Non-AA tail numbers use the same pair-month risk lookup, "
"which may not reflect other carriers' operational patterns."
)
month_val = int(pd.to_datetime(sel_date).month)
day_df = bts[bts["FlightDate"] == sel_date].copy()
if carrier_filter == "AA only":
day_df = day_df[day_df["Reporting_Airline"] == "AA"]
seqs = _bts_day_to_seqs(day_df, month_val)
if not seqs.empty:
seqs = lf.score_sequences(seqs, scores_sched)
seqs = seqs.sort_values("risk_score", ascending=False)
_render_sequences(seqs, sel_date, dep_col="dep_min", key_suffix="hist")
else:
st.info(f"No sequences on {sel_date}.")
# ═══════════════════════════════════════════════════════════════════════════
# TAB 3: SEQUENCE OPTIMIZER
# ═══════════════════════════════════════════════════════════════════════════
with tab_optim:
st.header("⚡ Sequence Optimizer")
st.markdown(
"Given a pool of DFW arrivals and departures, find the minimum-risk one-to-one "
"assignment of inbound → outbound sequences using the " +
tip("Hungarian algorithm", "Also called the Jonker-Volgenant algorithm. Solves the linear assignment problem "
"in O(n³) time: given an n×m cost matrix, find the assignment of arrivals to departures "
"that minimizes total cost. Implemented via scipy.optimize.linear_sum_assignment.") +
". Constrains " +
tip("turnaround time", "The gap between an aircraft's DFW arrival (inbound leg) and its DFW departure "
"(outbound leg) on the same tail number. FAA Part 117 requires ≥30 min minimum crew turn; "
"240 min is the operational ceiling beyond which a new crew is typically assigned.") +
" to **30–240 min** per FAA Part 117 guidelines.",
unsafe_allow_html=True,
)
bts_o = get_bts_2024()
scores_o_idx = get_scores_indexed()
if bts_o.empty:
st.warning("BTS 2024 data not found.")
else:
# ── Data source ────────────────────────────────────────────────────
opt_source = st.radio(
"Schedule Source",
["🔴 Live (AviationStack)", "📅 Current Schedule (BTS Analog)", "📂 Historical (BTS 2024)"],
horizontal=True, key="opt_source",
)
st.divider()
# ── Time window controls ───────────────────────────────────────────
st.subheader("Time Window")
tw1, tw2, tw3 = st.columns(3)
with tw1:
opt_carrier = st.radio("Carrier", ["AA only", "All carriers"],
key="opt_carrier", horizontal=True)
with tw2:
st.markdown("**Arrival Window (→ DFW)**")
arr_h0 = st.slider("Arrival from hour", 0, 23, 6, key="arr_h0")
arr_h1 = st.slider("Arrival to hour", 1, 24, 20, key="arr_h1")
with tw3:
st.markdown("**Departure Window (DFW →)**")
dep_h0 = st.slider("Departure from hour", 0, 23, 7, key="dep_h0")
dep_h1 = st.slider("Departure to hour", 1, 24, 22, key="dep_h1")
# Extra control: historical date picker (only shown in historical mode)
if opt_source.startswith("📂"):
avail_dates_o = sorted(bts_o["FlightDate"].unique())
opt_date = st.selectbox("Schedule Date", avail_dates_o,
index=min(180, len(avail_dates_o)-1), key="opt_date")
else:
opt_date = None
st.divider()
# ── Load arrivals + departures based on source ─────────────────────
arrivals_o = pd.DataFrame()
departures_o = pd.DataFrame()
opt_month = datetime.now().month
opt_source_label = ""
if opt_source.startswith("🔴"):
if not aviationstack_key:
st.warning("Add AviationStack API key in sidebar for live data.")
else:
cache_key_as = "as_live_seqs"
col_oa, col_ob = st.columns([4,1])
with col_ob:
fetch_live = st.button("🔄 Fetch Live", key="opt_refresh_live")
if fetch_live:
with st.spinner("Fetching live AA schedule from AviationStack..."):
arr_raw, dep_raw, status = lf.fetch_aviationstack(aviationstack_key)
st.session_state[cache_key_as] = (arr_raw, dep_raw, status)
if cache_key_as not in st.session_state:
st.info("Press **🔄 Fetch Live** to load the current AA DFW schedule.")
else:
arr_raw, dep_raw, status = st.session_state[cache_key_as]
arr_df_as = opt.aviationstack_to_arrivals(arr_raw, arr_h0, arr_h1)
dep_df_as = opt.aviationstack_to_departures(dep_raw, dep_h0, dep_h1)
with col_oa:
st.caption(status)
if len(arr_df_as) == 0 and len(dep_df_as) == 0:
st.warning("No flights returned — check API key or try BTS Analog.")
elif len(arr_df_as) < 50:
st.info(
f"Only {len(arr_df_as)} arrivals found. "
"All times shown in **DFW local (CDT)**. "
"Widen the arrival/departure hour sliders if flights are missing."
)
arrivals_o = arr_df_as
departures_o = dep_df_as
opt_month = datetime.now().month
opt_source_label = f"Live {datetime.now().strftime('%Y-%m-%d')}"
elif opt_source.startswith("📅"):
col_oa, col_ob = st.columns([4,1])
with col_ob:
if st.button("🔄 Refresh", key="opt_refresh_analog"):
if "bts_analog" in st.session_state:
del st.session_state["bts_analog"]
cache_key_an = "bts_analog"
if cache_key_an not in st.session_state:
aa_bts = bts_o[bts_o["Reporting_Airline"] == "AA"] if opt_carrier == "AA only" else bts_o
day_df_an, status_an = lf.get_bts_analog(aa_bts)
st.session_state[cache_key_an] = (day_df_an, status_an)
day_df_an, status_an = st.session_state[cache_key_an]
with col_oa:
st.caption(status_an)
opt_month = int(pd.to_datetime(day_df_an["FlightDate"].iloc[0]).month)
arrivals_o = opt.bts_to_arrivals(day_df_an, arr_h0, arr_h1)
departures_o = opt.bts_to_departures(day_df_an, dep_h0, dep_h1)
opt_source_label = f"Analog {day_df_an['FlightDate'].iloc[0]}"
else: # historical
opt_day = bts_o[bts_o["FlightDate"] == opt_date].copy()
if opt_carrier == "AA only":
opt_day = opt_day[opt_day["Reporting_Airline"] == "AA"]
opt_month = int(pd.to_datetime(opt_date).month)
arrivals_o = opt.bts_to_arrivals(opt_day, arr_h0, arr_h1)
departures_o = opt.bts_to_departures(opt_day, dep_h0, dep_h1)
opt_source_label = opt_date
if arr_h0 >= arr_h1 or dep_h0 >= dep_h1:
st.error("End hour must be > start hour.")
elif not arrivals_o.empty or not departures_o.empty:
sc1, sc2, sc3 = st.columns(3)
sc1.metric("Arrivals in window", len(arrivals_o))
sc2.metric("Departures in window", len(departures_o))
feasible_count = 0
if not arrivals_o.empty and not departures_o.empty:
# Vectorized cross-join feasibility check
_a = arrivals_o["time_min"].to_numpy(dtype=float)[:, None] # (n,1)
_d = departures_o["time_min"].to_numpy(dtype=float)[None, :] # (1,m)
_ta = _d - _a
_ap_a = arrivals_o["airport"].to_numpy(dtype=str)[:, None]
_ap_d = departures_o["airport"].to_numpy(dtype=str)[None, :]
_same = _ap_a == _ap_d
feasible_count = int(((_ta >= 30) & (_ta <= 240) & ~_same).sum())
sc3.metric("Feasible pairs", feasible_count)
st.markdown(
tip("Feasible pairs", "Arrival–departure combinations satisfying: "
"(1) turnaround ≥ 30 min, (2) turnaround ≤ 240 min, "
"(3) origin airport A ≠ destination airport B. "
"The optimizer picks the best one-to-one assignment from this pool."),
unsafe_allow_html=True,
)
st.divider()
if st.button("⚡ Run Optimizer", type="primary", key="run_optim"):
if arrivals_o.empty or departures_o.empty:
st.error("Need at least 1 arrival and 1 departure in the time windows.")
elif feasible_count == 0:
st.error("No feasible A→DFW→B pairs in the selected windows. "
"Widen arrival/departure windows or reduce turnaround constraints.")
else:
with st.spinner("Running Hungarian algorithm..."):
result_df, stats = opt.optimize_sequences(
arrivals_o, departures_o, scores_o_idx, opt_month
)
st.session_state["opt_result"] = (result_df, stats, arrivals_o, departures_o)
if "opt_result" in st.session_state:
result_df, stats, arrivals_o_r, departures_o_r = st.session_state["opt_result"]
# ── Summary metrics ───────────────────────────────────────
st.subheader("Optimization Results")
rm1, rm2, rm3, rm4, rm5 = st.columns(5)
rm1.metric("Sequences Assigned", stats["n_matched"])
rm2.metric("Avg Risk (Optimal)", f"{stats['optimal_avg']:.1%}")
worst_avg = stats["worst_total"] / max(stats["n_matched"], 1)
rm3.metric("Avg Risk (Worst-case)", f"{worst_avg:.1%}")
risk_saved_pct = (stats["risk_saved"] / max(stats["worst_total"], 0.001)) * 100
rm4.metric("Risk Reduction", f"{risk_saved_pct:.1f}%",
delta=f"-{stats['risk_saved']:.2f} total score")
rm5.metric("High Risk Sequences", f"{stats['pct_high']:.0%}")
st.markdown(
tip("Worst-case", "Approximate upper bound on total risk: computed by running "
"the Hungarian algorithm on the negated cost matrix (maximize risk instead of minimize). "
"Represents a naive worst-possible assignment.") +
" · " +
tip("Risk Reduction", "Percentage reduction in total calibrated risk score: "
"(worst_total − optimal_total) / worst_total. "
"Reflects how much disruption risk the optimizer avoids vs. a naive assignment."),
unsafe_allow_html=True,
)
st.divider()
col_res1, col_res2 = st.columns([3, 2])
with col_res1:
st.subheader("Optimal Assignment")
if result_df.empty:
st.info("No feasible assignments found.")
else:
disp_r = result_df[["Sequence", "flight_in", "arr_time",
"flight_out", "dep_time",
"turnaround_min", "risk_score", "risk_label"]].copy()
disp_r.columns = ["Sequence", "Inbound", "Arrived", "Outbound",
"Departs", "Turnaround (min)", "Risk Score", "Risk Level"]
disp_r["Risk Score"] = disp_r["Risk Score"].map("{:.1%}".format)
def _cr(row):
c = {"HIGH":"rgba(214,39,40,0.25)","MODERATE":"rgba(255,127,14,0.25)","LOW":"rgba(44,160,44,0.25)"}.get(
str(row.get("Risk Level","")), "")
return [f"background-color:{c}" for _ in row]
st.dataframe(disp_r.style.apply(_cr, axis=1),
width='stretch', height=420)
st.download_button("Download Optimal Schedule",
disp_r.to_csv(index=False),
file_name=f"optimal_sequences_{opt_source_label}.csv",
mime="text/csv", key="dl_opt")
with col_res2:
st.subheader("Risk Distribution")
if not result_df.empty:
counts = result_df["risk_label"].value_counts().reindex(
["HIGH","MODERATE","LOW"], fill_value=0)
fig_pie = go.Figure(go.Pie(
labels=counts.index, values=counts.values,
marker_colors=["#d62728","#ff7f0e","#2ca02c"],
hole=0.45,
textinfo="label+percent+value",
))
fig_pie.update_layout(title="Assigned Sequences by Risk Level",
height=280, margin=dict(t=40,b=0,l=0,r=0))
st.plotly_chart(fig_pie, width='stretch')
# Optimal vs worst bar
fig_cmp = go.Figure([
go.Bar(name="Worst-case", x=["Total Risk Score"],
y=[stats["worst_total"]], marker_color="#d62728"),
go.Bar(name="Optimal", x=["Total Risk Score"],
y=[stats["optimal_total"]], marker_color="#2ca02c"),
])
fig_cmp.update_layout(
barmode="group", title="Optimal vs Worst-case Total Risk",
height=260, plot_bgcolor="rgba(0,0,0,0)",
margin=dict(t=40,b=40,l=40,r=20),
legend=dict(orientation="h", y=-0.2),
)
st.plotly_chart(fig_cmp, width='stretch')
# ── Gantt-style timeline ──────────────────────────────────
if not result_df.empty:
st.subheader("Sequence Timeline (Gantt)")
fig_g = go.Figure()
for i, row in result_df.iterrows():
arr_m = arrivals_o_r[arrivals_o_r["airport"] == row["airport_A"]]["time_min"]
dep_m = departures_o_r[departures_o_r["airport"] == row["airport_B"]]["time_min"]
a_t = float(arr_m.iloc[0]) if not arr_m.empty else 0
d_t = float(dep_m.iloc[0]) if not dep_m.empty else a_t + 90
color = score_to_color(row["risk_score"])
fig_g.add_trace(go.Scatter(
x=[a_t/60, d_t/60], y=[i, i], mode="lines+markers",
line=dict(color=color, width=8),
marker=dict(size=8, color=["#555", color]),
name=row["Sequence"],
hovertemplate=(
f"{row['Sequence']}
"
f"Arr: {a_t/60:.2f}h | Dep: {d_t/60:.2f}h
"
f"Turnaround: {row['turnaround_min']} min
"
f"Risk: {row['risk_score']:.1%}"
),
showlegend=False,
))
fig_g.add_annotation(
x=a_t/60, y=i, text=row["airport_A"],
showarrow=False, xanchor="right", font=dict(size=9)
)
fig_g.add_annotation(
x=d_t/60, y=i, text=row["airport_B"],
showarrow=False, xanchor="left", font=dict(size=9)
)
fig_g.update_layout(
xaxis=dict(title="Time (hour)", tickvals=list(range(0,25)),
ticktext=[f"{h:02d}:00" for h in range(25)]),
yaxis=dict(visible=False),
height=max(300, len(result_df) * 22 + 60),
plot_bgcolor="rgba(0,0,0,0)",
title="Each bar = one A→DFW→B sequence (color = risk level)",
margin=dict(l=80, r=80, t=50, b=50),
)
st.plotly_chart(fig_g, width='stretch')
# ═══════════════════════════════════════════════════════════════════════════
# TAB 4: PAIR RISK QUERY
# ═══════════════════════════════════════════════════════════════════════════
with tab_query:
st.header("Pair Risk Query")
st.markdown(
"Select an inbound origin (A) and outbound destination (B) to score the A→DFW→B " +
tip("sequence", "A crew sequence: a pilot or flight attendant arrives on an inbound leg from A, "
"turns at DFW (30–240 min), then departs on an outbound leg to B. "
"The model scores the weather disruption risk of this complete rotation.") +
". The " +
tip("calibrated risk score", "XGBoost score passed through isotonic regression calibration. "
"Directly interpretable: 0.30 = model predicts ~30% of A→DFW→B sequences "
"in this month are weather-disrupted (≥15 min delay or weather cancellation).") +
" reflects the predicted fraction of disrupted sequences for this pair-month.",
unsafe_allow_html=True,
)
pred = get_predictor()
col_qa, col_qb, col_qm = st.columns(3)
with col_qa:
a_opts = pred.airports_a
default_a = a_opts.index("MCO") if "MCO" in a_opts else 0
airport_a = st.selectbox(
"Inbound Origin (A)",
a_opts,
index=default_a,
format_func=ap_meta.label,
key="q_a",
)
with col_qb:
b_opts = pred.airports_b
default_b = b_opts.index("LAX") if "LAX" in b_opts else 0
airport_b = st.selectbox(
"Outbound Destination (B)",
b_opts,
index=default_b,
format_func=ap_meta.label,
key="q_b",
)
with col_qm:
q_month = st.slider("Month", 1, 12, 6, key="q_month",
format="%d — " + "%s")
st.caption(ap_meta.MONTH_NAMES[q_month])
st.markdown("---")
if airport_a == "DFW" or airport_b == "DFW":
st.warning("DFW is the hub — select non-DFW airports for A and B.")
else:
result = pred.predict_pair(airport_a, airport_b, q_month)
if result is None:
st.warning(
f"No historical data for **{airport_a} → DFW → {airport_b}** in month {q_month}. "
"This pair-month combination wasn't observed in BTS 2015–2024."
)
else:
# Layout: gauge + explanation side by side
col_g, col_e = st.columns([1, 2])
with col_g:
st.plotly_chart(
gauge_chart(result["risk_score"],
f"{airport_a} → DFW → {airport_b}"),
width='stretch',
)
# Key metrics
st.markdown(
f"| | |\n|--|--|\n"
f"| **Sequence** | {airport_a} → DFW → {airport_b} |\n"
f"| **Month** | {ap_meta.MONTH_NAMES[q_month]} |\n"
f"| {tip('Model Risk Score', 'Calibrated XGBoost score ≈ fraction of days in this month on which the A→DFW or DFW→B corridor experiences weather disruption. Isotonic regression maps raw model output to the observed corridor bad-rate scale.')} | {result['risk_score']:.1%} |\n"
f"| {tip('Observed Rotation Bad Rate', 'Fraction of tail-matched aircraft rotations (same tail number, same date, 30–240 min turn) in this pair-month (2015–2024) where the A→DFW inbound leg or DFW→B outbound leg had a weather delay ≥15 min, or a cascade was detected. This is the direct sample estimate of crew disruption frequency.')} | {result['observed_bad_rate']:.1%} |\n"
f"| {tip('Rotations Observed', 'Number of tail-matched aircraft rotations (same tail number, same date) on this corridor in this month across 2015–2024. The sample size for the observed bad rate. Typical value is 2–10 per year-month; rare corridors may have only 1.')} | {result['n_sequences']:,} |\n",
unsafe_allow_html=True,
)
# Recommendation box
score = result["risk_score"]
if score >= HIGH_THRESHOLD:
st.error(
f"**Recommendation: Do Not Assign**\n\n"
f"Model predicts **{score:.0%}** of sequences on this route are weather-disrupted "
f"in {ap_meta.MONTH_NAMES[q_month]} — above the {HIGH_THRESHOLD:.0%} high-risk threshold. "
f"Observed historical disruption rate: {result['observed_bad_rate']:.0%}."
)
elif score >= MOD_THRESHOLD:
st.warning(
f"**Recommendation: Caution**\n\n"
f"Model predicts **{score:.0%}** of sequences are disrupted — moderate risk. "
f"Consider buffer time or weather monitoring. "
f"Historical disruption rate: {result['observed_bad_rate']:.0%}."
)
else:
st.success(
f"**Recommendation: Acceptable**\n\n"
f"Model predicts **{score:.0%}** of sequences are disrupted — low risk. "
f"Historical disruption rate: {result['observed_bad_rate']:.0%}."
)
with col_e:
# SHAP explanation — cached in session_state to avoid re-init every render
shap_key = f"shap_{airport_a}_{airport_b}_{q_month}"
if shap_key not in st.session_state:
with st.spinner("Computing feature contributions (first time only)..."):
try:
st.session_state[shap_key] = pred.explain_pair(
result["X"], top_n=15,
gsom_imputed=result.get("gsom_imputed", set()),
)
except Exception as ex:
st.session_state[shap_key] = ex
shap_result = st.session_state[shap_key]
if isinstance(shap_result, Exception):
st.info(f"SHAP explanation unavailable: {shap_result}")
feat_vals = result["X"].T.rename(columns={0: "Value"})
feat_vals.index = [FEATURE_LABELS.get(f, f) for f in feat_vals.index]
feat_vals = feat_vals.dropna()
st.dataframe(feat_vals.style.format("{:.4f}"), height=350)
else:
st.markdown(
tip("SHAP values", "SHapley Additive exPlanations. Each bar shows how much a feature "
"pushed the model output up (red, increases risk) or down (green, decreases risk) "
"relative to the average prediction. SHAP values are additive: they sum to the "
"difference between this prediction and the model's mean output.") +
" for this pair-month — features sorted by impact magnitude.",
unsafe_allow_html=True,
)
_gsom_imp = result.get("gsom_imputed", set())
if _gsom_imp:
st.info(
f"**★ GSOM weather features imputed** — {airport_a} or {airport_b} "
f"lacks a nearby NOAA weather station. "
f"{len(_gsom_imp)} feature(s) filled with the month-{q_month} population "
f"median across all airports that do have GSOM data. "
f"Starred (★) features in the chart used imputed values; their SHAP "
f"contributions reflect *typical* weather for this month, not measured values."
)
st.plotly_chart(shap_bar_chart(shap_result), width='stretch')
st.divider()
# Month-by-month risk for selected pair
st.subheader(f"Month-by-Month Risk: {airport_a} → DFW → {airport_b}")
monthly = pred.predict_all_months(airport_a, airport_b)
fig_monthly = go.Figure()
fig_monthly.add_trace(go.Scatter(
x=monthly["Month"],
y=monthly["risk_score"],
mode="lines+markers",
marker=dict(
size=12,
color=[score_to_color(s) if not np.isnan(s) else "#aaa"
for s in monthly["risk_score"]],
line=dict(width=1.5, color="black"),
),
line=dict(color="#555", width=2),
text=monthly["risk_score"].map(
lambda s: f"{s:.1%}" if not np.isnan(s) else "N/A"
),
hovertemplate="%{x}
Risk: %{text}",
))
fig_monthly.add_hrect(y0=HIGH_THRESHOLD, y1=1.05, fillcolor="red", opacity=0.07, line_width=0)
fig_monthly.add_hrect(y0=MOD_THRESHOLD, y1=HIGH_THRESHOLD, fillcolor="orange", opacity=0.07, line_width=0)
fig_monthly.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="red", opacity=0.4)
fig_monthly.add_hline(y=MOD_THRESHOLD, line_dash="dash", line_color="orange", opacity=0.4)
fig_monthly.update_layout(
xaxis=dict(tickvals=list(range(1, 13)),
ticktext=[ap_meta.MONTH_NAMES[m][:3] for m in range(1, 13)]),
yaxis=dict(title="Risk Score", range=[0, 1.05], tickformat=".0%"),
height=300,
plot_bgcolor="rgba(0,0,0,0)",
title="Seasonal Risk Profile",
showlegend=False,
)
if q_month:
fig_monthly.add_vline(x=q_month, line_dash="dot", line_color="#333",
annotation_text=ap_meta.MONTH_NAMES[q_month][:3],
annotation_position="top")
st.plotly_chart(fig_monthly, width='stretch')
# Compare with reversed sequence
with st.expander("Compare: reversed sequence B → DFW → A"):
result_rev = pred.predict_pair(airport_b, airport_a, q_month)
if result_rev:
col_rev1, col_rev2 = st.columns(2)
with col_rev1:
st.plotly_chart(
gauge_chart(result["risk_score"] if result else 0,
f"{airport_a}→DFW→{airport_b}"),
width='stretch'
)
with col_rev2:
st.plotly_chart(
gauge_chart(result_rev["risk_score"],
f"{airport_b}→DFW→{airport_a}"),
width='stretch'
)
else:
st.info(f"No data for {airport_b} → DFW → {airport_a} in month {q_month}.")
# ═══════════════════════════════════════════════════════════════════════════
# TAB 4: AIRPORT RISK MAP
# ═══════════════════════════════════════════════════════════════════════════
with tab_map:
st.header("Airport Risk Map")
st.markdown(
"Airports sized and colored by " +
tip("average calibrated risk score", "Mean of the calibrated XGBoost risk scores "
"across all pair-months involving this airport in the selected role and month. "
"Color and marker size both scale with risk (green=low, red=high). "
"Normalized to the calibrated score range [0%, 50%].") +
". DFW is the hub for all sequences. Spoke lines connect DFW to the 10 riskiest airports.",
unsafe_allow_html=True,
)
scores_map = get_pair_scores()
col_m1, col_m2, col_m3, col_m4 = st.columns(4)
with col_m1:
map_month = st.slider("Month", 1, 12, 6, key="map_month")
st.caption(ap_meta.MONTH_NAMES[map_month])
with col_m2:
map_role = st.radio("Airport Role", ["As Origin (A)", "As Destination (B)"], horizontal=True)
with col_m3:
map_top_n = st.slider("Show top N airports", 10, 200, 30)
with col_m4:
st.caption("Map style follows the\n🌙 Dark mode toggle in the sidebar.")
# Map geo colors follow global dark_mode toggle
if dark_mode:
_land = "rgba(40,40,40,0.7)"
_lake = "rgba(30,80,120,0.5)"
_coast = "rgba(160,160,160,0.5)"
_sub = "rgba(160,160,160,0.3)"
_bg = "rgba(15,17,22,0.0)"
else:
_land = "#e8ecf0"
_lake = "#c6dff0"
_coast = "#aaaaaa"
_sub = "#cccccc"
_bg = "rgba(0,0,0,0)"
role_key = "origin" if map_role == "As Origin (A)" else "dest"
role_label = "Origin" if map_role == "As Origin (A)" else "Destination"
grp = get_map_group(map_month, role_key)
grp = grp.nlargest(map_top_n, "avg_risk")
ap_df = get_airport_df(tuple(grp["airport"].tolist() + ["DFW"]))
grp = grp.merge(ap_df, left_on="airport", right_on="iata", how="left")
grp = grp.dropna(subset=["lat", "lon"])
fig_map = go.Figure()
# DFW hub marker
dfw_info = ap_meta.get("DFW")
if dfw_info.get("lat"):
fig_map.add_trace(go.Scattergeo(
lon=[dfw_info["lon"]], lat=[dfw_info["lat"]],
mode="markers+text",
marker=dict(size=18, color="#1f77b4", symbol="star",
line=dict(width=2, color="white")),
text=["DFW"], textposition="top right",
name="DFW Hub",
hovertemplate="DFW — Dallas/Fort Worth
Hub airport (all sequences pass through)",
))
# Draw spoke lines to top-10 riskiest
for _, row in grp.head(10).iterrows():
if pd.notna(row["lat"]) and pd.notna(row["lon"]):
fig_map.add_trace(go.Scattergeo(
lon=[dfw_info["lon"], row["lon"]],
lat=[dfw_info["lat"], row["lat"]],
mode="lines",
line=dict(width=1, color=score_to_color(row["avg_risk"])),
opacity=0.4,
showlegend=False,
hoverinfo="skip",
))
# Airport markers
fig_map.add_trace(go.Scattergeo(
lon=grp["lon"],
lat=grp["lat"],
mode="markers+text",
marker=dict(
size=grp["avg_risk"] * 30 + 6,
color=grp["avg_risk"],
colorscale="RdYlGn_r",
cmin=0, cmax=0.5,
showscale=True,
colorbar=dict(title="Avg Risk", tickformat=".0%", x=1.0,
tickvals=[0, 0.1, 0.2, 0.3, 0.4, 0.5],
ticktext=["0%","10%","20%","30%","40%","≥50%"]),
line=dict(width=0.8, color="black"),
),
text=grp["airport"],
textposition="top center",
textfont=dict(size=9),
name=f"Airports as {role_label}",
hovertemplate=(
"%{text}
"
"Avg Risk: %{marker.color:.1%}
"
"N Pairs: %{customdata[0]}
"
"Worst Partner: %{customdata[1]}"
),
customdata=grp[["n_pairs", "worst_partner"]].values,
))
fig_map.update_layout(
geo=dict(
scope="usa",
projection_type="albers usa",
bgcolor=_bg,
showland=True, landcolor=_land,
showlakes=True, lakecolor=_lake,
showcoastlines=True, coastlinecolor=_coast,
showsubunits=True, subunitcolor=_sub,
showframe=False,
),
title=f"Airport Risk Map — {role_label} — {ap_meta.MONTH_NAMES[map_month]}",
height=560,
margin=dict(t=40, b=0, l=0, r=0),
legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.01),
)
st.plotly_chart(fig_map, width='stretch')
# Table below map
st.subheader(f"Top {map_top_n} Airports by Risk ({ap_meta.MONTH_NAMES[map_month]})")
tbl = grp[["airport", "city", "state", "avg_risk", "n_pairs", "worst_partner"]].copy()
tbl.columns = ["Airport", "City", "State", "Avg Risk", "N Pairs", "Worst Partner"]
tbl["Avg Risk"] = tbl["Avg Risk"].map("{:.1%}".format)
st.markdown(
tip("Avg Risk", "Mean calibrated risk score across all pair-months involving this airport "
"in the selected month and role. After isotonic calibration, this approximates "
"the average fraction of sequences on routes through this airport that are disrupted.") +
" · " +
tip("Worst Partner", "The airport B (or A) that, when paired with this airport, "
"produces the highest calibrated risk score in the selected month."),
unsafe_allow_html=True,
)
st.dataframe(tbl, width='stretch', height=320)