underdog-lab / scripts /backtest_walk_forward.py
Moftah
Tune recency and add ensemble evaluation
b05ddd5
Raw
History Blame Contribute Delete
9.03 kB
from __future__ import annotations
"""Walk-forward backtest: fitted Dixon-Coles Elo model vs baselines.
For each test year Y in TEST_YEARS, the Dixon-Coles Elo model is refit (same
MLE procedure as fit_elo_dixon_coles.py) on only the matches strictly before
Y, then scored on every match played during Y. This mirrors how the model
would actually have been used -- no fold sees data from its own future.
Three candidates are scored on each fold's test matches:
- uniform: 1/3 / 1/3 / 1/3 regardless of teams (no skill baseline).
- current: the pre-remediation model previously shipped in
underdog_lab.world_cup.forecasting.MODEL (independent Poisson,
elo_scale=0.00165, home_advantage_elo=0, rho=0 -- hand-set, never fit).
- fitted: this fold's freshly-fit Dixon-Coles Elo model (intercept,
elo_scale, home_advantage_elo, rho all fit on the training window only).
Metrics: mean log loss, Brier score, and Rank Probability Score (RPS) per
candidate, summed/averaged across all test folds. A basic calibration table
for the fitted model's predicted home-win probability is also produced.
Ship gate: the fitted model must beat both "uniform" and "current" on mean
log loss across all test folds combined and beat "current" on the
neutral-venue subset that most closely matches World Cup inference. Writes
models/backtest_report.json with the full breakdown and the gate verdict.
Does not modify
src/underdog_lab/world_cup/forecasting.py -- that swap is a separate,
human-reviewed step gated on this report's verdict.
Usage:
python scripts/backtest_walk_forward.py
"""
import json
from datetime import date
from types import SimpleNamespace
from underdog_lab.config import MODEL_DIR
from underdog_lab.forecasting.elo_goals import EloGoalModel
from underdog_lab.forecasting.poisson import forecast_from_lambdas
from backtest_common import (
calibration_table,
fit_dixon_coles,
load_matches_with_self_elo,
observed_outcome,
score_candidate,
)
REPORT_PATH = MODEL_DIR / "backtest_report.json"
# Selected by scripts/upgrade_evaluation.py: beats the previous 1095-day
# (3 year) half-life on mean log loss across 2018-2025 selection folds AND
# on the held-out 2026 confirmation fold, both overall and on the
# neutral-venue subset. See models/upgrade_evaluation.json.
HALF_LIFE_DAYS = 180.0
# Test years: the dataset starts 2015-01-03, so 2018 onward leaves at least
# three years of training data for the first fold.
TEST_YEARS = list(range(2018, 2027))
UNIFORM_FORECAST = SimpleNamespace(p_home=1 / 3, p_draw=1 / 3, p_away=1 / 3)
# The pre-remediation model previously shipped in world_cup/forecasting.py:
# independent Poisson (rho=0), hand-set elo_scale, no home advantage.
CURRENT_MODEL = EloGoalModel(
intercept=0.09531017980432493,
elo_scale=0.00165,
home_advantage_elo=0.0,
)
def current_model_forecast(home_elo: float, away_elo: float, neutral: bool):
lambda_home, lambda_away = CURRENT_MODEL.lambdas(home_elo, away_elo, neutral_venue=neutral)
return forecast_from_lambdas(lambda_home, lambda_away)
def run_fold(test_year: int, all_matches: list[dict]) -> dict:
train_cutoff = date(test_year - 1, 12, 31)
train_matches = [m for m in all_matches if m["date"] <= train_cutoff]
test_matches = [m for m in all_matches if m["date"].year == test_year]
if not test_matches:
return {}
fitted_model = fit_dixon_coles(train_matches, train_cutoff, HALF_LIFE_DAYS)
totals = {
scope: {
name: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
for name in ("uniform", "current", "fitted")
}
for scope in ("all", "neutral")
}
counts = {"all": 0, "neutral": 0}
calibration_rows = []
for match in test_matches:
outcome = observed_outcome(match["home_goals"], match["away_goals"])
current_forecast = current_model_forecast(match["home_elo"], match["away_elo"], match["neutral"])
fitted_forecast = fitted_model.forecast(match["home_elo"], match["away_elo"], neutral_venue=match["neutral"])
forecasts = {
"uniform": UNIFORM_FORECAST,
"current": current_forecast,
"fitted": fitted_forecast,
}
scopes = ["all"] + (["neutral"] if match["neutral"] else [])
for scope in scopes:
counts[scope] += 1
for candidate, forecast in forecasts.items():
for metric, value in score_candidate(forecast, outcome).items():
totals[scope][candidate][metric] += value
calibration_rows.append((fitted_forecast.p_home, outcome == "home"))
means = {
scope: {
candidate: {
metric: total / counts[scope]
for metric, total in metric_totals.items()
}
for candidate, metric_totals in scope_totals.items()
}
for scope, scope_totals in totals.items()
if counts[scope]
}
return {
"test_year": test_year,
"train_matches": len(train_matches),
"test_matches": len(test_matches),
"neutral_test_matches": counts["neutral"],
"fitted_params": {
"intercept": fitted_model.intercept,
"elo_scale": fitted_model.elo_scale,
"home_advantage_elo": fitted_model.home_advantage_elo,
"rho": fitted_model.rho,
},
"mean_scores": means["all"],
"neutral_mean_scores": means.get("neutral", {}),
"calibration_rows": calibration_rows,
}
def main() -> None:
all_matches = load_matches_with_self_elo(date(2026, 6, 12))
folds = []
for test_year in TEST_YEARS:
fold = run_fold(test_year, all_matches)
if fold:
folds.append(fold)
all_calibration_rows: list[tuple[float, bool]] = []
for fold in folds:
all_calibration_rows.extend(fold.pop("calibration_rows"))
total_test_matches = sum(fold["test_matches"] for fold in folds)
neutral_test_matches = sum(fold["neutral_test_matches"] for fold in folds)
overall = {candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} for candidate in ("uniform", "current", "fitted")}
neutral_overall = {
candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
for candidate in ("uniform", "current", "fitted")
}
for fold in folds:
for candidate, metric_means in fold["mean_scores"].items():
for metric, mean_value in metric_means.items():
overall[candidate][metric] += mean_value * fold["test_matches"]
for candidate, metric_means in fold["neutral_mean_scores"].items():
for metric, mean_value in metric_means.items():
neutral_overall[candidate][metric] += (
mean_value * fold["neutral_test_matches"]
)
for candidate, metric_totals in overall.items():
for metric in metric_totals:
overall[candidate][metric] /= total_test_matches
for candidate, metric_totals in neutral_overall.items():
for metric in metric_totals:
neutral_overall[candidate][metric] /= neutral_test_matches
fitted_beats_uniform = overall["fitted"]["log_loss"] < overall["uniform"]["log_loss"]
fitted_beats_current = overall["fitted"]["log_loss"] < overall["current"]["log_loss"]
fitted_beats_neutral_current = (
neutral_overall["fitted"]["log_loss"]
< neutral_overall["current"]["log_loss"]
)
ship = (
fitted_beats_uniform
and fitted_beats_current
and fitted_beats_neutral_current
)
report = {
"test_years": TEST_YEARS,
"half_life_days": HALF_LIFE_DAYS,
"total_test_matches": total_test_matches,
"neutral_test_matches": neutral_test_matches,
"folds": folds,
"overall_mean_scores": overall,
"neutral_mean_scores": neutral_overall,
"calibration_home_win": calibration_table(all_calibration_rows),
"ship_gate": {
"fitted_beats_uniform_log_loss": fitted_beats_uniform,
"fitted_beats_current_log_loss": fitted_beats_current,
"fitted_beats_current_neutral_log_loss": (
fitted_beats_neutral_current
),
"ship": ship,
"criterion": (
"The fitted Dixon-Coles Elo model must have a lower mean "
"log loss than both the uniform baseline and the model "
"previously shipped in world_cup/forecasting.py, both overall "
"and on neutral-venue matches, across walk-forward test folds "
"(2018-2026, no fold trained on its own test data)."
),
},
}
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
print(f"Wrote {REPORT_PATH}")
print(json.dumps(report["overall_mean_scores"], indent=2))
print(json.dumps(report["ship_gate"], indent=2))
if __name__ == "__main__":
main()