Spaces:
Running
Running
| from __future__ import annotations | |
| """Walk-forward backtest: fitted Dixon-Coles Elo model vs baselines. | |
| For each test year Y in TEST_YEARS, the Dixon-Coles Elo model is refit (same | |
| MLE procedure as fit_elo_dixon_coles.py) on only the matches strictly before | |
| Y, then scored on every match played during Y. This mirrors how the model | |
| would actually have been used -- no fold sees data from its own future. | |
| Three candidates are scored on each fold's test matches: | |
| - uniform: 1/3 / 1/3 / 1/3 regardless of teams (no skill baseline). | |
| - current: the pre-remediation model previously shipped in | |
| underdog_lab.world_cup.forecasting.MODEL (independent Poisson, | |
| elo_scale=0.00165, home_advantage_elo=0, rho=0 -- hand-set, never fit). | |
| - fitted: this fold's freshly-fit Dixon-Coles Elo model (intercept, | |
| elo_scale, home_advantage_elo, rho all fit on the training window only). | |
| Metrics: mean log loss, Brier score, and Rank Probability Score (RPS) per | |
| candidate, summed/averaged across all test folds. A basic calibration table | |
| for the fitted model's predicted home-win probability is also produced. | |
| Ship gate: the fitted model must beat both "uniform" and "current" on mean | |
| log loss across all test folds combined and beat "current" on the | |
| neutral-venue subset that most closely matches World Cup inference. Writes | |
| models/backtest_report.json with the full breakdown and the gate verdict. | |
| Does not modify | |
| src/underdog_lab/world_cup/forecasting.py -- that swap is a separate, | |
| human-reviewed step gated on this report's verdict. | |
| Usage: | |
| python scripts/backtest_walk_forward.py | |
| """ | |
| import json | |
| from datetime import date | |
| from types import SimpleNamespace | |
| from underdog_lab.config import MODEL_DIR | |
| from underdog_lab.forecasting.elo_goals import EloGoalModel | |
| from underdog_lab.forecasting.poisson import forecast_from_lambdas | |
| from backtest_common import ( | |
| calibration_table, | |
| fit_dixon_coles, | |
| load_matches_with_self_elo, | |
| observed_outcome, | |
| score_candidate, | |
| ) | |
| REPORT_PATH = MODEL_DIR / "backtest_report.json" | |
| # Selected by scripts/upgrade_evaluation.py: beats the previous 1095-day | |
| # (3 year) half-life on mean log loss across 2018-2025 selection folds AND | |
| # on the held-out 2026 confirmation fold, both overall and on the | |
| # neutral-venue subset. See models/upgrade_evaluation.json. | |
| HALF_LIFE_DAYS = 180.0 | |
| # Test years: the dataset starts 2015-01-03, so 2018 onward leaves at least | |
| # three years of training data for the first fold. | |
| TEST_YEARS = list(range(2018, 2027)) | |
| UNIFORM_FORECAST = SimpleNamespace(p_home=1 / 3, p_draw=1 / 3, p_away=1 / 3) | |
| # The pre-remediation model previously shipped in world_cup/forecasting.py: | |
| # independent Poisson (rho=0), hand-set elo_scale, no home advantage. | |
| CURRENT_MODEL = EloGoalModel( | |
| intercept=0.09531017980432493, | |
| elo_scale=0.00165, | |
| home_advantage_elo=0.0, | |
| ) | |
| def current_model_forecast(home_elo: float, away_elo: float, neutral: bool): | |
| lambda_home, lambda_away = CURRENT_MODEL.lambdas(home_elo, away_elo, neutral_venue=neutral) | |
| return forecast_from_lambdas(lambda_home, lambda_away) | |
| def run_fold(test_year: int, all_matches: list[dict]) -> dict: | |
| train_cutoff = date(test_year - 1, 12, 31) | |
| train_matches = [m for m in all_matches if m["date"] <= train_cutoff] | |
| test_matches = [m for m in all_matches if m["date"].year == test_year] | |
| if not test_matches: | |
| return {} | |
| fitted_model = fit_dixon_coles(train_matches, train_cutoff, HALF_LIFE_DAYS) | |
| totals = { | |
| scope: { | |
| name: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} | |
| for name in ("uniform", "current", "fitted") | |
| } | |
| for scope in ("all", "neutral") | |
| } | |
| counts = {"all": 0, "neutral": 0} | |
| calibration_rows = [] | |
| for match in test_matches: | |
| outcome = observed_outcome(match["home_goals"], match["away_goals"]) | |
| current_forecast = current_model_forecast(match["home_elo"], match["away_elo"], match["neutral"]) | |
| fitted_forecast = fitted_model.forecast(match["home_elo"], match["away_elo"], neutral_venue=match["neutral"]) | |
| forecasts = { | |
| "uniform": UNIFORM_FORECAST, | |
| "current": current_forecast, | |
| "fitted": fitted_forecast, | |
| } | |
| scopes = ["all"] + (["neutral"] if match["neutral"] else []) | |
| for scope in scopes: | |
| counts[scope] += 1 | |
| for candidate, forecast in forecasts.items(): | |
| for metric, value in score_candidate(forecast, outcome).items(): | |
| totals[scope][candidate][metric] += value | |
| calibration_rows.append((fitted_forecast.p_home, outcome == "home")) | |
| means = { | |
| scope: { | |
| candidate: { | |
| metric: total / counts[scope] | |
| for metric, total in metric_totals.items() | |
| } | |
| for candidate, metric_totals in scope_totals.items() | |
| } | |
| for scope, scope_totals in totals.items() | |
| if counts[scope] | |
| } | |
| return { | |
| "test_year": test_year, | |
| "train_matches": len(train_matches), | |
| "test_matches": len(test_matches), | |
| "neutral_test_matches": counts["neutral"], | |
| "fitted_params": { | |
| "intercept": fitted_model.intercept, | |
| "elo_scale": fitted_model.elo_scale, | |
| "home_advantage_elo": fitted_model.home_advantage_elo, | |
| "rho": fitted_model.rho, | |
| }, | |
| "mean_scores": means["all"], | |
| "neutral_mean_scores": means.get("neutral", {}), | |
| "calibration_rows": calibration_rows, | |
| } | |
| def main() -> None: | |
| all_matches = load_matches_with_self_elo(date(2026, 6, 12)) | |
| folds = [] | |
| for test_year in TEST_YEARS: | |
| fold = run_fold(test_year, all_matches) | |
| if fold: | |
| folds.append(fold) | |
| all_calibration_rows: list[tuple[float, bool]] = [] | |
| for fold in folds: | |
| all_calibration_rows.extend(fold.pop("calibration_rows")) | |
| total_test_matches = sum(fold["test_matches"] for fold in folds) | |
| neutral_test_matches = sum(fold["neutral_test_matches"] for fold in folds) | |
| overall = {candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} for candidate in ("uniform", "current", "fitted")} | |
| neutral_overall = { | |
| candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} | |
| for candidate in ("uniform", "current", "fitted") | |
| } | |
| for fold in folds: | |
| for candidate, metric_means in fold["mean_scores"].items(): | |
| for metric, mean_value in metric_means.items(): | |
| overall[candidate][metric] += mean_value * fold["test_matches"] | |
| for candidate, metric_means in fold["neutral_mean_scores"].items(): | |
| for metric, mean_value in metric_means.items(): | |
| neutral_overall[candidate][metric] += ( | |
| mean_value * fold["neutral_test_matches"] | |
| ) | |
| for candidate, metric_totals in overall.items(): | |
| for metric in metric_totals: | |
| overall[candidate][metric] /= total_test_matches | |
| for candidate, metric_totals in neutral_overall.items(): | |
| for metric in metric_totals: | |
| neutral_overall[candidate][metric] /= neutral_test_matches | |
| fitted_beats_uniform = overall["fitted"]["log_loss"] < overall["uniform"]["log_loss"] | |
| fitted_beats_current = overall["fitted"]["log_loss"] < overall["current"]["log_loss"] | |
| fitted_beats_neutral_current = ( | |
| neutral_overall["fitted"]["log_loss"] | |
| < neutral_overall["current"]["log_loss"] | |
| ) | |
| ship = ( | |
| fitted_beats_uniform | |
| and fitted_beats_current | |
| and fitted_beats_neutral_current | |
| ) | |
| report = { | |
| "test_years": TEST_YEARS, | |
| "half_life_days": HALF_LIFE_DAYS, | |
| "total_test_matches": total_test_matches, | |
| "neutral_test_matches": neutral_test_matches, | |
| "folds": folds, | |
| "overall_mean_scores": overall, | |
| "neutral_mean_scores": neutral_overall, | |
| "calibration_home_win": calibration_table(all_calibration_rows), | |
| "ship_gate": { | |
| "fitted_beats_uniform_log_loss": fitted_beats_uniform, | |
| "fitted_beats_current_log_loss": fitted_beats_current, | |
| "fitted_beats_current_neutral_log_loss": ( | |
| fitted_beats_neutral_current | |
| ), | |
| "ship": ship, | |
| "criterion": ( | |
| "The fitted Dixon-Coles Elo model must have a lower mean " | |
| "log loss than both the uniform baseline and the model " | |
| "previously shipped in world_cup/forecasting.py, both overall " | |
| "and on neutral-venue matches, across walk-forward test folds " | |
| "(2018-2026, no fold trained on its own test data)." | |
| ), | |
| }, | |
| } | |
| MODEL_DIR.mkdir(parents=True, exist_ok=True) | |
| REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8") | |
| print(f"Wrote {REPORT_PATH}") | |
| print(json.dumps(report["overall_mean_scores"], indent=2)) | |
| print(json.dumps(report["ship_gate"], indent=2)) | |
| if __name__ == "__main__": | |
| main() | |