Spaces:

sammoftah
/

underdog-lab

Running

underdog-lab / scripts /backtest_walk_forward.py

Moftah

Tune recency and add ensemble evaluation

b05ddd5 17 days ago

9.03 kB

	from __future__ import annotations

	"""Walk-forward backtest: fitted Dixon-Coles Elo model vs baselines.

	For each test year Y in TEST_YEARS, the Dixon-Coles Elo model is refit (same
	MLE procedure as fit_elo_dixon_coles.py) on only the matches strictly before
	Y, then scored on every match played during Y. This mirrors how the model
	would actually have been used -- no fold sees data from its own future.

	Three candidates are scored on each fold's test matches:

	- uniform: 1/3 / 1/3 / 1/3 regardless of teams (no skill baseline).
	- current: the pre-remediation model previously shipped in
	underdog_lab.world_cup.forecasting.MODEL (independent Poisson,
	elo_scale=0.00165, home_advantage_elo=0, rho=0 -- hand-set, never fit).
	- fitted: this fold's freshly-fit Dixon-Coles Elo model (intercept,
	elo_scale, home_advantage_elo, rho all fit on the training window only).

	Metrics: mean log loss, Brier score, and Rank Probability Score (RPS) per
	candidate, summed/averaged across all test folds. A basic calibration table
	for the fitted model's predicted home-win probability is also produced.

	Ship gate: the fitted model must beat both "uniform" and "current" on mean
	log loss across all test folds combined and beat "current" on the
	neutral-venue subset that most closely matches World Cup inference. Writes
	models/backtest_report.json with the full breakdown and the gate verdict.
	Does not modify
	src/underdog_lab/world_cup/forecasting.py -- that swap is a separate,
	human-reviewed step gated on this report's verdict.

	Usage:
	python scripts/backtest_walk_forward.py
	"""

	import json
	from datetime import date
	from types import SimpleNamespace

	from underdog_lab.config import MODEL_DIR
	from underdog_lab.forecasting.elo_goals import EloGoalModel
	from underdog_lab.forecasting.poisson import forecast_from_lambdas

	from backtest_common import (
	calibration_table,
	fit_dixon_coles,
	load_matches_with_self_elo,
	observed_outcome,
	score_candidate,
	)

	REPORT_PATH = MODEL_DIR / "backtest_report.json"
	# Selected by scripts/upgrade_evaluation.py: beats the previous 1095-day
	# (3 year) half-life on mean log loss across 2018-2025 selection folds AND
	# on the held-out 2026 confirmation fold, both overall and on the
	# neutral-venue subset. See models/upgrade_evaluation.json.
	HALF_LIFE_DAYS = 180.0

	# Test years: the dataset starts 2015-01-03, so 2018 onward leaves at least
	# three years of training data for the first fold.
	TEST_YEARS = list(range(2018, 2027))

	UNIFORM_FORECAST = SimpleNamespace(p_home=1 / 3, p_draw=1 / 3, p_away=1 / 3)

	# The pre-remediation model previously shipped in world_cup/forecasting.py:
	# independent Poisson (rho=0), hand-set elo_scale, no home advantage.
	CURRENT_MODEL = EloGoalModel(
	intercept=0.09531017980432493,
	elo_scale=0.00165,
	home_advantage_elo=0.0,
	)


	def current_model_forecast(home_elo: float, away_elo: float, neutral: bool):
	lambda_home, lambda_away = CURRENT_MODEL.lambdas(home_elo, away_elo, neutral_venue=neutral)
	return forecast_from_lambdas(lambda_home, lambda_away)


	def run_fold(test_year: int, all_matches: list[dict]) -> dict:
	train_cutoff = date(test_year - 1, 12, 31)
	train_matches = [m for m in all_matches if m["date"] <= train_cutoff]
	test_matches = [m for m in all_matches if m["date"].year == test_year]
	if not test_matches:
	return {}

	fitted_model = fit_dixon_coles(train_matches, train_cutoff, HALF_LIFE_DAYS)

	totals = {
	scope: {
	name: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
	for name in ("uniform", "current", "fitted")
	}
	for scope in ("all", "neutral")
	}
	counts = {"all": 0, "neutral": 0}
	calibration_rows = []
	for match in test_matches:
	outcome = observed_outcome(match["home_goals"], match["away_goals"])

	current_forecast = current_model_forecast(match["home_elo"], match["away_elo"], match["neutral"])
	fitted_forecast = fitted_model.forecast(match["home_elo"], match["away_elo"], neutral_venue=match["neutral"])
	forecasts = {
	"uniform": UNIFORM_FORECAST,
	"current": current_forecast,
	"fitted": fitted_forecast,
	}
	scopes = ["all"] + (["neutral"] if match["neutral"] else [])
	for scope in scopes:
	counts[scope] += 1
	for candidate, forecast in forecasts.items():
	for metric, value in score_candidate(forecast, outcome).items():
	totals[scope][candidate][metric] += value
	calibration_rows.append((fitted_forecast.p_home, outcome == "home"))

	means = {
	scope: {
	candidate: {
	metric: total / counts[scope]
	for metric, total in metric_totals.items()
	}
	for candidate, metric_totals in scope_totals.items()
	}
	for scope, scope_totals in totals.items()
	if counts[scope]
	}
	return {
	"test_year": test_year,
	"train_matches": len(train_matches),
	"test_matches": len(test_matches),
	"neutral_test_matches": counts["neutral"],
	"fitted_params": {
	"intercept": fitted_model.intercept,
	"elo_scale": fitted_model.elo_scale,
	"home_advantage_elo": fitted_model.home_advantage_elo,
	"rho": fitted_model.rho,
	},
	"mean_scores": means["all"],
	"neutral_mean_scores": means.get("neutral", {}),
	"calibration_rows": calibration_rows,
	}


	def main() -> None:
	all_matches = load_matches_with_self_elo(date(2026, 6, 12))

	folds = []
	for test_year in TEST_YEARS:
	fold = run_fold(test_year, all_matches)
	if fold:
	folds.append(fold)

	all_calibration_rows: list[tuple[float, bool]] = []
	for fold in folds:
	all_calibration_rows.extend(fold.pop("calibration_rows"))

	total_test_matches = sum(fold["test_matches"] for fold in folds)
	neutral_test_matches = sum(fold["neutral_test_matches"] for fold in folds)
	overall = {candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} for candidate in ("uniform", "current", "fitted")}
	neutral_overall = {
	candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
	for candidate in ("uniform", "current", "fitted")
	}
	for fold in folds:
	for candidate, metric_means in fold["mean_scores"].items():
	for metric, mean_value in metric_means.items():
	overall[candidate][metric] += mean_value * fold["test_matches"]
	for candidate, metric_means in fold["neutral_mean_scores"].items():
	for metric, mean_value in metric_means.items():
	neutral_overall[candidate][metric] += (
	mean_value * fold["neutral_test_matches"]
	)
	for candidate, metric_totals in overall.items():
	for metric in metric_totals:
	overall[candidate][metric] /= total_test_matches
	for candidate, metric_totals in neutral_overall.items():
	for metric in metric_totals:
	neutral_overall[candidate][metric] /= neutral_test_matches

	fitted_beats_uniform = overall["fitted"]["log_loss"] < overall["uniform"]["log_loss"]
	fitted_beats_current = overall["fitted"]["log_loss"] < overall["current"]["log_loss"]
	fitted_beats_neutral_current = (
	neutral_overall["fitted"]["log_loss"]
	< neutral_overall["current"]["log_loss"]
	)
	ship = (
	fitted_beats_uniform
	and fitted_beats_current
	and fitted_beats_neutral_current
	)

	report = {
	"test_years": TEST_YEARS,
	"half_life_days": HALF_LIFE_DAYS,
	"total_test_matches": total_test_matches,
	"neutral_test_matches": neutral_test_matches,
	"folds": folds,
	"overall_mean_scores": overall,
	"neutral_mean_scores": neutral_overall,
	"calibration_home_win": calibration_table(all_calibration_rows),
	"ship_gate": {
	"fitted_beats_uniform_log_loss": fitted_beats_uniform,
	"fitted_beats_current_log_loss": fitted_beats_current,
	"fitted_beats_current_neutral_log_loss": (
	fitted_beats_neutral_current
	),
	"ship": ship,
	"criterion": (
	"The fitted Dixon-Coles Elo model must have a lower mean "
	"log loss than both the uniform baseline and the model "
	"previously shipped in world_cup/forecasting.py, both overall "
	"and on neutral-venue matches, across walk-forward test folds "
	"(2018-2026, no fold trained on its own test data)."
	),
	},
	}

	MODEL_DIR.mkdir(parents=True, exist_ok=True)
	REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
	print(f"Wrote {REPORT_PATH}")
	print(json.dumps(report["overall_mean_scores"], indent=2))
	print(json.dumps(report["ship_gate"], indent=2))


	if __name__ == "__main__":
	main()