Spaces:

sammoftah
/

underdog-lab

Running

underdog-lab / scripts /vector_calibration_evaluation.py

Moftah

Finalize forecast tracking and result automation

3a6c339 18 days ago

7.15 kB

	from __future__ import annotations

	import json
	import random
	from datetime import date

	from underdog_lab.config import MODEL_DIR
	from underdog_lab.forecasting.calibration import apply_temperature
	from underdog_lab.forecasting.scoring import (
	brier_score,
	log_loss,
	rank_probability_score,
	)
	from underdog_lab.forecasting.vector_calibration import (
	apply_vector_scaling,
	fit_vector_scaling,
	)
	from underdog_lab.world_cup.forecasting import CALIBRATION_TEMPERATURE

	from backtest_common import (
	fit_dixon_coles,
	load_matches_with_self_elo,
	observed_outcome,
	)

	REPORT_PATH = MODEL_DIR / "vector_calibration_evaluation.json"
	HALF_LIFE_DAYS = 180.0
	SELECTION_YEARS = list(range(2018, 2026))
	ROBUSTNESS_YEAR = 2026
	REGULARIZATION_GRID = (0.0001, 0.001, 0.01, 0.1)


	def collect_rows() -> dict[int, list[tuple]]:
	matches = load_matches_with_self_elo(date(2026, 6, 12))
	per_year = {}
	for year in [*SELECTION_YEARS, ROBUSTNESS_YEAR]:
	cutoff = date(year - 1, 12, 31)
	train = [match for match in matches if match["date"] <= cutoff]
	test = [match for match in matches if match["date"].year == year]
	model = fit_dixon_coles(train, cutoff, HALF_LIFE_DAYS)
	per_year[year] = [
	(
	apply_temperature(
	model.forecast(
	match["home_elo"],
	match["away_elo"],
	neutral_venue=match["neutral"],
	),
	CALIBRATION_TEMPERATURE,
	),
	observed_outcome(match["home_goals"], match["away_goals"]),
	match["neutral"],
	)
	for match in test
	]
	return per_year


	def metrics(rows: list[tuple], parameters: list[float] \| None = None) -> dict:
	forecasts = [
	(
	apply_vector_scaling(forecast, parameters)
	if parameters is not None
	else forecast,
	outcome,
	)
	for forecast, outcome, _ in rows
	]
	return {
	"n": len(rows),
	"log_loss": sum(log_loss(fc, outcome) for fc, outcome in forecasts)
	/ len(rows),
	"brier": sum(brier_score(fc, outcome) for fc, outcome in forecasts)
	/ len(rows),
	"rps": sum(rank_probability_score(fc, outcome) for fc, outcome in forecasts)
	/ len(rows),
	"ece": expected_calibration_error(forecasts),
	}


	def expected_calibration_error(rows: list[tuple], bins: int = 10) -> float:
	buckets = [[] for _ in range(bins)]
	for forecast, outcome in rows:
	probabilities = (forecast.p_home, forecast.p_draw, forecast.p_away)
	index = max(range(3), key=probabilities.__getitem__)
	confidence = probabilities[index]
	correct = outcome == ("home", "draw", "away")[index]
	buckets[min(bins - 1, int(confidence * bins))].append(
	(confidence, correct)
	)
	total = len(rows)
	return sum(
	len(bucket)
	/ total
	* abs(
	sum(confidence for confidence, _ in bucket) / len(bucket)
	- sum(correct for _, correct in bucket) / len(bucket)
	)
	for bucket in buckets
	if bucket
	)


	def blocked_interval(
	rows: list[tuple],
	parameters: list[float],
	*,
	iterations: int = 3000,
	) -> list[float]:
	differences = [
	log_loss(apply_vector_scaling(forecast, parameters), outcome)
	- log_loss(forecast, outcome)
	for forecast, outcome, _ in rows
	]
	rng = random.Random(2026)
	block = 20
	blocks = [
	differences[index : index + block]
	for index in range(0, len(differences), block)
	]
	samples = []
	for _ in range(iterations):
	selected = [rng.choice(blocks) for _ in blocks]
	values = [value for group in selected for value in group]
	samples.append(sum(values) / len(values))
	samples.sort()
	return [samples[int(iterations * 0.025)], samples[int(iterations * 0.975)]]


	def main() -> None:
	per_year = collect_rows()
	rolling_scores = {}
	for regularization in REGULARIZATION_GRID:
	fold_losses = []
	for validation_year in range(2021, 2026):
	train_rows = [
	row
	for year in SELECTION_YEARS
	if year < validation_year
	for row in per_year[year]
	]
	validation_rows = per_year[validation_year]
	parameters = fit_vector_scaling(
	[(forecast, outcome) for forecast, outcome, _ in train_rows],
	regularization=regularization,
	)
	fold_losses.append(metrics(validation_rows, parameters)["log_loss"])
	rolling_scores[str(regularization)] = sum(fold_losses) / len(fold_losses)
	selected_regularization = min(
	REGULARIZATION_GRID,
	key=lambda value: rolling_scores[str(value)],
	)
	selection = [
	row for year in SELECTION_YEARS for row in per_year[year]
	]
	parameters = fit_vector_scaling(
	[(forecast, outcome) for forecast, outcome, _ in selection],
	regularization=selected_regularization,
	)
	robustness = per_year[ROBUSTNESS_YEAR]
	robustness_neutral = [row for row in robustness if row[2]]
	slices = {
	"selection_descriptive": selection,
	"robustness_2026_viewed": robustness,
	"robustness_2026_neutral_viewed": robustness_neutral,
	}
	report_slices = {
	name: {
	"baseline": metrics(rows),
	"candidate": metrics(rows, parameters),
	"blocked_log_loss_difference_95": blocked_interval(rows, parameters),
	}
	for name, rows in slices.items()
	}
	improves_robustness = all(
	value["candidate"]["log_loss"] < value["baseline"]["log_loss"]
	and value["candidate"]["brier"] <= value["baseline"]["brier"]
	and value["candidate"]["rps"] <= value["baseline"]["rps"] + 0.001
	and value["candidate"]["ece"] <= value["baseline"]["ece"] + 0.01
	and value["blocked_log_loss_difference_95"][1] < 0
	for name, value in report_slices.items()
	if name.startswith("robustness")
	)
	report = {
	"baseline": "shipped global temperature calibration",
	"method": "regularized five-parameter multiclass vector scaling",
	"rolling_origin_regularization_scores": rolling_scores,
	"selected_regularization": selected_regularization,
	"parameters": parameters,
	"slices": report_slices,
	"research_gate_passed": improves_robustness,
	"production_adopted": False,
	"claim_boundary": (
	"The 2026 slice has already been viewed and used in prior model "
	"decisions. It is a robustness diagnostic, not pristine "
	"confirmation. Production adoption requires a future "
	"pre-registered evaluation period."
	),
	}
	REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
	print(f"Wrote {REPORT_PATH}")
	print(json.dumps(report, indent=2))


	if __name__ == "__main__":
	main()