underdog-lab / scripts /vector_calibration_evaluation.py
Moftah
Finalize forecast tracking and result automation
3a6c339
Raw
History Blame Contribute Delete
7.15 kB
from __future__ import annotations
import json
import random
from datetime import date
from underdog_lab.config import MODEL_DIR
from underdog_lab.forecasting.calibration import apply_temperature
from underdog_lab.forecasting.scoring import (
brier_score,
log_loss,
rank_probability_score,
)
from underdog_lab.forecasting.vector_calibration import (
apply_vector_scaling,
fit_vector_scaling,
)
from underdog_lab.world_cup.forecasting import CALIBRATION_TEMPERATURE
from backtest_common import (
fit_dixon_coles,
load_matches_with_self_elo,
observed_outcome,
)
REPORT_PATH = MODEL_DIR / "vector_calibration_evaluation.json"
HALF_LIFE_DAYS = 180.0
SELECTION_YEARS = list(range(2018, 2026))
ROBUSTNESS_YEAR = 2026
REGULARIZATION_GRID = (0.0001, 0.001, 0.01, 0.1)
def collect_rows() -> dict[int, list[tuple]]:
matches = load_matches_with_self_elo(date(2026, 6, 12))
per_year = {}
for year in [*SELECTION_YEARS, ROBUSTNESS_YEAR]:
cutoff = date(year - 1, 12, 31)
train = [match for match in matches if match["date"] <= cutoff]
test = [match for match in matches if match["date"].year == year]
model = fit_dixon_coles(train, cutoff, HALF_LIFE_DAYS)
per_year[year] = [
(
apply_temperature(
model.forecast(
match["home_elo"],
match["away_elo"],
neutral_venue=match["neutral"],
),
CALIBRATION_TEMPERATURE,
),
observed_outcome(match["home_goals"], match["away_goals"]),
match["neutral"],
)
for match in test
]
return per_year
def metrics(rows: list[tuple], parameters: list[float] | None = None) -> dict:
forecasts = [
(
apply_vector_scaling(forecast, parameters)
if parameters is not None
else forecast,
outcome,
)
for forecast, outcome, _ in rows
]
return {
"n": len(rows),
"log_loss": sum(log_loss(fc, outcome) for fc, outcome in forecasts)
/ len(rows),
"brier": sum(brier_score(fc, outcome) for fc, outcome in forecasts)
/ len(rows),
"rps": sum(rank_probability_score(fc, outcome) for fc, outcome in forecasts)
/ len(rows),
"ece": expected_calibration_error(forecasts),
}
def expected_calibration_error(rows: list[tuple], bins: int = 10) -> float:
buckets = [[] for _ in range(bins)]
for forecast, outcome in rows:
probabilities = (forecast.p_home, forecast.p_draw, forecast.p_away)
index = max(range(3), key=probabilities.__getitem__)
confidence = probabilities[index]
correct = outcome == ("home", "draw", "away")[index]
buckets[min(bins - 1, int(confidence * bins))].append(
(confidence, correct)
)
total = len(rows)
return sum(
len(bucket)
/ total
* abs(
sum(confidence for confidence, _ in bucket) / len(bucket)
- sum(correct for _, correct in bucket) / len(bucket)
)
for bucket in buckets
if bucket
)
def blocked_interval(
rows: list[tuple],
parameters: list[float],
*,
iterations: int = 3000,
) -> list[float]:
differences = [
log_loss(apply_vector_scaling(forecast, parameters), outcome)
- log_loss(forecast, outcome)
for forecast, outcome, _ in rows
]
rng = random.Random(2026)
block = 20
blocks = [
differences[index : index + block]
for index in range(0, len(differences), block)
]
samples = []
for _ in range(iterations):
selected = [rng.choice(blocks) for _ in blocks]
values = [value for group in selected for value in group]
samples.append(sum(values) / len(values))
samples.sort()
return [samples[int(iterations * 0.025)], samples[int(iterations * 0.975)]]
def main() -> None:
per_year = collect_rows()
rolling_scores = {}
for regularization in REGULARIZATION_GRID:
fold_losses = []
for validation_year in range(2021, 2026):
train_rows = [
row
for year in SELECTION_YEARS
if year < validation_year
for row in per_year[year]
]
validation_rows = per_year[validation_year]
parameters = fit_vector_scaling(
[(forecast, outcome) for forecast, outcome, _ in train_rows],
regularization=regularization,
)
fold_losses.append(metrics(validation_rows, parameters)["log_loss"])
rolling_scores[str(regularization)] = sum(fold_losses) / len(fold_losses)
selected_regularization = min(
REGULARIZATION_GRID,
key=lambda value: rolling_scores[str(value)],
)
selection = [
row for year in SELECTION_YEARS for row in per_year[year]
]
parameters = fit_vector_scaling(
[(forecast, outcome) for forecast, outcome, _ in selection],
regularization=selected_regularization,
)
robustness = per_year[ROBUSTNESS_YEAR]
robustness_neutral = [row for row in robustness if row[2]]
slices = {
"selection_descriptive": selection,
"robustness_2026_viewed": robustness,
"robustness_2026_neutral_viewed": robustness_neutral,
}
report_slices = {
name: {
"baseline": metrics(rows),
"candidate": metrics(rows, parameters),
"blocked_log_loss_difference_95": blocked_interval(rows, parameters),
}
for name, rows in slices.items()
}
improves_robustness = all(
value["candidate"]["log_loss"] < value["baseline"]["log_loss"]
and value["candidate"]["brier"] <= value["baseline"]["brier"]
and value["candidate"]["rps"] <= value["baseline"]["rps"] + 0.001
and value["candidate"]["ece"] <= value["baseline"]["ece"] + 0.01
and value["blocked_log_loss_difference_95"][1] < 0
for name, value in report_slices.items()
if name.startswith("robustness")
)
report = {
"baseline": "shipped global temperature calibration",
"method": "regularized five-parameter multiclass vector scaling",
"rolling_origin_regularization_scores": rolling_scores,
"selected_regularization": selected_regularization,
"parameters": parameters,
"slices": report_slices,
"research_gate_passed": improves_robustness,
"production_adopted": False,
"claim_boundary": (
"The 2026 slice has already been viewed and used in prior model "
"decisions. It is a robustness diagnostic, not pristine "
"confirmation. Production adoption requires a future "
"pre-registered evaluation period."
),
}
REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
print(f"Wrote {REPORT_PATH}")
print(json.dumps(report, indent=2))
if __name__ == "__main__":
main()