from __future__ import annotations

"""Walk-forward backtest: fitted Dixon-Coles Elo model vs baselines.

For each test year Y in TEST_YEARS, the Dixon-Coles Elo model is refit (same
MLE procedure as fit_elo_dixon_coles.py) on only the matches strictly before
Y, then scored on every match played during Y. This mirrors how the model
would actually have been used -- no fold sees data from its own future.

Three candidates are scored on each fold's test matches:

  - uniform: 1/3 / 1/3 / 1/3 regardless of teams (no skill baseline).
  - current: the pre-remediation model previously shipped in
    underdog_lab.world_cup.forecasting.MODEL (independent Poisson,
    elo_scale=0.00165, home_advantage_elo=0, rho=0 -- hand-set, never fit).
  - fitted: this fold's freshly-fit Dixon-Coles Elo model (intercept,
    elo_scale, home_advantage_elo, rho all fit on the training window only).

Metrics: mean log loss, Brier score, and Rank Probability Score (RPS) per
candidate, summed/averaged across all test folds. A basic calibration table
for the fitted model's predicted home-win probability is also produced.

Ship gate: the fitted model must beat both "uniform" and "current" on mean
log loss across all test folds combined and beat "current" on the
neutral-venue subset that most closely matches World Cup inference. Writes
models/backtest_report.json with the full breakdown and the gate verdict.
Does not modify
src/underdog_lab/world_cup/forecasting.py -- that swap is a separate,
human-reviewed step gated on this report's verdict.

Usage:
  python scripts/backtest_walk_forward.py
"""

import json
from datetime import date
from types import SimpleNamespace

from underdog_lab.config import MODEL_DIR
from underdog_lab.forecasting.elo_goals import EloGoalModel
from underdog_lab.forecasting.poisson import forecast_from_lambdas

from backtest_common import (
    calibration_table,
    fit_dixon_coles,
    load_matches_with_self_elo,
    observed_outcome,
    score_candidate,
)

REPORT_PATH = MODEL_DIR / "backtest_report.json"
# Selected by scripts/upgrade_evaluation.py: beats the previous 1095-day
# (3 year) half-life on mean log loss across 2018-2025 selection folds AND
# on the held-out 2026 confirmation fold, both overall and on the
# neutral-venue subset. See models/upgrade_evaluation.json.
HALF_LIFE_DAYS = 180.0

# Test years: the dataset starts 2015-01-03, so 2018 onward leaves at least
# three years of training data for the first fold.
TEST_YEARS = list(range(2018, 2027))

UNIFORM_FORECAST = SimpleNamespace(p_home=1 / 3, p_draw=1 / 3, p_away=1 / 3)

# The pre-remediation model previously shipped in world_cup/forecasting.py:
# independent Poisson (rho=0), hand-set elo_scale, no home advantage.
CURRENT_MODEL = EloGoalModel(
    intercept=0.09531017980432493,
    elo_scale=0.00165,
    home_advantage_elo=0.0,
)


def current_model_forecast(home_elo: float, away_elo: float, neutral: bool):
    lambda_home, lambda_away = CURRENT_MODEL.lambdas(home_elo, away_elo, neutral_venue=neutral)
    return forecast_from_lambdas(lambda_home, lambda_away)


def run_fold(test_year: int, all_matches: list[dict]) -> dict:
    train_cutoff = date(test_year - 1, 12, 31)
    train_matches = [m for m in all_matches if m["date"] <= train_cutoff]
    test_matches = [m for m in all_matches if m["date"].year == test_year]
    if not test_matches:
        return {}

    fitted_model = fit_dixon_coles(train_matches, train_cutoff, HALF_LIFE_DAYS)

    totals = {
        scope: {
            name: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
            for name in ("uniform", "current", "fitted")
        }
        for scope in ("all", "neutral")
    }
    counts = {"all": 0, "neutral": 0}
    calibration_rows = []
    for match in test_matches:
        outcome = observed_outcome(match["home_goals"], match["away_goals"])

        current_forecast = current_model_forecast(match["home_elo"], match["away_elo"], match["neutral"])
        fitted_forecast = fitted_model.forecast(match["home_elo"], match["away_elo"], neutral_venue=match["neutral"])
        forecasts = {
            "uniform": UNIFORM_FORECAST,
            "current": current_forecast,
            "fitted": fitted_forecast,
        }
        scopes = ["all"] + (["neutral"] if match["neutral"] else [])
        for scope in scopes:
            counts[scope] += 1
            for candidate, forecast in forecasts.items():
                for metric, value in score_candidate(forecast, outcome).items():
                    totals[scope][candidate][metric] += value
        calibration_rows.append((fitted_forecast.p_home, outcome == "home"))

    means = {
        scope: {
            candidate: {
                metric: total / counts[scope]
                for metric, total in metric_totals.items()
            }
            for candidate, metric_totals in scope_totals.items()
        }
        for scope, scope_totals in totals.items()
        if counts[scope]
    }
    return {
        "test_year": test_year,
        "train_matches": len(train_matches),
        "test_matches": len(test_matches),
        "neutral_test_matches": counts["neutral"],
        "fitted_params": {
            "intercept": fitted_model.intercept,
            "elo_scale": fitted_model.elo_scale,
            "home_advantage_elo": fitted_model.home_advantage_elo,
            "rho": fitted_model.rho,
        },
        "mean_scores": means["all"],
        "neutral_mean_scores": means.get("neutral", {}),
        "calibration_rows": calibration_rows,
    }


def main() -> None:
    all_matches = load_matches_with_self_elo(date(2026, 6, 12))

    folds = []
    for test_year in TEST_YEARS:
        fold = run_fold(test_year, all_matches)
        if fold:
            folds.append(fold)

    all_calibration_rows: list[tuple[float, bool]] = []
    for fold in folds:
        all_calibration_rows.extend(fold.pop("calibration_rows"))

    total_test_matches = sum(fold["test_matches"] for fold in folds)
    neutral_test_matches = sum(fold["neutral_test_matches"] for fold in folds)
    overall = {candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0} for candidate in ("uniform", "current", "fitted")}
    neutral_overall = {
        candidate: {"log_loss": 0.0, "brier": 0.0, "rps": 0.0}
        for candidate in ("uniform", "current", "fitted")
    }
    for fold in folds:
        for candidate, metric_means in fold["mean_scores"].items():
            for metric, mean_value in metric_means.items():
                overall[candidate][metric] += mean_value * fold["test_matches"]
        for candidate, metric_means in fold["neutral_mean_scores"].items():
            for metric, mean_value in metric_means.items():
                neutral_overall[candidate][metric] += (
                    mean_value * fold["neutral_test_matches"]
                )
    for candidate, metric_totals in overall.items():
        for metric in metric_totals:
            overall[candidate][metric] /= total_test_matches
    for candidate, metric_totals in neutral_overall.items():
        for metric in metric_totals:
            neutral_overall[candidate][metric] /= neutral_test_matches

    fitted_beats_uniform = overall["fitted"]["log_loss"] < overall["uniform"]["log_loss"]
    fitted_beats_current = overall["fitted"]["log_loss"] < overall["current"]["log_loss"]
    fitted_beats_neutral_current = (
        neutral_overall["fitted"]["log_loss"]
        < neutral_overall["current"]["log_loss"]
    )
    ship = (
        fitted_beats_uniform
        and fitted_beats_current
        and fitted_beats_neutral_current
    )

    report = {
        "test_years": TEST_YEARS,
        "half_life_days": HALF_LIFE_DAYS,
        "total_test_matches": total_test_matches,
        "neutral_test_matches": neutral_test_matches,
        "folds": folds,
        "overall_mean_scores": overall,
        "neutral_mean_scores": neutral_overall,
        "calibration_home_win": calibration_table(all_calibration_rows),
        "ship_gate": {
            "fitted_beats_uniform_log_loss": fitted_beats_uniform,
            "fitted_beats_current_log_loss": fitted_beats_current,
            "fitted_beats_current_neutral_log_loss": (
                fitted_beats_neutral_current
            ),
            "ship": ship,
            "criterion": (
                "The fitted Dixon-Coles Elo model must have a lower mean "
                "log loss than both the uniform baseline and the model "
                "previously shipped in world_cup/forecasting.py, both overall "
                "and on neutral-venue matches, across walk-forward test folds "
                "(2018-2026, no fold trained on its own test data)."
            ),
        },
    }

    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    REPORT_PATH.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
    print(f"Wrote {REPORT_PATH}")
    print(json.dumps(report["overall_mean_scores"], indent=2))
    print(json.dumps(report["ship_gate"], indent=2))


if __name__ == "__main__":
    main()