amarorn / models /wc_collaborative.py
beAnalytic's picture
feat: sync main with feature/superbet-live-inplay
16c19b8 verified
Raw
History Blame Contribute Delete
8.18 kB
from collections.abc import Callable
from dataclasses import dataclass
from datetime import datetime, timezone
from math import log
import pandas as pd
from models.dixon_coles_wc import DixonColesWcModel
from models.logistic_wc import WcLogisticModel
from pipelines.wc_holdout import wc_holdout_test_df, wc_holdout_train_df
from pipelines.wc_hyperparams import get_wc_hyperparams
from pipelines.wc_stats import build_match_features, precompute_elo_timeline
@dataclass
class CollaborativeMetrics:
dixon_coles_weight: float
logistic_weight: float
accuracy: float
brier_score: float
log_loss: float
validation_size: int
def _brier_multiclass(rows: list[dict]) -> float:
if not rows:
return 0.0
labels = ("1", "X", "2")
total = 0.0
for row in rows:
y = row["label"]
probs = row["probs"]
for label in labels:
target = 1.0 if y == label else 0.0
total += (probs[label] - target) ** 2
return total / (len(rows) * len(labels))
def _log_loss_multiclass(rows: list[dict], epsilon: float = 1e-12) -> float:
if not rows:
return 0.0
total = 0.0
for row in rows:
p = max(min(row["probs"][row["label"]], 1.0 - epsilon), epsilon)
total += -log(p)
return total / len(rows)
class CollaborativeWcModel:
def __init__(self, dixon_coles: DixonColesWcModel | None = None) -> None:
self.logistic = WcLogisticModel()
self.dixon_coles = dixon_coles or DixonColesWcModel()
self.metrics: CollaborativeMetrics | None = None
self._dixon_coles_weight = 0.5
@property
def dixon_coles_weight(self) -> float:
return self._dixon_coles_weight
@property
def poisson_weight(self) -> float:
return self._dixon_coles_weight
@property
def logistic_weight(self) -> float:
return 1.0 - self._dixon_coles_weight
def fit(
self,
fixtures_df: pd.DataFrame,
validation_season: int = 2022,
logistic_model: WcLogisticModel | None = None,
on_progress: Callable[[int, int, str], None] | None = None,
) -> CollaborativeMetrics:
df = fixtures_df.sort_values("match_date").copy()
train_df = wc_holdout_train_df(df, validation_season)
valid_df = wc_holdout_test_df(df, validation_season)
if train_df.empty or valid_df.empty:
raise ValueError(
f"Não foi possível separar treino/validação para a temporada {validation_season}."
)
if logistic_model is not None:
self.logistic = logistic_model
else:
self.logistic.fit(train_df, holdout_season=None)
if not self.dixon_coles._fitted:
self.dixon_coles.fit(df, holdout_season=validation_season)
elo_timeline = precompute_elo_timeline(df)
base_rows: list[dict] = []
valid_total = len(valid_df)
for valid_index, (_, row) in enumerate(valid_df.iterrows(), start=1):
before = row["match_date"]
history_mask = pd.to_datetime(df["match_date"], utc=True) < pd.to_datetime(before, utc=True)
history = df[history_mask]
if history.empty:
continue
phase = row.get("phase", "group")
is_neutral = bool(row.get("is_neutral", True))
features = build_match_features(
history,
row["home_team"],
row["away_team"],
before_date=before,
phase=phase,
is_neutral=is_neutral,
elo_timeline=elo_timeline,
)
dc = self.dixon_coles.predict(
history,
row["home_team"],
row["away_team"],
features=features,
before_date=before,
)
logistic = self.logistic.predict_match(
history,
row["home_team"],
row["away_team"],
phase=phase,
is_neutral=is_neutral,
)
base_rows.append(
{
"label": row["label"],
"dixon_coles": {"1": dc.prob_home, "X": dc.prob_draw, "2": dc.prob_away},
"logistic": {"1": logistic.prob_home, "X": logistic.prob_draw, "2": logistic.prob_away},
}
)
if on_progress and (
valid_index == 1 or valid_index % 5 == 0 or valid_index == valid_total
):
on_progress(valid_index, valid_total, "validacao")
if not base_rows:
raise ValueError("Não foi possível gerar previsões para calibração.")
hp = get_wc_hyperparams()
steps = max(hp.ensemble_weight_steps, 1)
best: dict | None = None
for step in range(0, steps + 1):
if on_progress:
on_progress(step + 1, steps + 1, "pesos")
dw = step / steps
lw = 1.0 - dw
scored_rows: list[dict] = []
correct = 0
for item in base_rows:
probs = {
"1": dw * item["dixon_coles"]["1"] + lw * item["logistic"]["1"],
"X": dw * item["dixon_coles"]["X"] + lw * item["logistic"]["X"],
"2": dw * item["dixon_coles"]["2"] + lw * item["logistic"]["2"],
}
total = probs["1"] + probs["X"] + probs["2"]
probs = {k: v / total for k, v in probs.items()}
pred = max(probs, key=probs.get)
if pred == item["label"]:
correct += 1
scored_rows.append({"label": item["label"], "probs": probs})
candidate = {
"dw": dw,
"lw": lw,
"accuracy": correct / len(scored_rows),
"brier": _brier_multiclass(scored_rows),
"log_loss": _log_loss_multiclass(scored_rows),
"size": len(scored_rows),
}
if best is None or candidate["brier"] < best["brier"]:
best = candidate
assert best is not None
self._dixon_coles_weight = float(best["dw"])
self.metrics = CollaborativeMetrics(
dixon_coles_weight=float(best["dw"]),
logistic_weight=float(best["lw"]),
accuracy=float(best["accuracy"]),
brier_score=float(best["brier"]),
log_loss=float(best["log_loss"]),
validation_size=int(best["size"]),
)
return self.metrics
def predict(
self,
fixtures_df: pd.DataFrame,
home_team: str,
away_team: str,
*,
phase: str = "group",
is_neutral: bool = True,
before_date: datetime | None = None,
) -> dict[str, float]:
if self.metrics is None:
self.fit(fixtures_df)
ref = before_date or datetime.now(timezone.utc)
dt = pd.to_datetime(fixtures_df["match_date"], utc=True)
ref_ts = pd.Timestamp(ref if ref.tzinfo else ref.replace(tzinfo=timezone.utc))
history = fixtures_df[dt < ref_ts]
if history.empty:
history = fixtures_df
features = build_match_features(
history,
home_team,
away_team,
before_date=ref,
phase=phase,
is_neutral=is_neutral,
)
dc = self.dixon_coles.predict(
history, home_team, away_team, features=features, before_date=ref
)
logistic = self.logistic.predict_match(
history,
home_team,
away_team,
phase=phase,
is_neutral=is_neutral,
)
dw = self.dixon_coles_weight
lw = self.logistic_weight
probs = {
"1": dw * dc.prob_home + lw * logistic.prob_home,
"X": dw * dc.prob_draw + lw * logistic.prob_draw,
"2": dw * dc.prob_away + lw * logistic.prob_away,
}
total = probs["1"] + probs["X"] + probs["2"]
return {k: v / total for k, v in probs.items()}