Spaces:

beAnalytic
/

amarorn

Runtime error

App Files Files Community

amarorn / models /wc_collaborative.py

beAnalytic

feat: sync main with feature/superbet-live-inplay

16c19b8 verified 26 days ago

Raw

History Blame Contribute Delete

8.18 kB

	from collections.abc import Callable
	from dataclasses import dataclass
	from datetime import datetime, timezone
	from math import log

	import pandas as pd

	from models.dixon_coles_wc import DixonColesWcModel
	from models.logistic_wc import WcLogisticModel
	from pipelines.wc_holdout import wc_holdout_test_df, wc_holdout_train_df
	from pipelines.wc_hyperparams import get_wc_hyperparams
	from pipelines.wc_stats import build_match_features, precompute_elo_timeline


	@dataclass
	class CollaborativeMetrics:
	dixon_coles_weight: float
	logistic_weight: float
	accuracy: float
	brier_score: float
	log_loss: float
	validation_size: int


	def _brier_multiclass(rows: list[dict]) -> float:
	if not rows:
	return 0.0
	labels = ("1", "X", "2")
	total = 0.0
	for row in rows:
	y = row["label"]
	probs = row["probs"]
	for label in labels:
	target = 1.0 if y == label else 0.0
	total += (probs[label] - target) ** 2
	return total / (len(rows) * len(labels))


	def _log_loss_multiclass(rows: list[dict], epsilon: float = 1e-12) -> float:
	if not rows:
	return 0.0
	total = 0.0
	for row in rows:
	p = max(min(row["probs"][row["label"]], 1.0 - epsilon), epsilon)
	total += -log(p)
	return total / len(rows)


	class CollaborativeWcModel:
	def __init__(self, dixon_coles: DixonColesWcModel \| None = None) -> None:
	self.logistic = WcLogisticModel()
	self.dixon_coles = dixon_coles or DixonColesWcModel()
	self.metrics: CollaborativeMetrics \| None = None
	self._dixon_coles_weight = 0.5

	@property
	def dixon_coles_weight(self) -> float:
	return self._dixon_coles_weight

	@property
	def poisson_weight(self) -> float:
	return self._dixon_coles_weight

	@property
	def logistic_weight(self) -> float:
	return 1.0 - self._dixon_coles_weight

	def fit(
	self,
	fixtures_df: pd.DataFrame,
	validation_season: int = 2022,
	logistic_model: WcLogisticModel \| None = None,
	on_progress: Callable[[int, int, str], None] \| None = None,
	) -> CollaborativeMetrics:
	df = fixtures_df.sort_values("match_date").copy()
	train_df = wc_holdout_train_df(df, validation_season)
	valid_df = wc_holdout_test_df(df, validation_season)

	if train_df.empty or valid_df.empty:
	raise ValueError(
	f"Não foi possível separar treino/validação para a temporada {validation_season}."
	)

	if logistic_model is not None:
	self.logistic = logistic_model
	else:
	self.logistic.fit(train_df, holdout_season=None)
	if not self.dixon_coles._fitted:
	self.dixon_coles.fit(df, holdout_season=validation_season)

	elo_timeline = precompute_elo_timeline(df)
	base_rows: list[dict] = []
	valid_total = len(valid_df)
	for valid_index, (_, row) in enumerate(valid_df.iterrows(), start=1):
	before = row["match_date"]
	history_mask = pd.to_datetime(df["match_date"], utc=True) < pd.to_datetime(before, utc=True)
	history = df[history_mask]
	if history.empty:
	continue

	phase = row.get("phase", "group")
	is_neutral = bool(row.get("is_neutral", True))

	features = build_match_features(
	history,
	row["home_team"],
	row["away_team"],
	before_date=before,
	phase=phase,
	is_neutral=is_neutral,
	elo_timeline=elo_timeline,
	)
	dc = self.dixon_coles.predict(
	history,
	row["home_team"],
	row["away_team"],
	features=features,
	before_date=before,
	)
	logistic = self.logistic.predict_match(
	history,
	row["home_team"],
	row["away_team"],
	phase=phase,
	is_neutral=is_neutral,
	)
	base_rows.append(
	{
	"label": row["label"],
	"dixon_coles": {"1": dc.prob_home, "X": dc.prob_draw, "2": dc.prob_away},
	"logistic": {"1": logistic.prob_home, "X": logistic.prob_draw, "2": logistic.prob_away},
	}
	)
	if on_progress and (
	valid_index == 1 or valid_index % 5 == 0 or valid_index == valid_total
	):
	on_progress(valid_index, valid_total, "validacao")

	if not base_rows:
	raise ValueError("Não foi possível gerar previsões para calibração.")

	hp = get_wc_hyperparams()
	steps = max(hp.ensemble_weight_steps, 1)
	best: dict \| None = None
	for step in range(0, steps + 1):
	if on_progress:
	on_progress(step + 1, steps + 1, "pesos")
	dw = step / steps
	lw = 1.0 - dw

	scored_rows: list[dict] = []
	correct = 0
	for item in base_rows:
	probs = {
	"1": dw * item["dixon_coles"]["1"] + lw * item["logistic"]["1"],
	"X": dw * item["dixon_coles"]["X"] + lw * item["logistic"]["X"],
	"2": dw * item["dixon_coles"]["2"] + lw * item["logistic"]["2"],
	}
	total = probs["1"] + probs["X"] + probs["2"]
	probs = {k: v / total for k, v in probs.items()}
	pred = max(probs, key=probs.get)
	if pred == item["label"]:
	correct += 1
	scored_rows.append({"label": item["label"], "probs": probs})

	candidate = {
	"dw": dw,
	"lw": lw,
	"accuracy": correct / len(scored_rows),
	"brier": _brier_multiclass(scored_rows),
	"log_loss": _log_loss_multiclass(scored_rows),
	"size": len(scored_rows),
	}
	if best is None or candidate["brier"] < best["brier"]:
	best = candidate

	assert best is not None
	self._dixon_coles_weight = float(best["dw"])
	self.metrics = CollaborativeMetrics(
	dixon_coles_weight=float(best["dw"]),
	logistic_weight=float(best["lw"]),
	accuracy=float(best["accuracy"]),
	brier_score=float(best["brier"]),
	log_loss=float(best["log_loss"]),
	validation_size=int(best["size"]),
	)
	return self.metrics

	def predict(
	self,
	fixtures_df: pd.DataFrame,
	home_team: str,
	away_team: str,
	*,
	phase: str = "group",
	is_neutral: bool = True,
	before_date: datetime \| None = None,
	) -> dict[str, float]:
	if self.metrics is None:
	self.fit(fixtures_df)

	ref = before_date or datetime.now(timezone.utc)
	dt = pd.to_datetime(fixtures_df["match_date"], utc=True)
	ref_ts = pd.Timestamp(ref if ref.tzinfo else ref.replace(tzinfo=timezone.utc))
	history = fixtures_df[dt < ref_ts]
	if history.empty:
	history = fixtures_df

	features = build_match_features(
	history,
	home_team,
	away_team,
	before_date=ref,
	phase=phase,
	is_neutral=is_neutral,
	)
	dc = self.dixon_coles.predict(
	history, home_team, away_team, features=features, before_date=ref
	)
	logistic = self.logistic.predict_match(
	history,
	home_team,
	away_team,
	phase=phase,
	is_neutral=is_neutral,
	)

	dw = self.dixon_coles_weight
	lw = self.logistic_weight
	probs = {
	"1": dw * dc.prob_home + lw * logistic.prob_home,
	"X": dw * dc.prob_draw + lw * logistic.prob_draw,
	"2": dw * dc.prob_away + lw * logistic.prob_away,
	}
	total = probs["1"] + probs["X"] + probs["2"]
	return {k: v / total for k, v in probs.items()}