2026_MLB_Model / analytics /evaluation_metrics.py
Syntrex's picture
Promote strikeout v2 and harden telemetry models
50dc123
raw
history blame
13.5 kB
from __future__ import annotations
import math
import pandas as pd
def _resolve_expected_prob(df: pd.DataFrame) -> pd.Series | None:
if "fair_prob" in df.columns:
series = pd.to_numeric(df["fair_prob"], errors="coerce")
if series.notna().any():
return series
if "model_k_prob" in df.columns:
series = pd.to_numeric(df["model_k_prob"], errors="coerce")
if series.notna().any():
return series
if "model_hr_prob" in df.columns:
series = pd.to_numeric(df["model_hr_prob"], errors="coerce")
if series.notna().any():
return series
if "fair_hr_odds" in df.columns:
return df["fair_hr_odds"].apply(_american_to_prob)
return None
def _resolve_realized_outcome(df: pd.DataFrame) -> pd.Series | None:
for col in ("realized_outcome", "realized_hr", "realized_k_over", "realized_win"):
if col in df.columns:
return pd.to_numeric(df[col], errors="coerce")
return None
def build_hr_calibration_table(audit_df: pd.DataFrame) -> pd.DataFrame:
"""
Compare predicted HR probability vs realized HR rate.
"""
if audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
df = df[df["hr_prob"].notna()]
bins = pd.cut(
df["hr_prob"],
bins=[0, .02, .04, .06, .08, .10, .15, .20, 1],
include_lowest=True,
)
table = (
df.groupby(bins)
.agg(
predictions=("hr_prob", "count"),
avg_pred_prob=("hr_prob", "mean"),
realized_hr_rate=("realized_hr", "mean"),
)
.reset_index()
)
table["realized_hr_rate"] = table["realized_hr_rate"].fillna(0)
return table
def build_edge_bucket_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
df = df[df["adjusted_edge"].notna()]
bins = pd.cut(
df["adjusted_edge"],
bins=[-1, -.05, -.02, 0, .02, .04, .06, .10, 1],
include_lowest=True,
)
table = (
df.groupby(bins)
.agg(
samples=("adjusted_edge", "count"),
avg_edge=("adjusted_edge", "mean"),
hr_rate=("realized_hr", "mean"),
)
.reset_index()
)
table["hr_rate"] = table["hr_rate"].fillna(0)
return table
def build_confidence_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
bins = pd.cut(
df["confidence"],
bins=[0, 40, 55, 70, 85, 100],
include_lowest=True,
)
table = (
df.groupby(bins)
.agg(
samples=("confidence", "count"),
hr_rate=("realized_hr", "mean"),
)
.reset_index()
)
table["hr_rate"] = table["hr_rate"].fillna(0)
return table
def build_tier_performance_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df.empty:
return pd.DataFrame()
table = (
audit_df.groupby("recommendation_tier")
.agg(
samples=("recommendation_tier", "count"),
hr_rate=("realized_hr", "mean"),
avg_edge=("adjusted_edge", "mean"),
avg_confidence=("confidence", "mean"),
)
.reset_index()
)
table["hr_rate"] = table["hr_rate"].fillna(0)
return table
def _safe_float(value) -> float | None:
try:
if value is None:
return None
text = str(value).strip().lower()
if text in {"", "nan", "none"}:
return None
return float(value)
except Exception:
return None
def _american_to_prob(odds) -> float | None:
value = _safe_float(odds)
if value is None:
return None
if value > 0:
return 100.0 / (value + 100.0)
if value < 0:
return abs(value) / (abs(value) + 100.0)
return None
def _bucket_series(values: pd.Series, edges: list[float], labels: list[str]) -> pd.Series:
try:
return pd.cut(values, bins=edges, labels=labels, include_lowest=True)
except Exception:
return pd.Series([None] * len(values), index=values.index)
def build_ere_table(audit_df: pd.DataFrame) -> pd.DataFrame:
"""
Global Edge Realization Efficiency.
ERE = realized outcomes / expected outcomes
"""
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
realized = _resolve_realized_outcome(df)
if realized is None:
return pd.DataFrame()
expected = _resolve_expected_prob(df)
if expected is None:
return pd.DataFrame()
df["expected_prob"] = expected.apply(_safe_float)
df["realized_outcome"] = realized.apply(_safe_float).fillna(0.0)
df = df[df["expected_prob"].notna()].copy()
if df.empty:
return pd.DataFrame()
expected_total = float(df["expected_prob"].sum())
realized_total = float(df["realized_outcome"].sum())
ere = realized_total / expected_total if expected_total > 0 else None
return pd.DataFrame(
[
{
"bets": int(len(df)),
"expected_hr_total": round(expected_total, 4),
"actual_hr_total": round(realized_total, 4),
"ere": round(ere, 4) if ere is not None else None,
}
]
)
def build_ere_by_edge_bucket_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
realized = _resolve_realized_outcome(df)
if "adjusted_edge" not in df.columns or realized is None:
return pd.DataFrame()
expected = _resolve_expected_prob(df)
if expected is None:
return pd.DataFrame()
df["expected_prob"] = expected.apply(_safe_float)
df["adjusted_edge"] = df["adjusted_edge"].apply(_safe_float)
df["realized_outcome"] = realized.apply(_safe_float).fillna(0.0)
df = df[df["adjusted_edge"].notna() & df["expected_prob"].notna()].copy()
if df.empty:
return pd.DataFrame()
edges = [-math.inf, 0.02, 0.04, 0.06, 0.08, math.inf]
labels = ["<2%", "2-4%", "4-6%", "6-8%", "8%+"]
df["edge_bucket"] = _bucket_series(df["adjusted_edge"], edges, labels)
grouped = (
df.groupby("edge_bucket", dropna=False)
.agg(
bets=("realized_outcome", "size"),
expected_hr_total=("expected_prob", "sum"),
actual_hr_total=("realized_outcome", "sum"),
)
.reset_index()
)
grouped["ere"] = grouped.apply(
lambda r: (r["actual_hr_total"] / r["expected_hr_total"])
if r["expected_hr_total"] and r["expected_hr_total"] > 0
else None,
axis=1,
)
grouped["expected_hr_total"] = grouped["expected_hr_total"].round(4)
grouped["actual_hr_total"] = grouped["actual_hr_total"].round(4)
grouped["ere"] = grouped["ere"].round(4)
return grouped
def build_ere_by_confidence_bucket_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
realized = _resolve_realized_outcome(df)
if "confidence" not in df.columns or realized is None:
return pd.DataFrame()
expected = _resolve_expected_prob(df)
if expected is None:
return pd.DataFrame()
df["expected_prob"] = expected.apply(_safe_float)
df["confidence"] = df["confidence"].apply(_safe_float)
df["realized_outcome"] = realized.apply(_safe_float).fillna(0.0)
df = df[df["confidence"].notna() & df["expected_prob"].notna()].copy()
if df.empty:
return pd.DataFrame()
edges = [-math.inf, 0.4, 0.5, 0.6, 0.7, math.inf]
labels = ["<0.40", "0.40-0.50", "0.50-0.60", "0.60-0.70", "0.70+"]
df["confidence_bucket"] = _bucket_series(df["confidence"], edges, labels)
grouped = (
df.groupby("confidence_bucket", dropna=False)
.agg(
bets=("realized_outcome", "size"),
expected_hr_total=("expected_prob", "sum"),
actual_hr_total=("realized_outcome", "sum"),
)
.reset_index()
)
grouped["ere"] = grouped.apply(
lambda r: (r["actual_hr_total"] / r["expected_hr_total"])
if r["expected_hr_total"] and r["expected_hr_total"] > 0
else None,
axis=1,
)
grouped["expected_hr_total"] = grouped["expected_hr_total"].round(4)
grouped["actual_hr_total"] = grouped["actual_hr_total"].round(4)
grouped["ere"] = grouped["ere"].round(4)
return grouped
def build_ere_by_tier_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
realized = _resolve_realized_outcome(df)
if "recommendation_tier" not in df.columns or realized is None:
return pd.DataFrame()
expected = _resolve_expected_prob(df)
if expected is None:
return pd.DataFrame()
df["expected_prob"] = expected.apply(_safe_float)
df["realized_outcome"] = realized.apply(_safe_float).fillna(0.0)
df["recommendation_tier"] = df["recommendation_tier"].fillna("").astype(str)
df = df[df["expected_prob"].notna() & df["recommendation_tier"].ne("")].copy()
if df.empty:
return pd.DataFrame()
grouped = (
df.groupby("recommendation_tier", dropna=False)
.agg(
bets=("realized_outcome", "size"),
expected_hr_total=("expected_prob", "sum"),
actual_hr_total=("realized_outcome", "sum"),
)
.reset_index()
)
grouped["ere"] = grouped.apply(
lambda r: (r["actual_hr_total"] / r["expected_hr_total"])
if r["expected_hr_total"] and r["expected_hr_total"] > 0
else None,
axis=1,
)
grouped["expected_hr_total"] = grouped["expected_hr_total"].round(4)
grouped["actual_hr_total"] = grouped["actual_hr_total"].round(4)
grouped["ere"] = grouped["ere"].round(4)
return grouped
def build_clv_table(audit_df: pd.DataFrame) -> pd.DataFrame:
"""
CLV table. Uses closing odds if available.
Assumes:
- book_hr_odds = bet-time odds
- closing_hr_odds = closing odds (if present)
"""
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
if "book_hr_odds" not in df.columns or "closing_hr_odds" not in df.columns:
return pd.DataFrame()
df["bet_prob"] = df["book_hr_odds"].apply(_american_to_prob)
df["close_prob"] = df["closing_hr_odds"].apply(_american_to_prob)
df["clv"] = df["close_prob"] - df["bet_prob"]
df = df[df["bet_prob"].notna() & df["close_prob"].notna()].copy()
if df.empty:
return pd.DataFrame()
summary = {
"bets": int(len(df)),
"avg_bet_prob": round(float(df["bet_prob"].mean()), 4),
"avg_close_prob": round(float(df["close_prob"].mean()), 4),
"avg_clv": round(float(df["clv"].mean()), 4),
"beat_closing_pct": round(float((df["clv"] > 0).mean()), 4),
}
return pd.DataFrame([summary])
def build_clv_by_tier_table(audit_df: pd.DataFrame) -> pd.DataFrame:
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
if (
"book_hr_odds" not in df.columns
or "closing_hr_odds" not in df.columns
or "recommendation_tier" not in df.columns
):
return pd.DataFrame()
df["bet_prob"] = df["book_hr_odds"].apply(_american_to_prob)
df["close_prob"] = df["closing_hr_odds"].apply(_american_to_prob)
df["clv"] = df["close_prob"] - df["bet_prob"]
df["recommendation_tier"] = df["recommendation_tier"].fillna("").astype(str)
df = df[
df["bet_prob"].notna()
& df["close_prob"].notna()
& df["recommendation_tier"].ne("")
].copy()
if df.empty:
return pd.DataFrame()
grouped = (
df.groupby("recommendation_tier", dropna=False)
.agg(
bets=("clv", "size"),
avg_clv=("clv", "mean"),
beat_closing_pct=("clv", lambda s: (s > 0).mean()),
)
.reset_index()
)
grouped["avg_clv"] = grouped["avg_clv"].round(4)
grouped["beat_closing_pct"] = grouped["beat_closing_pct"].round(4)
return grouped
def build_props_calibration_table(audit_df: pd.DataFrame, bins: list[float] | None = None) -> pd.DataFrame:
if audit_df is None or audit_df.empty:
return pd.DataFrame()
df = audit_df.copy()
expected = _resolve_expected_prob(df)
realized = _resolve_realized_outcome(df)
if expected is None or realized is None:
return pd.DataFrame()
df["expected_prob"] = expected.apply(_safe_float)
df["realized_outcome"] = realized.apply(_safe_float)
df = df[df["expected_prob"].notna() & df["realized_outcome"].notna()].copy()
if df.empty:
return pd.DataFrame()
cut_bins = bins or [0, 0.05, 0.10, 0.20, 0.35, 0.50, 0.65, 0.80, 1.0]
grouped = (
df.groupby(pd.cut(df["expected_prob"], bins=cut_bins, include_lowest=True), dropna=False)
.agg(
samples=("expected_prob", "count"),
avg_pred_prob=("expected_prob", "mean"),
realized_rate=("realized_outcome", "mean"),
)
.reset_index()
)
grouped["realized_rate"] = grouped["realized_rate"].round(4)
grouped["avg_pred_prob"] = grouped["avg_pred_prob"].round(4)
return grouped