2026_MLB_Model / models /batter_trend_model.py
Syntrex's picture
Batch 12A-12G: Player Analysis Suite — batter shape taxonomy, trend model, archetype classifier, pitcher contact-shape enrichment
32229d8
raw
history blame
7.57 kB
from __future__ import annotations
"""
Batch 12B — Batter Trend Model
Computes per-batter rolling quality metrics over 7d and 30d windows relative to a
reference date (usually the scheduled game date). Produces trend-direction flags
for use as output/debug fields in the simulator.
Rules:
- descriptive stats return None when the window has < 5 rows
- boolean flags return False when descriptive stats are None
- reference_date in the simulator path MUST come from game context,
not datetime.now()
"""
from datetime import date, datetime
from typing import Any
import pandas as pd
# ---------------------------------------------------------------------------
# helpers (shared with batter_baseline to avoid redefining formulas)
# ---------------------------------------------------------------------------
def _percentile(series: pd.Series, q: float) -> float | None:
numeric = pd.to_numeric(series, errors="coerce").dropna()
if len(numeric) < 5:
return None
return float(numeric.quantile(q))
def _safe_mean(series: pd.Series) -> float | None:
numeric = pd.to_numeric(series, errors="coerce").dropna()
if len(numeric) < 5:
return None
return float(numeric.mean())
def _barrel_rate(launch_speed: pd.Series, launch_angle: pd.Series) -> float | None:
"""Same barrel approximation as batter_baseline._build_barrel_mask."""
valid = pd.DataFrame(
{
"ls": pd.to_numeric(launch_speed, errors="coerce"),
"la": pd.to_numeric(launch_angle, errors="coerce"),
}
).dropna()
if len(valid) < 5:
return None
mask = (
((valid["ls"] >= 98) & (valid["la"].between(26, 30)))
| ((valid["ls"] >= 99) & (valid["la"].between(25, 31)))
| ((valid["ls"] >= 100) & (valid["la"].between(23, 33)))
| ((valid["ls"] >= 102) & (valid["la"].between(20, 35)))
)
return float(mask.mean())
def _parse_reference_date(reference_date: Any) -> date | None:
"""Parse reference_date from game_row value (str, date, or datetime)."""
if reference_date is None:
return None
if isinstance(reference_date, datetime):
return reference_date.date()
if isinstance(reference_date, date):
return reference_date
if isinstance(reference_date, str):
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S"):
try:
return datetime.strptime(reference_date[:19], fmt).date()
except ValueError:
continue
return None
def _window_df(
player_df: pd.DataFrame,
ref: date,
days: int,
) -> pd.DataFrame:
"""Filter player_df to rows within [ref − days, ref)."""
if "game_date" not in player_df.columns:
return player_df.iloc[0:0] # empty
game_dates = pd.to_datetime(player_df["game_date"], errors="coerce")
cutoff = pd.Timestamp(ref)
lo = cutoff - pd.Timedelta(days=days)
mask = (game_dates >= lo) & (game_dates < cutoff)
return player_df[mask]
# ---------------------------------------------------------------------------
# public API
# ---------------------------------------------------------------------------
_EMPTY_TREND: dict[str, Any] = {
"ev90_7d": None,
"ev90_30d": None,
"barrel_7d": None,
"barrel_30d": None,
"xwoba_7d": None,
"xwoba_30d": None,
"trend_delta_ev90": None,
"trend_delta_barrel": None,
"trend_sample_size_7d": 0, # pitch-event count (~4 per PA), not AB count
"trend_sample_size_30d": 0, # pitch-event count (~4 per PA), not AB count
"batter_hot_flag": False,
"batter_cold_flag": False,
"batter_trending_up_flag": False,
}
def build_batter_trend_row(
statcast_df: pd.DataFrame,
player_name: str,
reference_date: Any = None,
) -> dict[str, Any]:
"""
Compute trend fields for *player_name* relative to *reference_date*.
Parameters
----------
statcast_df : DataFrame
Full batter Statcast dataset (may contain multiple players).
player_name : str
Exact player name as it appears in statcast_df["player_name"].
reference_date : str | date | datetime | None
Game date from game_row context. In the simulator path this MUST be
supplied from game_row["game_datetime_utc"] or game_row["game_date"].
If None, descriptive stats are still attempted but the caller should
treat the result as approximate.
Returns
-------
dict with trend fields; descriptive stats are None when the window has < 5
rows; boolean flags are False when the underlying stat is None.
"""
if statcast_df is None or statcast_df.empty:
return dict(_EMPTY_TREND)
ref = _parse_reference_date(reference_date)
try:
player_df = statcast_df[
statcast_df["player_name"].astype(str) == str(player_name)
].copy()
except Exception:
return dict(_EMPTY_TREND)
if player_df.empty:
return dict(_EMPTY_TREND)
# Season baseline (all available data)
ev90_season = _percentile(player_df.get("launch_speed", pd.Series(dtype=float)), 0.90)
barrel_season = _barrel_rate(
player_df.get("launch_speed", pd.Series(dtype=float)),
player_df.get("launch_angle", pd.Series(dtype=float)),
)
# Windowed subsets (only if reference_date is available)
if ref is None:
return dict(_EMPTY_TREND)
df7 = _window_df(player_df, ref, 7)
df30 = _window_df(player_df, ref, 30)
n7 = len(df7)
n30 = len(df30)
ev90_7d = _percentile(df7.get("launch_speed", pd.Series(dtype=float)), 0.90)
ev90_30d = _percentile(df30.get("launch_speed", pd.Series(dtype=float)), 0.90)
barrel_7d = _barrel_rate(
df7.get("launch_speed", pd.Series(dtype=float)),
df7.get("launch_angle", pd.Series(dtype=float)),
)
barrel_30d = _barrel_rate(
df30.get("launch_speed", pd.Series(dtype=float)),
df30.get("launch_angle", pd.Series(dtype=float)),
)
xwoba_7d = _safe_mean(df7.get("estimated_woba_using_speedangle", pd.Series(dtype=float)))
xwoba_30d = _safe_mean(df30.get("estimated_woba_using_speedangle", pd.Series(dtype=float)))
# Deltas vs season baseline
trend_delta_ev90 = (
round(ev90_7d - ev90_season, 3)
if (ev90_7d is not None and ev90_season is not None)
else None
)
trend_delta_barrel = (
round(barrel_7d - barrel_season, 4)
if (barrel_7d is not None and barrel_season is not None)
else None
)
# Flags (gated on minimum sample)
batter_hot_flag = bool(
trend_delta_ev90 is not None
and trend_delta_ev90 > 1.5
and n7 >= 15
)
batter_cold_flag = bool(
trend_delta_ev90 is not None
and trend_delta_ev90 < -2.0
and n7 >= 15
)
batter_trending_up_flag = bool(
trend_delta_barrel is not None
and trend_delta_barrel > 0.02
and n30 >= 30
)
return {
"ev90_7d": ev90_7d,
"ev90_30d": ev90_30d,
"barrel_7d": barrel_7d,
"barrel_30d": barrel_30d,
"xwoba_7d": xwoba_7d,
"xwoba_30d": xwoba_30d,
"trend_delta_ev90": trend_delta_ev90,
"trend_delta_barrel": trend_delta_barrel,
"trend_sample_size_7d": n7, # pitch-event count, not AB count
"trend_sample_size_30d": n30, # pitch-event count, not AB count
"batter_hot_flag": batter_hot_flag,
"batter_cold_flag": batter_cold_flag,
"batter_trending_up_flag": batter_trending_up_flag,
}