2026_MLB_Model / analytics /batter_realization.py
Syntrex's picture
Create batter_realization.py
abeeae7 verified
raw
history blame
2.95 kB
from __future__ import annotations
from typing import Any
import pandas as pd
def _safe_text(value: Any) -> str:
return str(value or "").strip()
def _event_is_hit(event: str) -> int:
e = str(event or "").strip().lower()
return int(e in {"single", "double", "triple", "home_run"})
def _event_is_hr(event: str) -> int:
e = str(event or "").strip().lower()
return int(e == "home_run")
def _event_total_bases(event: str) -> int:
e = str(event or "").strip().lower()
if e == "single":
return 1
if e == "double":
return 2
if e == "triple":
return 3
if e == "home_run":
return 4
return 0
def build_batter_realization_rows(
batter_prop_outcomes_df: pd.DataFrame,
statcast_df: pd.DataFrame,
graded_at: str,
) -> pd.DataFrame:
"""
First-pass realized batter outcome scaffold.
Uses currently loaded statcast rows to determine whether a batter:
- recorded at least one hit
- recorded at least one HR
- recorded 2+ total bases
This is not yet game-perfect by game_pk, but it upgrades outcomes
from pending to actual observed values within the current loaded dataset.
"""
if batter_prop_outcomes_df is None or batter_prop_outcomes_df.empty:
return pd.DataFrame()
if statcast_df is None or statcast_df.empty:
return pd.DataFrame()
if "player_name" not in statcast_df.columns or "events" not in statcast_df.columns:
return pd.DataFrame()
rows: list[dict[str, Any]] = []
for _, outcome_row in batter_prop_outcomes_df.iterrows():
batter_name = _safe_text(outcome_row.get("batter_name"))
if not batter_name:
continue
batter_events = statcast_df[
statcast_df["player_name"].astype(str).str.strip() == batter_name
].copy()
if batter_events.empty:
realized_hit = None
realized_hr = None
realized_tb2p = None
grade_status = "pending"
outcome_source = "statcast_no_match"
else:
events = batter_events["events"].fillna("").astype(str)
hit_count = sum(_event_is_hit(e) for e in events)
hr_count = sum(_event_is_hr(e) for e in events)
total_bases = sum(_event_total_bases(e) for e in events)
realized_hit = int(hit_count > 0)
realized_hr = int(hr_count > 0)
realized_tb2p = int(total_bases >= 2)
grade_status = "graded"
outcome_source = "statcast_loaded_window"
row_dict = outcome_row.to_dict()
row_dict["graded_at"] = graded_at
row_dict["realized_hit"] = realized_hit
row_dict["realized_hr"] = realized_hr
row_dict["realized_tb2p"] = realized_tb2p
row_dict["grade_status"] = grade_status
row_dict["outcome_source"] = outcome_source
rows.append(row_dict)
return pd.DataFrame(rows)