from __future__ import annotations from typing import Any import pandas as pd def _safe_text(value: Any) -> str: return str(value or "").strip() def _event_is_hit(event: str) -> int: e = str(event or "").strip().lower() return int(e in {"single", "double", "triple", "home_run"}) def _event_is_hr(event: str) -> int: e = str(event or "").strip().lower() return int(e == "home_run") def _event_total_bases(event: str) -> int: e = str(event or "").strip().lower() if e == "single": return 1 if e == "double": return 2 if e == "triple": return 3 if e == "home_run": return 4 return 0 def build_batter_realization_rows( batter_prop_outcomes_df: pd.DataFrame, statcast_df: pd.DataFrame, graded_at: str, ) -> pd.DataFrame: """ First-pass realized batter outcome scaffold. Uses currently loaded statcast rows to determine whether a batter: - recorded at least one hit - recorded at least one HR - recorded 2+ total bases This is not yet game-perfect by game_pk, but it upgrades outcomes from pending to actual observed values within the current loaded dataset. """ if batter_prop_outcomes_df is None or batter_prop_outcomes_df.empty: return pd.DataFrame() if statcast_df is None or statcast_df.empty: return pd.DataFrame() if "player_name" not in statcast_df.columns or "events" not in statcast_df.columns: return pd.DataFrame() rows: list[dict[str, Any]] = [] for _, outcome_row in batter_prop_outcomes_df.iterrows(): batter_name = _safe_text(outcome_row.get("batter_name")) if not batter_name: continue batter_events = statcast_df[ statcast_df["player_name"].astype(str).str.strip() == batter_name ].copy() if batter_events.empty: realized_hit = None realized_hr = None realized_tb2p = None grade_status = "pending" outcome_source = "statcast_no_match" else: events = batter_events["events"].fillna("").astype(str) hit_count = sum(_event_is_hit(e) for e in events) hr_count = sum(_event_is_hr(e) for e in events) total_bases = sum(_event_total_bases(e) for e in events) realized_hit = int(hit_count > 0) realized_hr = int(hr_count > 0) realized_tb2p = int(total_bases >= 2) grade_status = "graded" outcome_source = "statcast_loaded_window" row_dict = outcome_row.to_dict() row_dict["graded_at"] = graded_at row_dict["realized_hit"] = realized_hit row_dict["realized_hr"] = realized_hr row_dict["realized_tb2p"] = realized_tb2p row_dict["grade_status"] = grade_status row_dict["outcome_source"] = outcome_source rows.append(row_dict) return pd.DataFrame(rows)