from __future__ import annotations from typing import Any import pandas as pd PITCH_FAMILY_MAP = { "4-seam fastball": "fastball", "four-seam fastball": "fastball", "fastball": "fastball", "sinker": "fastball", "cutter": "fastball", "slider": "breaking", "sweeper": "breaking", "curveball": "breaking", "knuckle curve": "breaking", "slurve": "breaking", "changeup": "offspeed", "splitter": "offspeed", "forkball": "offspeed", "split-finger": "offspeed", "circle change": "offspeed", } def _safe_mean(series: pd.Series) -> float | None: numeric = pd.to_numeric(series, errors="coerce").dropna() if numeric.empty: return None return float(numeric.mean()) def _safe_rate(series: pd.Series) -> float | None: numeric = pd.to_numeric(series, errors="coerce").dropna() if numeric.empty: return None return float(numeric.mean()) def _normalize_pitch_family(pitch_name: Any) -> str: text = str(pitch_name or "").strip().lower() if text in {"", "nan", "none"}: return "unknown" return PITCH_FAMILY_MAP.get(text, "unknown") def classify_zone_bucket(plate_x: Any, plate_z: Any) -> str: try: x = float(plate_x) z = float(plate_z) except Exception: return "unknown" # Approx strike-zone guidance # Heart = central zone # Shadow = edge of zone # Chase = just outside zone # Waste = clearly outside zone zone_left = -0.83 zone_right = 0.83 zone_bottom = 1.50 zone_top = 3.50 if zone_left <= x <= zone_right and zone_bottom <= z <= zone_top: inner_left = -0.45 inner_right = 0.45 inner_bottom = 1.90 inner_top = 3.10 if inner_left <= x <= inner_right and inner_bottom <= z <= inner_top: return "heart" return "shadow" chase_left = -1.20 chase_right = 1.20 chase_bottom = 1.10 chase_top = 3.90 if chase_left <= x <= chase_right and chase_bottom <= z <= chase_top: return "chase" return "waste" def _empty_batter_zone_row(player_name: str) -> dict[str, Any]: out: dict[str, Any] = { "player_name": player_name, "zone_sample_size": 0, } for family in ["fastball", "breaking", "offspeed"]: for zone in ["heart", "shadow", "chase", "waste"]: out[f"hr_prob_{family}_{zone}"] = None out[f"hit_prob_{family}_{zone}"] = None out[f"tb2p_prob_{family}_{zone}"] = None out[f"whiff_prob_{family}_{zone}"] = None out[f"damage_prob_{family}_{zone}"] = None out[f"sample_size_{family}_{zone}"] = 0 return out def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]: if statcast_df.empty or "player_name" not in statcast_df.columns: return _empty_batter_zone_row(player_name) df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy() if df.empty: return _empty_batter_zone_row(player_name) # Need pitch location + pitch type for zone modeling if "plate_x" not in df.columns or "plate_z" not in df.columns: return _empty_batter_zone_row(player_name) pitch_name_series = None if "pitch_name" in df.columns: pitch_name_series = df["pitch_name"] elif "pitch_type" in df.columns: pitch_name_series = df["pitch_type"] else: pitch_name_series = pd.Series(["unknown"] * len(df), index=df.index) zone_bucket_series = df.apply( lambda row: classify_zone_bucket(row.get("plate_x"), row.get("plate_z")), axis=1, ) pitch_family_series = pitch_name_series.apply(_normalize_pitch_family) df = df.copy() df["zone_bucket"] = zone_bucket_series df["pitch_family"] = pitch_family_series launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce") estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce") events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower() # rough hit / tb / hr / whiff proxies hit_mask = events.isin({"single", "double", "triple", "home_run"}) hr_mask = events.eq("home_run") tb2p_mask = events.isin({"double", "triple", "home_run"}) description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower() whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"}) # damage proxy: either quality contact or strong xwOBA damage_mask = ( (launch_speed >= 95) | (estimated_woba >= 0.500) | hr_mask ) out = _empty_batter_zone_row(player_name) out["zone_sample_size"] = int(len(df)) for family in ["fastball", "breaking", "offspeed"]: for zone in ["heart", "shadow", "chase", "waste"]: subset = df[(df["pitch_family"] == family) & (df["zone_bucket"] == zone)].copy() if subset.empty: continue subset_idx = subset.index sample_size = int(len(subset)) out[f"sample_size_{family}_{zone}"] = sample_size out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean()) out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean()) out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean()) out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean()) out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean()) return out