Spaces:
Running
Running
| from __future__ import annotations | |
| from typing import Any | |
| import pandas as pd | |
| PITCH_FAMILY_MAP = { | |
| "4-seam fastball": "fastball", | |
| "four-seam fastball": "fastball", | |
| "fastball": "fastball", | |
| "sinker": "fastball", | |
| "cutter": "fastball", | |
| "slider": "breaking", | |
| "sweeper": "breaking", | |
| "curveball": "breaking", | |
| "knuckle curve": "breaking", | |
| "slurve": "breaking", | |
| "changeup": "offspeed", | |
| "splitter": "offspeed", | |
| "forkball": "offspeed", | |
| "split-finger": "offspeed", | |
| "circle change": "offspeed", | |
| } | |
| def _safe_mean(series: pd.Series) -> float | None: | |
| numeric = pd.to_numeric(series, errors="coerce").dropna() | |
| if numeric.empty: | |
| return None | |
| return float(numeric.mean()) | |
| def _safe_rate(series: pd.Series) -> float | None: | |
| numeric = pd.to_numeric(series, errors="coerce").dropna() | |
| if numeric.empty: | |
| return None | |
| return float(numeric.mean()) | |
| def _normalize_pitch_family(pitch_name: Any) -> str: | |
| text = str(pitch_name or "").strip().lower() | |
| if text in {"", "nan", "none"}: | |
| return "unknown" | |
| return PITCH_FAMILY_MAP.get(text, "unknown") | |
| def classify_zone_bucket(plate_x: Any, plate_z: Any) -> str: | |
| try: | |
| x = float(plate_x) | |
| z = float(plate_z) | |
| except Exception: | |
| return "unknown" | |
| # Approx strike-zone guidance | |
| # Heart = central zone | |
| # Shadow = edge of zone | |
| # Chase = just outside zone | |
| # Waste = clearly outside zone | |
| zone_left = -0.83 | |
| zone_right = 0.83 | |
| zone_bottom = 1.50 | |
| zone_top = 3.50 | |
| if zone_left <= x <= zone_right and zone_bottom <= z <= zone_top: | |
| inner_left = -0.45 | |
| inner_right = 0.45 | |
| inner_bottom = 1.90 | |
| inner_top = 3.10 | |
| if inner_left <= x <= inner_right and inner_bottom <= z <= inner_top: | |
| return "heart" | |
| return "shadow" | |
| chase_left = -1.20 | |
| chase_right = 1.20 | |
| chase_bottom = 1.10 | |
| chase_top = 3.90 | |
| if chase_left <= x <= chase_right and chase_bottom <= z <= chase_top: | |
| return "chase" | |
| return "waste" | |
| def _empty_batter_zone_row(player_name: str) -> dict[str, Any]: | |
| out: dict[str, Any] = { | |
| "player_name": player_name, | |
| "zone_sample_size": 0, | |
| } | |
| for family in ["fastball", "breaking", "offspeed"]: | |
| for zone in ["heart", "shadow", "chase", "waste"]: | |
| out[f"hr_prob_{family}_{zone}"] = None | |
| out[f"hit_prob_{family}_{zone}"] = None | |
| out[f"tb2p_prob_{family}_{zone}"] = None | |
| out[f"whiff_prob_{family}_{zone}"] = None | |
| out[f"damage_prob_{family}_{zone}"] = None | |
| out[f"sample_size_{family}_{zone}"] = 0 | |
| return out | |
| def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]: | |
| if statcast_df.empty or "player_name" not in statcast_df.columns: | |
| return _empty_batter_zone_row(player_name) | |
| df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy() | |
| if df.empty: | |
| return _empty_batter_zone_row(player_name) | |
| # Need pitch location + pitch type for zone modeling | |
| if "plate_x" not in df.columns or "plate_z" not in df.columns: | |
| return _empty_batter_zone_row(player_name) | |
| pitch_name_series = None | |
| if "pitch_name" in df.columns: | |
| pitch_name_series = df["pitch_name"] | |
| elif "pitch_type" in df.columns: | |
| pitch_name_series = df["pitch_type"] | |
| else: | |
| pitch_name_series = pd.Series(["unknown"] * len(df), index=df.index) | |
| zone_bucket_series = df.apply( | |
| lambda row: classify_zone_bucket(row.get("plate_x"), row.get("plate_z")), | |
| axis=1, | |
| ) | |
| pitch_family_series = pitch_name_series.apply(_normalize_pitch_family) | |
| df = df.copy() | |
| df["zone_bucket"] = zone_bucket_series | |
| df["pitch_family"] = pitch_family_series | |
| launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce") | |
| estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce") | |
| events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower() | |
| # rough hit / tb / hr / whiff proxies | |
| hit_mask = events.isin({"single", "double", "triple", "home_run"}) | |
| hr_mask = events.eq("home_run") | |
| tb2p_mask = events.isin({"double", "triple", "home_run"}) | |
| description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower() | |
| whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"}) | |
| # damage proxy: either quality contact or strong xwOBA | |
| damage_mask = ( | |
| (launch_speed >= 95) | |
| | (estimated_woba >= 0.500) | |
| | hr_mask | |
| ) | |
| out = _empty_batter_zone_row(player_name) | |
| out["zone_sample_size"] = int(len(df)) | |
| for family in ["fastball", "breaking", "offspeed"]: | |
| for zone in ["heart", "shadow", "chase", "waste"]: | |
| subset = df[(df["pitch_family"] == family) & (df["zone_bucket"] == zone)].copy() | |
| if subset.empty: | |
| continue | |
| subset_idx = subset.index | |
| sample_size = int(len(subset)) | |
| out[f"sample_size_{family}_{zone}"] = sample_size | |
| out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean()) | |
| out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean()) | |
| out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean()) | |
| out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean()) | |
| out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean()) | |
| return out |