Spaces:
Sleeping
Sleeping
Create batter_zone_model.py
Browse files- models/batter_zone_model.py +175 -0
models/batter_zone_model.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
PITCH_FAMILY_MAP = {
|
| 9 |
+
"4-seam fastball": "fastball",
|
| 10 |
+
"four-seam fastball": "fastball",
|
| 11 |
+
"fastball": "fastball",
|
| 12 |
+
"sinker": "fastball",
|
| 13 |
+
"cutter": "fastball",
|
| 14 |
+
|
| 15 |
+
"slider": "breaking",
|
| 16 |
+
"sweeper": "breaking",
|
| 17 |
+
"curveball": "breaking",
|
| 18 |
+
"knuckle curve": "breaking",
|
| 19 |
+
"slurve": "breaking",
|
| 20 |
+
|
| 21 |
+
"changeup": "offspeed",
|
| 22 |
+
"splitter": "offspeed",
|
| 23 |
+
"forkball": "offspeed",
|
| 24 |
+
"split-finger": "offspeed",
|
| 25 |
+
"circle change": "offspeed",
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _safe_mean(series: pd.Series) -> float | None:
|
| 30 |
+
numeric = pd.to_numeric(series, errors="coerce").dropna()
|
| 31 |
+
if numeric.empty:
|
| 32 |
+
return None
|
| 33 |
+
return float(numeric.mean())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _safe_rate(series: pd.Series) -> float | None:
|
| 37 |
+
numeric = pd.to_numeric(series, errors="coerce").dropna()
|
| 38 |
+
if numeric.empty:
|
| 39 |
+
return None
|
| 40 |
+
return float(numeric.mean())
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _normalize_pitch_family(pitch_name: Any) -> str:
|
| 44 |
+
text = str(pitch_name or "").strip().lower()
|
| 45 |
+
if text in {"", "nan", "none"}:
|
| 46 |
+
return "unknown"
|
| 47 |
+
return PITCH_FAMILY_MAP.get(text, "unknown")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def classify_zone_bucket(plate_x: Any, plate_z: Any) -> str:
|
| 51 |
+
try:
|
| 52 |
+
x = float(plate_x)
|
| 53 |
+
z = float(plate_z)
|
| 54 |
+
except Exception:
|
| 55 |
+
return "unknown"
|
| 56 |
+
|
| 57 |
+
# Approx strike-zone guidance
|
| 58 |
+
# Heart = central zone
|
| 59 |
+
# Shadow = edge of zone
|
| 60 |
+
# Chase = just outside zone
|
| 61 |
+
# Waste = clearly outside zone
|
| 62 |
+
zone_left = -0.83
|
| 63 |
+
zone_right = 0.83
|
| 64 |
+
zone_bottom = 1.50
|
| 65 |
+
zone_top = 3.50
|
| 66 |
+
|
| 67 |
+
if zone_left <= x <= zone_right and zone_bottom <= z <= zone_top:
|
| 68 |
+
inner_left = -0.45
|
| 69 |
+
inner_right = 0.45
|
| 70 |
+
inner_bottom = 1.90
|
| 71 |
+
inner_top = 3.10
|
| 72 |
+
|
| 73 |
+
if inner_left <= x <= inner_right and inner_bottom <= z <= inner_top:
|
| 74 |
+
return "heart"
|
| 75 |
+
return "shadow"
|
| 76 |
+
|
| 77 |
+
chase_left = -1.20
|
| 78 |
+
chase_right = 1.20
|
| 79 |
+
chase_bottom = 1.10
|
| 80 |
+
chase_top = 3.90
|
| 81 |
+
|
| 82 |
+
if chase_left <= x <= chase_right and chase_bottom <= z <= chase_top:
|
| 83 |
+
return "chase"
|
| 84 |
+
|
| 85 |
+
return "waste"
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _empty_batter_zone_row(player_name: str) -> dict[str, Any]:
|
| 89 |
+
out: dict[str, Any] = {
|
| 90 |
+
"player_name": player_name,
|
| 91 |
+
"zone_sample_size": 0,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
for family in ["fastball", "breaking", "offspeed"]:
|
| 95 |
+
for zone in ["heart", "shadow", "chase", "waste"]:
|
| 96 |
+
out[f"hr_prob_{family}_{zone}"] = None
|
| 97 |
+
out[f"hit_prob_{family}_{zone}"] = None
|
| 98 |
+
out[f"tb2p_prob_{family}_{zone}"] = None
|
| 99 |
+
out[f"whiff_prob_{family}_{zone}"] = None
|
| 100 |
+
out[f"damage_prob_{family}_{zone}"] = None
|
| 101 |
+
out[f"sample_size_{family}_{zone}"] = 0
|
| 102 |
+
|
| 103 |
+
return out
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]:
|
| 107 |
+
if statcast_df.empty or "player_name" not in statcast_df.columns:
|
| 108 |
+
return _empty_batter_zone_row(player_name)
|
| 109 |
+
|
| 110 |
+
df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy()
|
| 111 |
+
if df.empty:
|
| 112 |
+
return _empty_batter_zone_row(player_name)
|
| 113 |
+
|
| 114 |
+
# Need pitch location + pitch type for zone modeling
|
| 115 |
+
if "plate_x" not in df.columns or "plate_z" not in df.columns:
|
| 116 |
+
return _empty_batter_zone_row(player_name)
|
| 117 |
+
|
| 118 |
+
pitch_name_series = None
|
| 119 |
+
if "pitch_name" in df.columns:
|
| 120 |
+
pitch_name_series = df["pitch_name"]
|
| 121 |
+
elif "pitch_type" in df.columns:
|
| 122 |
+
pitch_name_series = df["pitch_type"]
|
| 123 |
+
else:
|
| 124 |
+
pitch_name_series = pd.Series(["unknown"] * len(df), index=df.index)
|
| 125 |
+
|
| 126 |
+
zone_bucket_series = df.apply(
|
| 127 |
+
lambda row: classify_zone_bucket(row.get("plate_x"), row.get("plate_z")),
|
| 128 |
+
axis=1,
|
| 129 |
+
)
|
| 130 |
+
pitch_family_series = pitch_name_series.apply(_normalize_pitch_family)
|
| 131 |
+
|
| 132 |
+
df = df.copy()
|
| 133 |
+
df["zone_bucket"] = zone_bucket_series
|
| 134 |
+
df["pitch_family"] = pitch_family_series
|
| 135 |
+
|
| 136 |
+
launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce")
|
| 137 |
+
estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
|
| 138 |
+
events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 139 |
+
|
| 140 |
+
# rough hit / tb / hr / whiff proxies
|
| 141 |
+
hit_mask = events.isin({"single", "double", "triple", "home_run"})
|
| 142 |
+
hr_mask = events.eq("home_run")
|
| 143 |
+
tb2p_mask = events.isin({"double", "triple", "home_run"})
|
| 144 |
+
|
| 145 |
+
description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 146 |
+
whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
|
| 147 |
+
|
| 148 |
+
# damage proxy: either quality contact or strong xwOBA
|
| 149 |
+
damage_mask = (
|
| 150 |
+
(launch_speed >= 95)
|
| 151 |
+
| (estimated_woba >= 0.500)
|
| 152 |
+
| hr_mask
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
out = _empty_batter_zone_row(player_name)
|
| 156 |
+
out["zone_sample_size"] = int(len(df))
|
| 157 |
+
|
| 158 |
+
for family in ["fastball", "breaking", "offspeed"]:
|
| 159 |
+
for zone in ["heart", "shadow", "chase", "waste"]:
|
| 160 |
+
subset = df[(df["pitch_family"] == family) & (df["zone_bucket"] == zone)].copy()
|
| 161 |
+
if subset.empty:
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
subset_idx = subset.index
|
| 165 |
+
|
| 166 |
+
sample_size = int(len(subset))
|
| 167 |
+
out[f"sample_size_{family}_{zone}"] = sample_size
|
| 168 |
+
|
| 169 |
+
out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
|
| 170 |
+
out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
|
| 171 |
+
out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
|
| 172 |
+
out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
|
| 173 |
+
out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
|
| 174 |
+
|
| 175 |
+
return out
|