2026_MLB_Model / batter_zone_model.py
Syntrex's picture
Create batter_zone_model.py
1d55b3f verified
raw
history blame
5.66 kB
from __future__ import annotations
from typing import Any
import pandas as pd
PITCH_FAMILY_MAP = {
"4-seam fastball": "fastball",
"four-seam fastball": "fastball",
"fastball": "fastball",
"sinker": "fastball",
"cutter": "fastball",
"slider": "breaking",
"sweeper": "breaking",
"curveball": "breaking",
"knuckle curve": "breaking",
"slurve": "breaking",
"changeup": "offspeed",
"splitter": "offspeed",
"forkball": "offspeed",
"split-finger": "offspeed",
"circle change": "offspeed",
}
def _safe_mean(series: pd.Series) -> float | None:
numeric = pd.to_numeric(series, errors="coerce").dropna()
if numeric.empty:
return None
return float(numeric.mean())
def _safe_rate(series: pd.Series) -> float | None:
numeric = pd.to_numeric(series, errors="coerce").dropna()
if numeric.empty:
return None
return float(numeric.mean())
def _normalize_pitch_family(pitch_name: Any) -> str:
text = str(pitch_name or "").strip().lower()
if text in {"", "nan", "none"}:
return "unknown"
return PITCH_FAMILY_MAP.get(text, "unknown")
def classify_zone_bucket(plate_x: Any, plate_z: Any) -> str:
try:
x = float(plate_x)
z = float(plate_z)
except Exception:
return "unknown"
# Approx strike-zone guidance
# Heart = central zone
# Shadow = edge of zone
# Chase = just outside zone
# Waste = clearly outside zone
zone_left = -0.83
zone_right = 0.83
zone_bottom = 1.50
zone_top = 3.50
if zone_left <= x <= zone_right and zone_bottom <= z <= zone_top:
inner_left = -0.45
inner_right = 0.45
inner_bottom = 1.90
inner_top = 3.10
if inner_left <= x <= inner_right and inner_bottom <= z <= inner_top:
return "heart"
return "shadow"
chase_left = -1.20
chase_right = 1.20
chase_bottom = 1.10
chase_top = 3.90
if chase_left <= x <= chase_right and chase_bottom <= z <= chase_top:
return "chase"
return "waste"
def _empty_batter_zone_row(player_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"player_name": player_name,
"zone_sample_size": 0,
}
for family in ["fastball", "breaking", "offspeed"]:
for zone in ["heart", "shadow", "chase", "waste"]:
out[f"hr_prob_{family}_{zone}"] = None
out[f"hit_prob_{family}_{zone}"] = None
out[f"tb2p_prob_{family}_{zone}"] = None
out[f"whiff_prob_{family}_{zone}"] = None
out[f"damage_prob_{family}_{zone}"] = None
out[f"sample_size_{family}_{zone}"] = 0
return out
def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]:
if statcast_df.empty or "player_name" not in statcast_df.columns:
return _empty_batter_zone_row(player_name)
df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy()
if df.empty:
return _empty_batter_zone_row(player_name)
# Need pitch location + pitch type for zone modeling
if "plate_x" not in df.columns or "plate_z" not in df.columns:
return _empty_batter_zone_row(player_name)
pitch_name_series = None
if "pitch_name" in df.columns:
pitch_name_series = df["pitch_name"]
elif "pitch_type" in df.columns:
pitch_name_series = df["pitch_type"]
else:
pitch_name_series = pd.Series(["unknown"] * len(df), index=df.index)
zone_bucket_series = df.apply(
lambda row: classify_zone_bucket(row.get("plate_x"), row.get("plate_z")),
axis=1,
)
pitch_family_series = pitch_name_series.apply(_normalize_pitch_family)
df = df.copy()
df["zone_bucket"] = zone_bucket_series
df["pitch_family"] = pitch_family_series
launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce")
estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
# rough hit / tb / hr / whiff proxies
hit_mask = events.isin({"single", "double", "triple", "home_run"})
hr_mask = events.eq("home_run")
tb2p_mask = events.isin({"double", "triple", "home_run"})
description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
# damage proxy: either quality contact or strong xwOBA
damage_mask = (
(launch_speed >= 95)
| (estimated_woba >= 0.500)
| hr_mask
)
out = _empty_batter_zone_row(player_name)
out["zone_sample_size"] = int(len(df))
for family in ["fastball", "breaking", "offspeed"]:
for zone in ["heart", "shadow", "chase", "waste"]:
subset = df[(df["pitch_family"] == family) & (df["zone_bucket"] == zone)].copy()
if subset.empty:
continue
subset_idx = subset.index
sample_size = int(len(subset))
out[f"sample_size_{family}_{zone}"] = sample_size
out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
return out