Syntrex commited on
Commit
4c2f97d
·
verified ·
1 Parent(s): 65a9049

Create batter_zone_model.py

Browse files
Files changed (1) hide show
  1. models/batter_zone_model.py +175 -0
models/batter_zone_model.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import pandas as pd
6
+
7
+
8
+ PITCH_FAMILY_MAP = {
9
+ "4-seam fastball": "fastball",
10
+ "four-seam fastball": "fastball",
11
+ "fastball": "fastball",
12
+ "sinker": "fastball",
13
+ "cutter": "fastball",
14
+
15
+ "slider": "breaking",
16
+ "sweeper": "breaking",
17
+ "curveball": "breaking",
18
+ "knuckle curve": "breaking",
19
+ "slurve": "breaking",
20
+
21
+ "changeup": "offspeed",
22
+ "splitter": "offspeed",
23
+ "forkball": "offspeed",
24
+ "split-finger": "offspeed",
25
+ "circle change": "offspeed",
26
+ }
27
+
28
+
29
+ def _safe_mean(series: pd.Series) -> float | None:
30
+ numeric = pd.to_numeric(series, errors="coerce").dropna()
31
+ if numeric.empty:
32
+ return None
33
+ return float(numeric.mean())
34
+
35
+
36
+ def _safe_rate(series: pd.Series) -> float | None:
37
+ numeric = pd.to_numeric(series, errors="coerce").dropna()
38
+ if numeric.empty:
39
+ return None
40
+ return float(numeric.mean())
41
+
42
+
43
+ def _normalize_pitch_family(pitch_name: Any) -> str:
44
+ text = str(pitch_name or "").strip().lower()
45
+ if text in {"", "nan", "none"}:
46
+ return "unknown"
47
+ return PITCH_FAMILY_MAP.get(text, "unknown")
48
+
49
+
50
+ def classify_zone_bucket(plate_x: Any, plate_z: Any) -> str:
51
+ try:
52
+ x = float(plate_x)
53
+ z = float(plate_z)
54
+ except Exception:
55
+ return "unknown"
56
+
57
+ # Approx strike-zone guidance
58
+ # Heart = central zone
59
+ # Shadow = edge of zone
60
+ # Chase = just outside zone
61
+ # Waste = clearly outside zone
62
+ zone_left = -0.83
63
+ zone_right = 0.83
64
+ zone_bottom = 1.50
65
+ zone_top = 3.50
66
+
67
+ if zone_left <= x <= zone_right and zone_bottom <= z <= zone_top:
68
+ inner_left = -0.45
69
+ inner_right = 0.45
70
+ inner_bottom = 1.90
71
+ inner_top = 3.10
72
+
73
+ if inner_left <= x <= inner_right and inner_bottom <= z <= inner_top:
74
+ return "heart"
75
+ return "shadow"
76
+
77
+ chase_left = -1.20
78
+ chase_right = 1.20
79
+ chase_bottom = 1.10
80
+ chase_top = 3.90
81
+
82
+ if chase_left <= x <= chase_right and chase_bottom <= z <= chase_top:
83
+ return "chase"
84
+
85
+ return "waste"
86
+
87
+
88
+ def _empty_batter_zone_row(player_name: str) -> dict[str, Any]:
89
+ out: dict[str, Any] = {
90
+ "player_name": player_name,
91
+ "zone_sample_size": 0,
92
+ }
93
+
94
+ for family in ["fastball", "breaking", "offspeed"]:
95
+ for zone in ["heart", "shadow", "chase", "waste"]:
96
+ out[f"hr_prob_{family}_{zone}"] = None
97
+ out[f"hit_prob_{family}_{zone}"] = None
98
+ out[f"tb2p_prob_{family}_{zone}"] = None
99
+ out[f"whiff_prob_{family}_{zone}"] = None
100
+ out[f"damage_prob_{family}_{zone}"] = None
101
+ out[f"sample_size_{family}_{zone}"] = 0
102
+
103
+ return out
104
+
105
+
106
+ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]:
107
+ if statcast_df.empty or "player_name" not in statcast_df.columns:
108
+ return _empty_batter_zone_row(player_name)
109
+
110
+ df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy()
111
+ if df.empty:
112
+ return _empty_batter_zone_row(player_name)
113
+
114
+ # Need pitch location + pitch type for zone modeling
115
+ if "plate_x" not in df.columns or "plate_z" not in df.columns:
116
+ return _empty_batter_zone_row(player_name)
117
+
118
+ pitch_name_series = None
119
+ if "pitch_name" in df.columns:
120
+ pitch_name_series = df["pitch_name"]
121
+ elif "pitch_type" in df.columns:
122
+ pitch_name_series = df["pitch_type"]
123
+ else:
124
+ pitch_name_series = pd.Series(["unknown"] * len(df), index=df.index)
125
+
126
+ zone_bucket_series = df.apply(
127
+ lambda row: classify_zone_bucket(row.get("plate_x"), row.get("plate_z")),
128
+ axis=1,
129
+ )
130
+ pitch_family_series = pitch_name_series.apply(_normalize_pitch_family)
131
+
132
+ df = df.copy()
133
+ df["zone_bucket"] = zone_bucket_series
134
+ df["pitch_family"] = pitch_family_series
135
+
136
+ launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce")
137
+ estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
138
+ events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
139
+
140
+ # rough hit / tb / hr / whiff proxies
141
+ hit_mask = events.isin({"single", "double", "triple", "home_run"})
142
+ hr_mask = events.eq("home_run")
143
+ tb2p_mask = events.isin({"double", "triple", "home_run"})
144
+
145
+ description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
146
+ whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
147
+
148
+ # damage proxy: either quality contact or strong xwOBA
149
+ damage_mask = (
150
+ (launch_speed >= 95)
151
+ | (estimated_woba >= 0.500)
152
+ | hr_mask
153
+ )
154
+
155
+ out = _empty_batter_zone_row(player_name)
156
+ out["zone_sample_size"] = int(len(df))
157
+
158
+ for family in ["fastball", "breaking", "offspeed"]:
159
+ for zone in ["heart", "shadow", "chase", "waste"]:
160
+ subset = df[(df["pitch_family"] == family) & (df["zone_bucket"] == zone)].copy()
161
+ if subset.empty:
162
+ continue
163
+
164
+ subset_idx = subset.index
165
+
166
+ sample_size = int(len(subset))
167
+ out[f"sample_size_{family}_{zone}"] = sample_size
168
+
169
+ out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
170
+ out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
171
+ out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
172
+ out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
173
+ out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
174
+
175
+ return out