Syntrex commited on
Commit
09655a8
·
verified ·
1 Parent(s): aa1c2fd

Update models/batter_zone_model.py

Browse files
Files changed (1) hide show
  1. models/batter_zone_model.py +110 -33
models/batter_zone_model.py CHANGED
@@ -133,41 +133,13 @@ def _empty_batter_zone_row(player_name: str) -> dict[str, Any]:
133
 
134
  return out
135
 
136
-
137
- def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]:
138
- if statcast_df.empty or "player_name" not in statcast_df.columns:
139
- return _empty_batter_zone_row(player_name)
140
-
141
- player_names = statcast_df["player_name"].astype(str).fillna("")
142
- normalized_target_variants = _to_last_first_variants(player_name)
143
-
144
- normalized_series = player_names.map(_normalize_name_text)
145
-
146
- mask = normalized_series.isin(normalized_target_variants)
147
-
148
- df = statcast_df[mask].copy()
149
-
150
- if df.empty:
151
- normalized_player_name = _normalize_name_text(player_name)
152
- name_parts = normalized_player_name.split()
153
-
154
- if len(name_parts) >= 2:
155
- first = name_parts[0]
156
- last = name_parts[-1]
157
-
158
- loose_mask = normalized_series.apply(
159
- lambda n: isinstance(n, str) and first in n and last in n
160
- )
161
- df = statcast_df[loose_mask].copy()
162
-
163
  if df.empty:
164
  return _empty_batter_zone_row(player_name)
165
 
166
- # Need pitch location + pitch type for zone modeling
167
  if "plate_x" not in df.columns or "plate_z" not in df.columns:
168
  return _empty_batter_zone_row(player_name)
169
 
170
- pitch_name_series = None
171
  if "pitch_name" in df.columns:
172
  pitch_name_series = df["pitch_name"]
173
  elif "pitch_type" in df.columns:
@@ -189,7 +161,6 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
189
  estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
190
  events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
191
 
192
- # rough hit / tb / hr / whiff proxies
193
  hit_mask = events.isin({"single", "double", "triple", "home_run"})
194
  hr_mask = events.eq("home_run")
195
  tb2p_mask = events.isin({"double", "triple", "home_run"})
@@ -197,7 +168,6 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
197
  description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
198
  whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
199
 
200
- # damage proxy: either quality contact or strong xwOBA
201
  damage_mask = (
202
  (launch_speed >= 95)
203
  | (estimated_woba >= 0.500)
@@ -214,14 +184,121 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
214
  continue
215
 
216
  subset_idx = subset.index
217
-
218
  sample_size = int(len(subset))
219
  out[f"sample_size_{family}_{zone}"] = sample_size
220
-
221
  out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
222
  out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
223
  out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
224
  out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
225
  out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  return out
 
133
 
134
  return out
135
 
136
+ def _build_zone_metrics_from_df(df: pd.DataFrame, player_name: str) -> dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if df.empty:
138
  return _empty_batter_zone_row(player_name)
139
 
 
140
  if "plate_x" not in df.columns or "plate_z" not in df.columns:
141
  return _empty_batter_zone_row(player_name)
142
 
 
143
  if "pitch_name" in df.columns:
144
  pitch_name_series = df["pitch_name"]
145
  elif "pitch_type" in df.columns:
 
161
  estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
162
  events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
163
 
 
164
  hit_mask = events.isin({"single", "double", "triple", "home_run"})
165
  hr_mask = events.eq("home_run")
166
  tb2p_mask = events.isin({"double", "triple", "home_run"})
 
168
  description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
169
  whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
170
 
 
171
  damage_mask = (
172
  (launch_speed >= 95)
173
  | (estimated_woba >= 0.500)
 
184
  continue
185
 
186
  subset_idx = subset.index
 
187
  sample_size = int(len(subset))
188
  out[f"sample_size_{family}_{zone}"] = sample_size
 
189
  out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
190
  out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
191
  out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
192
  out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
193
  out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
194
 
195
+ return out
196
+
197
+ def _blend_rate_with_sample_size(
198
+ current_value: Any,
199
+ current_sample: int,
200
+ previous_value: Any,
201
+ previous_sample: int,
202
+ ) -> float | None:
203
+ try:
204
+ current_sample = int(current_sample or 0)
205
+ except Exception:
206
+ current_sample = 0
207
+
208
+ try:
209
+ previous_sample = int(previous_sample or 0)
210
+ except Exception:
211
+ previous_sample = 0
212
+
213
+ current_exists = current_value is not None
214
+ previous_exists = previous_value is not None
215
+
216
+ if not current_exists and not previous_exists:
217
+ return None
218
+ if current_exists and not previous_exists:
219
+ return float(current_value)
220
+ if previous_exists and not current_exists:
221
+ return float(previous_value)
222
+
223
+ # Current season weight rises with sample size, but does not go to 100% too quickly.
224
+ current_weight = min(0.85, max(0.25, current_sample / (current_sample + 150.0)))
225
+ previous_weight = 1.0 - current_weight
226
+
227
+ return (float(current_value) * current_weight) + (float(previous_value) * previous_weight)
228
+
229
+ def build_batter_zone_feature_row(
230
+ current_statcast_df: pd.DataFrame,
231
+ player_name: str,
232
+ previous_statcast_df: pd.DataFrame | None = None,
233
+ ) -> dict[str, Any]:
234
+ if current_statcast_df is None:
235
+ current_statcast_df = pd.DataFrame()
236
+ if previous_statcast_df is None:
237
+ previous_statcast_df = pd.DataFrame()
238
+
239
+ def _match_player(df: pd.DataFrame, player_name: str) -> pd.DataFrame:
240
+ if df.empty or "player_name" not in df.columns:
241
+ return pd.DataFrame()
242
+
243
+ player_names = df["player_name"].astype(str).fillna("")
244
+ normalized_target_variants = _to_last_first_variants(player_name)
245
+ normalized_series = player_names.map(_normalize_name_text)
246
+
247
+ mask = normalized_series.isin(normalized_target_variants)
248
+ matched = df[mask].copy()
249
+
250
+ if matched.empty:
251
+ normalized_player_name = _normalize_name_text(player_name)
252
+ name_parts = normalized_player_name.split()
253
+
254
+ if len(name_parts) >= 2:
255
+ first = name_parts[0]
256
+ last = name_parts[-1]
257
+
258
+ loose_mask = normalized_series.apply(
259
+ lambda n: isinstance(n, str) and first in n and last in n
260
+ )
261
+ matched = df[loose_mask].copy()
262
+
263
+ return matched
264
+
265
+ current_df = _match_player(current_statcast_df, player_name)
266
+ previous_df = _match_player(previous_statcast_df, player_name)
267
+
268
+ current_metrics = _build_zone_metrics_from_df(current_df, player_name)
269
+ previous_metrics = _build_zone_metrics_from_df(previous_df, player_name)
270
+
271
+ out = _empty_batter_zone_row(player_name)
272
+
273
+ out["zone_sample_size"] = int(current_metrics.get("zone_sample_size", 0) or 0) + int(
274
+ previous_metrics.get("zone_sample_size", 0) or 0
275
+ )
276
+
277
+ for family in ["fastball", "breaking", "offspeed"]:
278
+ for zone in ["heart", "shadow", "chase", "waste"]:
279
+ sample_key = f"sample_size_{family}_{zone}"
280
+
281
+ current_sample = int(current_metrics.get(sample_key, 0) or 0)
282
+ previous_sample = int(previous_metrics.get(sample_key, 0) or 0)
283
+
284
+ out[sample_key] = current_sample + previous_sample
285
+
286
+ for metric_prefix in [
287
+ "hit_prob",
288
+ "hr_prob",
289
+ "tb2p_prob",
290
+ "whiff_prob",
291
+ "damage_prob",
292
+ ]:
293
+ key = f"{metric_prefix}_{family}_{zone}"
294
+ out[key] = _blend_rate_with_sample_size(
295
+ current_value=current_metrics.get(key),
296
+ current_sample=current_sample,
297
+ previous_value=previous_metrics.get(key),
298
+ previous_sample=previous_sample,
299
+ )
300
+
301
+ out["current_zone_sample_size"] = int(current_metrics.get("zone_sample_size", 0) or 0)
302
+ out["previous_zone_sample_size"] = int(previous_metrics.get("zone_sample_size", 0) or 0)
303
+
304
  return out