Spaces:
Sleeping
Sleeping
Update models/batter_zone_model.py
Browse files- models/batter_zone_model.py +110 -33
models/batter_zone_model.py
CHANGED
|
@@ -133,41 +133,13 @@ def _empty_batter_zone_row(player_name: str) -> dict[str, Any]:
|
|
| 133 |
|
| 134 |
return out
|
| 135 |
|
| 136 |
-
|
| 137 |
-
def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -> dict[str, Any]:
|
| 138 |
-
if statcast_df.empty or "player_name" not in statcast_df.columns:
|
| 139 |
-
return _empty_batter_zone_row(player_name)
|
| 140 |
-
|
| 141 |
-
player_names = statcast_df["player_name"].astype(str).fillna("")
|
| 142 |
-
normalized_target_variants = _to_last_first_variants(player_name)
|
| 143 |
-
|
| 144 |
-
normalized_series = player_names.map(_normalize_name_text)
|
| 145 |
-
|
| 146 |
-
mask = normalized_series.isin(normalized_target_variants)
|
| 147 |
-
|
| 148 |
-
df = statcast_df[mask].copy()
|
| 149 |
-
|
| 150 |
-
if df.empty:
|
| 151 |
-
normalized_player_name = _normalize_name_text(player_name)
|
| 152 |
-
name_parts = normalized_player_name.split()
|
| 153 |
-
|
| 154 |
-
if len(name_parts) >= 2:
|
| 155 |
-
first = name_parts[0]
|
| 156 |
-
last = name_parts[-1]
|
| 157 |
-
|
| 158 |
-
loose_mask = normalized_series.apply(
|
| 159 |
-
lambda n: isinstance(n, str) and first in n and last in n
|
| 160 |
-
)
|
| 161 |
-
df = statcast_df[loose_mask].copy()
|
| 162 |
-
|
| 163 |
if df.empty:
|
| 164 |
return _empty_batter_zone_row(player_name)
|
| 165 |
|
| 166 |
-
# Need pitch location + pitch type for zone modeling
|
| 167 |
if "plate_x" not in df.columns or "plate_z" not in df.columns:
|
| 168 |
return _empty_batter_zone_row(player_name)
|
| 169 |
|
| 170 |
-
pitch_name_series = None
|
| 171 |
if "pitch_name" in df.columns:
|
| 172 |
pitch_name_series = df["pitch_name"]
|
| 173 |
elif "pitch_type" in df.columns:
|
|
@@ -189,7 +161,6 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
|
|
| 189 |
estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
|
| 190 |
events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 191 |
|
| 192 |
-
# rough hit / tb / hr / whiff proxies
|
| 193 |
hit_mask = events.isin({"single", "double", "triple", "home_run"})
|
| 194 |
hr_mask = events.eq("home_run")
|
| 195 |
tb2p_mask = events.isin({"double", "triple", "home_run"})
|
|
@@ -197,7 +168,6 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
|
|
| 197 |
description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 198 |
whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
|
| 199 |
|
| 200 |
-
# damage proxy: either quality contact or strong xwOBA
|
| 201 |
damage_mask = (
|
| 202 |
(launch_speed >= 95)
|
| 203 |
| (estimated_woba >= 0.500)
|
|
@@ -214,14 +184,121 @@ def build_batter_zone_feature_row(statcast_df: pd.DataFrame, player_name: str) -
|
|
| 214 |
continue
|
| 215 |
|
| 216 |
subset_idx = subset.index
|
| 217 |
-
|
| 218 |
sample_size = int(len(subset))
|
| 219 |
out[f"sample_size_{family}_{zone}"] = sample_size
|
| 220 |
-
|
| 221 |
out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
|
| 222 |
out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
|
| 223 |
out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
|
| 224 |
out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
|
| 225 |
out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
return out
|
|
|
|
| 133 |
|
| 134 |
return out
|
| 135 |
|
| 136 |
+
def _build_zone_metrics_from_df(df: pd.DataFrame, player_name: str) -> dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
if df.empty:
|
| 138 |
return _empty_batter_zone_row(player_name)
|
| 139 |
|
|
|
|
| 140 |
if "plate_x" not in df.columns or "plate_z" not in df.columns:
|
| 141 |
return _empty_batter_zone_row(player_name)
|
| 142 |
|
|
|
|
| 143 |
if "pitch_name" in df.columns:
|
| 144 |
pitch_name_series = df["pitch_name"]
|
| 145 |
elif "pitch_type" in df.columns:
|
|
|
|
| 161 |
estimated_woba = pd.to_numeric(df.get("estimated_woba_using_speedangle"), errors="coerce")
|
| 162 |
events = df.get("events", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 163 |
|
|
|
|
| 164 |
hit_mask = events.isin({"single", "double", "triple", "home_run"})
|
| 165 |
hr_mask = events.eq("home_run")
|
| 166 |
tb2p_mask = events.isin({"double", "triple", "home_run"})
|
|
|
|
| 168 |
description_series = df.get("description", pd.Series(index=df.index, dtype="object")).astype(str).str.lower()
|
| 169 |
whiff_mask = description_series.isin({"swinging_strike", "swinging_strike_blocked"})
|
| 170 |
|
|
|
|
| 171 |
damage_mask = (
|
| 172 |
(launch_speed >= 95)
|
| 173 |
| (estimated_woba >= 0.500)
|
|
|
|
| 184 |
continue
|
| 185 |
|
| 186 |
subset_idx = subset.index
|
|
|
|
| 187 |
sample_size = int(len(subset))
|
| 188 |
out[f"sample_size_{family}_{zone}"] = sample_size
|
|
|
|
| 189 |
out[f"hit_prob_{family}_{zone}"] = float(hit_mask.loc[subset_idx].mean())
|
| 190 |
out[f"hr_prob_{family}_{zone}"] = float(hr_mask.loc[subset_idx].mean())
|
| 191 |
out[f"tb2p_prob_{family}_{zone}"] = float(tb2p_mask.loc[subset_idx].mean())
|
| 192 |
out[f"whiff_prob_{family}_{zone}"] = float(whiff_mask.loc[subset_idx].mean())
|
| 193 |
out[f"damage_prob_{family}_{zone}"] = float(damage_mask.loc[subset_idx].mean())
|
| 194 |
|
| 195 |
+
return out
|
| 196 |
+
|
| 197 |
+
def _blend_rate_with_sample_size(
|
| 198 |
+
current_value: Any,
|
| 199 |
+
current_sample: int,
|
| 200 |
+
previous_value: Any,
|
| 201 |
+
previous_sample: int,
|
| 202 |
+
) -> float | None:
|
| 203 |
+
try:
|
| 204 |
+
current_sample = int(current_sample or 0)
|
| 205 |
+
except Exception:
|
| 206 |
+
current_sample = 0
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
previous_sample = int(previous_sample or 0)
|
| 210 |
+
except Exception:
|
| 211 |
+
previous_sample = 0
|
| 212 |
+
|
| 213 |
+
current_exists = current_value is not None
|
| 214 |
+
previous_exists = previous_value is not None
|
| 215 |
+
|
| 216 |
+
if not current_exists and not previous_exists:
|
| 217 |
+
return None
|
| 218 |
+
if current_exists and not previous_exists:
|
| 219 |
+
return float(current_value)
|
| 220 |
+
if previous_exists and not current_exists:
|
| 221 |
+
return float(previous_value)
|
| 222 |
+
|
| 223 |
+
# Current season weight rises with sample size, but does not go to 100% too quickly.
|
| 224 |
+
current_weight = min(0.85, max(0.25, current_sample / (current_sample + 150.0)))
|
| 225 |
+
previous_weight = 1.0 - current_weight
|
| 226 |
+
|
| 227 |
+
return (float(current_value) * current_weight) + (float(previous_value) * previous_weight)
|
| 228 |
+
|
| 229 |
+
def build_batter_zone_feature_row(
|
| 230 |
+
current_statcast_df: pd.DataFrame,
|
| 231 |
+
player_name: str,
|
| 232 |
+
previous_statcast_df: pd.DataFrame | None = None,
|
| 233 |
+
) -> dict[str, Any]:
|
| 234 |
+
if current_statcast_df is None:
|
| 235 |
+
current_statcast_df = pd.DataFrame()
|
| 236 |
+
if previous_statcast_df is None:
|
| 237 |
+
previous_statcast_df = pd.DataFrame()
|
| 238 |
+
|
| 239 |
+
def _match_player(df: pd.DataFrame, player_name: str) -> pd.DataFrame:
|
| 240 |
+
if df.empty or "player_name" not in df.columns:
|
| 241 |
+
return pd.DataFrame()
|
| 242 |
+
|
| 243 |
+
player_names = df["player_name"].astype(str).fillna("")
|
| 244 |
+
normalized_target_variants = _to_last_first_variants(player_name)
|
| 245 |
+
normalized_series = player_names.map(_normalize_name_text)
|
| 246 |
+
|
| 247 |
+
mask = normalized_series.isin(normalized_target_variants)
|
| 248 |
+
matched = df[mask].copy()
|
| 249 |
+
|
| 250 |
+
if matched.empty:
|
| 251 |
+
normalized_player_name = _normalize_name_text(player_name)
|
| 252 |
+
name_parts = normalized_player_name.split()
|
| 253 |
+
|
| 254 |
+
if len(name_parts) >= 2:
|
| 255 |
+
first = name_parts[0]
|
| 256 |
+
last = name_parts[-1]
|
| 257 |
+
|
| 258 |
+
loose_mask = normalized_series.apply(
|
| 259 |
+
lambda n: isinstance(n, str) and first in n and last in n
|
| 260 |
+
)
|
| 261 |
+
matched = df[loose_mask].copy()
|
| 262 |
+
|
| 263 |
+
return matched
|
| 264 |
+
|
| 265 |
+
current_df = _match_player(current_statcast_df, player_name)
|
| 266 |
+
previous_df = _match_player(previous_statcast_df, player_name)
|
| 267 |
+
|
| 268 |
+
current_metrics = _build_zone_metrics_from_df(current_df, player_name)
|
| 269 |
+
previous_metrics = _build_zone_metrics_from_df(previous_df, player_name)
|
| 270 |
+
|
| 271 |
+
out = _empty_batter_zone_row(player_name)
|
| 272 |
+
|
| 273 |
+
out["zone_sample_size"] = int(current_metrics.get("zone_sample_size", 0) or 0) + int(
|
| 274 |
+
previous_metrics.get("zone_sample_size", 0) or 0
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
for family in ["fastball", "breaking", "offspeed"]:
|
| 278 |
+
for zone in ["heart", "shadow", "chase", "waste"]:
|
| 279 |
+
sample_key = f"sample_size_{family}_{zone}"
|
| 280 |
+
|
| 281 |
+
current_sample = int(current_metrics.get(sample_key, 0) or 0)
|
| 282 |
+
previous_sample = int(previous_metrics.get(sample_key, 0) or 0)
|
| 283 |
+
|
| 284 |
+
out[sample_key] = current_sample + previous_sample
|
| 285 |
+
|
| 286 |
+
for metric_prefix in [
|
| 287 |
+
"hit_prob",
|
| 288 |
+
"hr_prob",
|
| 289 |
+
"tb2p_prob",
|
| 290 |
+
"whiff_prob",
|
| 291 |
+
"damage_prob",
|
| 292 |
+
]:
|
| 293 |
+
key = f"{metric_prefix}_{family}_{zone}"
|
| 294 |
+
out[key] = _blend_rate_with_sample_size(
|
| 295 |
+
current_value=current_metrics.get(key),
|
| 296 |
+
current_sample=current_sample,
|
| 297 |
+
previous_value=previous_metrics.get(key),
|
| 298 |
+
previous_sample=previous_sample,
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
out["current_zone_sample_size"] = int(current_metrics.get("zone_sample_size", 0) or 0)
|
| 302 |
+
out["previous_zone_sample_size"] = int(previous_metrics.get("zone_sample_size", 0) or 0)
|
| 303 |
+
|
| 304 |
return out
|