pitch_dash / src /featurize.py
rsm-roguchi
Initial clean commit (no binaries, no venv)
c75151e
from __future__ import annotations
import numpy as np
import pandas as pd
INCHES_PER_FOOT = 12.0
def infer_ivb_sign(df_raw: pd.DataFrame) -> int:
"""
Data-driven IVB orientation: pick +1 or -1 so 'ride' is positive.
Uses only df_raw['pfx_z'] (no hardcoding of pitch types).
"""
if "pfx_z" not in df_raw.columns or df_raw["pfx_z"].dropna().empty:
return -1
med = df_raw["pfx_z"].median()
return -1 if med < 0 else +1
def signed_arm_side(hb_in_raw: pd.Series, p_throws: pd.Series) -> pd.Series:
"""
Convert Statcast pfx_x (catcher-right +) into 'arm-side positive' regardless of handedness.
RHP → +pfx_x is arm-side ; LHP → -pfx_x is arm-side.
"""
handed = p_throws.fillna("R").str.upper().str[0]
sign = np.where(handed == "R", 1.0, -1.0)
return -hb_in_raw * sign
def _safe_rate(num, den):
return np.divide(
num, den, out=np.full_like(num, np.nan, dtype=float), where=den > 0
)
def engineer_pitch_features(df: pd.DataFrame, ivb_sign: int) -> pd.DataFrame:
cols = [
"pitch_type",
"player_name",
"game_date",
"events",
"description",
"p_throws",
"stand",
"release_pos_x",
"release_pos_z",
"pfx_x",
"pfx_z",
"release_speed",
"release_spin_rate",
"plate_x",
"plate_z",
"zone",
]
have = [c for c in cols if c in df.columns]
df = df[have].copy()
# outcomes
df["is_called_strike"] = (df["description"] == "called_strike").astype(int)
df["is_swing"] = (
df["description"]
.isin(["swinging_strike", "swinging_strike_blocked", "foul", "hit_into_play"])
.astype(int)
)
df["is_whiff"] = (
df["description"]
.isin(["swinging_strike", "swinging_strike_blocked"])
.astype(int)
)
df["is_in_play"] = (df["description"] == "hit_into_play").astype(int)
df["is_gb"] = (
df["events"]
.isin(["groundout", "field_error", "single", "double", "triple"])
.astype(int)
)
# movement (handedness-aware XY)
df["hb_in_raw"] = df["pfx_x"] * INCHES_PER_FOOT
df["ivb_in"] = ivb_sign * df["pfx_z"] * INCHES_PER_FOOT # + = ride, − = drop
df["hb_as_in"] = signed_arm_side(df["hb_in_raw"], df.get("p_throws"))
grp = df.groupby(["player_name", "pitch_type", "p_throws"], as_index=False)
agg = grp.agg(
n=("pitch_type", "size"),
velo=("release_speed", "mean"),
spin=("release_spin_rate", "mean"),
ivb_in=("ivb_in", "mean"),
hb_as_in=("hb_as_in", "mean"),
rel_height=("release_pos_z", "mean"),
rel_side=("release_pos_x", "mean"),
cs=("is_called_strike", "sum"),
swings=("is_swing", "sum"),
whiffs=("is_whiff", "sum"),
inplay=("is_in_play", "sum"),
gb=("is_gb", "sum"),
)
agg["csw"] = _safe_rate(agg["cs"] + agg["whiffs"], agg["n"])
agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
agg["gb_rate"] = _safe_rate(agg["gb"], agg["inplay"])
agg["zone_pct"] = _safe_rate(agg["cs"] + agg["inplay"], agg["n"])
keep = [
"player_name",
"pitch_type",
"p_throws",
"n",
"velo",
"spin",
"ivb_in",
"hb_as_in",
"rel_height",
"rel_side",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
]
return agg[keep].dropna(subset=["velo", "ivb_in", "hb_as_in"])