from __future__ import annotations import numpy as np import pandas as pd INCHES_PER_FOOT = 12.0 def infer_ivb_sign(df_raw: pd.DataFrame) -> int: """ Data-driven IVB orientation: pick +1 or -1 so 'ride' is positive. Uses only df_raw['pfx_z'] (no hardcoding of pitch types). """ if "pfx_z" not in df_raw.columns or df_raw["pfx_z"].dropna().empty: return -1 med = df_raw["pfx_z"].median() return -1 if med < 0 else +1 def signed_arm_side(hb_in_raw: pd.Series, p_throws: pd.Series) -> pd.Series: """ Convert Statcast pfx_x (catcher-right +) into 'arm-side positive' regardless of handedness. RHP → +pfx_x is arm-side ; LHP → -pfx_x is arm-side. """ handed = p_throws.fillna("R").str.upper().str[0] sign = np.where(handed == "R", 1.0, -1.0) return -hb_in_raw * sign def _safe_rate(num, den): return np.divide( num, den, out=np.full_like(num, np.nan, dtype=float), where=den > 0 ) def engineer_pitch_features(df: pd.DataFrame, ivb_sign: int) -> pd.DataFrame: cols = [ "pitch_type", "player_name", "game_date", "events", "description", "p_throws", "stand", "release_pos_x", "release_pos_z", "pfx_x", "pfx_z", "release_speed", "release_spin_rate", "plate_x", "plate_z", "zone", ] have = [c for c in cols if c in df.columns] df = df[have].copy() # outcomes df["is_called_strike"] = (df["description"] == "called_strike").astype(int) df["is_swing"] = ( df["description"] .isin(["swinging_strike", "swinging_strike_blocked", "foul", "hit_into_play"]) .astype(int) ) df["is_whiff"] = ( df["description"] .isin(["swinging_strike", "swinging_strike_blocked"]) .astype(int) ) df["is_in_play"] = (df["description"] == "hit_into_play").astype(int) df["is_gb"] = ( df["events"] .isin(["groundout", "field_error", "single", "double", "triple"]) .astype(int) ) # movement (handedness-aware XY) df["hb_in_raw"] = df["pfx_x"] * INCHES_PER_FOOT df["ivb_in"] = ivb_sign * df["pfx_z"] * INCHES_PER_FOOT # + = ride, − = drop df["hb_as_in"] = signed_arm_side(df["hb_in_raw"], df.get("p_throws")) grp = df.groupby(["player_name", "pitch_type", "p_throws"], as_index=False) agg = grp.agg( n=("pitch_type", "size"), velo=("release_speed", "mean"), spin=("release_spin_rate", "mean"), ivb_in=("ivb_in", "mean"), hb_as_in=("hb_as_in", "mean"), rel_height=("release_pos_z", "mean"), rel_side=("release_pos_x", "mean"), cs=("is_called_strike", "sum"), swings=("is_swing", "sum"), whiffs=("is_whiff", "sum"), inplay=("is_in_play", "sum"), gb=("is_gb", "sum"), ) agg["csw"] = _safe_rate(agg["cs"] + agg["whiffs"], agg["n"]) agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"]) agg["gb_rate"] = _safe_rate(agg["gb"], agg["inplay"]) agg["zone_pct"] = _safe_rate(agg["cs"] + agg["inplay"], agg["n"]) keep = [ "player_name", "pitch_type", "p_throws", "n", "velo", "spin", "ivb_in", "hb_as_in", "rel_height", "rel_side", "csw", "whiff_rate", "gb_rate", "zone_pct", ] return agg[keep].dropna(subset=["velo", "ivb_in", "hb_as_in"])