File size: 3,515 Bytes
c75151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from __future__ import annotations
import numpy as np
import pandas as pd

INCHES_PER_FOOT = 12.0


def infer_ivb_sign(df_raw: pd.DataFrame) -> int:
    """
    Data-driven IVB orientation: pick +1 or -1 so 'ride' is positive.
    Uses only df_raw['pfx_z'] (no hardcoding of pitch types).
    """
    if "pfx_z" not in df_raw.columns or df_raw["pfx_z"].dropna().empty:
        return -1
    med = df_raw["pfx_z"].median()
    return -1 if med < 0 else +1


def signed_arm_side(hb_in_raw: pd.Series, p_throws: pd.Series) -> pd.Series:
    """
    Convert Statcast pfx_x (catcher-right +) into 'arm-side positive' regardless of handedness.
    RHP → +pfx_x is arm-side ; LHP → -pfx_x is arm-side.
    """
    handed = p_throws.fillna("R").str.upper().str[0]
    sign = np.where(handed == "R", 1.0, -1.0)
    return -hb_in_raw * sign


def _safe_rate(num, den):
    return np.divide(
        num, den, out=np.full_like(num, np.nan, dtype=float), where=den > 0
    )


def engineer_pitch_features(df: pd.DataFrame, ivb_sign: int) -> pd.DataFrame:
    cols = [
        "pitch_type",
        "player_name",
        "game_date",
        "events",
        "description",
        "p_throws",
        "stand",
        "release_pos_x",
        "release_pos_z",
        "pfx_x",
        "pfx_z",
        "release_speed",
        "release_spin_rate",
        "plate_x",
        "plate_z",
        "zone",
    ]
    have = [c for c in cols if c in df.columns]
    df = df[have].copy()

    # outcomes
    df["is_called_strike"] = (df["description"] == "called_strike").astype(int)
    df["is_swing"] = (
        df["description"]
        .isin(["swinging_strike", "swinging_strike_blocked", "foul", "hit_into_play"])
        .astype(int)
    )
    df["is_whiff"] = (
        df["description"]
        .isin(["swinging_strike", "swinging_strike_blocked"])
        .astype(int)
    )
    df["is_in_play"] = (df["description"] == "hit_into_play").astype(int)
    df["is_gb"] = (
        df["events"]
        .isin(["groundout", "field_error", "single", "double", "triple"])
        .astype(int)
    )

    # movement (handedness-aware XY)
    df["hb_in_raw"] = df["pfx_x"] * INCHES_PER_FOOT
    df["ivb_in"] = ivb_sign * df["pfx_z"] * INCHES_PER_FOOT  # + = ride, − = drop
    df["hb_as_in"] = signed_arm_side(df["hb_in_raw"], df.get("p_throws"))

    grp = df.groupby(["player_name", "pitch_type", "p_throws"], as_index=False)
    agg = grp.agg(
        n=("pitch_type", "size"),
        velo=("release_speed", "mean"),
        spin=("release_spin_rate", "mean"),
        ivb_in=("ivb_in", "mean"),
        hb_as_in=("hb_as_in", "mean"),
        rel_height=("release_pos_z", "mean"),
        rel_side=("release_pos_x", "mean"),
        cs=("is_called_strike", "sum"),
        swings=("is_swing", "sum"),
        whiffs=("is_whiff", "sum"),
        inplay=("is_in_play", "sum"),
        gb=("is_gb", "sum"),
    )

    agg["csw"] = _safe_rate(agg["cs"] + agg["whiffs"], agg["n"])
    agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
    agg["gb_rate"] = _safe_rate(agg["gb"], agg["inplay"])
    agg["zone_pct"] = _safe_rate(agg["cs"] + agg["inplay"], agg["n"])

    keep = [
        "player_name",
        "pitch_type",
        "p_throws",
        "n",
        "velo",
        "spin",
        "ivb_in",
        "hb_as_in",
        "rel_height",
        "rel_side",
        "csw",
        "whiff_rate",
        "gb_rate",
        "zone_pct",
    ]
    return agg[keep].dropna(subset=["velo", "ivb_in", "hb_as_in"])