In [1]:
from __future__ import annotations
import os
from datetime import date
from pathlib import Path

import numpy as np
import pandas as pd

from pybaseball import statcast
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

import plotly.express as px
import plotly.graph_objects as go

In [2]:
pd.set_option("display.max_columns", 200)
CACHE_DIR = Path("data/cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
def default_window():
    """Return a recent season window (approx opening day to today)."""
    today = date.today()
    start = date(today.year if today.month >= 3 else today.year - 1, 3, 1)
    return start.isoformat(), today.isoformat()


def cache_path(start: str, end: str) -> Path:
    return CACHE_DIR / f"statcast_{start}_{end}.parquet"

In [4]:
def load_statcast(start_date: str, end_date: str, force: bool = False) -> pd.DataFrame:
    """Download Statcast data with simple caching to Parquet."""
    cp = cache_path(start_date, end_date)
    if cp.exists() and not force:
        print(f"Loading cached: {cp}")
        return pd.read_parquet(cp)
    print("Downloading from Statcast (pybaseball)... window size affects duration.")
    df = statcast(start_dt=start_date, end_dt=end_date)
    if "pitch_type" in df.columns:
        df = df[df["pitch_type"].notna()]
    df.to_parquet(cp, index=False)
    print(f"Cached to: {cp}")
    return df


start_date, end_date = default_window()
start_date, end_date

('2025-03-01', '2025-10-30')

In [5]:
df_raw = load_statcast(start_date, end_date, force=False)
df_raw.shape, df_raw.head(3)

Loading cached: data/cache/statcast_2025-03-01_2025-10-30.parquet


((752024, 118),
   pitch_type  game_date  release_speed  release_pos_x  release_pos_z  \
 0         SL 2025-10-29           86.7          -2.63           5.58   
 1         SL 2025-10-29           87.0          -2.55            5.6   
 2         SL 2025-10-29           87.2          -2.58           5.59   
 
      player_name  batter  pitcher     events      description  spin_dir  \
 0  Hoffman, Jeff  606192   656546  strikeout  swinging_strike      <NA>   
 1  Hoffman, Jeff  606192   656546       None     blocked_ball      <NA>   
 2  Hoffman, Jeff  606192   656546       None             ball      <NA>   
 
    spin_rate_deprecated  break_angle_deprecated  break_length_deprecated  \
 0                  <NA>                    <NA>                     <NA>   
 1                  <NA>                    <NA>                     <NA>   
 2                  <NA>                    <NA>                     <NA>   
 
    zone                                      des game_type stand p_throws

In [6]:
INCHES_PER_FOOT = 12.0

def _safe_rate(num, den):
    return np.divide(
        num, den, out=np.full_like(num, np.nan, dtype=float), where=den > 0
    )


def signed_arm_side(hb_in_raw: pd.Series, p_throws: pd.Series) -> pd.Series:
    """
    Convert Statcast pfx_x (catcher-right positive) into 'arm-side positive' for both RHP and LHP.
    For RHP: arm-side = +pfx_x.
    For LHP: arm-side = -pfx_x (flip sign).
    """
    handed = p_throws.fillna("R").str.upper().str[0]
    sign = np.where(handed == "R", 1.0, -1.0)
    return -hb_in_raw * sign


def engineer_pitch_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregates to (player_name, pitch_type, p_throws) with handedness-aware XY:
      X = hb_as_in (Arm-Side + / Glove-Side -), Y = ivb_in (Ride + / Drop -)
    Also computes CSW, Whiff, GB, Zone%.
    """
    cols = [
        "pitch_type",
        "player_name",
        "game_date",
        "events",
        "description",
        "p_throws",
        "stand",
        "release_pos_x",
        "release_pos_z",
        "pfx_x",
        "pfx_z",
        "release_speed",
        "release_spin_rate",
        "plate_x",
        "plate_z",
        "zone",
    ]
    have = [c for c in cols if c in df.columns]
    df = df[have].copy()

    # outcomes
    df["is_called_strike"] = (df["description"] == "called_strike").astype(int)
    df["is_swing"] = (
        df["description"]
        .isin(["swinging_strike", "swinging_strike_blocked", "foul", "hit_into_play"])
        .astype(int)
    )
    df["is_whiff"] = (
        df["description"]
        .isin(["swinging_strike", "swinging_strike_blocked"])
        .astype(int)
    )
    df["is_ball"] = (df["description"] == "ball").astype(int)
    df["is_in_play"] = (df["description"] == "hit_into_play").astype(int)
    # quick GB proxy (refine later with launch angle)
    df["is_gb"] = (
        df["events"]
        .isin(["groundout", "field_error", "single", "double", "triple"])
        .astype(int)
    )

    # movement proxies in inches
    df["hb_in_raw"] = df["pfx_x"] * INCHES_PER_FOOT  # + = break to catcher's right
    df["ivb_in"] = df["pfx_z"] * INCHES_PER_FOOT  # + = ride, − = drop
    df["hb_as_in"] = signed_arm_side(
        df["hb_in_raw"], df.get("p_throws")
    )  # + = arm-side, − = glove-side

    grp = df.groupby(["player_name", "pitch_type", "p_throws"], as_index=False)
    agg = grp.agg(
        n=("pitch_type", "size"),
        velo=("release_speed", "mean"),
        spin=("release_spin_rate", "mean"),
        ivb_in=("ivb_in", "mean"),
        hb_as_in=("hb_as_in", "mean"),
        rel_height=("release_pos_z", "mean"),
        rel_side=("release_pos_x", "mean"),
        cs=("is_called_strike", "sum"),
        swings=("is_swing", "sum"),
        whiffs=("is_whiff", "sum"),
        inplay=("is_in_play", "sum"),
        gb=("is_gb", "sum"),
    )

    agg["csw"] = _safe_rate(agg["cs"] + agg["whiffs"], agg["n"])
    agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
    agg["gb_rate"] = _safe_rate(agg["gb"], agg["inplay"])
    # rough in-zone proxy
    agg["zone_pct"] = _safe_rate(agg["cs"] + agg["inplay"], agg["n"])

    keep = [
        "player_name",
        "pitch_type",
        "p_throws",
        "n",
        "velo",
        "spin",
        "ivb_in",
        "hb_as_in",
        "rel_height",
        "rel_side",
        "csw",
        "whiff_rate",
        "gb_rate",
        "zone_pct",
    ]
    out = agg[keep].dropna(subset=["velo", "ivb_in", "hb_as_in"])
    return out

In [7]:
ARCH_FEATURES = [
    "velo",
    "ivb_in",
    "hb_as_in",
    "rel_height",
    "rel_side",
    "spin",
    "csw",
    "whiff_rate",
    "gb_rate",
    "zone_pct",
]


def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42):
    df = df_feat.dropna(subset=ARCH_FEATURES).copy()
    X = df[ARCH_FEATURES].values
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    km = KMeans(n_clusters=k, n_init=20, random_state=random_state)
    labels = km.fit_predict(Xs)
    df["cluster"] = labels

    nn = NearestNeighbors(n_neighbors=6, metric="euclidean")
    nn.fit(Xs)
    return df, scaler, km, nn

In [8]:
# --- Replacement: cluster tags using XY + functional flavor ---
def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
    if v >= q75:
        return big
    if v <= q25:
        return small
    return mid


def _side_label(hb_as):
    return "Arm-Side" if hb_as >= 0 else "Glove-Side"


def _vert_label(ivb):
    return "Ride" if ivb >= 0 else "Drop"


def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
    """
    Returns {cluster_id: label} like:
    Slider: Glove-Side • Moderate Sweep, Subtle Ride • Strike-Throwing
    Changeup: Arm-Side • Subtle Ride, Heavy Run • Grounder-First
    """
    df = df_with_clusters.copy()

    q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
    q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
    q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
    q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)

    q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
    q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
    q_zn75 = np.nanquantile(df["zone_pct"], 0.75)

    q_wh50 = np.nanquantile(df["whiff_rate"], 0.50)
    q_gb50 = np.nanquantile(df["gb_rate"], 0.50)
    q_zn50 = np.nanquantile(df["zone_pct"], 0.50)

    tags = {}
    for c, sub in df.groupby("cluster"):
        row = sub.mean(numeric_only=True)
        dom_pt = (
            sub["pitch_type"].mode().iloc[0]
            if not sub["pitch_type"].mode().empty
            else "Pitch"
        )

        side = _side_label(row["hb_as_in"])
        vert = _vert_label(row["ivb_in"])

        mag_side = _mag_label(
            abs(row["hb_as_in"]), q_abs_hb25, q_abs_hb75
        )  # "Subtle/Moderate/Heavy"
        mag_vert = _mag_label(abs(row["ivb_in"]), q_abs_ivb25, q_abs_ivb75)

        # Functional flavor
        flavor = []
        if row["whiff_rate"] >= q_wh75:
            flavor.append("Whiff-First")
        if row["gb_rate"] >= q_gb75:
            flavor.append("Grounder-First")
        if row["zone_pct"] >= q_zn75:
            flavor.append("Strike-Throwing")
        if not flavor:
            diffs = {
                "Whiff-First": row["whiff_rate"] - q_wh50,
                "Grounder-First": row["gb_rate"] - q_gb50,
                "Strike-Throwing": row["zone_pct"] - q_zn50,
            }
            flavor.append(max(diffs, key=diffs.get))

        # Compose label: "<PitchType>: <Side> • <Magnitude Side>, <Magnitude Vert> <Vert> • <Flavor>"
        # Examples:
        #   "Slider: Glove-Side • Heavy Sweep, Subtle Drop • Strike-Throwing"
        #   "Changeup: Arm-Side • Moderate Run, Subtle Drop • Grounder-First"
        side_noun = "Sweep" if side == "Glove-Side" else "Run"
        vert_noun = "Ride" if vert == "Ride" else "Drop"
        shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
        tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)

    return tags

In [9]:
def nearest_comps(
    row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6
):
    X_all = df_fit[ARCH_FEATURES].values
    xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
    dists, idxs = nn.kneighbors(xq, n_neighbors=k)
    comps = df_fit.iloc[idxs[0]].copy()
    if within_pitch_type:
        comps = comps[comps["pitch_type"] == row["pitch_type"]]
    return comps[
        [
            "player_name",
            "pitch_type",
            "p_throws",
            "velo",
            "ivb_in",
            "hb_as_in",
            "whiff_rate",
            "gb_rate",
            "cluster_name",
        ]
    ].head(k - 1)

In [10]:
def movement_scatter_xy(
    df: pd.DataFrame, color="pitch_type", facet_by_handedness=False
):
    df_plot = df.copy()
    if facet_by_handedness:
        fig = px.scatter(
            df_plot,
            x="hb_as_in",
            y="ivb_in",
            color=color,
            facet_col="p_throws",
            hover_data=[
                "player_name",
                "pitch_type",
                "p_throws",
                "velo",
                "whiff_rate",
                "gb_rate",
                "csw",
            ],
        )
    else:
        fig = px.scatter(
            df_plot,
            x="hb_as_in",
            y="ivb_in",
            color=color,
            hover_data=[
                "player_name",
                "pitch_type",
                "p_throws",
                "velo",
                "whiff_rate",
                "gb_rate",
                "csw",
            ],
        )

    fig.update_layout(
        xaxis_title="Horizontal: Arm-Side (+)  |  Glove-Side (−)",
        yaxis_title="Vertical: Ride (+)  |  Drop (−)",
        legend_title_text=color,
    )
    # quadrant guides (for every subplot if faceted)
    for ax in fig.select_yaxes():
        fig.add_hline(
            y=0,
            line_dash="dot",
            row=ax.anchor.split("y")[-1] if facet_by_handedness else None,
            col=None,
        )
    for ax in fig.select_xaxes():
        fig.add_vline(
            x=0,
            line_dash="dot",
            row=ax.anchor.split("x")[-1] if facet_by_handedness else None,
            col=None,
        )

    # helpful annotations (single-plot case)
    if not facet_by_handedness:
        fig.add_annotation(
            x=1,
            y=0,
            xref="paper",
            yref="paper",
            xanchor="left",
            yanchor="top",
            text="AS (+)",
            showarrow=False,
            font=dict(size=10),
        )
        fig.add_annotation(
            x=0,
            y=0,
            xref="paper",
            yref="paper",
            xanchor="right",
            yanchor="top",
            text="GS (−)",
            showarrow=False,
            font=dict(size=10),
        )
        fig.add_annotation(
            x=0.5,
            y=1,
            xref="paper",
            yref="paper",
            xanchor="center",
            yanchor="bottom",
            text="Ride (+)",
            showarrow=False,
            font=dict(size=10),
        )
        fig.add_annotation(
            x=0.5,
            y=0,
            xref="paper",
            yref="paper",
            xanchor="center",
            yanchor="top",
            text="Drop (−)",
            showarrow=False,
            font=dict(size=10),
        )
    return fig


def radar_quality(row: pd.Series):
    cats = ["csw", "whiff_rate", "gb_rate", "zone_pct"]
    vals = [row[c] for c in cats]
    fig = go.Figure(data=go.Scatterpolar(r=vals, theta=cats, fill="toself"))
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=False
    )
    return fig


def xy_blurb(row: pd.Series) -> str:
    side = "Arm-Side" if row["hb_as_in"] >= 0 else "Glove-Side"
    vert = "Ride" if row["ivb_in"] >= 0 else "Drop"
    return (
        f"{row['pitch_type']} ({row['p_throws']}): {side}, {vert}. "
        f"Velo {row['velo']:.1f} mph | AS {row['hb_as_in']:.1f}\" | "
        f"Ride {row['ivb_in']:.1f}\" | CSW {row['csw']:.2f} | "
        f"Whiff {row['whiff_rate']:.2f} | GB {row['gb_rate']:.2f} | Zone {row['zone_pct']:.2f}"
    )

In [11]:
start_date, end_date = default_window()
# For a fast first run, you can narrow the window, e.g.:
# start_date, end_date = "2024-04-01", "2024-04-07"

df_raw = load_statcast(start_date, end_date, force=False)
df_feat = engineer_pitch_features(df_raw)

df_fit, scaler, km, nn = fit_kmeans(df_feat, k=8)
cluster_names = xy_cluster_tags(df_fit)
df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)

# Save artifacts
df_feat.to_parquet(ARTIFACTS_DIR / "pitch_features.parquet", index=False)
df_fit.to_parquet(ARTIFACTS_DIR / "pitch_features_clusters.parquet", index=False)

Loading cached: data/cache/statcast_2025-03-01_2025-10-30.parquet


In [48]:
pitchers = sorted(df_fit["player_name"].unique().tolist())
SELECTED_PITCHER = pitchers[1041] if pitchers else None
SELECTED_PITCHER

'Williams, Devin'

In [49]:
if SELECTED_PITCHER:
    df_p = df_fit[df_fit["player_name"] == SELECTED_PITCHER].sort_values("pitch_type")
    try:
        display(df_p)
    except NameError:
        print(df_p)

    try:
        display(movement_scatter_xy(df_p, color="pitch_type"))
    except NameError:
        movement_scatter_xy(df_p, color="pitch_type").show()

    for _, row in df_p.iterrows():
        try:
            display(radar_quality(row))
        except NameError:
            radar_quality(row).show()
        print(xy_blurb(row))
else:
    print("No pitchers found. Try a different date window.")

Unnamed: 0,player_name,pitch_type,p_throws,n,velo,spin,ivb_in,hb_as_in,rel_height,rel_side,csw,whiff_rate,gb_rate,zone_pct,cluster,cluster_name
4547,"Williams, Devin",CH,R,636,83.724057,2734.992126,-4.718491,19.226792,5.147233,-2.511509,0.320755,0.356725,0.25,0.279874,1,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
4548,"Williams, Devin",FC,R,6,89.266667,2411.833333,13.26,-1.28,5.521667,-2.38,0.5,0.666667,0.0,0.333333,1,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
4549,"Williams, Devin",FF,R,584,94.123973,2365.510274,15.680548,12.834452,5.391387,-2.346592,0.289384,0.333333,0.385965,0.248288,7,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."


CH (R): Arm-Side, Drop. Velo 83.7 mph | AS 19.2" | Ride -4.7" | CSW 0.32 | Whiff 0.36 | GB 0.25 | Zone 0.28


FC (R): Glove-Side, Ride. Velo 89.3 mph | AS -1.3" | Ride 13.3" | CSW 0.50 | Whiff 0.67 | GB 0.00 | Zone 0.33


FF (R): Arm-Side, Ride. Velo 94.1 mph | AS 12.8" | Ride 15.7" | CSW 0.29 | Whiff 0.33 | GB 0.39 | Zone 0.25


In [50]:
if SELECTED_PITCHER:
    for _, row in df_p.iterrows():
        comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
        print(f"\nNearest comps — {row['pitch_type']} ({row['cluster_name']}):")
        try:
            display(comps)
        except NameError:
            print(comps.to_string(index=False))


Nearest comps — CH (SL: Glove-Side • Moderate Sweep, Subtle Drop • Whiff-First):


Unnamed: 0,player_name,pitch_type,p_throws,velo,ivb_in,hb_as_in,whiff_rate,gb_rate,cluster_name
4547,"Williams, Devin",CH,R,83.724057,-4.718491,19.226792,0.356725,0.25,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
4685,"Yoho, Craig",CH,R,77.122892,-2.949398,17.998554,0.289474,0.181818,"CH: Arm-Side • Moderate Run, Moderate Ride • W..."
3773,"Seabold, Connor",CH,R,81.337931,-2.695862,16.121379,0.259259,0.285714,"CH: Arm-Side • Moderate Run, Moderate Ride • W..."
4530,"Whitlock, Garrett",CH,R,84.306513,0.177471,17.017011,0.309211,0.361702,"CH: Arm-Side • Moderate Run, Moderate Ride • W..."
1768,"Henderson, Logan",CH,R,81.867222,5.488667,18.700667,0.252874,0.25,"CH: Arm-Side • Moderate Run, Moderate Ride • W..."



Nearest comps — FC (SL: Glove-Side • Moderate Sweep, Subtle Drop • Whiff-First):


Unnamed: 0,player_name,pitch_type,p_throws,velo,ivb_in,hb_as_in,whiff_rate,gb_rate,cluster_name
4548,"Williams, Devin",FC,R,89.266667,13.26,-1.28,0.666667,0.0,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
3116,"Overton, Connor",FC,R,88.685714,8.64,0.377143,0.5,0.0,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
471,"Boyer, Logan",FC,R,90.741667,4.76,-7.16,0.8,0.0,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."
2360,"Liranzo, Jesús",FC,R,86.06,1.68,-5.184,0.5,0.0,"SL: Glove-Side • Moderate Sweep, Subtle Drop •..."



Nearest comps — FF (FF: Arm-Side • Moderate Run, Moderate Ride • Strike-Throwing):


Unnamed: 0,player_name,pitch_type,p_throws,velo,ivb_in,hb_as_in,whiff_rate,gb_rate,cluster_name
4549,"Williams, Devin",FF,R,94.123973,15.680548,12.834452,0.333333,0.385965,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."
4090,"Strzelecki, Peter",FF,R,93.960606,15.116364,6.883636,0.315789,0.333333,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."
14,"Abreu, Bryan",FF,R,97.282255,16.629453,6.517015,0.344569,0.304348,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."
2620,"McCullers Jr., Lance",FF,R,91.788406,11.806957,7.490435,0.375,0.444444,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."
4506,"Wheeler, Zack",FF,R,96.008189,14.729133,10.002312,0.270169,0.253731,"FF: Arm-Side • Moderate Run, Moderate Ride • S..."
