pitch_dash / src /matchups.py
rsm-roguchi
boah
cbe015e
# src/matchups.py
from __future__ import annotations
import numpy as np
import pandas as pd
OUTCOME_DESCS_SWING = {
"swinging_strike",
"swinging_strike_blocked",
"foul",
"hit_into_play",
}
OUTCOME_DESCS_WHIFF = {"swinging_strike", "swinging_strike_blocked"}
EVENTS_GB = {
"groundout",
"field_error",
"single",
"double",
"triple",
} # crude GB proxy on balls in play
# Name resolution for MLBAM batter IDs → "First Last"
def ensure_batter_names(df_raw: pd.DataFrame) -> pd.DataFrame:
if "batter_name" in df_raw.columns:
return df_raw
df = df_raw.copy()
if "batter" not in df.columns or df["batter"].dropna().empty:
df["batter_name"] = None
return df
try:
from pybaseball import playerid_reverse_lookup
ids = df["batter"].dropna().astype(int).unique().tolist()
lut = playerid_reverse_lookup(ids, key_type="mlbam")[
["key_mlbam", "name_first", "name_last"]
]
lut["batter_name"] = (
lut["name_first"].str.title() + " " + lut["name_last"].str.title()
)
name_map = dict(zip(lut["key_mlbam"].astype(int), lut["batter_name"]))
df["batter_name"] = df["batter"].map(name_map)
except Exception:
# Fallback: readable placeholder if lookup fails / no internet
df["batter_name"] = df["batter"].apply(
lambda x: f"ID {int(x)}" if pd.notna(x) else None
)
return df
def _safe_rate(num, den):
num = num.astype(float)
den = den.astype(float)
with np.errstate(divide="ignore", invalid="ignore"):
r = np.where(den > 0, num / den, np.nan)
return r
def best_matchups_for_pitcher(
df_raw: pd.DataFrame,
pitcher_name: str,
min_pitches: int = 10,
top_n: int = 10,
w_whiff: float = 0.6,
w_gb: float = 0.4,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Summarize batter-vs-selected-pitcher outcomes and rank by a 'pitcher-friendly' score.
score = w_whiff * whiff_rate + w_gb * gb_rate_on_contact
Returns (best_df, worst_df).
"""
# Filter to the one pitcher
dfp = df_raw[df_raw.get("player_name") == pitcher_name].copy()
if dfp.empty:
return pd.DataFrame(), pd.DataFrame()
# Derive per-pitch outcomes
dfp["is_swing"] = dfp["description"].isin(OUTCOME_DESCS_SWING).astype(int)
dfp["is_whiff"] = dfp["description"].isin(OUTCOME_DESCS_WHIFF).astype(int)
dfp["is_in_play"] = (dfp["description"] == "hit_into_play").astype(int)
dfp["is_gb_event"] = dfp["events"].isin(EVENTS_GB).astype(int)
# Name columns vary across pybaseball versions; prefer 'batter_name' if present
name_col = "batter_name" if "batter_name" in dfp.columns else None
group_cols = ["batter", "stand"]
if name_col:
group_cols = [name_col, "batter", "stand"]
g = dfp.groupby(group_cols, dropna=False)
agg = g.agg(
pitches=("pitch_type", "size"),
swings=("is_swing", "sum"),
whiffs=("is_whiff", "sum"),
inplay=("is_in_play", "sum"),
gb_events=("is_gb_event", "sum"),
).reset_index()
# Rates
agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
agg["gb_rate_on_contact"] = _safe_rate(agg["gb_events"], agg["inplay"])
# Pitcher-friendly score
agg["pm_score"] = w_whiff * agg["whiff_rate"] + w_gb * agg["gb_rate_on_contact"]
# Filtering
agg = agg[agg["pitches"] >= min_pitches].sort_values("pm_score", ascending=False)
# Nicely ordered columns
display_cols = []
if name_col:
display_cols.append(name_col)
display_cols += [
"batter",
"stand",
"pitches",
"whiff_rate",
"gb_rate_on_contact",
"pm_score",
]
best = agg.head(top_n)[display_cols].copy()
worst = agg.tail(top_n).sort_values("pm_score", ascending=True)[display_cols].copy()
return best, worst