Spaces:
Sleeping
Sleeping
| # src/matchups.py | |
| from __future__ import annotations | |
| import numpy as np | |
| import pandas as pd | |
| OUTCOME_DESCS_SWING = { | |
| "swinging_strike", | |
| "swinging_strike_blocked", | |
| "foul", | |
| "hit_into_play", | |
| } | |
| OUTCOME_DESCS_WHIFF = {"swinging_strike", "swinging_strike_blocked"} | |
| EVENTS_GB = { | |
| "groundout", | |
| "field_error", | |
| "single", | |
| "double", | |
| "triple", | |
| } # crude GB proxy on balls in play | |
| # Name resolution for MLBAM batter IDs → "First Last" | |
| def ensure_batter_names(df_raw: pd.DataFrame) -> pd.DataFrame: | |
| if "batter_name" in df_raw.columns: | |
| return df_raw | |
| df = df_raw.copy() | |
| if "batter" not in df.columns or df["batter"].dropna().empty: | |
| df["batter_name"] = None | |
| return df | |
| try: | |
| from pybaseball import playerid_reverse_lookup | |
| ids = df["batter"].dropna().astype(int).unique().tolist() | |
| lut = playerid_reverse_lookup(ids, key_type="mlbam")[ | |
| ["key_mlbam", "name_first", "name_last"] | |
| ] | |
| lut["batter_name"] = ( | |
| lut["name_first"].str.title() + " " + lut["name_last"].str.title() | |
| ) | |
| name_map = dict(zip(lut["key_mlbam"].astype(int), lut["batter_name"])) | |
| df["batter_name"] = df["batter"].map(name_map) | |
| except Exception: | |
| # Fallback: readable placeholder if lookup fails / no internet | |
| df["batter_name"] = df["batter"].apply( | |
| lambda x: f"ID {int(x)}" if pd.notna(x) else None | |
| ) | |
| return df | |
| def _safe_rate(num, den): | |
| num = num.astype(float) | |
| den = den.astype(float) | |
| with np.errstate(divide="ignore", invalid="ignore"): | |
| r = np.where(den > 0, num / den, np.nan) | |
| return r | |
| def best_matchups_for_pitcher( | |
| df_raw: pd.DataFrame, | |
| pitcher_name: str, | |
| min_pitches: int = 10, | |
| top_n: int = 10, | |
| w_whiff: float = 0.6, | |
| w_gb: float = 0.4, | |
| ) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Summarize batter-vs-selected-pitcher outcomes and rank by a 'pitcher-friendly' score. | |
| score = w_whiff * whiff_rate + w_gb * gb_rate_on_contact | |
| Returns (best_df, worst_df). | |
| """ | |
| # Filter to the one pitcher | |
| dfp = df_raw[df_raw.get("player_name") == pitcher_name].copy() | |
| if dfp.empty: | |
| return pd.DataFrame(), pd.DataFrame() | |
| # Derive per-pitch outcomes | |
| dfp["is_swing"] = dfp["description"].isin(OUTCOME_DESCS_SWING).astype(int) | |
| dfp["is_whiff"] = dfp["description"].isin(OUTCOME_DESCS_WHIFF).astype(int) | |
| dfp["is_in_play"] = (dfp["description"] == "hit_into_play").astype(int) | |
| dfp["is_gb_event"] = dfp["events"].isin(EVENTS_GB).astype(int) | |
| # Name columns vary across pybaseball versions; prefer 'batter_name' if present | |
| name_col = "batter_name" if "batter_name" in dfp.columns else None | |
| group_cols = ["batter", "stand"] | |
| if name_col: | |
| group_cols = [name_col, "batter", "stand"] | |
| g = dfp.groupby(group_cols, dropna=False) | |
| agg = g.agg( | |
| pitches=("pitch_type", "size"), | |
| swings=("is_swing", "sum"), | |
| whiffs=("is_whiff", "sum"), | |
| inplay=("is_in_play", "sum"), | |
| gb_events=("is_gb_event", "sum"), | |
| ).reset_index() | |
| # Rates | |
| agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"]) | |
| agg["gb_rate_on_contact"] = _safe_rate(agg["gb_events"], agg["inplay"]) | |
| # Pitcher-friendly score | |
| agg["pm_score"] = w_whiff * agg["whiff_rate"] + w_gb * agg["gb_rate_on_contact"] | |
| # Filtering | |
| agg = agg[agg["pitches"] >= min_pitches].sort_values("pm_score", ascending=False) | |
| # Nicely ordered columns | |
| display_cols = [] | |
| if name_col: | |
| display_cols.append(name_col) | |
| display_cols += [ | |
| "batter", | |
| "stand", | |
| "pitches", | |
| "whiff_rate", | |
| "gb_rate_on_contact", | |
| "pm_score", | |
| ] | |
| best = agg.head(top_n)[display_cols].copy() | |
| worst = agg.tail(top_n).sort_values("pm_score", ascending=True)[display_cols].copy() | |
| return best, worst | |