Spaces:

roguchi
/

pitch_dash

Sleeping

App Files Files Community

rsm-roguchi commited on Oct 30, 2025

Commit

cbe015e

1 Parent(s): 752a595

boah

Browse files

Files changed (2) hide show

app.py +50 -1
src/matchups.py +130 -0

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from featurize import infer_ivb_sign, engineer_pitch_features
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
 try:
     from huggingface_hub import hf_hub_download
@@ -86,6 +87,7 @@ with st.sidebar:
 with st.spinner("Loading data…"):
     df_raw = safe_load_data(start, end, force)
 if df_raw.empty:
     st.warning(
@@ -123,7 +125,7 @@ with st.spinner("Clustering & tagging…"):
 pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
-tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
 with tab1:
     view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
@@ -168,3 +170,50 @@ with tab3:
         # ⬇️ Old signature again
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
         st.dataframe(comps, use_container_width=True)

 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
+from matchups import best_matchups_for_pitcher, ensure_batter_names
 try:
     from huggingface_hub import hf_hub_download
 with st.spinner("Loading data…"):
     df_raw = safe_load_data(start, end, force)
+    df_raw = ensure_batter_names(df_raw)
 if df_raw.empty:
     st.warning(
 pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
+tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])
 with tab1:
     view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
         # ⬇️ Old signature again
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
         st.dataframe(comps, use_container_width=True)
+with tab4:
+    st.subheader(f"Best Matchups — {pitcher}")
+    # Controls
+    colA, colB, colC, colD = st.columns([1, 1, 1, 2])
+    with colA:
+        min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
+    with colB:
+        top_n = st.number_input("Top N", 5, 50, 10, step=5)
+    with colC:
+        w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
+    with colD:
+        w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)
+    # Normalize weights (optional)
+    total_w = max(w_whiff + w_gb, 1e-6)
+    w_whiff /= total_w
+    w_gb /= total_w
+    # Compute
+    best, worst = best_matchups_for_pitcher(
+        df_raw,
+        pitcher,
+        min_pitches=min_pitches,
+        top_n=int(top_n),
+        w_whiff=float(w_whiff),
+        w_gb=float(w_gb),
+    )
+    if best.empty and worst.empty:
+        st.info(
+            "No batter matchups for this pitcher within the current window / filters."
+        )
+    else:
+        c1, c2 = st.columns(2)
+        with c1:
+            st.markdown("### ✅ Best (Pitcher-Friendly)")
+            st.dataframe(best, use_container_width=True)
+        with c2:
+            st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
+            st.dataframe(worst, use_container_width=True)
+    st.caption(
+        "Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
+        "Adjust weights to emphasize strikeouts vs. weak contact."
+    )

src/matchups.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# src/matchups.py
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+OUTCOME_DESCS_SWING = {
+    "swinging_strike",
+    "swinging_strike_blocked",
+    "foul",
+    "hit_into_play",
+}
+OUTCOME_DESCS_WHIFF = {"swinging_strike", "swinging_strike_blocked"}
+EVENTS_GB = {
+    "groundout",
+    "field_error",
+    "single",
+    "double",
+    "triple",
+}  # crude GB proxy on balls in play
+# Name resolution for MLBAM batter IDs → "First Last"
+def ensure_batter_names(df_raw: pd.DataFrame) -> pd.DataFrame:
+    if "batter_name" in df_raw.columns:
+        return df_raw
+    df = df_raw.copy()
+    if "batter" not in df.columns or df["batter"].dropna().empty:
+        df["batter_name"] = None
+        return df
+    try:
+        from pybaseball import playerid_reverse_lookup
+        ids = df["batter"].dropna().astype(int).unique().tolist()
+        lut = playerid_reverse_lookup(ids, key_type="mlbam")[
+            ["key_mlbam", "name_first", "name_last"]
+        ]
+        lut["batter_name"] = (
+            lut["name_first"].str.title() + " " + lut["name_last"].str.title()
+        )
+        name_map = dict(zip(lut["key_mlbam"].astype(int), lut["batter_name"]))
+        df["batter_name"] = df["batter"].map(name_map)
+    except Exception:
+        # Fallback: readable placeholder if lookup fails / no internet
+        df["batter_name"] = df["batter"].apply(
+            lambda x: f"ID {int(x)}" if pd.notna(x) else None
+        )
+    return df
+def _safe_rate(num, den):
+    num = num.astype(float)
+    den = den.astype(float)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        r = np.where(den > 0, num / den, np.nan)
+    return r
+def best_matchups_for_pitcher(
+    df_raw: pd.DataFrame,
+    pitcher_name: str,
+    min_pitches: int = 10,
+    top_n: int = 10,
+    w_whiff: float = 0.6,
+    w_gb: float = 0.4,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Summarize batter-vs-selected-pitcher outcomes and rank by a 'pitcher-friendly' score.
+      score = w_whiff * whiff_rate + w_gb * gb_rate_on_contact
+    Returns (best_df, worst_df).
+    """
+    # Filter to the one pitcher
+    dfp = df_raw[df_raw.get("player_name") == pitcher_name].copy()
+    if dfp.empty:
+        return pd.DataFrame(), pd.DataFrame()
+    # Derive per-pitch outcomes
+    dfp["is_swing"] = dfp["description"].isin(OUTCOME_DESCS_SWING).astype(int)
+    dfp["is_whiff"] = dfp["description"].isin(OUTCOME_DESCS_WHIFF).astype(int)
+    dfp["is_in_play"] = (dfp["description"] == "hit_into_play").astype(int)
+    dfp["is_gb_event"] = dfp["events"].isin(EVENTS_GB).astype(int)
+    # Name columns vary across pybaseball versions; prefer 'batter_name' if present
+    name_col = "batter_name" if "batter_name" in dfp.columns else None
+    group_cols = ["batter", "stand"]
+    if name_col:
+        group_cols = [name_col, "batter", "stand"]
+    g = dfp.groupby(group_cols, dropna=False)
+    agg = g.agg(
+        pitches=("pitch_type", "size"),
+        swings=("is_swing", "sum"),
+        whiffs=("is_whiff", "sum"),
+        inplay=("is_in_play", "sum"),
+        gb_events=("is_gb_event", "sum"),
+    ).reset_index()
+    # Rates
+    agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
+    agg["gb_rate_on_contact"] = _safe_rate(agg["gb_events"], agg["inplay"])
+    # Pitcher-friendly score
+    agg["pm_score"] = w_whiff * agg["whiff_rate"] + w_gb * agg["gb_rate_on_contact"]
+    # Filtering
+    agg = agg[agg["pitches"] >= min_pitches].sort_values("pm_score", ascending=False)
+    # Nicely ordered columns
+    display_cols = []
+    if name_col:
+        display_cols.append(name_col)
+    display_cols += [
+        "batter",
+        "stand",
+        "pitches",
+        "whiff_rate",
+        "gb_rate_on_contact",
+        "pm_score",
+    ]
+    best = agg.head(top_n)[display_cols].copy()
+    worst = agg.tail(top_n).sort_values("pm_score", ascending=True)[display_cols].copy()
+    return best, worst