Spaces:
Sleeping
Sleeping
rsm-roguchi
commited on
Commit
·
cbe015e
1
Parent(s):
752a595
boah
Browse files- app.py +50 -1
- src/matchups.py +130 -0
app.py
CHANGED
|
@@ -15,6 +15,7 @@ from featurize import infer_ivb_sign, engineer_pitch_features
|
|
| 15 |
from model import fit_kmeans, nearest_comps
|
| 16 |
from tags import xy_cluster_tags
|
| 17 |
from plots import movement_scatter_xy, radar_quality
|
|
|
|
| 18 |
|
| 19 |
try:
|
| 20 |
from huggingface_hub import hf_hub_download
|
|
@@ -86,6 +87,7 @@ with st.sidebar:
|
|
| 86 |
|
| 87 |
with st.spinner("Loading data…"):
|
| 88 |
df_raw = safe_load_data(start, end, force)
|
|
|
|
| 89 |
|
| 90 |
if df_raw.empty:
|
| 91 |
st.warning(
|
|
@@ -123,7 +125,7 @@ with st.spinner("Clustering & tagging…"):
|
|
| 123 |
pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
|
| 124 |
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
|
| 125 |
|
| 126 |
-
tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
|
| 127 |
|
| 128 |
with tab1:
|
| 129 |
view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
|
|
@@ -168,3 +170,50 @@ with tab3:
|
|
| 168 |
# ⬇️ Old signature again
|
| 169 |
comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
|
| 170 |
st.dataframe(comps, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
from model import fit_kmeans, nearest_comps
|
| 16 |
from tags import xy_cluster_tags
|
| 17 |
from plots import movement_scatter_xy, radar_quality
|
| 18 |
+
from matchups import best_matchups_for_pitcher, ensure_batter_names
|
| 19 |
|
| 20 |
try:
|
| 21 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 87 |
|
| 88 |
with st.spinner("Loading data…"):
|
| 89 |
df_raw = safe_load_data(start, end, force)
|
| 90 |
+
df_raw = ensure_batter_names(df_raw)
|
| 91 |
|
| 92 |
if df_raw.empty:
|
| 93 |
st.warning(
|
|
|
|
| 125 |
pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
|
| 126 |
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
|
| 127 |
|
| 128 |
+
tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])
|
| 129 |
|
| 130 |
with tab1:
|
| 131 |
view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
|
|
|
|
| 170 |
# ⬇️ Old signature again
|
| 171 |
comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
|
| 172 |
st.dataframe(comps, use_container_width=True)
|
| 173 |
+
|
| 174 |
+
with tab4:
|
| 175 |
+
st.subheader(f"Best Matchups — {pitcher}")
|
| 176 |
+
|
| 177 |
+
# Controls
|
| 178 |
+
colA, colB, colC, colD = st.columns([1, 1, 1, 2])
|
| 179 |
+
with colA:
|
| 180 |
+
min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
|
| 181 |
+
with colB:
|
| 182 |
+
top_n = st.number_input("Top N", 5, 50, 10, step=5)
|
| 183 |
+
with colC:
|
| 184 |
+
w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
|
| 185 |
+
with colD:
|
| 186 |
+
w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)
|
| 187 |
+
|
| 188 |
+
# Normalize weights (optional)
|
| 189 |
+
total_w = max(w_whiff + w_gb, 1e-6)
|
| 190 |
+
w_whiff /= total_w
|
| 191 |
+
w_gb /= total_w
|
| 192 |
+
|
| 193 |
+
# Compute
|
| 194 |
+
best, worst = best_matchups_for_pitcher(
|
| 195 |
+
df_raw,
|
| 196 |
+
pitcher,
|
| 197 |
+
min_pitches=min_pitches,
|
| 198 |
+
top_n=int(top_n),
|
| 199 |
+
w_whiff=float(w_whiff),
|
| 200 |
+
w_gb=float(w_gb),
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
if best.empty and worst.empty:
|
| 204 |
+
st.info(
|
| 205 |
+
"No batter matchups for this pitcher within the current window / filters."
|
| 206 |
+
)
|
| 207 |
+
else:
|
| 208 |
+
c1, c2 = st.columns(2)
|
| 209 |
+
with c1:
|
| 210 |
+
st.markdown("### ✅ Best (Pitcher-Friendly)")
|
| 211 |
+
st.dataframe(best, use_container_width=True)
|
| 212 |
+
with c2:
|
| 213 |
+
st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
|
| 214 |
+
st.dataframe(worst, use_container_width=True)
|
| 215 |
+
|
| 216 |
+
st.caption(
|
| 217 |
+
"Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
|
| 218 |
+
"Adjust weights to emphasize strikeouts vs. weak contact."
|
| 219 |
+
)
|
src/matchups.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/matchups.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
OUTCOME_DESCS_SWING = {
|
| 7 |
+
"swinging_strike",
|
| 8 |
+
"swinging_strike_blocked",
|
| 9 |
+
"foul",
|
| 10 |
+
"hit_into_play",
|
| 11 |
+
}
|
| 12 |
+
OUTCOME_DESCS_WHIFF = {"swinging_strike", "swinging_strike_blocked"}
|
| 13 |
+
|
| 14 |
+
EVENTS_GB = {
|
| 15 |
+
"groundout",
|
| 16 |
+
"field_error",
|
| 17 |
+
"single",
|
| 18 |
+
"double",
|
| 19 |
+
"triple",
|
| 20 |
+
} # crude GB proxy on balls in play
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Name resolution for MLBAM batter IDs → "First Last"
|
| 24 |
+
def ensure_batter_names(df_raw: pd.DataFrame) -> pd.DataFrame:
|
| 25 |
+
if "batter_name" in df_raw.columns:
|
| 26 |
+
return df_raw
|
| 27 |
+
|
| 28 |
+
df = df_raw.copy()
|
| 29 |
+
if "batter" not in df.columns or df["batter"].dropna().empty:
|
| 30 |
+
df["batter_name"] = None
|
| 31 |
+
return df
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
from pybaseball import playerid_reverse_lookup
|
| 35 |
+
|
| 36 |
+
ids = df["batter"].dropna().astype(int).unique().tolist()
|
| 37 |
+
lut = playerid_reverse_lookup(ids, key_type="mlbam")[
|
| 38 |
+
["key_mlbam", "name_first", "name_last"]
|
| 39 |
+
]
|
| 40 |
+
lut["batter_name"] = (
|
| 41 |
+
lut["name_first"].str.title() + " " + lut["name_last"].str.title()
|
| 42 |
+
)
|
| 43 |
+
name_map = dict(zip(lut["key_mlbam"].astype(int), lut["batter_name"]))
|
| 44 |
+
df["batter_name"] = df["batter"].map(name_map)
|
| 45 |
+
except Exception:
|
| 46 |
+
# Fallback: readable placeholder if lookup fails / no internet
|
| 47 |
+
df["batter_name"] = df["batter"].apply(
|
| 48 |
+
lambda x: f"ID {int(x)}" if pd.notna(x) else None
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _safe_rate(num, den):
|
| 55 |
+
num = num.astype(float)
|
| 56 |
+
den = den.astype(float)
|
| 57 |
+
with np.errstate(divide="ignore", invalid="ignore"):
|
| 58 |
+
r = np.where(den > 0, num / den, np.nan)
|
| 59 |
+
return r
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def best_matchups_for_pitcher(
|
| 63 |
+
df_raw: pd.DataFrame,
|
| 64 |
+
pitcher_name: str,
|
| 65 |
+
min_pitches: int = 10,
|
| 66 |
+
top_n: int = 10,
|
| 67 |
+
w_whiff: float = 0.6,
|
| 68 |
+
w_gb: float = 0.4,
|
| 69 |
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 70 |
+
"""
|
| 71 |
+
Summarize batter-vs-selected-pitcher outcomes and rank by a 'pitcher-friendly' score.
|
| 72 |
+
score = w_whiff * whiff_rate + w_gb * gb_rate_on_contact
|
| 73 |
+
|
| 74 |
+
Returns (best_df, worst_df).
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
# Filter to the one pitcher
|
| 78 |
+
dfp = df_raw[df_raw.get("player_name") == pitcher_name].copy()
|
| 79 |
+
if dfp.empty:
|
| 80 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 81 |
+
|
| 82 |
+
# Derive per-pitch outcomes
|
| 83 |
+
dfp["is_swing"] = dfp["description"].isin(OUTCOME_DESCS_SWING).astype(int)
|
| 84 |
+
dfp["is_whiff"] = dfp["description"].isin(OUTCOME_DESCS_WHIFF).astype(int)
|
| 85 |
+
dfp["is_in_play"] = (dfp["description"] == "hit_into_play").astype(int)
|
| 86 |
+
dfp["is_gb_event"] = dfp["events"].isin(EVENTS_GB).astype(int)
|
| 87 |
+
|
| 88 |
+
# Name columns vary across pybaseball versions; prefer 'batter_name' if present
|
| 89 |
+
name_col = "batter_name" if "batter_name" in dfp.columns else None
|
| 90 |
+
|
| 91 |
+
group_cols = ["batter", "stand"]
|
| 92 |
+
if name_col:
|
| 93 |
+
group_cols = [name_col, "batter", "stand"]
|
| 94 |
+
|
| 95 |
+
g = dfp.groupby(group_cols, dropna=False)
|
| 96 |
+
|
| 97 |
+
agg = g.agg(
|
| 98 |
+
pitches=("pitch_type", "size"),
|
| 99 |
+
swings=("is_swing", "sum"),
|
| 100 |
+
whiffs=("is_whiff", "sum"),
|
| 101 |
+
inplay=("is_in_play", "sum"),
|
| 102 |
+
gb_events=("is_gb_event", "sum"),
|
| 103 |
+
).reset_index()
|
| 104 |
+
|
| 105 |
+
# Rates
|
| 106 |
+
agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
|
| 107 |
+
agg["gb_rate_on_contact"] = _safe_rate(agg["gb_events"], agg["inplay"])
|
| 108 |
+
|
| 109 |
+
# Pitcher-friendly score
|
| 110 |
+
agg["pm_score"] = w_whiff * agg["whiff_rate"] + w_gb * agg["gb_rate_on_contact"]
|
| 111 |
+
|
| 112 |
+
# Filtering
|
| 113 |
+
agg = agg[agg["pitches"] >= min_pitches].sort_values("pm_score", ascending=False)
|
| 114 |
+
|
| 115 |
+
# Nicely ordered columns
|
| 116 |
+
display_cols = []
|
| 117 |
+
if name_col:
|
| 118 |
+
display_cols.append(name_col)
|
| 119 |
+
display_cols += [
|
| 120 |
+
"batter",
|
| 121 |
+
"stand",
|
| 122 |
+
"pitches",
|
| 123 |
+
"whiff_rate",
|
| 124 |
+
"gb_rate_on_contact",
|
| 125 |
+
"pm_score",
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
best = agg.head(top_n)[display_cols].copy()
|
| 129 |
+
worst = agg.tail(top_n).sort_values("pm_score", ascending=True)[display_cols].copy()
|
| 130 |
+
return best, worst
|