rsm-roguchi commited on
Commit
cbe015e
·
1 Parent(s): 752a595
Files changed (2) hide show
  1. app.py +50 -1
  2. src/matchups.py +130 -0
app.py CHANGED
@@ -15,6 +15,7 @@ from featurize import infer_ivb_sign, engineer_pitch_features
15
  from model import fit_kmeans, nearest_comps
16
  from tags import xy_cluster_tags
17
  from plots import movement_scatter_xy, radar_quality
 
18
 
19
  try:
20
  from huggingface_hub import hf_hub_download
@@ -86,6 +87,7 @@ with st.sidebar:
86
 
87
  with st.spinner("Loading data…"):
88
  df_raw = safe_load_data(start, end, force)
 
89
 
90
  if df_raw.empty:
91
  st.warning(
@@ -123,7 +125,7 @@ with st.spinner("Clustering & tagging…"):
123
  pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
124
  df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
125
 
126
- tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
127
 
128
  with tab1:
129
  view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
@@ -168,3 +170,50 @@ with tab3:
168
  # ⬇️ Old signature again
169
  comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
170
  st.dataframe(comps, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  from model import fit_kmeans, nearest_comps
16
  from tags import xy_cluster_tags
17
  from plots import movement_scatter_xy, radar_quality
18
+ from matchups import best_matchups_for_pitcher, ensure_batter_names
19
 
20
  try:
21
  from huggingface_hub import hf_hub_download
 
87
 
88
  with st.spinner("Loading data…"):
89
  df_raw = safe_load_data(start, end, force)
90
+ df_raw = ensure_batter_names(df_raw)
91
 
92
  if df_raw.empty:
93
  st.warning(
 
125
  pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
126
  df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
127
 
128
+ tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])
129
 
130
  with tab1:
131
  view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
 
170
  # ⬇️ Old signature again
171
  comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
172
  st.dataframe(comps, use_container_width=True)
173
+
174
+ with tab4:
175
+ st.subheader(f"Best Matchups — {pitcher}")
176
+
177
+ # Controls
178
+ colA, colB, colC, colD = st.columns([1, 1, 1, 2])
179
+ with colA:
180
+ min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
181
+ with colB:
182
+ top_n = st.number_input("Top N", 5, 50, 10, step=5)
183
+ with colC:
184
+ w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
185
+ with colD:
186
+ w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)
187
+
188
+ # Normalize weights (optional)
189
+ total_w = max(w_whiff + w_gb, 1e-6)
190
+ w_whiff /= total_w
191
+ w_gb /= total_w
192
+
193
+ # Compute
194
+ best, worst = best_matchups_for_pitcher(
195
+ df_raw,
196
+ pitcher,
197
+ min_pitches=min_pitches,
198
+ top_n=int(top_n),
199
+ w_whiff=float(w_whiff),
200
+ w_gb=float(w_gb),
201
+ )
202
+
203
+ if best.empty and worst.empty:
204
+ st.info(
205
+ "No batter matchups for this pitcher within the current window / filters."
206
+ )
207
+ else:
208
+ c1, c2 = st.columns(2)
209
+ with c1:
210
+ st.markdown("### ✅ Best (Pitcher-Friendly)")
211
+ st.dataframe(best, use_container_width=True)
212
+ with c2:
213
+ st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
214
+ st.dataframe(worst, use_container_width=True)
215
+
216
+ st.caption(
217
+ "Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
218
+ "Adjust weights to emphasize strikeouts vs. weak contact."
219
+ )
src/matchups.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/matchups.py
2
+ from __future__ import annotations
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ OUTCOME_DESCS_SWING = {
7
+ "swinging_strike",
8
+ "swinging_strike_blocked",
9
+ "foul",
10
+ "hit_into_play",
11
+ }
12
+ OUTCOME_DESCS_WHIFF = {"swinging_strike", "swinging_strike_blocked"}
13
+
14
+ EVENTS_GB = {
15
+ "groundout",
16
+ "field_error",
17
+ "single",
18
+ "double",
19
+ "triple",
20
+ } # crude GB proxy on balls in play
21
+
22
+
23
+ # Name resolution for MLBAM batter IDs → "First Last"
24
+ def ensure_batter_names(df_raw: pd.DataFrame) -> pd.DataFrame:
25
+ if "batter_name" in df_raw.columns:
26
+ return df_raw
27
+
28
+ df = df_raw.copy()
29
+ if "batter" not in df.columns or df["batter"].dropna().empty:
30
+ df["batter_name"] = None
31
+ return df
32
+
33
+ try:
34
+ from pybaseball import playerid_reverse_lookup
35
+
36
+ ids = df["batter"].dropna().astype(int).unique().tolist()
37
+ lut = playerid_reverse_lookup(ids, key_type="mlbam")[
38
+ ["key_mlbam", "name_first", "name_last"]
39
+ ]
40
+ lut["batter_name"] = (
41
+ lut["name_first"].str.title() + " " + lut["name_last"].str.title()
42
+ )
43
+ name_map = dict(zip(lut["key_mlbam"].astype(int), lut["batter_name"]))
44
+ df["batter_name"] = df["batter"].map(name_map)
45
+ except Exception:
46
+ # Fallback: readable placeholder if lookup fails / no internet
47
+ df["batter_name"] = df["batter"].apply(
48
+ lambda x: f"ID {int(x)}" if pd.notna(x) else None
49
+ )
50
+
51
+ return df
52
+
53
+
54
+ def _safe_rate(num, den):
55
+ num = num.astype(float)
56
+ den = den.astype(float)
57
+ with np.errstate(divide="ignore", invalid="ignore"):
58
+ r = np.where(den > 0, num / den, np.nan)
59
+ return r
60
+
61
+
62
+ def best_matchups_for_pitcher(
63
+ df_raw: pd.DataFrame,
64
+ pitcher_name: str,
65
+ min_pitches: int = 10,
66
+ top_n: int = 10,
67
+ w_whiff: float = 0.6,
68
+ w_gb: float = 0.4,
69
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
70
+ """
71
+ Summarize batter-vs-selected-pitcher outcomes and rank by a 'pitcher-friendly' score.
72
+ score = w_whiff * whiff_rate + w_gb * gb_rate_on_contact
73
+
74
+ Returns (best_df, worst_df).
75
+ """
76
+
77
+ # Filter to the one pitcher
78
+ dfp = df_raw[df_raw.get("player_name") == pitcher_name].copy()
79
+ if dfp.empty:
80
+ return pd.DataFrame(), pd.DataFrame()
81
+
82
+ # Derive per-pitch outcomes
83
+ dfp["is_swing"] = dfp["description"].isin(OUTCOME_DESCS_SWING).astype(int)
84
+ dfp["is_whiff"] = dfp["description"].isin(OUTCOME_DESCS_WHIFF).astype(int)
85
+ dfp["is_in_play"] = (dfp["description"] == "hit_into_play").astype(int)
86
+ dfp["is_gb_event"] = dfp["events"].isin(EVENTS_GB).astype(int)
87
+
88
+ # Name columns vary across pybaseball versions; prefer 'batter_name' if present
89
+ name_col = "batter_name" if "batter_name" in dfp.columns else None
90
+
91
+ group_cols = ["batter", "stand"]
92
+ if name_col:
93
+ group_cols = [name_col, "batter", "stand"]
94
+
95
+ g = dfp.groupby(group_cols, dropna=False)
96
+
97
+ agg = g.agg(
98
+ pitches=("pitch_type", "size"),
99
+ swings=("is_swing", "sum"),
100
+ whiffs=("is_whiff", "sum"),
101
+ inplay=("is_in_play", "sum"),
102
+ gb_events=("is_gb_event", "sum"),
103
+ ).reset_index()
104
+
105
+ # Rates
106
+ agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
107
+ agg["gb_rate_on_contact"] = _safe_rate(agg["gb_events"], agg["inplay"])
108
+
109
+ # Pitcher-friendly score
110
+ agg["pm_score"] = w_whiff * agg["whiff_rate"] + w_gb * agg["gb_rate_on_contact"]
111
+
112
+ # Filtering
113
+ agg = agg[agg["pitches"] >= min_pitches].sort_values("pm_score", ascending=False)
114
+
115
+ # Nicely ordered columns
116
+ display_cols = []
117
+ if name_col:
118
+ display_cols.append(name_col)
119
+ display_cols += [
120
+ "batter",
121
+ "stand",
122
+ "pitches",
123
+ "whiff_rate",
124
+ "gb_rate_on_contact",
125
+ "pm_score",
126
+ ]
127
+
128
+ best = agg.head(top_n)[display_cols].copy()
129
+ worst = agg.tail(top_n).sort_values("pm_score", ascending=True)[display_cols].copy()
130
+ return best, worst