rsm-roguchi commited on
Commit
c5c0f3e
·
1 Parent(s): 4c2852d
Files changed (4) hide show
  1. app.py +125 -13
  2. build.ipynb +0 -0
  3. requirements.txt +7 -3
  4. src/tags.py +108 -18
app.py CHANGED
@@ -1,39 +1,151 @@
 
1
  import os, sys
 
2
 
3
- sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
 
 
4
 
5
  import streamlit as st
6
  import pandas as pd
 
 
7
  from data import load_statcast, default_window
8
  from featurize import infer_ivb_sign, engineer_pitch_features
9
  from model import fit_kmeans, nearest_comps
10
  from tags import xy_cluster_tags
11
  from plots import movement_scatter_xy, radar_quality
12
 
 
 
 
 
 
 
13
 
14
  st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
15
  st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  with st.sidebar:
18
  st.header("Data Window")
19
  dstart, dend = default_window()
20
  start = st.text_input("Start YYYY-MM-DD", dstart)
21
  end = st.text_input("End YYYY-MM-DD", dend)
22
  k = st.slider("Clusters (k)", 5, 12, 8)
23
- force = st.checkbox("Force re-download", value=False)
 
 
 
 
 
 
24
 
25
- df_raw = load_statcast(start, end, force=force)
26
  if df_raw.empty:
27
- st.warning("No data for that window.")
 
 
 
 
28
  st.stop()
29
 
30
- ivb_sign = infer_ivb_sign(df_raw)
31
- df_feat = engineer_pitch_features(df_raw, ivb_sign)
32
- df_fit, scaler, km, nn = fit_kmeans(df_feat, k=k)
33
- cluster_names = xy_cluster_tags(df_fit)
34
- df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
35
 
36
- pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].unique()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
38
 
39
  tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
@@ -51,7 +163,6 @@ with tab1:
51
  movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
52
  )
53
 
54
-
55
  with tab2:
56
  st.subheader(f"Scouting Card — {pitcher}")
57
  st.dataframe(
@@ -69,7 +180,8 @@ with tab2:
69
  "zone_pct",
70
  "cluster_name",
71
  ]
72
- ]
 
73
  )
74
  for _, row in df_p.iterrows():
75
  st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
@@ -79,4 +191,4 @@ with tab3:
79
  for _, row in df_p.iterrows():
80
  st.markdown(f"#### {row['pitch_type']} comps")
81
  comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
82
- st.dataframe(comps)
 
1
+ # app.py
2
  import os, sys
3
+ from datetime import datetime
4
 
5
+ # Ensure we can import from ./src even on HF Spaces
6
+ BASE_DIR = os.path.dirname(__file__)
7
+ sys.path.append(os.path.join(BASE_DIR, "src"))
8
 
9
  import streamlit as st
10
  import pandas as pd
11
+
12
+ # Your local modules
13
  from data import load_statcast, default_window
14
  from featurize import infer_ivb_sign, engineer_pitch_features
15
  from model import fit_kmeans, nearest_comps
16
  from tags import xy_cluster_tags
17
  from plots import movement_scatter_xy, radar_quality
18
 
19
+ try:
20
+ from huggingface_hub import hf_hub_download
21
+
22
+ HF_HUB_OK = True
23
+ except Exception:
24
+ HF_HUB_OK = False
25
 
26
  st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
27
  st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
28
 
29
+ # ---- Helpers
30
+
31
+
32
+ @st.cache_data(show_spinner=False, ttl=24 * 3600)
33
+ def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
34
+ """
35
+ Cached wrapper around your loader. On Spaces, expensive network calls during
36
+ app init are the #1 cause of infinite 'Starting...'. This keeps it fast.
37
+ """
38
+ return load_statcast(start, end, force=force)
39
+
40
+
41
+ @st.cache_data(show_spinner=False)
42
+ def load_sample_fallback() -> pd.DataFrame:
43
+ """
44
+ Optional: fallback sample data so the app is usable even if MLB/Statcast
45
+ endpoints are rate limited / blocked in Spaces.
46
+ - Put a small parquet or CSV in your Space repo: data/sample_statcast.parquet
47
+ - Or host it under a HF Dataset repo and set SAMPLE_DATA_REPO, SAMPLE_DATA_FILE.
48
+ """
49
+ local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
50
+ if os.path.exists(local_path):
51
+ return pd.read_parquet(local_path)
52
+
53
+ # If not bundled locally, try HF Hub (if available)
54
+ repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
55
+ file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
56
+ if HF_HUB_OK and repo_id:
57
+ path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
58
+ return pd.read_parquet(path)
59
+
60
+ # Give a tiny empty frame with expected columns to keep UI alive
61
+ return pd.DataFrame(
62
+ columns=[
63
+ "game_date",
64
+ "player_name",
65
+ "pitch_type",
66
+ "p_throws",
67
+ "n",
68
+ "velo",
69
+ "ivb_in",
70
+ "hb_as_in",
71
+ "csw",
72
+ "whiff_rate",
73
+ "gb_rate",
74
+ "zone_pct",
75
+ "cluster",
76
+ "cluster_name",
77
+ "x_mvt",
78
+ "y_mvt",
79
+ ]
80
+ )
81
+
82
+
83
+ def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
84
+ """
85
+ Try cached real data first; if it errors or returns empty, fall back to a sample.
86
+ """
87
+ try:
88
+ df = load_statcast_cached(start, end, force)
89
+ # Basic sanity check – empty windows are common; handle gracefully
90
+ if df is not None and not df.empty:
91
+ return df
92
+ st.info("No live data returned for that window — showing sample data instead.")
93
+ except Exception as e:
94
+ st.warning(f"Live data failed: {e}\nUsing sample data instead.")
95
+ return load_sample_fallback()
96
+
97
+
98
+ # ---- Sidebar
99
+
100
  with st.sidebar:
101
  st.header("Data Window")
102
  dstart, dend = default_window()
103
  start = st.text_input("Start YYYY-MM-DD", dstart)
104
  end = st.text_input("End YYYY-MM-DD", dend)
105
  k = st.slider("Clusters (k)", 5, 12, 8)
106
+ force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
107
+ st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
108
+
109
+ # ---- Data pipeline
110
+
111
+ with st.spinner("Loading data…"):
112
+ df_raw = safe_load_data(start, end, force)
113
 
 
114
  if df_raw.empty:
115
+ st.warning(
116
+ "No data available (live and sample were both empty). "
117
+ "Upload a small sample file to ./data/sample_statcast.parquet or set "
118
+ "env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
119
+ )
120
  st.stop()
121
 
 
 
 
 
 
122
 
123
+ # Feature engineering (cache stable steps)
124
+ @st.cache_data(show_spinner=False)
125
+ def _featurize(df_raw_in: pd.DataFrame):
126
+ ivb_sign = infer_ivb_sign(df_raw_in)
127
+ df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
128
+ return df_feat_local
129
+
130
+
131
+ df_feat = _featurize(df_raw)
132
+
133
+
134
+ @st.cache_data(show_spinner=False)
135
+ def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
136
+ df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
137
+ cluster_names_local = xy_cluster_tags(df_fit_local)
138
+ df_fit_local = df_fit_local.copy()
139
+ df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
140
+ return df_fit_local, scaler, km, nn
141
+
142
+
143
+ with st.spinner("Clustering & tagging…"):
144
+ df_fit, scaler, km, nn = _fit_model(df_feat, k)
145
+
146
+ # ---- UI
147
+
148
+ pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
149
  df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
150
 
151
  tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
 
163
  movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
164
  )
165
 
 
166
  with tab2:
167
  st.subheader(f"Scouting Card — {pitcher}")
168
  st.dataframe(
 
180
  "zone_pct",
181
  "cluster_name",
182
  ]
183
+ ],
184
+ use_container_width=True,
185
  )
186
  for _, row in df_p.iterrows():
187
  st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
 
191
  for _, row in df_p.iterrows():
192
  st.markdown(f"#### {row['pitch_type']} comps")
193
  comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
194
+ st.dataframe(comps, use_container_width=True)
build.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ pandas==2.2.2
3
+ numpy==1.26.4
4
+ plotly==5.24.1
5
+ scikit-learn==1.5.1
6
+ pyarrow==16.1.0
7
+ huggingface_hub==0.25.2
src/tags.py CHANGED
@@ -4,6 +4,8 @@ import pandas as pd
4
 
5
 
6
  def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
 
 
7
  if v >= q75:
8
  return big
9
  if v <= q25:
@@ -11,22 +13,65 @@ def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
11
  return mid
12
 
13
 
14
- def _side_label(hb_as):
15
- return "Arm-Side" if hb_as >= 0 else "Glove-Side"
16
-
17
-
18
  def _vert_label(ivb):
 
 
19
  return "Ride" if ivb >= 0 else "Drop"
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
23
  df = df_with_clusters.copy()
24
 
 
25
  q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
26
  q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
27
  q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
28
  q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
29
 
 
30
  q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
31
  q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
32
  q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
@@ -36,36 +81,81 @@ def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
36
 
37
  tags = {}
38
  for c, sub in df.groupby("cluster"):
39
- row = sub.mean(numeric_only=True)
 
 
 
40
  dom_pt = (
41
  sub["pitch_type"].mode().iloc[0]
42
- if not sub["pitch_type"].mode().empty
43
  else "Pitch"
44
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- side = _side_label(row["hb_as_in"])
47
- vert = _vert_label(row["ivb_in"])
48
- mag_side = _mag_label(abs(row["hb_as_in"]), q_abs_hb25, q_abs_hb75)
49
- mag_vert = _mag_label(abs(row["ivb_in"]), q_abs_ivb25, q_abs_ivb75)
50
 
 
51
  flavor = []
52
- if row["whiff_rate"] >= q_wh75:
53
  flavor.append("Whiff-First")
54
- if row["gb_rate"] >= q_gb75:
55
  flavor.append("Grounder-First")
56
- if row["zone_pct"] >= q_zn75:
57
  flavor.append("Strike-Throwing")
58
  if not flavor:
59
  diffs = {
60
- "Whiff-First": row["whiff_rate"] - q_wh50,
61
- "Grounder-First": row["gb_rate"] - q_gb50,
62
- "Strike-Throwing": row["zone_pct"] - q_zn50,
63
  }
64
  flavor.append(max(diffs, key=diffs.get))
65
 
66
- side_noun = "Run" if side == "Arm-Side" else "Sweep"
67
- vert_noun = "Ride" if vert == "Ride" else "Drop"
 
 
 
 
 
 
68
  shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
 
69
  tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
70
 
71
  return tags
 
4
 
5
 
6
  def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
7
+ if pd.isna(v):
8
+ return mid
9
  if v >= q75:
10
  return big
11
  if v <= q25:
 
13
  return mid
14
 
15
 
 
 
 
 
16
  def _vert_label(ivb):
17
+ if pd.isna(ivb):
18
+ return "Neutral"
19
  return "Ride" if ivb >= 0 else "Drop"
20
 
21
 
22
+ def _armside_from_raw_hb(hb_raw: float, throws: str) -> str:
23
+ """Return 'Arm-Side' or 'Glove-Side' from raw HB (catcher view) and dominant throws.
24
+ Statcast convention (catcher view): positive = to catcher’s left (3B side).
25
+ Arm-side mapping commonly used:
26
+ - RHP arm-side run → negative hb_raw
27
+ - LHP arm-side run → positive hb_raw
28
+ """
29
+ if pd.isna(hb_raw) or throws not in ("R", "L"):
30
+ return "Neutral"
31
+ if (throws == "R" and hb_raw < 0) or (throws == "L" and hb_raw > 0):
32
+ return "Arm-Side"
33
+ return "Glove-Side"
34
+
35
+
36
+ def _infer_side_series(sub: pd.DataFrame) -> pd.Series:
37
+ """Infer per-pitch side (Arm/Glove) robustly, using raw hb if available,
38
+ else reconstruct a raw-ish value from hb_as_in and p_throws."""
39
+ has_raw = "hb_in" in sub.columns
40
+ if has_raw:
41
+ hb_raw = sub["hb_in"]
42
+ else:
43
+ # Reconstruct raw-ish: if hb_as_in is arm-side-adjusted (positive toward arm-side),
44
+ # then flip sign for RHP to get a catcher-view-like raw sign.
45
+ # raw ≈ +hb_as for LHP, raw ≈ -hb_as for RHP
46
+ if "hb_as_in" in sub.columns and "p_throws" in sub.columns:
47
+ hb_raw = np.where(sub["p_throws"] == "L", sub["hb_as_in"], -sub["hb_as_in"])
48
+ hb_raw = pd.Series(hb_raw, index=sub.index)
49
+ else:
50
+ return pd.Series(["Neutral"] * len(sub), index=sub.index)
51
+
52
+ throws = sub["p_throws"].fillna(
53
+ sub["p_throws"].mode().iloc[0] if not sub["p_throws"].mode().empty else "R"
54
+ )
55
+ return pd.Series(
56
+ np.where(
57
+ ((throws == "R") & (hb_raw < 0)) | ((throws == "L") & (hb_raw > 0)),
58
+ "Arm-Side",
59
+ "Glove-Side",
60
+ ),
61
+ index=sub.index,
62
+ )
63
+
64
+
65
  def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
66
  df = df_with_clusters.copy()
67
 
68
+ # Quantiles for magnitude bucketing
69
  q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
70
  q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
71
  q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
72
  q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
73
 
74
+ # Quality quantiles
75
  q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
76
  q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
77
  q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
 
81
 
82
  tags = {}
83
  for c, sub in df.groupby("cluster"):
84
+ # Robust central tendency
85
+ row = sub.median(numeric_only=True)
86
+
87
+ # Dominant metadata
88
  dom_pt = (
89
  sub["pitch_type"].mode().iloc[0]
90
+ if "pitch_type" in sub and not sub["pitch_type"].mode().empty
91
  else "Pitch"
92
  )
93
+ dom_throw = (
94
+ sub["p_throws"].mode().iloc[0]
95
+ if "p_throws" in sub and not sub["p_throws"].mode().empty
96
+ else "R"
97
+ )
98
+
99
+ # Robust side inference
100
+ per_pitch_side = _infer_side_series(sub)
101
+ side_counts = per_pitch_side.value_counts(dropna=False)
102
+ side = side_counts.idxmax() if not side_counts.empty else "Neutral"
103
+
104
+ # If nearly tied or Neutral, fall back to median raw
105
+ if side in ("Neutral",) or (
106
+ len(side_counts) > 1 and (side_counts.max() - side_counts.min()) <= 2
107
+ ):
108
+ # Use hb_raw median logic
109
+ if "hb_in" in sub.columns:
110
+ hb_raw_med = sub["hb_in"].median()
111
+ else:
112
+ # Reconstruct raw-ish median from hb_as_in + throws
113
+ if "hb_as_in" in sub.columns:
114
+ hb_raw_med = sub.apply(
115
+ lambda r: (
116
+ r["hb_as_in"]
117
+ if r.get("p_throws", dom_throw) == "L"
118
+ else -r["hb_as_in"]
119
+ ),
120
+ axis=1,
121
+ ).median()
122
+ else:
123
+ hb_raw_med = np.nan
124
+ side = _armside_from_raw_hb(hb_raw_med, dom_throw)
125
+
126
+ # Vertical shape from ivb sign (already handedness-invariant)
127
+ vert = _vert_label(row.get("ivb_in", np.nan))
128
 
129
+ # Magnitudes from absolute, handedness-invariant features
130
+ mag_side = _mag_label(abs(row.get("hb_as_in", np.nan)), q_abs_hb25, q_abs_hb75)
131
+ mag_vert = _mag_label(abs(row.get("ivb_in", np.nan)), q_abs_ivb25, q_abs_ivb75)
 
132
 
133
+ # Flavor tags
134
  flavor = []
135
+ if row.get("whiff_rate", 0) >= q_wh75:
136
  flavor.append("Whiff-First")
137
+ if row.get("gb_rate", 0) >= q_gb75:
138
  flavor.append("Grounder-First")
139
+ if row.get("zone_pct", 0) >= q_zn75:
140
  flavor.append("Strike-Throwing")
141
  if not flavor:
142
  diffs = {
143
+ "Whiff-First": row.get("whiff_rate", 0) - q_wh50,
144
+ "Grounder-First": row.get("gb_rate", 0) - q_gb50,
145
+ "Strike-Throwing": row.get("zone_pct", 0) - q_zn50,
146
  }
147
  flavor.append(max(diffs, key=diffs.get))
148
 
149
+ side_noun = (
150
+ "Run"
151
+ if side == "Arm-Side"
152
+ else ("Sweep" if side == "Glove-Side" else "Run/Sweep")
153
+ )
154
+ vert_noun = (
155
+ "Ride" if vert == "Ride" else ("Drop" if vert == "Drop" else "Ride/Drop")
156
+ )
157
  shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
158
+
159
  tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
160
 
161
  return tags