Spaces:
Sleeping
Sleeping
File size: 7,076 Bytes
c5c0f3e 58c5edf c5c0f3e 58c5edf c5c0f3e 58c5edf c75151e c5c0f3e c75151e 752a595 c75151e cbe015e c75151e c5c0f3e c75151e c5c0f3e c75151e 752a595 c5c0f3e cbe015e c75151e c5c0f3e c75151e c5c0f3e 752a595 c5c0f3e 752a595 c5c0f3e c75151e cbe015e c75151e c5c0f3e c75151e 752a595 c75151e c5c0f3e cbe015e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# app.py
import os, sys
from datetime import datetime
BASE_DIR = os.path.dirname(__file__)
sys.path.append(os.path.join(BASE_DIR, "src"))
import streamlit as st
import pandas as pd
from data import load_statcast, default_window
from featurize import infer_ivb_sign, engineer_pitch_features
# ⬇️ Revert to older API
from model import fit_kmeans, nearest_comps
from tags import xy_cluster_tags
from plots import movement_scatter_xy, radar_quality
from matchups import best_matchups_for_pitcher, ensure_batter_names
try:
from huggingface_hub import hf_hub_download
HF_HUB_OK = True
except Exception:
HF_HUB_OK = False
st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
@st.cache_data(show_spinner=False, ttl=24 * 3600)
def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
return load_statcast(start, end, force=force)
@st.cache_data(show_spinner=False)
def load_sample_fallback() -> pd.DataFrame:
local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
if os.path.exists(local_path):
return pd.read_parquet(local_path)
repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
if HF_HUB_OK and repo_id:
path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
return pd.read_parquet(path)
return pd.DataFrame(
columns=[
"game_date",
"player_name",
"pitch_type",
"p_throws",
"n",
"velo",
"ivb_in",
"hb_as_in",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
"cluster",
"cluster_name",
"x_mvt",
"y_mvt",
]
)
def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
try:
df = load_statcast_cached(start, end, force)
if df is not None and not df.empty:
return df
st.info("No live data returned for that window — showing sample data instead.")
except Exception as e:
st.warning(f"Live data failed: {e}\nUsing sample data instead.")
return load_sample_fallback()
with st.sidebar:
st.header("Data Window")
dstart, dend = default_window()
start = st.text_input("Start YYYY-MM-DD", dstart)
end = st.text_input("End YYYY-MM-DD", dend)
k = st.slider("Clusters (k)", 5, 40, 25)
force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
with st.spinner("Loading data…"):
df_raw = safe_load_data(start, end, force)
df_raw = ensure_batter_names(df_raw)
if df_raw.empty:
st.warning(
"No data available (live and sample were both empty). "
"Upload a small sample file to ./data/sample_statcast.parquet or set "
"env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
)
st.stop()
@st.cache_data(show_spinner=False)
def _featurize(df_raw_in: pd.DataFrame):
ivb_sign = infer_ivb_sign(df_raw_in)
df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
return df_feat_local
df_feat = _featurize(df_raw)
# ✅ Cache the fitted artifacts from the older API
@st.cache_resource(show_spinner=False)
def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
# Tag clusters with readable names
cluster_names_local = xy_cluster_tags(df_fit_local)
df_fit_local = df_fit_local.copy()
df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
return df_fit_local, scaler, km, nn
with st.spinner("Clustering & tagging…"):
df_fit, scaler, km, nn = _fit_model(df_feat, k)
pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])
with tab1:
view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
if view == "Selected pitcher":
st.subheader(f"Movement — {pitcher}")
st.plotly_chart(
movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True
)
else:
st.subheader("Movement — All pitchers (cluster context)")
st.plotly_chart(
movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
)
with tab2:
st.subheader(f"Scouting Card — {pitcher}")
st.dataframe(
df_p[
[
"pitch_type",
"p_throws",
"n",
"velo",
"ivb_in",
"hb_as_in",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
"cluster_name",
]
],
use_container_width=True,
)
for _, row in df_p.iterrows():
st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
st.plotly_chart(radar_quality(row), use_container_width=True)
with tab3:
for _, row in df_p.iterrows():
st.markdown(f"#### {row['pitch_type']} comps")
# ⬇️ Old signature again
comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
st.dataframe(comps, use_container_width=True)
with tab4:
st.subheader(f"Best Matchups — {pitcher}")
# Controls
colA, colB, colC, colD = st.columns([1, 1, 1, 2])
with colA:
min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
with colB:
top_n = st.number_input("Top N", 5, 50, 10, step=5)
with colC:
w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
with colD:
w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)
# Normalize weights (optional)
total_w = max(w_whiff + w_gb, 1e-6)
w_whiff /= total_w
w_gb /= total_w
# Compute
best, worst = best_matchups_for_pitcher(
df_raw,
pitcher,
min_pitches=min_pitches,
top_n=int(top_n),
w_whiff=float(w_whiff),
w_gb=float(w_gb),
)
if best.empty and worst.empty:
st.info(
"No batter matchups for this pitcher within the current window / filters."
)
else:
c1, c2 = st.columns(2)
with c1:
st.markdown("### ✅ Best (Pitcher-Friendly)")
st.dataframe(best, use_container_width=True)
with c2:
st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
st.dataframe(worst, use_container_width=True)
st.caption(
"Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
"Adjust weights to emphasize strikeouts vs. weak contact."
)
|