pitch_dash / app.py
rsm-roguchi
boah
cbe015e
# app.py
import os, sys
from datetime import datetime
BASE_DIR = os.path.dirname(__file__)
sys.path.append(os.path.join(BASE_DIR, "src"))
import streamlit as st
import pandas as pd
from data import load_statcast, default_window
from featurize import infer_ivb_sign, engineer_pitch_features
# ⬇️ Revert to older API
from model import fit_kmeans, nearest_comps
from tags import xy_cluster_tags
from plots import movement_scatter_xy, radar_quality
from matchups import best_matchups_for_pitcher, ensure_batter_names
try:
from huggingface_hub import hf_hub_download
HF_HUB_OK = True
except Exception:
HF_HUB_OK = False
st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
@st.cache_data(show_spinner=False, ttl=24 * 3600)
def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
return load_statcast(start, end, force=force)
@st.cache_data(show_spinner=False)
def load_sample_fallback() -> pd.DataFrame:
local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
if os.path.exists(local_path):
return pd.read_parquet(local_path)
repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
if HF_HUB_OK and repo_id:
path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
return pd.read_parquet(path)
return pd.DataFrame(
columns=[
"game_date",
"player_name",
"pitch_type",
"p_throws",
"n",
"velo",
"ivb_in",
"hb_as_in",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
"cluster",
"cluster_name",
"x_mvt",
"y_mvt",
]
)
def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
try:
df = load_statcast_cached(start, end, force)
if df is not None and not df.empty:
return df
st.info("No live data returned for that window — showing sample data instead.")
except Exception as e:
st.warning(f"Live data failed: {e}\nUsing sample data instead.")
return load_sample_fallback()
with st.sidebar:
st.header("Data Window")
dstart, dend = default_window()
start = st.text_input("Start YYYY-MM-DD", dstart)
end = st.text_input("End YYYY-MM-DD", dend)
k = st.slider("Clusters (k)", 5, 40, 25)
force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
with st.spinner("Loading data…"):
df_raw = safe_load_data(start, end, force)
df_raw = ensure_batter_names(df_raw)
if df_raw.empty:
st.warning(
"No data available (live and sample were both empty). "
"Upload a small sample file to ./data/sample_statcast.parquet or set "
"env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
)
st.stop()
@st.cache_data(show_spinner=False)
def _featurize(df_raw_in: pd.DataFrame):
ivb_sign = infer_ivb_sign(df_raw_in)
df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
return df_feat_local
df_feat = _featurize(df_raw)
# ✅ Cache the fitted artifacts from the older API
@st.cache_resource(show_spinner=False)
def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
# Tag clusters with readable names
cluster_names_local = xy_cluster_tags(df_fit_local)
df_fit_local = df_fit_local.copy()
df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
return df_fit_local, scaler, km, nn
with st.spinner("Clustering & tagging…"):
df_fit, scaler, km, nn = _fit_model(df_feat, k)
pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])
with tab1:
view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
if view == "Selected pitcher":
st.subheader(f"Movement — {pitcher}")
st.plotly_chart(
movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True
)
else:
st.subheader("Movement — All pitchers (cluster context)")
st.plotly_chart(
movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
)
with tab2:
st.subheader(f"Scouting Card — {pitcher}")
st.dataframe(
df_p[
[
"pitch_type",
"p_throws",
"n",
"velo",
"ivb_in",
"hb_as_in",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
"cluster_name",
]
],
use_container_width=True,
)
for _, row in df_p.iterrows():
st.markdown(f"### {row['pitch_type']}{row['cluster_name']}")
st.plotly_chart(radar_quality(row), use_container_width=True)
with tab3:
for _, row in df_p.iterrows():
st.markdown(f"#### {row['pitch_type']} comps")
# ⬇️ Old signature again
comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
st.dataframe(comps, use_container_width=True)
with tab4:
st.subheader(f"Best Matchups — {pitcher}")
# Controls
colA, colB, colC, colD = st.columns([1, 1, 1, 2])
with colA:
min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
with colB:
top_n = st.number_input("Top N", 5, 50, 10, step=5)
with colC:
w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
with colD:
w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)
# Normalize weights (optional)
total_w = max(w_whiff + w_gb, 1e-6)
w_whiff /= total_w
w_gb /= total_w
# Compute
best, worst = best_matchups_for_pitcher(
df_raw,
pitcher,
min_pitches=min_pitches,
top_n=int(top_n),
w_whiff=float(w_whiff),
w_gb=float(w_gb),
)
if best.empty and worst.empty:
st.info(
"No batter matchups for this pitcher within the current window / filters."
)
else:
c1, c2 = st.columns(2)
with c1:
st.markdown("### ✅ Best (Pitcher-Friendly)")
st.dataframe(best, use_container_width=True)
with c2:
st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
st.dataframe(worst, use_container_width=True)
st.caption(
"Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
"Adjust weights to emphasize strikeouts vs. weak contact."
)