# app.py import os, sys from datetime import datetime BASE_DIR = os.path.dirname(__file__) sys.path.append(os.path.join(BASE_DIR, "src")) import streamlit as st import pandas as pd from data import load_statcast, default_window from featurize import infer_ivb_sign, engineer_pitch_features # ⬇️ Revert to older API from model import fit_kmeans, nearest_comps from tags import xy_cluster_tags from plots import movement_scatter_xy, radar_quality from matchups import best_matchups_for_pitcher, ensure_batter_names try: from huggingface_hub import hf_hub_download HF_HUB_OK = True except Exception: HF_HUB_OK = False st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide") st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards") @st.cache_data(show_spinner=False, ttl=24 * 3600) def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame: return load_statcast(start, end, force=force) @st.cache_data(show_spinner=False) def load_sample_fallback() -> pd.DataFrame: local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet") if os.path.exists(local_path): return pd.read_parquet(local_path) repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip() file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip() if HF_HUB_OK and repo_id: path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset") return pd.read_parquet(path) return pd.DataFrame( columns=[ "game_date", "player_name", "pitch_type", "p_throws", "n", "velo", "ivb_in", "hb_as_in", "csw", "whiff_rate", "gb_rate", "zone_pct", "cluster", "cluster_name", "x_mvt", "y_mvt", ] ) def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame: try: df = load_statcast_cached(start, end, force) if df is not None and not df.empty: return df st.info("No live data returned for that window — showing sample data instead.") except Exception as e: st.warning(f"Live data failed: {e}\nUsing sample data instead.") return load_sample_fallback() with st.sidebar: st.header("Data Window") dstart, dend = default_window() start = st.text_input("Start YYYY-MM-DD", dstart) end = st.text_input("End YYYY-MM-DD", dend) k = st.slider("Clusters (k)", 5, 40, 25) force = st.checkbox("Force re-download (discouraged on Spaces)", value=False) st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.") with st.spinner("Loading data…"): df_raw = safe_load_data(start, end, force) df_raw = ensure_batter_names(df_raw) if df_raw.empty: st.warning( "No data available (live and sample were both empty). " "Upload a small sample file to ./data/sample_statcast.parquet or set " "env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset." ) st.stop() @st.cache_data(show_spinner=False) def _featurize(df_raw_in: pd.DataFrame): ivb_sign = infer_ivb_sign(df_raw_in) df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign) return df_feat_local df_feat = _featurize(df_raw) # ✅ Cache the fitted artifacts from the older API @st.cache_resource(show_spinner=False) def _fit_model(df_feat_in: pd.DataFrame, k_val: int): df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val) # Tag clusters with readable names cluster_names_local = xy_cluster_tags(df_fit_local) df_fit_local = df_fit_local.copy() df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local) return df_fit_local, scaler, km, nn with st.spinner("Clustering & tagging…"): df_fit, scaler, km, nn = _fit_model(df_feat, k) pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique())) df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type") tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups']) with tab1: view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True) if view == "Selected pitcher": st.subheader(f"Movement — {pitcher}") st.plotly_chart( movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True ) else: st.subheader("Movement — All pitchers (cluster context)") st.plotly_chart( movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True ) with tab2: st.subheader(f"Scouting Card — {pitcher}") st.dataframe( df_p[ [ "pitch_type", "p_throws", "n", "velo", "ivb_in", "hb_as_in", "csw", "whiff_rate", "gb_rate", "zone_pct", "cluster_name", ] ], use_container_width=True, ) for _, row in df_p.iterrows(): st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}") st.plotly_chart(radar_quality(row), use_container_width=True) with tab3: for _, row in df_p.iterrows(): st.markdown(f"#### {row['pitch_type']} comps") # ⬇️ Old signature again comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6) st.dataframe(comps, use_container_width=True) with tab4: st.subheader(f"Best Matchups — {pitcher}") # Controls colA, colB, colC, colD = st.columns([1, 1, 1, 2]) with colA: min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5) with colB: top_n = st.number_input("Top N", 5, 50, 10, step=5) with colC: w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05) with colD: w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05) # Normalize weights (optional) total_w = max(w_whiff + w_gb, 1e-6) w_whiff /= total_w w_gb /= total_w # Compute best, worst = best_matchups_for_pitcher( df_raw, pitcher, min_pitches=min_pitches, top_n=int(top_n), w_whiff=float(w_whiff), w_gb=float(w_gb), ) if best.empty and worst.empty: st.info( "No batter matchups for this pitcher within the current window / filters." ) else: c1, c2 = st.columns(2) with c1: st.markdown("### ✅ Best (Pitcher-Friendly)") st.dataframe(best, use_container_width=True) with c2: st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)") st.dataframe(worst, use_container_width=True) st.caption( "Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. " "Adjust weights to emphasize strikeouts vs. weak contact." )