Spaces:

roguchi
/

pitch_dash

Sleeping

File size: 7,076 Bytes

# app.py
import os, sys
from datetime import datetime

BASE_DIR = os.path.dirname(__file__)
sys.path.append(os.path.join(BASE_DIR, "src"))

import streamlit as st
import pandas as pd

from data import load_statcast, default_window
from featurize import infer_ivb_sign, engineer_pitch_features

# ⬇️ Revert to older API
from model import fit_kmeans, nearest_comps
from tags import xy_cluster_tags
from plots import movement_scatter_xy, radar_quality
from matchups import best_matchups_for_pitcher, ensure_batter_names

try:
    from huggingface_hub import hf_hub_download

    HF_HUB_OK = True
except Exception:
    HF_HUB_OK = False

st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")


@st.cache_data(show_spinner=False, ttl=24 * 3600)
def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
    return load_statcast(start, end, force=force)


@st.cache_data(show_spinner=False)
def load_sample_fallback() -> pd.DataFrame:
    local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
    if os.path.exists(local_path):
        return pd.read_parquet(local_path)
    repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
    file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
    if HF_HUB_OK and repo_id:
        path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
        return pd.read_parquet(path)
    return pd.DataFrame(
        columns=[
            "game_date",
            "player_name",
            "pitch_type",
            "p_throws",
            "n",
            "velo",
            "ivb_in",
            "hb_as_in",
            "csw",
            "whiff_rate",
            "gb_rate",
            "zone_pct",
            "cluster",
            "cluster_name",
            "x_mvt",
            "y_mvt",
        ]
    )


def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
    try:
        df = load_statcast_cached(start, end, force)
        if df is not None and not df.empty:
            return df
        st.info("No live data returned for that window — showing sample data instead.")
    except Exception as e:
        st.warning(f"Live data failed: {e}\nUsing sample data instead.")
    return load_sample_fallback()


with st.sidebar:
    st.header("Data Window")
    dstart, dend = default_window()
    start = st.text_input("Start YYYY-MM-DD", dstart)
    end = st.text_input("End YYYY-MM-DD", dend)
    k = st.slider("Clusters (k)", 5, 40, 25)
    force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
    st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")

with st.spinner("Loading data…"):
    df_raw = safe_load_data(start, end, force)
    df_raw = ensure_batter_names(df_raw)

if df_raw.empty:
    st.warning(
        "No data available (live and sample were both empty). "
        "Upload a small sample file to ./data/sample_statcast.parquet or set "
        "env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
    )
    st.stop()


@st.cache_data(show_spinner=False)
def _featurize(df_raw_in: pd.DataFrame):
    ivb_sign = infer_ivb_sign(df_raw_in)
    df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
    return df_feat_local


df_feat = _featurize(df_raw)


# ✅ Cache the fitted artifacts from the older API
@st.cache_resource(show_spinner=False)
def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
    df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
    # Tag clusters with readable names
    cluster_names_local = xy_cluster_tags(df_fit_local)
    df_fit_local = df_fit_local.copy()
    df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
    return df_fit_local, scaler, km, nn


with st.spinner("Clustering & tagging…"):
    df_fit, scaler, km, nn = _fit_model(df_feat, k)

pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")

tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])

with tab1:
    view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
    if view == "Selected pitcher":
        st.subheader(f"Movement — {pitcher}")
        st.plotly_chart(
            movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True
        )
    else:
        st.subheader("Movement — All pitchers (cluster context)")
        st.plotly_chart(
            movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
        )

with tab2:
    st.subheader(f"Scouting Card — {pitcher}")
    st.dataframe(
        df_p[
            [
                "pitch_type",
                "p_throws",
                "n",
                "velo",
                "ivb_in",
                "hb_as_in",
                "csw",
                "whiff_rate",
                "gb_rate",
                "zone_pct",
                "cluster_name",
            ]
        ],
        use_container_width=True,
    )
    for _, row in df_p.iterrows():
        st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
        st.plotly_chart(radar_quality(row), use_container_width=True)

with tab3:
    for _, row in df_p.iterrows():
        st.markdown(f"#### {row['pitch_type']} comps")
        # ⬇️ Old signature again
        comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
        st.dataframe(comps, use_container_width=True)

with tab4:
    st.subheader(f"Best Matchups — {pitcher}")

    # Controls
    colA, colB, colC, colD = st.columns([1, 1, 1, 2])
    with colA:
        min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
    with colB:
        top_n = st.number_input("Top N", 5, 50, 10, step=5)
    with colC:
        w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
    with colD:
        w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)

    # Normalize weights (optional)
    total_w = max(w_whiff + w_gb, 1e-6)
    w_whiff /= total_w
    w_gb /= total_w

    # Compute
    best, worst = best_matchups_for_pitcher(
        df_raw,
        pitcher,
        min_pitches=min_pitches,
        top_n=int(top_n),
        w_whiff=float(w_whiff),
        w_gb=float(w_gb),
    )

    if best.empty and worst.empty:
        st.info(
            "No batter matchups for this pitcher within the current window / filters."
        )
    else:
        c1, c2 = st.columns(2)
        with c1:
            st.markdown("### ✅ Best (Pitcher-Friendly)")
            st.dataframe(best, use_container_width=True)
        with c2:
            st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
            st.dataframe(worst, use_container_width=True)

    st.caption(
        "Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
        "Adjust weights to emphasize strikeouts vs. weak contact."
    )