Spaces:

roguchi
/

pitch_dash

Sleeping

App Files Files Community

rsm-roguchi commited on Oct 30, 2025

Commit

c5c0f3e

1 Parent(s): 4c2852d

update

Browse files

Files changed (4) hide show

app.py +125 -13
build.ipynb +0 -0
requirements.txt +7 -3
src/tags.py +108 -18

app.py CHANGED Viewed

@@ -1,39 +1,151 @@
 import os, sys
-sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
 import streamlit as st
 import pandas as pd
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
 st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
 st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
 with st.sidebar:
     st.header("Data Window")
     dstart, dend = default_window()
     start = st.text_input("Start YYYY-MM-DD", dstart)
     end = st.text_input("End YYYY-MM-DD", dend)
     k = st.slider("Clusters (k)", 5, 12, 8)
-    force = st.checkbox("Force re-download", value=False)
-df_raw = load_statcast(start, end, force=force)
 if df_raw.empty:
-    st.warning("No data for that window.")
     st.stop()
-ivb_sign = infer_ivb_sign(df_raw)
-df_feat = engineer_pitch_features(df_raw, ivb_sign)
-df_fit, scaler, km, nn = fit_kmeans(df_feat, k=k)
-cluster_names = xy_cluster_tags(df_fit)
-df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
-pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
 tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
@@ -51,7 +163,6 @@ with tab1:
             movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
         )
 with tab2:
     st.subheader(f"Scouting Card — {pitcher}")
     st.dataframe(
@@ -69,7 +180,8 @@ with tab2:
                 "zone_pct",
                 "cluster_name",
             ]
-        ]
     )
     for _, row in df_p.iterrows():
         st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
@@ -79,4 +191,4 @@ with tab3:
     for _, row in df_p.iterrows():
         st.markdown(f"#### {row['pitch_type']} comps")
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
-        st.dataframe(comps)

+# app.py
 import os, sys
+from datetime import datetime
+# Ensure we can import from ./src even on HF Spaces
+BASE_DIR = os.path.dirname(__file__)
+sys.path.append(os.path.join(BASE_DIR, "src"))
 import streamlit as st
 import pandas as pd
+# Your local modules
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
+try:
+    from huggingface_hub import hf_hub_download
+    HF_HUB_OK = True
+except Exception:
+    HF_HUB_OK = False
 st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
 st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
+# ---- Helpers
+@st.cache_data(show_spinner=False, ttl=24 * 3600)
+def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
+    """
+    Cached wrapper around your loader. On Spaces, expensive network calls during
+    app init are the #1 cause of infinite 'Starting...'. This keeps it fast.
+    """
+    return load_statcast(start, end, force=force)
+@st.cache_data(show_spinner=False)
+def load_sample_fallback() -> pd.DataFrame:
+    """
+    Optional: fallback sample data so the app is usable even if MLB/Statcast
+    endpoints are rate limited / blocked in Spaces.
+    - Put a small parquet or CSV in your Space repo: data/sample_statcast.parquet
+    - Or host it under a HF Dataset repo and set SAMPLE_DATA_REPO, SAMPLE_DATA_FILE.
+    """
+    local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
+    if os.path.exists(local_path):
+        return pd.read_parquet(local_path)
+    # If not bundled locally, try HF Hub (if available)
+    repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
+    file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
+    if HF_HUB_OK and repo_id:
+        path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
+        return pd.read_parquet(path)
+    # Give a tiny empty frame with expected columns to keep UI alive
+    return pd.DataFrame(
+        columns=[
+            "game_date",
+            "player_name",
+            "pitch_type",
+            "p_throws",
+            "n",
+            "velo",
+            "ivb_in",
+            "hb_as_in",
+            "csw",
+            "whiff_rate",
+            "gb_rate",
+            "zone_pct",
+            "cluster",
+            "cluster_name",
+            "x_mvt",
+            "y_mvt",
+        ]
+    )
+def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
+    """
+    Try cached real data first; if it errors or returns empty, fall back to a sample.
+    """
+    try:
+        df = load_statcast_cached(start, end, force)
+        # Basic sanity check – empty windows are common; handle gracefully
+        if df is not None and not df.empty:
+            return df
+        st.info("No live data returned for that window — showing sample data instead.")
+    except Exception as e:
+        st.warning(f"Live data failed: {e}\nUsing sample data instead.")
+    return load_sample_fallback()
+# ---- Sidebar
 with st.sidebar:
     st.header("Data Window")
     dstart, dend = default_window()
     start = st.text_input("Start YYYY-MM-DD", dstart)
     end = st.text_input("End YYYY-MM-DD", dend)
     k = st.slider("Clusters (k)", 5, 12, 8)
+    force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
+    st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
+# ---- Data pipeline
+with st.spinner("Loading data…"):
+    df_raw = safe_load_data(start, end, force)
 if df_raw.empty:
+    st.warning(
+        "No data available (live and sample were both empty). "
+        "Upload a small sample file to ./data/sample_statcast.parquet or set "
+        "env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
+    )
     st.stop()
+# Feature engineering (cache stable steps)
+@st.cache_data(show_spinner=False)
+def _featurize(df_raw_in: pd.DataFrame):
+    ivb_sign = infer_ivb_sign(df_raw_in)
+    df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
+    return df_feat_local
+df_feat = _featurize(df_raw)
+@st.cache_data(show_spinner=False)
+def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
+    df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
+    cluster_names_local = xy_cluster_tags(df_fit_local)
+    df_fit_local = df_fit_local.copy()
+    df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
+    return df_fit_local, scaler, km, nn
+with st.spinner("Clustering & tagging…"):
+    df_fit, scaler, km, nn = _fit_model(df_feat, k)
+# ---- UI
+pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
 tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
             movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
         )
 with tab2:
     st.subheader(f"Scouting Card — {pitcher}")
     st.dataframe(
                 "zone_pct",
                 "cluster_name",
             ]
+        ],
+        use_container_width=True,
     )
     for _, row in df_p.iterrows():
         st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
     for _, row in df_p.iterrows():
         st.markdown(f"#### {row['pitch_type']} comps")
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
+        st.dataframe(comps, use_container_width=True)

build.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit==1.38.0
+pandas==2.2.2
+numpy==1.26.4
+plotly==5.24.1
+scikit-learn==1.5.1
+pyarrow==16.1.0
+huggingface_hub==0.25.2

src/tags.py CHANGED Viewed

@@ -4,6 +4,8 @@ import pandas as pd
 def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
     if v >= q75:
         return big
     if v <= q25:
@@ -11,22 +13,65 @@ def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
     return mid
-def _side_label(hb_as):
-    return "Arm-Side" if hb_as >= 0 else "Glove-Side"
 def _vert_label(ivb):
     return "Ride" if ivb >= 0 else "Drop"
 def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
     df = df_with_clusters.copy()
     q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
     q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
     q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
     q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
     q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
     q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
     q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
@@ -36,36 +81,81 @@ def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
     tags = {}
     for c, sub in df.groupby("cluster"):
-        row = sub.mean(numeric_only=True)
         dom_pt = (
             sub["pitch_type"].mode().iloc[0]
-            if not sub["pitch_type"].mode().empty
             else "Pitch"
         )
-        side = _side_label(row["hb_as_in"])
-        vert = _vert_label(row["ivb_in"])
-        mag_side = _mag_label(abs(row["hb_as_in"]), q_abs_hb25, q_abs_hb75)
-        mag_vert = _mag_label(abs(row["ivb_in"]), q_abs_ivb25, q_abs_ivb75)
         flavor = []
-        if row["whiff_rate"] >= q_wh75:
             flavor.append("Whiff-First")
-        if row["gb_rate"] >= q_gb75:
             flavor.append("Grounder-First")
-        if row["zone_pct"] >= q_zn75:
             flavor.append("Strike-Throwing")
         if not flavor:
             diffs = {
-                "Whiff-First": row["whiff_rate"] - q_wh50,
-                "Grounder-First": row["gb_rate"] - q_gb50,
-                "Strike-Throwing": row["zone_pct"] - q_zn50,
             }
             flavor.append(max(diffs, key=diffs.get))
-        side_noun = "Run" if side == "Arm-Side" else "Sweep"
-        vert_noun = "Ride" if vert == "Ride" else "Drop"
         shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
         tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
     return tags

 def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
+    if pd.isna(v):
+        return mid
     if v >= q75:
         return big
     if v <= q25:
     return mid
 def _vert_label(ivb):
+    if pd.isna(ivb):
+        return "Neutral"
     return "Ride" if ivb >= 0 else "Drop"
+def _armside_from_raw_hb(hb_raw: float, throws: str) -> str:
+    """Return 'Arm-Side' or 'Glove-Side' from raw HB (catcher view) and dominant throws.
+    Statcast convention (catcher view): positive = to catcher’s left (3B side).
+    Arm-side mapping commonly used:
+      - RHP arm-side run → negative hb_raw
+      - LHP arm-side run → positive hb_raw
+    """
+    if pd.isna(hb_raw) or throws not in ("R", "L"):
+        return "Neutral"
+    if (throws == "R" and hb_raw < 0) or (throws == "L" and hb_raw > 0):
+        return "Arm-Side"
+    return "Glove-Side"
+def _infer_side_series(sub: pd.DataFrame) -> pd.Series:
+    """Infer per-pitch side (Arm/Glove) robustly, using raw hb if available,
+    else reconstruct a raw-ish value from hb_as_in and p_throws."""
+    has_raw = "hb_in" in sub.columns
+    if has_raw:
+        hb_raw = sub["hb_in"]
+    else:
+        # Reconstruct raw-ish: if hb_as_in is arm-side-adjusted (positive toward arm-side),
+        # then flip sign for RHP to get a catcher-view-like raw sign.
+        # raw ≈ +hb_as for LHP, raw ≈ -hb_as for RHP
+        if "hb_as_in" in sub.columns and "p_throws" in sub.columns:
+            hb_raw = np.where(sub["p_throws"] == "L", sub["hb_as_in"], -sub["hb_as_in"])
+            hb_raw = pd.Series(hb_raw, index=sub.index)
+        else:
+            return pd.Series(["Neutral"] * len(sub), index=sub.index)
+    throws = sub["p_throws"].fillna(
+        sub["p_throws"].mode().iloc[0] if not sub["p_throws"].mode().empty else "R"
+    )
+    return pd.Series(
+        np.where(
+            ((throws == "R") & (hb_raw < 0)) | ((throws == "L") & (hb_raw > 0)),
+            "Arm-Side",
+            "Glove-Side",
+        ),
+        index=sub.index,
+    )
 def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
     df = df_with_clusters.copy()
+    # Quantiles for magnitude bucketing
     q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
     q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
     q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
     q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
+    # Quality quantiles
     q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
     q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
     q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
     tags = {}
     for c, sub in df.groupby("cluster"):
+        # Robust central tendency
+        row = sub.median(numeric_only=True)
+        # Dominant metadata
         dom_pt = (
             sub["pitch_type"].mode().iloc[0]
+            if "pitch_type" in sub and not sub["pitch_type"].mode().empty
             else "Pitch"
         )
+        dom_throw = (
+            sub["p_throws"].mode().iloc[0]
+            if "p_throws" in sub and not sub["p_throws"].mode().empty
+            else "R"
+        )
+        # Robust side inference
+        per_pitch_side = _infer_side_series(sub)
+        side_counts = per_pitch_side.value_counts(dropna=False)
+        side = side_counts.idxmax() if not side_counts.empty else "Neutral"
+        # If nearly tied or Neutral, fall back to median raw
+        if side in ("Neutral",) or (
+            len(side_counts) > 1 and (side_counts.max() - side_counts.min()) <= 2
+        ):
+            # Use hb_raw median logic
+            if "hb_in" in sub.columns:
+                hb_raw_med = sub["hb_in"].median()
+            else:
+                # Reconstruct raw-ish median from hb_as_in + throws
+                if "hb_as_in" in sub.columns:
+                    hb_raw_med = sub.apply(
+                        lambda r: (
+                            r["hb_as_in"]
+                            if r.get("p_throws", dom_throw) == "L"
+                            else -r["hb_as_in"]
+                        ),
+                        axis=1,
+                    ).median()
+                else:
+                    hb_raw_med = np.nan
+            side = _armside_from_raw_hb(hb_raw_med, dom_throw)
+        # Vertical shape from ivb sign (already handedness-invariant)
+        vert = _vert_label(row.get("ivb_in", np.nan))
+        # Magnitudes from absolute, handedness-invariant features
+        mag_side = _mag_label(abs(row.get("hb_as_in", np.nan)), q_abs_hb25, q_abs_hb75)
+        mag_vert = _mag_label(abs(row.get("ivb_in", np.nan)), q_abs_ivb25, q_abs_ivb75)
+        # Flavor tags
         flavor = []
+        if row.get("whiff_rate", 0) >= q_wh75:
             flavor.append("Whiff-First")
+        if row.get("gb_rate", 0) >= q_gb75:
             flavor.append("Grounder-First")
+        if row.get("zone_pct", 0) >= q_zn75:
             flavor.append("Strike-Throwing")
         if not flavor:
             diffs = {
+                "Whiff-First": row.get("whiff_rate", 0) - q_wh50,
+                "Grounder-First": row.get("gb_rate", 0) - q_gb50,
+                "Strike-Throwing": row.get("zone_pct", 0) - q_zn50,
             }
             flavor.append(max(diffs, key=diffs.get))
+        side_noun = (
+            "Run"
+            if side == "Arm-Side"
+            else ("Sweep" if side == "Glove-Side" else "Run/Sweep")
+        )
+        vert_noun = (
+            "Ride" if vert == "Ride" else ("Drop" if vert == "Drop" else "Ride/Drop")
+        )
         shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
         tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
     return tags