Spaces:

roguchi
/

pitch_dash

Sleeping

App Files Files Community

rsm-roguchi commited on Oct 30, 2025

Commit

752a595

1 Parent(s): cb73dd6

update

Browse files

Files changed (6) hide show

app.py +7 -32
bin/cli.py +14 -4
pyproject.toml +1 -0
requirements.txt +2 -1
src/model.py +205 -16
src/tags.py +88 -114

app.py CHANGED Viewed

@@ -2,16 +2,16 @@
 import os, sys
 from datetime import datetime
-# Ensure we can import from ./src even on HF Spaces
 BASE_DIR = os.path.dirname(__file__)
 sys.path.append(os.path.join(BASE_DIR, "src"))
 import streamlit as st
 import pandas as pd
-# Your local modules
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
@@ -26,38 +26,22 @@ except Exception:
 st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
 st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
-# ---- Helpers
 @st.cache_data(show_spinner=False, ttl=24 * 3600)
 def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
-    """
-    Cached wrapper around your loader. On Spaces, expensive network calls during
-    app init are the #1 cause of infinite 'Starting...'. This keeps it fast.
-    """
     return load_statcast(start, end, force=force)
 @st.cache_data(show_spinner=False)
 def load_sample_fallback() -> pd.DataFrame:
-    """
-    Optional: fallback sample data so the app is usable even if MLB/Statcast
-    endpoints are rate limited / blocked in Spaces.
-    - Put a small parquet or CSV in your Space repo: data/sample_statcast.parquet
-    - Or host it under a HF Dataset repo and set SAMPLE_DATA_REPO, SAMPLE_DATA_FILE.
-    """
     local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
     if os.path.exists(local_path):
         return pd.read_parquet(local_path)
-    # If not bundled locally, try HF Hub (if available)
     repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
     file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
     if HF_HUB_OK and repo_id:
         path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
         return pd.read_parquet(path)
-    # Give a tiny empty frame with expected columns to keep UI alive
     return pd.DataFrame(
         columns=[
             "game_date",
@@ -81,12 +65,8 @@ def load_sample_fallback() -> pd.DataFrame:
 def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
-    """
-    Try cached real data first; if it errors or returns empty, fall back to a sample.
-    """
     try:
         df = load_statcast_cached(start, end, force)
-        # Basic sanity check – empty windows are common; handle gracefully
         if df is not None and not df.empty:
             return df
         st.info("No live data returned for that window — showing sample data instead.")
@@ -95,19 +75,15 @@ def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
     return load_sample_fallback()
-# ---- Sidebar
 with st.sidebar:
     st.header("Data Window")
     dstart, dend = default_window()
     start = st.text_input("Start YYYY-MM-DD", dstart)
     end = st.text_input("End YYYY-MM-DD", dend)
-    k = st.slider("Clusters (k)", 5, 12, 8)
     force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
     st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
-# ---- Data pipeline
 with st.spinner("Loading data…"):
     df_raw = safe_load_data(start, end, force)
@@ -120,7 +96,6 @@ if df_raw.empty:
     st.stop()
-# Feature engineering (cache stable steps)
 @st.cache_data(show_spinner=False)
 def _featurize(df_raw_in: pd.DataFrame):
     ivb_sign = infer_ivb_sign(df_raw_in)
@@ -131,9 +106,11 @@ def _featurize(df_raw_in: pd.DataFrame):
 df_feat = _featurize(df_raw)
-@st.cache_data(show_spinner=False)
 def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
     df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
     cluster_names_local = xy_cluster_tags(df_fit_local)
     df_fit_local = df_fit_local.copy()
     df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
@@ -143,8 +120,6 @@ def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
 with st.spinner("Clustering & tagging…"):
     df_fit, scaler, km, nn = _fit_model(df_feat, k)
-# ---- UI
 pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
@@ -190,6 +165,6 @@ with tab2:
 with tab3:
     for _, row in df_p.iterrows():
         st.markdown(f"#### {row['pitch_type']} comps")
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
         st.dataframe(comps, use_container_width=True)

 import os, sys
 from datetime import datetime
 BASE_DIR = os.path.dirname(__file__)
 sys.path.append(os.path.join(BASE_DIR, "src"))
 import streamlit as st
 import pandas as pd
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
+# ⬇️ Revert to older API
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy, radar_quality
 st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
 st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
 @st.cache_data(show_spinner=False, ttl=24 * 3600)
 def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
     return load_statcast(start, end, force=force)
 @st.cache_data(show_spinner=False)
 def load_sample_fallback() -> pd.DataFrame:
     local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
     if os.path.exists(local_path):
         return pd.read_parquet(local_path)
     repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
     file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
     if HF_HUB_OK and repo_id:
         path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
         return pd.read_parquet(path)
     return pd.DataFrame(
         columns=[
             "game_date",
 def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
     try:
         df = load_statcast_cached(start, end, force)
         if df is not None and not df.empty:
             return df
         st.info("No live data returned for that window — showing sample data instead.")
     return load_sample_fallback()
 with st.sidebar:
     st.header("Data Window")
     dstart, dend = default_window()
     start = st.text_input("Start YYYY-MM-DD", dstart)
     end = st.text_input("End YYYY-MM-DD", dend)
+    k = st.slider("Clusters (k)", 5, 40, 25)
     force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
     st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")
 with st.spinner("Loading data…"):
     df_raw = safe_load_data(start, end, force)
     st.stop()
 @st.cache_data(show_spinner=False)
 def _featurize(df_raw_in: pd.DataFrame):
     ivb_sign = infer_ivb_sign(df_raw_in)
 df_feat = _featurize(df_raw)
+# ✅ Cache the fitted artifacts from the older API
+@st.cache_resource(show_spinner=False)
 def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
     df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
+    # Tag clusters with readable names
     cluster_names_local = xy_cluster_tags(df_fit_local)
     df_fit_local = df_fit_local.copy()
     df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
 with st.spinner("Clustering & tagging…"):
     df_fit, scaler, km, nn = _fit_model(df_feat, k)
 pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
 df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
 with tab3:
     for _, row in df_p.iterrows():
         st.markdown(f"#### {row['pitch_type']} comps")
+        # ⬇️ Old signature again
         comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
         st.dataframe(comps, use_container_width=True)

bin/cli.py CHANGED Viewed

@@ -2,6 +2,8 @@ from __future__ import annotations
 import argparse
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy
@@ -38,7 +40,12 @@ def main():
     print(f"IVB sign inferred = {ivb_sign} (ride should be positive)")
     df_feat = engineer_pitch_features(df_raw, ivb_sign)
-    df_fit, scaler, km, nn = fit_kmeans(df_feat, k=args.k)
     cluster_names = xy_cluster_tags(df_fit)
     df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
@@ -78,9 +85,8 @@ def main():
                 ].to_string(index=False)
             )
             for _, row in df_p.iterrows():
-                comps = nearest_comps(
-                    row, df_fit, scaler, nn, within_pitch_type=True, k=6
-                )
                 print(f"\nNearest comps — {row['pitch_type']} ({row['cluster_name']}):")
                 print(comps.to_string(index=False))
@@ -90,3 +96,7 @@ def main():
         out = ARTIFACTS_DIR / "movement_all.html"
         pio.write_html(fig, file=str(out), auto_open=False, include_plotlyjs="cdn")
         print(f"Saved plot: {out}")

 import argparse
 from data import load_statcast, default_window
 from featurize import infer_ivb_sign, engineer_pitch_features
+# ⬇️ NEW: import the updated API
 from model import fit_kmeans, nearest_comps
 from tags import xy_cluster_tags
 from plots import movement_scatter_xy
     print(f"IVB sign inferred = {ivb_sign} (ride should be positive)")
     df_feat = engineer_pitch_features(df_raw, ivb_sign)
+    # ⬇️ NEW: fit the improved model
+    model = fit_pitch_clusters(df_feat, k=args.k)
+    df_fit = model.df_fit  # contains all original cols + 'cluster'
+    # Tag clusters with human-readable names
     cluster_names = xy_cluster_tags(df_fit)
     df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
                 ].to_string(index=False)
             )
             for _, row in df_p.iterrows():
+                # ⬇️ UPDATED: pass the model, not (df_fit, scaler, nn)
+                comps = nearest_comps(row, model, k=5, allow_cross_type=False)
                 print(f"\nNearest comps — {row['pitch_type']} ({row['cluster_name']}):")
                 print(comps.to_string(index=False))
         out = ARTIFACTS_DIR / "movement_all.html"
         pio.write_html(fig, file=str(out), auto_open=False, include_plotlyjs="cdn")
         print(f"Saved plot: {out}")
+if __name__ == "__main__":
+    main()

pyproject.toml CHANGED Viewed

@@ -9,6 +9,7 @@ dependencies = [
   "numpy",
   "pybaseball",
   "scikit-learn",
   "plotly",
   "pyarrow",
   "streamlit"  # needed for HF Space app below

   "numpy",
   "pybaseball",
   "scikit-learn",
+  "scikit-learn-extra",
   "plotly",
   "pyarrow",
   "streamlit"  # needed for HF Space app below

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ scikit-learn==1.5.1
 pyarrow==16.1.0
 huggingface_hub==0.25.2
 pybaseball==2.2.7
-requests>=2.31.0

 pyarrow==16.1.0
 huggingface_hub==0.25.2
 pybaseball==2.2.7
+requests>=2.31.0
+scikit-learn-extra

src/model.py CHANGED Viewed

@@ -1,9 +1,15 @@
 from __future__ import annotations
 import pandas as pd
-from sklearn.preprocessing import StandardScaler
-from sklearn.cluster import KMeans
 from sklearn.neighbors import NearestNeighbors
 ARCH_FEATURES = [
     "velo",
     "ivb_in",
@@ -17,29 +23,169 @@ ARCH_FEATURES = [
     "zone_pct",
 ]
-def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42):
     df = df_feat.dropna(subset=ARCH_FEATURES).copy()
-    X = df[ARCH_FEATURES].values
-    scaler = StandardScaler()
-    Xs = scaler.fit_transform(X)
-    km = KMeans(n_clusters=k, n_init=20, random_state=random_state)
     labels = km.fit_predict(Xs)
     df["cluster"] = labels
-    nn = NearestNeighbors(n_neighbors=6, metric="euclidean")
-    nn.fit(Xs)
     return df, scaler, km, nn
 def nearest_comps(
-    row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6
 ):
     xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
-    dists, idxs = nn.kneighbors(xq, n_neighbors=k)
-    comps = df_fit.iloc[idxs[0]].copy()
-    if within_pitch_type:
-        comps = comps[comps["pitch_type"] == row["pitch_type"]]
     cols = [
         "player_name",
         "pitch_type",
@@ -49,6 +195,49 @@ def nearest_comps(
         "hb_as_in",
         "whiff_rate",
         "gb_rate",
-        "cluster_name",
     ]
-    return comps[cols].head(k - 1)

 from __future__ import annotations
+import numpy as np
 import pandas as pd
+from typing import Dict, Optional, Tuple
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import RobustScaler, StandardScaler, FunctionTransformer
+from sklearn.pipeline import Pipeline
 from sklearn.neighbors import NearestNeighbors
+# NEW: medoids (robust, nearest-exemplar clustering)
+from sklearn_extra.cluster import KMedoids
 ARCH_FEATURES = [
     "velo",
     "ivb_in",
     "zone_pct",
 ]
+# ---------- existing helpers (unchanged API) ----------
+def winsorize_df(df: pd.DataFrame, cols, lower=0.01, upper=0.99):
+    q_low = df[cols].quantile(lower)
+    q_hi = df[cols].quantile(upper)
+    return df.assign(**{c: df[c].clip(q_low[c], q_hi[c]) for c in cols})
+def groupwise_z(df: pd.DataFrame, cols, group_col="pitch_type"):
+    df = df.copy()
+    def _z(g):
+        return (g - g.mean()) / (g.std(ddof=0) + 1e-8)
+    gz_cols = []
+    for c in cols:
+        gz = f"{c}_gz"
+        df[gz] = df.groupby(group_col)[c].transform(_z)
+        gz_cols.append(gz)
+    return df, gz_cols
+def _preprocessor(
+    gz_feats: list[str], weights: Optional[Dict[str, float]] = None
+) -> Pipeline:
+    """
+    Consistent preprocessing for clustering and neighbor search.
+    Applies impute -> robust scale -> standardize -> optional weights.
+    """
+    steps = [
+        ("imputer", SimpleImputer(strategy="median")),
+        ("robust", RobustScaler()),
+        ("std", StandardScaler(with_mean=True, with_std=True)),
+    ]
+    if weights:
+        w = np.array(
+            [weights.get(f.replace("_gz", ""), 1.0) for f in gz_feats], dtype=float
+        )
+        steps.append(
+            (
+                "weights",
+                FunctionTransformer(lambda X: X * w, feature_names_out="one-to-one"),
+            )
+        )
+    return Pipeline(steps)
+# ---------- local, neighbor-aware label smoothing (kept) ----------
+def _contextual_smooth_labels(
+    Xs: np.ndarray,
+    labels: np.ndarray,
+    n_neighbors: int = 15,
+    vote_thresh: float = 0.6,
+    margin: float = 0.0,
+    max_iters: int = 2,
+) -> np.ndarray:
+    """
+    Reassign labels by local kNN majority with a confidence threshold.
+    - vote_thresh: minimum fraction of neighbors that must agree to flip (e.g., 0.6)
+    - margin: require the neighbor-majority centroid to be at least 'margin' closer
+      than the current cluster center (0.0 = no distance guard)
+    """
+    n = len(labels)
+    labels = labels.copy()
+    knn = NearestNeighbors(n_neighbors=min(n, n_neighbors + 1), metric="manhattan").fit(
+        Xs
+    )
+    dists, idxs = knn.kneighbors(Xs)
+    def centroids(lbls):
+        Cs = []
+        for k in np.unique(lbls):
+            Cs.append(Xs[lbls == k].mean(axis=0))
+        return {k: c for k, c in zip(np.unique(lbls), Cs)}
+    for _ in range(max_iters):
+        C = centroids(labels)
+        changed = 0
+        for i in range(n):
+            neigh = idxs[i][1:]  # drop self
+            neigh_lbls = labels[neigh]
+            vals, counts = np.unique(neigh_lbls, return_counts=True)
+            j = np.argmax(counts)
+            maj_label, maj_frac = vals[j], counts[j] / len(neigh_lbls)
+            if maj_frac < vote_thresh or maj_label == labels[i]:
+                continue
+            if margin > 0.0:
+                cur_c = C[labels[i]]
+                maj_c = C[maj_label]
+                di_cur = np.linalg.norm(Xs[i] - cur_c)
+                di_maj = np.linalg.norm(Xs[i] - maj_c)
+                if di_maj >= di_cur - margin:
+                    continue
+            labels[i] = maj_label
+            changed += 1
+        if changed == 0:
+            break
+    return labels
+# ---------- API: fit + comps (drop-in) ----------
+def fit_kmeans(df_feat: pd.DataFrame, k: int = 20, random_state: int = 42):
+    """
+    DROP-IN REPLACEMENT:
+    - Uses K-MEDOIDS with MANHATTAN distance (closest-neighbor–friendly).
+    - Returns (df_with_clusters, scaler_pipeline, kmedoids_model, knn_index).
+    """
     df = df_feat.dropna(subset=ARCH_FEATURES).copy()
+    # Light winsorization: dampen outliers without warping scale
+    df[ARCH_FEATURES] = df[ARCH_FEATURES].clip(
+        df[ARCH_FEATURES].quantile(0.01),
+        df[ARCH_FEATURES].quantile(0.99),
+        axis=1,
+    )
+    # Consistent preprocessing for clustering and neighbors
+    scaler = _preprocessor(ARCH_FEATURES, weights=None)
+    Xs = scaler.fit_transform(df[ARCH_FEATURES].values)
+    # K-Medoids with Manhattan distance -> emphasizes true nearest relationships
+    km = KMedoids(
+        n_clusters=k,
+        metric="manhattan",
+        init="k-medoids++",
+        max_iter=500,
+        random_state=random_state,
+    )
     labels = km.fit_predict(Xs)
     df["cluster"] = labels
+    # NN index in the SAME space & metric
+    nn = NearestNeighbors(n_neighbors=8, metric="manhattan").fit(Xs)
     return df, scaler, km, nn
 def nearest_comps(
+    row: pd.Series,
+    df_fit: pd.DataFrame,
+    scaler: Pipeline,
+    nn: NearestNeighbors,
+    within_pitch_type: bool = True,
+    k: int = 6,
 ):
+    """
+    Nearest comps in the SAME preprocessed space and metric (Manhattan).
+    If within_pitch_type=True, restricts candidates to the same pitch_type.
+    """
+    # Ensure all required features exist
+    missing = [c for c in ARCH_FEATURES if c not in df_fit.columns]
+    if missing:
+        raise KeyError(f"nearest_comps: df_fit is missing required features: {missing}")
+    # Query vector in the exact same space as clustering
     xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
+    # Columns to return
     cols = [
         "player_name",
         "pitch_type",
         "hb_as_in",
         "whiff_rate",
         "gb_rate",
+        "cluster",
     ]
+    # Per-pitch-type neighborhood (preferred)
+    if within_pitch_type and "pitch_type" in df_fit.columns:
+        ptype = row.get("pitch_type")
+        if isinstance(ptype, str):
+            sub = df_fit[df_fit["pitch_type"] == ptype].copy()
+            if not sub.empty:
+                Xsub = scaler.transform(sub[ARCH_FEATURES].values)
+                k_loc = min(len(sub), max(2, k + 1))  # +1 to allow excluding self
+                knn_local = NearestNeighbors(n_neighbors=k_loc, metric="manhattan").fit(
+                    Xsub
+                )
+                dists, inds = knn_local.kneighbors(xq, n_neighbors=k_loc)
+                cand = sub.iloc[inds[0]].copy()
+                cand["_dist"] = dists[0]
+                # Prefer excluding the same player if present
+                pname = row.get("player_name", None)
+                if pname is not None and "player_name" in cand.columns:
+                    cand = cand[cand["player_name"] != pname]
+                return (
+                    cand.sort_values("_dist")
+                    .drop(columns=["_dist"], errors="ignore")[cols]
+                    .head(k)
+                )
+    # Global fallback: use provided NN (already fit in Manhattan space)
+    k_glob = min(len(df_fit), max(2, k + 1))
+    dists, inds = nn.kneighbors(xq, n_neighbors=k_glob)
+    cand = df_fit.iloc[inds[0]].copy()
+    if within_pitch_type and "pitch_type" in df_fit.columns:
+        ptype = row.get("pitch_type")
+        if isinstance(ptype, str):
+            cand = cand[cand["pitch_type"] == ptype]
+    pname = row.get("player_name", None)
+    if pname is not None and "player_name" in cand.columns:
+        cand = cand[cand["player_name"] != pname]
+    cand["_dist"] = dists[0][: len(cand)] if len(dists[0]) >= len(cand) else 0.0
+    return (
+        cand.sort_values("_dist").drop(columns=["_dist"], errors="ignore")[cols].head(k)
+    )
+# Make public API explicit (unchanged)
+__all__ = ["ARCH_FEATURES", "fit_kmeans", "nearest_comps"]

src/tags.py CHANGED Viewed

@@ -1,9 +1,17 @@
 from __future__ import annotations
 import numpy as np
 import pandas as pd
-def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
     if pd.isna(v):
         return mid
     if v >= q75:
@@ -13,139 +21,97 @@ def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
     return mid
-def _vert_label(ivb):
-    if pd.isna(ivb):
         return "Neutral"
-    return "Ride" if ivb >= 0 else "Drop"
-def _armside_from_raw_hb(hb_raw: float, throws: str) -> str:
-    """Return 'Arm-Side' or 'Glove-Side' from raw HB (catcher view) and dominant throws.
-    Statcast convention (catcher view): positive = to catcher’s left (3B side).
-    Arm-side mapping commonly used:
-      - RHP arm-side run → negative hb_raw
-      - LHP arm-side run → positive hb_raw
-    """
-    if pd.isna(hb_raw) or throws not in ("R", "L"):
         return "Neutral"
-    if (throws == "R" and hb_raw < 0) or (throws == "L" and hb_raw > 0):
-        return "Arm-Side"
-    return "Glove-Side"
-def _infer_side_series(sub: pd.DataFrame) -> pd.Series:
-    """Infer per-pitch side (Arm/Glove) robustly, using raw hb if available,
-    else reconstruct a raw-ish value from hb_as_in and p_throws."""
-    has_raw = "hb_in" in sub.columns
-    if has_raw:
-        hb_raw = sub["hb_in"]
-    else:
-        # Reconstruct raw-ish: if hb_as_in is arm-side-adjusted (positive toward arm-side),
-        # then flip sign for RHP to get a catcher-view-like raw sign.
-        # raw ≈ +hb_as for LHP, raw ≈ -hb_as for RHP
-        if "hb_as_in" in sub.columns and "p_throws" in sub.columns:
-            hb_raw = np.where(sub["p_throws"] == "L", sub["hb_as_in"], -sub["hb_as_in"])
-            hb_raw = pd.Series(hb_raw, index=sub.index)
-        else:
-            return pd.Series(["Neutral"] * len(sub), index=sub.index)
-    throws = sub["p_throws"].fillna(
-        sub["p_throws"].mode().iloc[0] if not sub["p_throws"].mode().empty else "R"
-    )
-    return pd.Series(
-        np.where(
-            ((throws == "R") & (hb_raw < 0)) | ((throws == "L") & (hb_raw > 0)),
-            "Arm-Side",
-            "Glove-Side",
-        ),
-        index=sub.index,
-    )
-def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
     df = df_with_clusters.copy()
-    # Quantiles for magnitude bucketing
-    q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
-    q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
-    q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
-    q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
     # Quality quantiles
-    q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
-    q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
-    q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
-    q_wh50 = np.nanquantile(df["whiff_rate"], 0.50)
-    q_gb50 = np.nanquantile(df["gb_rate"], 0.50)
-    q_zn50 = np.nanquantile(df["zone_pct"], 0.50)
-    tags = {}
     for c, sub in df.groupby("cluster"):
         # Robust central tendency
         row = sub.median(numeric_only=True)
-        # Dominant metadata
-        dom_pt = (
-            sub["pitch_type"].mode().iloc[0]
-            if "pitch_type" in sub and not sub["pitch_type"].mode().empty
-            else "Pitch"
-        )
-        dom_throw = (
-            sub["p_throws"].mode().iloc[0]
-            if "p_throws" in sub and not sub["p_throws"].mode().empty
-            else "R"
-        )
-        # Robust side inference
-        per_pitch_side = _infer_side_series(sub)
-        side_counts = per_pitch_side.value_counts(dropna=False)
-        side = side_counts.idxmax() if not side_counts.empty else "Neutral"
-        # If nearly tied or Neutral, fall back to median raw
-        if side in ("Neutral",) or (
-            len(side_counts) > 1 and (side_counts.max() - side_counts.min()) <= 2
-        ):
-            # Use hb_raw median logic
-            if "hb_in" in sub.columns:
-                hb_raw_med = sub["hb_in"].median()
-            else:
-                # Reconstruct raw-ish median from hb_as_in + throws
-                if "hb_as_in" in sub.columns:
-                    hb_raw_med = sub.apply(
-                        lambda r: (
-                            r["hb_as_in"]
-                            if r.get("p_throws", dom_throw) == "L"
-                            else -r["hb_as_in"]
-                        ),
-                        axis=1,
-                    ).median()
-                else:
-                    hb_raw_med = np.nan
-            side = _armside_from_raw_hb(hb_raw_med, dom_throw)
-        # Vertical shape from ivb sign (already handedness-invariant)
-        vert = _vert_label(row.get("ivb_in", np.nan))
-        # Magnitudes from absolute, handedness-invariant features
-        mag_side = _mag_label(abs(row.get("hb_as_in", np.nan)), q_abs_hb25, q_abs_hb75)
-        mag_vert = _mag_label(abs(row.get("ivb_in", np.nan)), q_abs_ivb25, q_abs_ivb75)
-        # Flavor tags
         flavor = []
-        if row.get("whiff_rate", 0) >= q_wh75:
             flavor.append("Whiff-First")
-        if row.get("gb_rate", 0) >= q_gb75:
             flavor.append("Grounder-First")
-        if row.get("zone_pct", 0) >= q_zn75:
             flavor.append("Strike-Throwing")
         if not flavor:
             diffs = {
-                "Whiff-First": row.get("whiff_rate", 0) - q_wh50,
-                "Grounder-First": row.get("gb_rate", 0) - q_gb50,
-                "Strike-Throwing": row.get("zone_pct", 0) - q_zn50,
             }
             flavor.append(max(diffs, key=diffs.get))
         side_noun = (
             "Run"
             if side == "Arm-Side"
@@ -154,9 +120,17 @@ def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
         vert_noun = (
             "Ride" if vert == "Ride" else ("Drop" if vert == "Drop" else "Ride/Drop")
         )
-        shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
-        tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
-    return tags

 from __future__ import annotations
 import numpy as np
 import pandas as pd
+from typing import Dict, Optional
+def _safe_q(s: pd.Series, q: float, default: float) -> float:
+    s = pd.to_numeric(s, errors="coerce").dropna()
+    return float(s.quantile(q)) if len(s) else default
+def _mag_label(
+    v: float, q25: float, q75: float, small="Subtle", mid="Moderate", big="Heavy"
+):
     if pd.isna(v):
         return mid
     if v >= q75:
     return mid
+def _vert_label(ivb: float, eps: float = 0.5) -> str:
+    if pd.isna(ivb) or abs(ivb) <= eps:
         return "Neutral"
+    return "Ride" if ivb > 0 else "Drop"
+def _side_label(hb_as: float, eps: float = 0.5) -> str:
+    """+hb_as_in = Arm-Side, -hb_as_in = Glove-Side; small |hb| -> Neutral."""
+    if pd.isna(hb_as) or abs(hb_as) <= eps:
         return "Neutral"
+    return "Arm-Side" if hb_as > 0 else "Glove-Side"
+def xy_cluster_tags(
+    df_with_clusters: pd.DataFrame,
+    *,
+    eps_lat: float = 0.5,  # dead-band for side near 0 (inches)
+    eps_vert: float = 0.5,  # dead-band for ride/drop near 0
+    prefix_pitch_type: bool = False,  # True to prepend dominant pitch_type like "SL:"
+) -> Dict[int, str]:
+    """
+    Cluster -> name using only movement characteristics:
+      - Side: sign(hb_as_in)  (+ -> Arm-Side, - -> Glove-Side)
+      - Vert: sign(ivb_in)    (+ -> Ride,     - -> Drop)
+    Magnitude adjectives via quantiles (Subtle/Moderate/Heavy). Adds flavor tags
+    (Whiff-First / Grounder-First / Strike-Throwing) based on medians.
+    Returns {cluster_id: label}
+    """
+    if df_with_clusters.empty or "cluster" not in df_with_clusters.columns:
+        return {}
     df = df_with_clusters.copy()
+    # Quantiles for magnitude bucketing (robust, adaptive per window)
+    q_abs_ivb25 = _safe_q(df.get("ivb_in", pd.Series([])), 0.25, 1.0)
+    q_abs_ivb75 = _safe_q(df.get("ivb_in", pd.Series([])).abs(), 0.75, 8.0)
+    q_abs_hb25 = _safe_q(df.get("hb_as_in", pd.Series([])).abs(), 0.25, 1.5)
+    q_abs_hb75 = _safe_q(df.get("hb_as_in", pd.Series([])).abs(), 0.75, 10.0)
     # Quality quantiles
+    q_wh75 = _safe_q(df.get("whiff_rate", pd.Series([])), 0.75, 0.30)
+    q_gb75 = _safe_q(df.get("gb_rate", pd.Series([])), 0.75, 0.45)
+    q_zn75 = _safe_q(df.get("zone_pct", pd.Series([])), 0.75, 0.52)
+    q_wh50 = _safe_q(df.get("whiff_rate", pd.Series([])), 0.50, 0.25)
+    q_gb50 = _safe_q(df.get("gb_rate", pd.Series([])), 0.50, 0.40)
+    q_zn50 = _safe_q(df.get("zone_pct", pd.Series([])), 0.50, 0.49)
+    tags: Dict[int, str] = {}
     for c, sub in df.groupby("cluster"):
         # Robust central tendency
         row = sub.median(numeric_only=True)
+        # Optional dominant metadata (NOT used for geometry)
+        prefix = ""
+        if (
+            prefix_pitch_type
+            and "pitch_type" in sub.columns
+            and not sub["pitch_type"].mode().empty
+        ):
+            prefix = f"{sub['pitch_type'].mode().iloc[0]}: "
+        # Geometry: use hb_as_in & ivb_in directly (signs define AS/GS and Ride/Drop)
+        hb_med = row.get("hb_as_in", np.nan)
+        ivb_med = row.get("ivb_in", np.nan)
+        side = _side_label(hb_med, eps=eps_lat)  # Arm-Side / Glove-Side / Neutral
+        vert = _vert_label(ivb_med, eps=eps_vert)  # Ride / Drop / Neutral
+        # Magnitude adjectives (absolute)
+        mag_side = _mag_label(abs(hb_med), q_abs_hb25, q_abs_hb75)
+        mag_vert = _mag_label(abs(ivb_med), q_abs_ivb25, q_abs_ivb75)
+        # Flavor tags (pick strongest; if none exceed 75th pct, choose highest vs median)
         flavor = []
+        if "whiff_rate" in row and row["whiff_rate"] >= q_wh75:
             flavor.append("Whiff-First")
+        if "gb_rate" in row and row["gb_rate"] >= q_gb75:
             flavor.append("Grounder-First")
+        if "zone_pct" in row and row["zone_pct"] >= q_zn75:
             flavor.append("Strike-Throwing")
         if not flavor:
             diffs = {
+                "Whiff-First": float(row.get("whiff_rate", 0) - q_wh50),
+                "Grounder-First": float(row.get("gb_rate", 0) - q_gb50),
+                "Strike-Throwing": float(row.get("zone_pct", 0) - q_zn50),
             }
             flavor.append(max(diffs, key=diffs.get))
+        # Compose human-readable shape
         side_noun = (
             "Run"
             if side == "Arm-Side"
         vert_noun = (
             "Ride" if vert == "Ride" else ("Drop" if vert == "Drop" else "Ride/Drop")
         )
+        # If Neutral on either axis, simplify the phrase
+        if side == "Neutral" and vert == "Neutral":
+            shape = "Neutral • Moderate Run/Sweep, Moderate Ride/Drop"
+        elif side == "Neutral":
+            shape = f"{vert} • Moderate Run/Sweep, {mag_vert} {vert_noun}"
+        elif vert == "Neutral":
+            shape = f"{side} • {mag_side} {side_noun}, Moderate Ride/Drop"
+        else:
+            shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
+        tags[int(c)] = f"{prefix}{shape} • " + " / ".join(flavor)
+    return tags