File size: 7,076 Bytes
c5c0f3e
58c5edf
c5c0f3e
58c5edf
c5c0f3e
 
58c5edf
c75151e
 
c5c0f3e
c75151e
 
752a595
 
c75151e
 
 
cbe015e
c75151e
c5c0f3e
 
 
 
 
 
c75151e
 
 
 
c5c0f3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c75151e
 
 
 
 
752a595
c5c0f3e
 
 
 
 
cbe015e
c75151e
 
c5c0f3e
 
 
 
 
c75151e
 
 
c5c0f3e
 
 
 
 
 
 
 
 
 
752a595
 
c5c0f3e
 
752a595
c5c0f3e
 
 
 
 
 
 
 
 
 
c75151e
 
cbe015e
c75151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c0f3e
 
c75151e
 
 
 
 
 
 
 
752a595
c75151e
c5c0f3e
cbe015e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# app.py
import os, sys
from datetime import datetime

BASE_DIR = os.path.dirname(__file__)
sys.path.append(os.path.join(BASE_DIR, "src"))

import streamlit as st
import pandas as pd

from data import load_statcast, default_window
from featurize import infer_ivb_sign, engineer_pitch_features

# ⬇️ Revert to older API
from model import fit_kmeans, nearest_comps
from tags import xy_cluster_tags
from plots import movement_scatter_xy, radar_quality
from matchups import best_matchups_for_pitcher, ensure_batter_names

try:
    from huggingface_hub import hf_hub_download

    HF_HUB_OK = True
except Exception:
    HF_HUB_OK = False

st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")


@st.cache_data(show_spinner=False, ttl=24 * 3600)
def load_statcast_cached(start: str, end: str, force: bool = False) -> pd.DataFrame:
    return load_statcast(start, end, force=force)


@st.cache_data(show_spinner=False)
def load_sample_fallback() -> pd.DataFrame:
    local_path = os.path.join(BASE_DIR, "data", "sample_statcast.parquet")
    if os.path.exists(local_path):
        return pd.read_parquet(local_path)
    repo_id = os.getenv("SAMPLE_DATA_REPO", "").strip()
    file_name = os.getenv("SAMPLE_DATA_FILE", "sample_statcast.parquet").strip()
    if HF_HUB_OK and repo_id:
        path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
        return pd.read_parquet(path)
    return pd.DataFrame(
        columns=[
            "game_date",
            "player_name",
            "pitch_type",
            "p_throws",
            "n",
            "velo",
            "ivb_in",
            "hb_as_in",
            "csw",
            "whiff_rate",
            "gb_rate",
            "zone_pct",
            "cluster",
            "cluster_name",
            "x_mvt",
            "y_mvt",
        ]
    )


def safe_load_data(start: str, end: str, force: bool) -> pd.DataFrame:
    try:
        df = load_statcast_cached(start, end, force)
        if df is not None and not df.empty:
            return df
        st.info("No live data returned for that window — showing sample data instead.")
    except Exception as e:
        st.warning(f"Live data failed: {e}\nUsing sample data instead.")
    return load_sample_fallback()


with st.sidebar:
    st.header("Data Window")
    dstart, dend = default_window()
    start = st.text_input("Start YYYY-MM-DD", dstart)
    end = st.text_input("End YYYY-MM-DD", dend)
    k = st.slider("Clusters (k)", 5, 40, 25)
    force = st.checkbox("Force re-download (discouraged on Spaces)", value=False)
    st.caption("Tip: avoid 'Force re-download' on Spaces to keep startup snappy.")

with st.spinner("Loading data…"):
    df_raw = safe_load_data(start, end, force)
    df_raw = ensure_batter_names(df_raw)

if df_raw.empty:
    st.warning(
        "No data available (live and sample were both empty). "
        "Upload a small sample file to ./data/sample_statcast.parquet or set "
        "env vars SAMPLE_DATA_REPO + SAMPLE_DATA_FILE to a HF dataset."
    )
    st.stop()


@st.cache_data(show_spinner=False)
def _featurize(df_raw_in: pd.DataFrame):
    ivb_sign = infer_ivb_sign(df_raw_in)
    df_feat_local = engineer_pitch_features(df_raw_in, ivb_sign)
    return df_feat_local


df_feat = _featurize(df_raw)


# ✅ Cache the fitted artifacts from the older API
@st.cache_resource(show_spinner=False)
def _fit_model(df_feat_in: pd.DataFrame, k_val: int):
    df_fit_local, scaler, km, nn = fit_kmeans(df_feat_in, k=k_val)
    # Tag clusters with readable names
    cluster_names_local = xy_cluster_tags(df_fit_local)
    df_fit_local = df_fit_local.copy()
    df_fit_local["cluster_name"] = df_fit_local["cluster"].map(cluster_names_local)
    return df_fit_local, scaler, km, nn


with st.spinner("Clustering & tagging…"):
    df_fit, scaler, km, nn = _fit_model(df_feat, k)

pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].dropna().unique()))
df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")

tab1, tab2, tab3, tab4 = st.tabs(["Movement", "Scouting Card", "Comps", 'Best Matchups'])

with tab1:
    view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
    if view == "Selected pitcher":
        st.subheader(f"Movement — {pitcher}")
        st.plotly_chart(
            movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True
        )
    else:
        st.subheader("Movement — All pitchers (cluster context)")
        st.plotly_chart(
            movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
        )

with tab2:
    st.subheader(f"Scouting Card — {pitcher}")
    st.dataframe(
        df_p[
            [
                "pitch_type",
                "p_throws",
                "n",
                "velo",
                "ivb_in",
                "hb_as_in",
                "csw",
                "whiff_rate",
                "gb_rate",
                "zone_pct",
                "cluster_name",
            ]
        ],
        use_container_width=True,
    )
    for _, row in df_p.iterrows():
        st.markdown(f"### {row['pitch_type']}{row['cluster_name']}")
        st.plotly_chart(radar_quality(row), use_container_width=True)

with tab3:
    for _, row in df_p.iterrows():
        st.markdown(f"#### {row['pitch_type']} comps")
        # ⬇️ Old signature again
        comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
        st.dataframe(comps, use_container_width=True)

with tab4:
    st.subheader(f"Best Matchups — {pitcher}")

    # Controls
    colA, colB, colC, colD = st.columns([1, 1, 1, 2])
    with colA:
        min_pitches = st.number_input("Min pitches vs batter", 5, 200, 10, step=5)
    with colB:
        top_n = st.number_input("Top N", 5, 50, 10, step=5)
    with colC:
        w_whiff = st.slider("Weight: Whiff", 0.0, 1.0, 0.6, 0.05)
    with colD:
        w_gb = st.slider("Weight: GB on contact", 0.0, 1.0, 0.4, 0.05)

    # Normalize weights (optional)
    total_w = max(w_whiff + w_gb, 1e-6)
    w_whiff /= total_w
    w_gb /= total_w

    # Compute
    best, worst = best_matchups_for_pitcher(
        df_raw,
        pitcher,
        min_pitches=min_pitches,
        top_n=int(top_n),
        w_whiff=float(w_whiff),
        w_gb=float(w_gb),
    )

    if best.empty and worst.empty:
        st.info(
            "No batter matchups for this pitcher within the current window / filters."
        )
    else:
        c1, c2 = st.columns(2)
        with c1:
            st.markdown("### ✅ Best (Pitcher-Friendly)")
            st.dataframe(best, use_container_width=True)
        with c2:
            st.markdown("### ⚠️ Tough (Least Pitcher-Friendly)")
            st.dataframe(worst, use_container_width=True)

    st.caption(
        "Score = w_whiff × whiff_rate + w_gb × ground-ball-rate-on-contact. "
        "Adjust weights to emphasize strikeouts vs. weak contact."
    )