pitch_dash / src /model.py
rsm-roguchi
Initial clean commit (no binaries, no venv)
c75151e
raw
history blame
1.41 kB
from __future__ import annotations
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
ARCH_FEATURES = [
"velo",
"ivb_in",
"hb_as_in",
"rel_height",
"rel_side",
"spin",
"csw",
"whiff_rate",
"gb_rate",
"zone_pct",
]
def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42):
df = df_feat.dropna(subset=ARCH_FEATURES).copy()
X = df[ARCH_FEATURES].values
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
km = KMeans(n_clusters=k, n_init=20, random_state=random_state)
labels = km.fit_predict(Xs)
df["cluster"] = labels
nn = NearestNeighbors(n_neighbors=6, metric="euclidean")
nn.fit(Xs)
return df, scaler, km, nn
def nearest_comps(
row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6
):
xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
dists, idxs = nn.kneighbors(xq, n_neighbors=k)
comps = df_fit.iloc[idxs[0]].copy()
if within_pitch_type:
comps = comps[comps["pitch_type"] == row["pitch_type"]]
cols = [
"player_name",
"pitch_type",
"p_throws",
"velo",
"ivb_in",
"hb_as_in",
"whiff_rate",
"gb_rate",
"cluster_name",
]
return comps[cols].head(k - 1)