from __future__ import annotations import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors ARCH_FEATURES = [ "velo", "ivb_in", "hb_as_in", "rel_height", "rel_side", "spin", "csw", "whiff_rate", "gb_rate", "zone_pct", ] def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42): df = df_feat.dropna(subset=ARCH_FEATURES).copy() X = df[ARCH_FEATURES].values scaler = StandardScaler() Xs = scaler.fit_transform(X) km = KMeans(n_clusters=k, n_init=20, random_state=random_state) labels = km.fit_predict(Xs) df["cluster"] = labels nn = NearestNeighbors(n_neighbors=6, metric="euclidean") nn.fit(Xs) return df, scaler, km, nn def nearest_comps( row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6 ): xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1)) dists, idxs = nn.kneighbors(xq, n_neighbors=k) comps = df_fit.iloc[idxs[0]].copy() if within_pitch_type: comps = comps[comps["pitch_type"] == row["pitch_type"]] cols = [ "player_name", "pitch_type", "p_throws", "velo", "ivb_in", "hb_as_in", "whiff_rate", "gb_rate", "cluster_name", ] return comps[cols].head(k - 1)