Spaces:

roguchi
/

pitch_dash

Sleeping

pitch_dash / src /model.py

rsm-roguchi

Initial clean commit (no binaries, no venv)

c75151e 3 months ago

1.41 kB

	from __future__ import annotations
	import pandas as pd
	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	from sklearn.neighbors import NearestNeighbors

	ARCH_FEATURES = [
	"velo",
	"ivb_in",
	"hb_as_in",
	"rel_height",
	"rel_side",
	"spin",
	"csw",
	"whiff_rate",
	"gb_rate",
	"zone_pct",
	]


	def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42):
	df = df_feat.dropna(subset=ARCH_FEATURES).copy()
	X = df[ARCH_FEATURES].values
	scaler = StandardScaler()
	Xs = scaler.fit_transform(X)
	km = KMeans(n_clusters=k, n_init=20, random_state=random_state)
	labels = km.fit_predict(Xs)
	df["cluster"] = labels

	nn = NearestNeighbors(n_neighbors=6, metric="euclidean")
	nn.fit(Xs)
	return df, scaler, km, nn


	def nearest_comps(
	row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6
	):
	xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
	dists, idxs = nn.kneighbors(xq, n_neighbors=k)
	comps = df_fit.iloc[idxs[0]].copy()
	if within_pitch_type:
	comps = comps[comps["pitch_type"] == row["pitch_type"]]
	cols = [
	"player_name",
	"pitch_type",
	"p_throws",
	"velo",
	"ivb_in",
	"hb_as_in",
	"whiff_rate",
	"gb_rate",
	"cluster_name",
	]
	return comps[cols].head(k - 1)