Spaces:

Hatman
/

NBA-Fantasy-Game

Sleeping

NBA-Fantasy-Game / scripts /compile_model.py

Hatmanstack

Refactor app with security fixes, error handling, and type safety

6424951 about 1 month ago

6.4 kB

	#!/usr/bin/env python3
	"""NBA game winner prediction model training script.

	This script trains a neural network to predict game winners based on
	team statistics. It uses RandomizedSearchCV to find optimal hyperparameters.

	Usage:
	python scripts/compile_model.py
	"""

	import logging
	from pathlib import Path

	import numpy as np
	import pandas as pd
	from scikeras.wrappers import KerasClassifier
	from sklearn.model_selection import RandomizedSearchCV, train_test_split
	from tensorflow import keras
	from tensorflow.keras import layers
	from tensorflow.keras.losses import BinaryCrossentropy

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	)
	logger = logging.getLogger(__name__)

	# Data file paths
	ROSTER_FILE = Path("player_stats.txt")
	SCHEDULE_FILE = Path("schedule.txt")
	OUTPUT_MODEL = Path("winner.keras")

	# Feature columns from roster data
	FEATURE_COLS: list[str] = [
	"TEAM",
	"PTS/G",
	"ORB",
	"DRB",
	"AST",
	"STL",
	"BLK",
	"TOV",
	"3P%",
	"FT%",
	"2P",
	]

	# Hyperparameter search space
	OPTIMIZERS: list[str] = [
	"SGD",
	"RMSprop",
	"Adagrad",
	"Adadelta",
	"Adam",
	"Adamax",
	"Nadam",
	]
	INITIALIZERS: list[str] = [
	"uniform",
	"lecun_uniform",
	"normal",
	"zero",
	"glorot_normal",
	"glorot_uniform",
	"he_normal",
	"he_uniform",
	]
	EPOCHS: list[int] = [500, 1000, 1500]
	BATCH_SIZES: list[int] = [50, 100, 200]


	def create_stats(
	roster: pd.DataFrame, schedule: pd.DataFrame
	) -> list[np.ndarray]:
	"""Create feature arrays from roster and schedule data.

	Args:
	roster: DataFrame with player statistics
	schedule: DataFrame with game schedule and scores

	Returns:
	List of numpy arrays, one per game with combined team stats
	"""
	home_stats: list[list] = []
	away_stats: list[list] = []
	features: list[np.ndarray] = []

	new_roster = roster[FEATURE_COLS]

	# Get stats for each team in each game
	for team in schedule["Home/Neutral"]:
	home_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist())

	for team in schedule["Visitor/Neutral"]:
	away_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist())

	# Combine home and away stats for each game
	for i in range(len(home_stats)):
	arr: list[float] = []

	for j in range(len(home_stats[i])):
	del home_stats[i][j][0] # Remove team name
	arr.extend(home_stats[i][j])

	for j in range(len(away_stats[i])):
	del away_stats[i][j][0] # Remove team name
	arr.extend(away_stats[i][j])

	# Handle NaN values
	features.append(np.nan_to_num(np.array(arr), copy=False))

	return features


	def create_model(
	optimizer: str = "rmsprop", init: str = "glorot_uniform"
	) -> keras.Model:
	"""Create the neural network model architecture.

	Args:
	optimizer: Optimizer name
	init: Weight initializer name

	Returns:
	Compiled Keras model
	"""
	inputs = keras.Input(shape=(100,))
	x = layers.Dense(50, activation="relu", kernel_initializer=init)(inputs)
	x = layers.Dense(64, activation="relu", kernel_initializer=init)(x)
	outputs = layers.Dense(1, activation="sigmoid")(x)

	model = keras.Model(inputs=inputs, outputs=outputs, name="nba_model")
	model.compile(
	loss=BinaryCrossentropy(from_logits=False),
	optimizer=optimizer,
	metrics=["accuracy"],
	)

	return model


	def train_model(
	x_train: np.ndarray,
	y_train: np.ndarray,
	x_test: np.ndarray,
	y_test: np.ndarray,
	n_iterations: int = 100,
	) -> tuple[keras.Model, dict, float]:
	"""Train model with hyperparameter search.

	Args:
	x_train: Training features
	y_train: Training labels
	x_test: Test features
	y_test: Test labels
	n_iterations: Number of random search iterations

	Returns:
	Tuple of (best_model, best_params, test_accuracy)
	"""
	model = KerasClassifier(
	model=create_model,
	verbose=0,
	init="glorot_uniform",
	)

	param_grid = {
	"optimizer": OPTIMIZERS,
	"epochs": EPOCHS,
	"batch_size": BATCH_SIZES,
	"init": INITIALIZERS,
	}

	logger.info(f"Starting randomized search with {n_iterations} iterations")

	random_search = RandomizedSearchCV(
	estimator=model,
	param_distributions=param_grid,
	n_iter=n_iterations,
	verbose=3,
	)

	random_search_result = random_search.fit(x_train, y_train)

	best_model = random_search_result.best_estimator_
	best_params = random_search_result.best_params_
	test_accuracy = best_model.score(x_test, y_test)

	return best_model.model_, best_params, test_accuracy


	def main() -> None:
	"""Main training pipeline."""
	logger.info("Loading data files")

	if not ROSTER_FILE.exists():
	logger.error(f"Roster file not found: {ROSTER_FILE}")
	raise FileNotFoundError(f"Missing {ROSTER_FILE}")

	if not SCHEDULE_FILE.exists():
	logger.error(f"Schedule file not found: {SCHEDULE_FILE}")
	raise FileNotFoundError(f"Missing {SCHEDULE_FILE}")

	roster = pd.read_csv(ROSTER_FILE, delimiter=",")
	schedule = pd.read_csv(SCHEDULE_FILE, delimiter=",")

	logger.info(f"Loaded {len(roster)} players and {len(schedule)} games")

	# Create target variable: 0 = home wins, 1 = away wins
	schedule["winner"] = schedule.apply(
	lambda x: 0 if x["PTS"] > x["PTS.1"] else 1, axis=1
	)

	# Create feature arrays
	logger.info("Creating feature arrays")
	X = np.array(create_stats(roster, schedule))
	y = np.array(schedule["winner"])

	logger.info(f"Feature shape: {X.shape}, Target shape: {y.shape}")

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	logger.info(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

	# Train model
	best_model, best_params, test_accuracy = train_model(
	X_train, y_train, X_test, y_test
	)

	# Save model
	logger.info(f"Saving model to {OUTPUT_MODEL}")
	best_model.save(OUTPUT_MODEL)

	logger.info(f"Best parameters: {best_params}")
	logger.info(f"Test accuracy: {test_accuracy:.4f}")


	if __name__ == "__main__":
	main()