NBA-Fantasy-Game / scripts /compile_model.py
Hatmanstack
Refactor app with security fixes, error handling, and type safety
6424951
#!/usr/bin/env python3
"""NBA game winner prediction model training script.
This script trains a neural network to predict game winners based on
team statistics. It uses RandomizedSearchCV to find optimal hyperparameters.
Usage:
python scripts/compile_model.py
"""
import logging
from pathlib import Path
import numpy as np
import pandas as pd
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Data file paths
ROSTER_FILE = Path("player_stats.txt")
SCHEDULE_FILE = Path("schedule.txt")
OUTPUT_MODEL = Path("winner.keras")
# Feature columns from roster data
FEATURE_COLS: list[str] = [
"TEAM",
"PTS/G",
"ORB",
"DRB",
"AST",
"STL",
"BLK",
"TOV",
"3P%",
"FT%",
"2P",
]
# Hyperparameter search space
OPTIMIZERS: list[str] = [
"SGD",
"RMSprop",
"Adagrad",
"Adadelta",
"Adam",
"Adamax",
"Nadam",
]
INITIALIZERS: list[str] = [
"uniform",
"lecun_uniform",
"normal",
"zero",
"glorot_normal",
"glorot_uniform",
"he_normal",
"he_uniform",
]
EPOCHS: list[int] = [500, 1000, 1500]
BATCH_SIZES: list[int] = [50, 100, 200]
def create_stats(
roster: pd.DataFrame, schedule: pd.DataFrame
) -> list[np.ndarray]:
"""Create feature arrays from roster and schedule data.
Args:
roster: DataFrame with player statistics
schedule: DataFrame with game schedule and scores
Returns:
List of numpy arrays, one per game with combined team stats
"""
home_stats: list[list] = []
away_stats: list[list] = []
features: list[np.ndarray] = []
new_roster = roster[FEATURE_COLS]
# Get stats for each team in each game
for team in schedule["Home/Neutral"]:
home_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist())
for team in schedule["Visitor/Neutral"]:
away_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist())
# Combine home and away stats for each game
for i in range(len(home_stats)):
arr: list[float] = []
for j in range(len(home_stats[i])):
del home_stats[i][j][0] # Remove team name
arr.extend(home_stats[i][j])
for j in range(len(away_stats[i])):
del away_stats[i][j][0] # Remove team name
arr.extend(away_stats[i][j])
# Handle NaN values
features.append(np.nan_to_num(np.array(arr), copy=False))
return features
def create_model(
optimizer: str = "rmsprop", init: str = "glorot_uniform"
) -> keras.Model:
"""Create the neural network model architecture.
Args:
optimizer: Optimizer name
init: Weight initializer name
Returns:
Compiled Keras model
"""
inputs = keras.Input(shape=(100,))
x = layers.Dense(50, activation="relu", kernel_initializer=init)(inputs)
x = layers.Dense(64, activation="relu", kernel_initializer=init)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="nba_model")
model.compile(
loss=BinaryCrossentropy(from_logits=False),
optimizer=optimizer,
metrics=["accuracy"],
)
return model
def train_model(
x_train: np.ndarray,
y_train: np.ndarray,
x_test: np.ndarray,
y_test: np.ndarray,
n_iterations: int = 100,
) -> tuple[keras.Model, dict, float]:
"""Train model with hyperparameter search.
Args:
x_train: Training features
y_train: Training labels
x_test: Test features
y_test: Test labels
n_iterations: Number of random search iterations
Returns:
Tuple of (best_model, best_params, test_accuracy)
"""
model = KerasClassifier(
model=create_model,
verbose=0,
init="glorot_uniform",
)
param_grid = {
"optimizer": OPTIMIZERS,
"epochs": EPOCHS,
"batch_size": BATCH_SIZES,
"init": INITIALIZERS,
}
logger.info(f"Starting randomized search with {n_iterations} iterations")
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=n_iterations,
verbose=3,
)
random_search_result = random_search.fit(x_train, y_train)
best_model = random_search_result.best_estimator_
best_params = random_search_result.best_params_
test_accuracy = best_model.score(x_test, y_test)
return best_model.model_, best_params, test_accuracy
def main() -> None:
"""Main training pipeline."""
logger.info("Loading data files")
if not ROSTER_FILE.exists():
logger.error(f"Roster file not found: {ROSTER_FILE}")
raise FileNotFoundError(f"Missing {ROSTER_FILE}")
if not SCHEDULE_FILE.exists():
logger.error(f"Schedule file not found: {SCHEDULE_FILE}")
raise FileNotFoundError(f"Missing {SCHEDULE_FILE}")
roster = pd.read_csv(ROSTER_FILE, delimiter=",")
schedule = pd.read_csv(SCHEDULE_FILE, delimiter=",")
logger.info(f"Loaded {len(roster)} players and {len(schedule)} games")
# Create target variable: 0 = home wins, 1 = away wins
schedule["winner"] = schedule.apply(
lambda x: 0 if x["PTS"] > x["PTS.1"] else 1, axis=1
)
# Create feature arrays
logger.info("Creating feature arrays")
X = np.array(create_stats(roster, schedule))
y = np.array(schedule["winner"])
logger.info(f"Feature shape: {X.shape}, Target shape: {y.shape}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
logger.info(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
# Train model
best_model, best_params, test_accuracy = train_model(
X_train, y_train, X_test, y_test
)
# Save model
logger.info(f"Saving model to {OUTPUT_MODEL}")
best_model.save(OUTPUT_MODEL)
logger.info(f"Best parameters: {best_params}")
logger.info(f"Test accuracy: {test_accuracy:.4f}")
if __name__ == "__main__":
main()