Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """NBA game winner prediction model training script. | |
| This script trains a neural network to predict game winners based on | |
| team statistics. It uses RandomizedSearchCV to find optimal hyperparameters. | |
| Usage: | |
| python scripts/compile_model.py | |
| """ | |
| import logging | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| from scikeras.wrappers import KerasClassifier | |
| from sklearn.model_selection import RandomizedSearchCV, train_test_split | |
| from tensorflow import keras | |
| from tensorflow.keras import layers | |
| from tensorflow.keras.losses import BinaryCrossentropy | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Data file paths | |
| ROSTER_FILE = Path("player_stats.txt") | |
| SCHEDULE_FILE = Path("schedule.txt") | |
| OUTPUT_MODEL = Path("winner.keras") | |
| # Feature columns from roster data | |
| FEATURE_COLS: list[str] = [ | |
| "TEAM", | |
| "PTS/G", | |
| "ORB", | |
| "DRB", | |
| "AST", | |
| "STL", | |
| "BLK", | |
| "TOV", | |
| "3P%", | |
| "FT%", | |
| "2P", | |
| ] | |
| # Hyperparameter search space | |
| OPTIMIZERS: list[str] = [ | |
| "SGD", | |
| "RMSprop", | |
| "Adagrad", | |
| "Adadelta", | |
| "Adam", | |
| "Adamax", | |
| "Nadam", | |
| ] | |
| INITIALIZERS: list[str] = [ | |
| "uniform", | |
| "lecun_uniform", | |
| "normal", | |
| "zero", | |
| "glorot_normal", | |
| "glorot_uniform", | |
| "he_normal", | |
| "he_uniform", | |
| ] | |
| EPOCHS: list[int] = [500, 1000, 1500] | |
| BATCH_SIZES: list[int] = [50, 100, 200] | |
| def create_stats( | |
| roster: pd.DataFrame, schedule: pd.DataFrame | |
| ) -> list[np.ndarray]: | |
| """Create feature arrays from roster and schedule data. | |
| Args: | |
| roster: DataFrame with player statistics | |
| schedule: DataFrame with game schedule and scores | |
| Returns: | |
| List of numpy arrays, one per game with combined team stats | |
| """ | |
| home_stats: list[list] = [] | |
| away_stats: list[list] = [] | |
| features: list[np.ndarray] = [] | |
| new_roster = roster[FEATURE_COLS] | |
| # Get stats for each team in each game | |
| for team in schedule["Home/Neutral"]: | |
| home_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist()) | |
| for team in schedule["Visitor/Neutral"]: | |
| away_stats.append(new_roster[new_roster["TEAM"] == team].values.tolist()) | |
| # Combine home and away stats for each game | |
| for i in range(len(home_stats)): | |
| arr: list[float] = [] | |
| for j in range(len(home_stats[i])): | |
| del home_stats[i][j][0] # Remove team name | |
| arr.extend(home_stats[i][j]) | |
| for j in range(len(away_stats[i])): | |
| del away_stats[i][j][0] # Remove team name | |
| arr.extend(away_stats[i][j]) | |
| # Handle NaN values | |
| features.append(np.nan_to_num(np.array(arr), copy=False)) | |
| return features | |
| def create_model( | |
| optimizer: str = "rmsprop", init: str = "glorot_uniform" | |
| ) -> keras.Model: | |
| """Create the neural network model architecture. | |
| Args: | |
| optimizer: Optimizer name | |
| init: Weight initializer name | |
| Returns: | |
| Compiled Keras model | |
| """ | |
| inputs = keras.Input(shape=(100,)) | |
| x = layers.Dense(50, activation="relu", kernel_initializer=init)(inputs) | |
| x = layers.Dense(64, activation="relu", kernel_initializer=init)(x) | |
| outputs = layers.Dense(1, activation="sigmoid")(x) | |
| model = keras.Model(inputs=inputs, outputs=outputs, name="nba_model") | |
| model.compile( | |
| loss=BinaryCrossentropy(from_logits=False), | |
| optimizer=optimizer, | |
| metrics=["accuracy"], | |
| ) | |
| return model | |
| def train_model( | |
| x_train: np.ndarray, | |
| y_train: np.ndarray, | |
| x_test: np.ndarray, | |
| y_test: np.ndarray, | |
| n_iterations: int = 100, | |
| ) -> tuple[keras.Model, dict, float]: | |
| """Train model with hyperparameter search. | |
| Args: | |
| x_train: Training features | |
| y_train: Training labels | |
| x_test: Test features | |
| y_test: Test labels | |
| n_iterations: Number of random search iterations | |
| Returns: | |
| Tuple of (best_model, best_params, test_accuracy) | |
| """ | |
| model = KerasClassifier( | |
| model=create_model, | |
| verbose=0, | |
| init="glorot_uniform", | |
| ) | |
| param_grid = { | |
| "optimizer": OPTIMIZERS, | |
| "epochs": EPOCHS, | |
| "batch_size": BATCH_SIZES, | |
| "init": INITIALIZERS, | |
| } | |
| logger.info(f"Starting randomized search with {n_iterations} iterations") | |
| random_search = RandomizedSearchCV( | |
| estimator=model, | |
| param_distributions=param_grid, | |
| n_iter=n_iterations, | |
| verbose=3, | |
| ) | |
| random_search_result = random_search.fit(x_train, y_train) | |
| best_model = random_search_result.best_estimator_ | |
| best_params = random_search_result.best_params_ | |
| test_accuracy = best_model.score(x_test, y_test) | |
| return best_model.model_, best_params, test_accuracy | |
| def main() -> None: | |
| """Main training pipeline.""" | |
| logger.info("Loading data files") | |
| if not ROSTER_FILE.exists(): | |
| logger.error(f"Roster file not found: {ROSTER_FILE}") | |
| raise FileNotFoundError(f"Missing {ROSTER_FILE}") | |
| if not SCHEDULE_FILE.exists(): | |
| logger.error(f"Schedule file not found: {SCHEDULE_FILE}") | |
| raise FileNotFoundError(f"Missing {SCHEDULE_FILE}") | |
| roster = pd.read_csv(ROSTER_FILE, delimiter=",") | |
| schedule = pd.read_csv(SCHEDULE_FILE, delimiter=",") | |
| logger.info(f"Loaded {len(roster)} players and {len(schedule)} games") | |
| # Create target variable: 0 = home wins, 1 = away wins | |
| schedule["winner"] = schedule.apply( | |
| lambda x: 0 if x["PTS"] > x["PTS.1"] else 1, axis=1 | |
| ) | |
| # Create feature arrays | |
| logger.info("Creating feature arrays") | |
| X = np.array(create_stats(roster, schedule)) | |
| y = np.array(schedule["winner"]) | |
| logger.info(f"Feature shape: {X.shape}, Target shape: {y.shape}") | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42 | |
| ) | |
| logger.info(f"Train size: {len(X_train)}, Test size: {len(X_test)}") | |
| # Train model | |
| best_model, best_params, test_accuracy = train_model( | |
| X_train, y_train, X_test, y_test | |
| ) | |
| # Save model | |
| logger.info(f"Saving model to {OUTPUT_MODEL}") | |
| best_model.save(OUTPUT_MODEL) | |
| logger.info(f"Best parameters: {best_params}") | |
| logger.info(f"Test accuracy: {test_accuracy:.4f}") | |
| if __name__ == "__main__": | |
| main() | |