Spaces:
Running
Running
| """ | |
| NBA ML Prediction System - Game Predictor | |
| ========================================== | |
| XGBoost + LightGBM ensemble for game win prediction. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional | |
| from sklearn.metrics import accuracy_score, brier_score_loss, log_loss | |
| from sklearn.calibration import calibration_curve | |
| import xgboost as xgb | |
| import lightgbm as lgb | |
| import joblib | |
| import logging | |
| from src.config import MODEL_CONFIG, MODELS_DIR | |
| # Import preprocessing so pickle can find DataPreprocessor class | |
| from src.preprocessing import DataPreprocessor, GameDatasetBuilder | |
| logger = logging.getLogger(__name__) | |
| # ============================================================================= | |
| # GAME PREDICTOR MODEL | |
| # ============================================================================= | |
| class GamePredictor: | |
| """ | |
| Ensemble model for predicting game outcomes. | |
| Uses XGBoost + LightGBM with weighted averaging. | |
| """ | |
| def __init__(self, | |
| xgb_weight: float = 0.5, | |
| lgb_weight: float = 0.5): | |
| self.xgb_weight = xgb_weight | |
| self.lgb_weight = lgb_weight | |
| self.xgb_model = None | |
| self.lgb_model = None | |
| self.feature_columns = None | |
| self.trained = False | |
| def train(self, X_train: np.ndarray, y_train: np.ndarray, | |
| X_val: np.ndarray = None, y_val: np.ndarray = None, | |
| feature_columns: List[str] = None): | |
| """ | |
| Train both XGBoost and LightGBM models. | |
| """ | |
| self.feature_columns = feature_columns | |
| logger.info("Training XGBoost model...") | |
| self.xgb_model = xgb.XGBClassifier(**MODEL_CONFIG.xgb_params) | |
| if X_val is not None: | |
| self.xgb_model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_val, y_val)], | |
| verbose=False | |
| ) | |
| else: | |
| self.xgb_model.fit(X_train, y_train) | |
| logger.info("Training LightGBM model...") | |
| self.lgb_model = lgb.LGBMClassifier(**MODEL_CONFIG.lgb_params) | |
| if X_val is not None: | |
| self.lgb_model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_val, y_val)] | |
| ) | |
| else: | |
| self.lgb_model.fit(X_train, y_train) | |
| self.trained = True | |
| logger.info("Training complete!") | |
| def predict_proba(self, X: np.ndarray) -> np.ndarray: | |
| """ | |
| Predict win probabilities using ensemble. | |
| Returns: | |
| Array of shape (n_samples, 2) with [loss_prob, win_prob] | |
| """ | |
| if not self.trained: | |
| raise ValueError("Model not trained. Call train() first.") | |
| xgb_proba = self.xgb_model.predict_proba(X) | |
| lgb_proba = self.lgb_model.predict_proba(X) | |
| # Weighted average | |
| ensemble_proba = ( | |
| self.xgb_weight * xgb_proba + | |
| self.lgb_weight * lgb_proba | |
| ) | |
| return ensemble_proba | |
| def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray: | |
| """Predict win/loss (1/0).""" | |
| proba = self.predict_proba(X) | |
| return (proba[:, 1] >= threshold).astype(int) | |
| def predict_with_confidence(self, X: np.ndarray) -> List[Dict]: | |
| """ | |
| Predict with detailed confidence information. | |
| Shows individual model predictions and disagreement. | |
| """ | |
| xgb_proba = self.xgb_model.predict_proba(X)[:, 1] | |
| lgb_proba = self.lgb_model.predict_proba(X)[:, 1] | |
| ensemble_proba = self.predict_proba(X)[:, 1] | |
| results = [] | |
| for i in range(len(X)): | |
| # Check model disagreement | |
| disagreement = abs(xgb_proba[i] - lgb_proba[i]) | |
| results.append({ | |
| "win_probability": ensemble_proba[i], | |
| "xgb_probability": xgb_proba[i], | |
| "lgb_probability": lgb_proba[i], | |
| "model_disagreement": disagreement, | |
| "confidence": "high" if disagreement < 0.1 else ("medium" if disagreement < 0.2 else "low"), | |
| "prediction": "WIN" if ensemble_proba[i] >= 0.5 else "LOSS" | |
| }) | |
| return results | |
| def evaluate(self, X: np.ndarray, y: np.ndarray) -> Dict[str, float]: | |
| """ | |
| Comprehensive model evaluation. | |
| Returns: | |
| Dict with accuracy, brier score, and other metrics | |
| """ | |
| y_pred = self.predict(X) | |
| y_proba = self.predict_proba(X)[:, 1] | |
| metrics = { | |
| "accuracy": accuracy_score(y, y_pred), | |
| "brier_score": brier_score_loss(y, y_proba), | |
| "log_loss": log_loss(y, y_proba) | |
| } | |
| # Individual model metrics | |
| xgb_pred = self.xgb_model.predict(X) | |
| lgb_pred = self.lgb_model.predict(X) | |
| metrics["xgb_accuracy"] = accuracy_score(y, xgb_pred) | |
| metrics["lgb_accuracy"] = accuracy_score(y, lgb_pred) | |
| return metrics | |
| def get_feature_importance(self) -> pd.DataFrame: | |
| """Get feature importance from both models.""" | |
| if not self.trained or self.feature_columns is None: | |
| return pd.DataFrame() | |
| xgb_importance = self.xgb_model.feature_importances_ | |
| lgb_importance = self.lgb_model.feature_importances_ | |
| df = pd.DataFrame({ | |
| "feature": self.feature_columns, | |
| "xgb_importance": xgb_importance, | |
| "lgb_importance": lgb_importance, | |
| "avg_importance": (xgb_importance + lgb_importance) / 2 | |
| }) | |
| return df.sort_values("avg_importance", ascending=False) | |
| def explain_prediction(self, X: np.ndarray, top_n: int = 5) -> List[Dict]: | |
| """ | |
| Explain predictions using feature importance. | |
| Returns top N contributing features for each prediction. | |
| """ | |
| if not self.trained or self.feature_columns is None: | |
| return [] | |
| importance = self.get_feature_importance() | |
| top_features = importance.head(top_n)["feature"].tolist() | |
| explanations = [] | |
| for i in range(len(X)): | |
| feature_contributions = [] | |
| for j, feat in enumerate(self.feature_columns): | |
| if feat in top_features: | |
| feature_contributions.append({ | |
| "feature": feat, | |
| "value": X[i, j], | |
| "importance": importance[importance["feature"] == feat]["avg_importance"].values[0] | |
| }) | |
| # Sort by importance | |
| feature_contributions.sort(key=lambda x: x["importance"], reverse=True) | |
| explanations.append({ | |
| "top_features": feature_contributions[:top_n], | |
| "prediction": self.predict(X[i:i+1])[0] | |
| }) | |
| return explanations | |
| def save(self, path: Path = None): | |
| """Save model to disk.""" | |
| if path is None: | |
| path = MODELS_DIR / "game_predictor.joblib" | |
| joblib.dump({ | |
| "xgb_model": self.xgb_model, | |
| "lgb_model": self.lgb_model, | |
| "xgb_weight": self.xgb_weight, | |
| "lgb_weight": self.lgb_weight, | |
| "feature_columns": self.feature_columns, | |
| "trained": self.trained | |
| }, path) | |
| logger.info(f"Saved model to {path}") | |
| def load(self, path: Path = None): | |
| """Load model from disk.""" | |
| if path is None: | |
| path = MODELS_DIR / "game_predictor.joblib" | |
| data = joblib.load(path) | |
| self.xgb_model = data["xgb_model"] | |
| self.lgb_model = data["lgb_model"] | |
| self.xgb_weight = data["xgb_weight"] | |
| self.lgb_weight = data["lgb_weight"] | |
| self.feature_columns = data["feature_columns"] | |
| self.trained = data["trained"] | |
| logger.info(f"Loaded model from {path}") | |
| # ============================================================================= | |
| # TRAINING PIPELINE | |
| # ============================================================================= | |
| def train_game_predictor(dataset: Dict) -> GamePredictor: | |
| """ | |
| Full training pipeline for game predictor. | |
| """ | |
| logger.info("Starting game predictor training...") | |
| model = GamePredictor() | |
| model.train( | |
| X_train=dataset["X_train"], | |
| y_train=dataset["y_train"], | |
| X_val=dataset["X_val"], | |
| y_val=dataset["y_val"], | |
| feature_columns=dataset["feature_columns"] | |
| ) | |
| # Evaluate on all splits | |
| logger.info("\n=== Training Metrics ===") | |
| train_metrics = model.evaluate(dataset["X_train"], dataset["y_train"]) | |
| logger.info(f"Train Accuracy: {train_metrics['accuracy']:.4f}") | |
| logger.info("\n=== Validation Metrics ===") | |
| val_metrics = model.evaluate(dataset["X_val"], dataset["y_val"]) | |
| logger.info(f"Val Accuracy: {val_metrics['accuracy']:.4f}") | |
| logger.info(f"Val Brier Score: {val_metrics['brier_score']:.4f}") | |
| logger.info("\n=== Test Metrics ===") | |
| test_metrics = model.evaluate(dataset["X_test"], dataset["y_test"]) | |
| logger.info(f"Test Accuracy: {test_metrics['accuracy']:.4f}") | |
| logger.info(f"Test Brier Score: {test_metrics['brier_score']:.4f}") | |
| # Check if we meet target | |
| if test_metrics["accuracy"] >= 0.65: | |
| logger.info("✓ Target accuracy (>65%) achieved!") | |
| else: | |
| logger.warning(f"✗ Below target accuracy. Got {test_metrics['accuracy']:.2%}") | |
| # Feature importance | |
| logger.info("\n=== Top Features ===") | |
| importance = model.get_feature_importance() | |
| print(importance.head(10)) | |
| # Save model | |
| model.save() | |
| return model | |
| # ============================================================================= | |
| # CLI INTERFACE | |
| # ============================================================================= | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Game Predictor Training") | |
| parser.add_argument("--train", action="store_true", help="Train model") | |
| parser.add_argument("--evaluate", action="store_true", help="Evaluate existing model") | |
| args = parser.parse_args() | |
| if args.train: | |
| from src.preprocessing import GameDatasetBuilder | |
| logging.basicConfig(level=logging.INFO) | |
| print("Loading dataset...") | |
| builder = GameDatasetBuilder() | |
| try: | |
| dataset = builder.load_dataset() | |
| print(f"Loaded dataset with {len(dataset['feature_columns'])} features") | |
| except FileNotFoundError: | |
| print("No dataset found. Please run 'python -m src.preprocessing --build' first.") | |
| exit(1) | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| exit(1) | |
| model = train_game_predictor(dataset) | |
| print("\nTraining complete!") | |
| elif args.evaluate: | |
| model = GamePredictor() | |
| model.load() | |
| from src.preprocessing import GameDatasetBuilder | |
| builder = GameDatasetBuilder() | |
| dataset = builder.load_dataset() | |
| metrics = model.evaluate(dataset["X_test"], dataset["y_test"]) | |
| print("\n=== Test Metrics ===") | |
| for k, v in metrics.items(): | |
| print(f"{k}: {v:.4f}") | |
| else: | |
| print("Use --train to train or --evaluate to evaluate") | |