NBA_PREDICTOR / src /models /game_predictor.py
jashdoshi77's picture
Initial commit: NBA Sage Predictor for Hugging Face Spaces (with LFS for large files)
c095e08
"""
NBA ML Prediction System - Game Predictor
==========================================
XGBoost + LightGBM ensemble for game win prediction.
"""
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from sklearn.metrics import accuracy_score, brier_score_loss, log_loss
from sklearn.calibration import calibration_curve
import xgboost as xgb
import lightgbm as lgb
import joblib
import logging
from src.config import MODEL_CONFIG, MODELS_DIR
# Import preprocessing so pickle can find DataPreprocessor class
from src.preprocessing import DataPreprocessor, GameDatasetBuilder
logger = logging.getLogger(__name__)
# =============================================================================
# GAME PREDICTOR MODEL
# =============================================================================
class GamePredictor:
"""
Ensemble model for predicting game outcomes.
Uses XGBoost + LightGBM with weighted averaging.
"""
def __init__(self,
xgb_weight: float = 0.5,
lgb_weight: float = 0.5):
self.xgb_weight = xgb_weight
self.lgb_weight = lgb_weight
self.xgb_model = None
self.lgb_model = None
self.feature_columns = None
self.trained = False
def train(self, X_train: np.ndarray, y_train: np.ndarray,
X_val: np.ndarray = None, y_val: np.ndarray = None,
feature_columns: List[str] = None):
"""
Train both XGBoost and LightGBM models.
"""
self.feature_columns = feature_columns
logger.info("Training XGBoost model...")
self.xgb_model = xgb.XGBClassifier(**MODEL_CONFIG.xgb_params)
if X_val is not None:
self.xgb_model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
verbose=False
)
else:
self.xgb_model.fit(X_train, y_train)
logger.info("Training LightGBM model...")
self.lgb_model = lgb.LGBMClassifier(**MODEL_CONFIG.lgb_params)
if X_val is not None:
self.lgb_model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)]
)
else:
self.lgb_model.fit(X_train, y_train)
self.trained = True
logger.info("Training complete!")
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""
Predict win probabilities using ensemble.
Returns:
Array of shape (n_samples, 2) with [loss_prob, win_prob]
"""
if not self.trained:
raise ValueError("Model not trained. Call train() first.")
xgb_proba = self.xgb_model.predict_proba(X)
lgb_proba = self.lgb_model.predict_proba(X)
# Weighted average
ensemble_proba = (
self.xgb_weight * xgb_proba +
self.lgb_weight * lgb_proba
)
return ensemble_proba
def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
"""Predict win/loss (1/0)."""
proba = self.predict_proba(X)
return (proba[:, 1] >= threshold).astype(int)
def predict_with_confidence(self, X: np.ndarray) -> List[Dict]:
"""
Predict with detailed confidence information.
Shows individual model predictions and disagreement.
"""
xgb_proba = self.xgb_model.predict_proba(X)[:, 1]
lgb_proba = self.lgb_model.predict_proba(X)[:, 1]
ensemble_proba = self.predict_proba(X)[:, 1]
results = []
for i in range(len(X)):
# Check model disagreement
disagreement = abs(xgb_proba[i] - lgb_proba[i])
results.append({
"win_probability": ensemble_proba[i],
"xgb_probability": xgb_proba[i],
"lgb_probability": lgb_proba[i],
"model_disagreement": disagreement,
"confidence": "high" if disagreement < 0.1 else ("medium" if disagreement < 0.2 else "low"),
"prediction": "WIN" if ensemble_proba[i] >= 0.5 else "LOSS"
})
return results
def evaluate(self, X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
"""
Comprehensive model evaluation.
Returns:
Dict with accuracy, brier score, and other metrics
"""
y_pred = self.predict(X)
y_proba = self.predict_proba(X)[:, 1]
metrics = {
"accuracy": accuracy_score(y, y_pred),
"brier_score": brier_score_loss(y, y_proba),
"log_loss": log_loss(y, y_proba)
}
# Individual model metrics
xgb_pred = self.xgb_model.predict(X)
lgb_pred = self.lgb_model.predict(X)
metrics["xgb_accuracy"] = accuracy_score(y, xgb_pred)
metrics["lgb_accuracy"] = accuracy_score(y, lgb_pred)
return metrics
def get_feature_importance(self) -> pd.DataFrame:
"""Get feature importance from both models."""
if not self.trained or self.feature_columns is None:
return pd.DataFrame()
xgb_importance = self.xgb_model.feature_importances_
lgb_importance = self.lgb_model.feature_importances_
df = pd.DataFrame({
"feature": self.feature_columns,
"xgb_importance": xgb_importance,
"lgb_importance": lgb_importance,
"avg_importance": (xgb_importance + lgb_importance) / 2
})
return df.sort_values("avg_importance", ascending=False)
def explain_prediction(self, X: np.ndarray, top_n: int = 5) -> List[Dict]:
"""
Explain predictions using feature importance.
Returns top N contributing features for each prediction.
"""
if not self.trained or self.feature_columns is None:
return []
importance = self.get_feature_importance()
top_features = importance.head(top_n)["feature"].tolist()
explanations = []
for i in range(len(X)):
feature_contributions = []
for j, feat in enumerate(self.feature_columns):
if feat in top_features:
feature_contributions.append({
"feature": feat,
"value": X[i, j],
"importance": importance[importance["feature"] == feat]["avg_importance"].values[0]
})
# Sort by importance
feature_contributions.sort(key=lambda x: x["importance"], reverse=True)
explanations.append({
"top_features": feature_contributions[:top_n],
"prediction": self.predict(X[i:i+1])[0]
})
return explanations
def save(self, path: Path = None):
"""Save model to disk."""
if path is None:
path = MODELS_DIR / "game_predictor.joblib"
joblib.dump({
"xgb_model": self.xgb_model,
"lgb_model": self.lgb_model,
"xgb_weight": self.xgb_weight,
"lgb_weight": self.lgb_weight,
"feature_columns": self.feature_columns,
"trained": self.trained
}, path)
logger.info(f"Saved model to {path}")
def load(self, path: Path = None):
"""Load model from disk."""
if path is None:
path = MODELS_DIR / "game_predictor.joblib"
data = joblib.load(path)
self.xgb_model = data["xgb_model"]
self.lgb_model = data["lgb_model"]
self.xgb_weight = data["xgb_weight"]
self.lgb_weight = data["lgb_weight"]
self.feature_columns = data["feature_columns"]
self.trained = data["trained"]
logger.info(f"Loaded model from {path}")
# =============================================================================
# TRAINING PIPELINE
# =============================================================================
def train_game_predictor(dataset: Dict) -> GamePredictor:
"""
Full training pipeline for game predictor.
"""
logger.info("Starting game predictor training...")
model = GamePredictor()
model.train(
X_train=dataset["X_train"],
y_train=dataset["y_train"],
X_val=dataset["X_val"],
y_val=dataset["y_val"],
feature_columns=dataset["feature_columns"]
)
# Evaluate on all splits
logger.info("\n=== Training Metrics ===")
train_metrics = model.evaluate(dataset["X_train"], dataset["y_train"])
logger.info(f"Train Accuracy: {train_metrics['accuracy']:.4f}")
logger.info("\n=== Validation Metrics ===")
val_metrics = model.evaluate(dataset["X_val"], dataset["y_val"])
logger.info(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
logger.info(f"Val Brier Score: {val_metrics['brier_score']:.4f}")
logger.info("\n=== Test Metrics ===")
test_metrics = model.evaluate(dataset["X_test"], dataset["y_test"])
logger.info(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
logger.info(f"Test Brier Score: {test_metrics['brier_score']:.4f}")
# Check if we meet target
if test_metrics["accuracy"] >= 0.65:
logger.info("✓ Target accuracy (>65%) achieved!")
else:
logger.warning(f"✗ Below target accuracy. Got {test_metrics['accuracy']:.2%}")
# Feature importance
logger.info("\n=== Top Features ===")
importance = model.get_feature_importance()
print(importance.head(10))
# Save model
model.save()
return model
# =============================================================================
# CLI INTERFACE
# =============================================================================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Game Predictor Training")
parser.add_argument("--train", action="store_true", help="Train model")
parser.add_argument("--evaluate", action="store_true", help="Evaluate existing model")
args = parser.parse_args()
if args.train:
from src.preprocessing import GameDatasetBuilder
logging.basicConfig(level=logging.INFO)
print("Loading dataset...")
builder = GameDatasetBuilder()
try:
dataset = builder.load_dataset()
print(f"Loaded dataset with {len(dataset['feature_columns'])} features")
except FileNotFoundError:
print("No dataset found. Please run 'python -m src.preprocessing --build' first.")
exit(1)
except Exception as e:
print(f"Error loading dataset: {e}")
import traceback
traceback.print_exc()
exit(1)
model = train_game_predictor(dataset)
print("\nTraining complete!")
elif args.evaluate:
model = GamePredictor()
model.load()
from src.preprocessing import GameDatasetBuilder
builder = GameDatasetBuilder()
dataset = builder.load_dataset()
metrics = model.evaluate(dataset["X_test"], dataset["y_test"])
print("\n=== Test Metrics ===")
for k, v in metrics.items():
print(f"{k}: {v:.4f}")
else:
print("Use --train to train or --evaluate to evaluate")