""" NBA ML Prediction System - MVP Predictor ========================================= Model to predict MVP based on player performance, team success, and historical MVP similarity. """ import numpy as np import pandas as pd from pathlib import Path from typing import Dict, List, Optional from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import StandardScaler import xgboost as xgb import joblib import logging from src.config import MODELS_DIR, RAW_DATA_DIR logger = logging.getLogger(__name__) # ============================================================================= # HISTORICAL MVP PROFILES # ============================================================================= # Historical MVP seasons (approximate stats for similarity comparison) HISTORICAL_MVP_PROFILES = { "2023-24": {"player": "Nikola Jokic", "ppg": 26.4, "rpg": 12.4, "apg": 9.0, "ws": 17.8, "team_wins": 57}, "2022-23": {"player": "Joel Embiid", "ppg": 33.1, "rpg": 10.2, "apg": 4.2, "ws": 14.3, "team_wins": 54}, "2021-22": {"player": "Nikola Jokic", "ppg": 27.1, "rpg": 13.8, "apg": 7.9, "ws": 15.2, "team_wins": 48}, "2020-21": {"player": "Nikola Jokic", "ppg": 26.4, "rpg": 10.8, "apg": 8.3, "ws": 15.6, "team_wins": 47}, "2019-20": {"player": "Giannis Antetokounmpo", "ppg": 29.5, "rpg": 13.6, "apg": 5.6, "ws": 14.4, "team_wins": 56}, "2018-19": {"player": "Giannis Antetokounmpo", "ppg": 27.7, "rpg": 12.5, "apg": 5.9, "ws": 14.4, "team_wins": 60}, "2017-18": {"player": "James Harden", "ppg": 30.4, "rpg": 5.4, "apg": 8.8, "ws": 15.4, "team_wins": 65}, "2016-17": {"player": "Russell Westbrook", "ppg": 31.6, "rpg": 10.7, "apg": 10.4, "ws": 13.1, "team_wins": 47}, "2015-16": {"player": "Stephen Curry", "ppg": 30.1, "rpg": 5.4, "apg": 6.7, "ws": 17.9, "team_wins": 73}, } # ============================================================================= # MVP PREDICTOR # ============================================================================= class MVPPredictor: """ Predicts MVP vote share using gradient boosting with narrative features. """ def __init__(self): self.model = None self.scaler = StandardScaler() self.feature_columns = None self.trained = False def calculate_mvp_similarity(self, player_stats: Dict) -> float: """ Calculate cosine similarity to historical MVP profiles. Captures voter psychology by finding players who "look like" past MVPs. """ # Create feature vector for player player_vector = np.array([ player_stats.get("ppg", 0), player_stats.get("rpg", 0), player_stats.get("apg", 0), player_stats.get("ws", 0), player_stats.get("team_wins", 0) / 82 # Normalize to 0-1 ]).reshape(1, -1) # Create matrix of historical MVP profiles mvp_vectors = [] for season, profile in HISTORICAL_MVP_PROFILES.items(): mvp_vectors.append([ profile["ppg"], profile["rpg"], profile["apg"], profile["ws"], profile["team_wins"] / 82 ]) mvp_matrix = np.array(mvp_vectors) # Normalize if len(mvp_matrix) > 0: mvp_matrix_normalized = self.scaler.fit_transform(mvp_matrix) player_normalized = self.scaler.transform(player_vector) # Calculate similarity to each MVP season similarities = cosine_similarity(player_normalized, mvp_matrix_normalized)[0] # Return max similarity (closest to any MVP) return float(np.max(similarities)) return 0.0 def calculate_narrative_features(self, player_stats: Dict, prev_season_stats: Optional[Dict] = None) -> Dict: """ Calculate narrative momentum features that voters care about. """ features = {} # Stat improvement year-over-year if prev_season_stats: features["ppg_improvement"] = player_stats.get("ppg", 0) - prev_season_stats.get("ppg", 0) features["rpg_improvement"] = player_stats.get("rpg", 0) - prev_season_stats.get("rpg", 0) features["apg_improvement"] = player_stats.get("apg", 0) - prev_season_stats.get("apg", 0) else: features["ppg_improvement"] = 0 features["rpg_improvement"] = 0 features["apg_improvement"] = 0 # Team success features["team_wins"] = player_stats.get("team_wins", 0) features["team_win_pct"] = player_stats.get("team_wins", 41) / 82 # Games played (durability matters) features["games_played"] = player_stats.get("gp", 0) features["games_played_pct"] = player_stats.get("gp", 0) / 82 return features def prepare_features(self, player_df: pd.DataFrame) -> pd.DataFrame: """Prepare all features for MVP prediction.""" features = player_df.copy() # Calculate MVP similarity for each player features["mvp_similarity"] = features.apply( lambda row: self.calculate_mvp_similarity({ "ppg": row.get("PTS", 0), "rpg": row.get("REB", 0), "apg": row.get("AST", 0), "ws": row.get("WS", 10), # Default if not available "team_wins": row.get("TEAM_WINS", 41) }), axis=1 ) return features def train(self, X: np.ndarray, y: np.ndarray, feature_columns: List[str]): """Train the MVP prediction model.""" self.feature_columns = feature_columns self.model = xgb.XGBRegressor( n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42 ) self.model.fit(X, y) self.trained = True logger.info("MVP model trained") def predict_vote_share(self, X: np.ndarray) -> np.ndarray: """Predict MVP vote share (0-1 scale).""" if not self.trained: raise ValueError("Model not trained") return self.model.predict(X) def rank_candidates(self, player_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame: """ Rank MVP candidates and return top N. Uses real stats-based scoring formula. """ df = player_df.copy() # MVP score based on stats available from NBA API # Weighted formula considering: # - Scoring (30%): Points per game # - Playmaking (20%): Assists per game # - Rebounding (15%): Rebounds per game # - Defense (10%): Steals + Blocks # - Efficiency (10%): Plus/Minus and FG% # - Team Success (15%): Team win percentage pts = df.get("PTS", pd.Series([0]*len(df))).fillna(0) ast = df.get("AST", pd.Series([0]*len(df))).fillna(0) reb = df.get("REB", pd.Series([0]*len(df))).fillna(0) stl = df.get("STL", pd.Series([0]*len(df))).fillna(0) blk = df.get("BLK", pd.Series([0]*len(df))).fillna(0) plus_minus = df.get("PLUS_MINUS", pd.Series([0]*len(df))).fillna(0) fg_pct = df.get("FG_PCT", pd.Series([0.45]*len(df))).fillna(0.45) team_win_pct = df.get("TEAM_WIN_PCT", pd.Series([0.5]*len(df))).fillna(0.5) df["mvp_score"] = ( pts * 1.0 + # Points (raw weight) ast * 2.0 + # Assists (weighted more for playmaking) reb * 1.0 + # Rebounds (stl + blk) * 1.5 + # Defense plus_minus * 0.3 + # Impact metric fg_pct * 20 + # Efficiency bonus team_win_pct * 30 # Team success (big factor for MVP) ) # Add MVP similarity if we can calculate it if "mvp_similarity" not in df.columns: df = self.prepare_features(df) if "mvp_similarity" in df.columns: df["mvp_score"] = df["mvp_score"] + df["mvp_similarity"].fillna(0) * 10 # Sort and return top candidates df = df.sort_values("mvp_score", ascending=False) # Ensure columns exist for return if "mvp_similarity" not in df.columns: df["mvp_similarity"] = 0.0 return df.head(top_n)[["PLAYER_NAME", "PTS", "REB", "AST", "mvp_score", "mvp_similarity"]] def save(self, path: Path = None): """Save model to disk.""" if path is None: path = MODELS_DIR / "mvp_predictor.joblib" joblib.dump({ "model": self.model, "scaler": self.scaler, "feature_columns": self.feature_columns, "trained": self.trained }, path) logger.info(f"Saved MVP model to {path}") def load(self, path: Path = None): """Load model from disk.""" if path is None: path = MODELS_DIR / "mvp_predictor.joblib" data = joblib.load(path) self.model = data["model"] self.scaler = data["scaler"] self.feature_columns = data["feature_columns"] self.trained = data["trained"] # ============================================================================= # CLI INTERFACE # ============================================================================= if __name__ == "__main__": print("Testing MVP Similarity Calculator...") predictor = MVPPredictor() # Test with a hypothetical MVP-caliber season test_stats = { "ppg": 28.5, "rpg": 12.0, "apg": 8.5, "ws": 15.0, "team_wins": 55 } similarity = predictor.calculate_mvp_similarity(test_stats) print(f"MVP Similarity Score: {similarity:.3f}") # Test narrative features prev_stats = {"ppg": 25.0, "rpg": 10.0, "apg": 7.0} narrative = predictor.calculate_narrative_features(test_stats, prev_stats) print(f"Narrative Features: {narrative}")