File size: 10,246 Bytes
c095e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
NBA ML Prediction System - MVP Predictor
=========================================
Model to predict MVP based on player performance, team success, 
and historical MVP similarity.
"""

import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import joblib
import logging

from src.config import MODELS_DIR, RAW_DATA_DIR

logger = logging.getLogger(__name__)

# =============================================================================
# HISTORICAL MVP PROFILES
# =============================================================================
# Historical MVP seasons (approximate stats for similarity comparison)
HISTORICAL_MVP_PROFILES = {
    "2023-24": {"player": "Nikola Jokic", "ppg": 26.4, "rpg": 12.4, "apg": 9.0, "ws": 17.8, "team_wins": 57},
    "2022-23": {"player": "Joel Embiid", "ppg": 33.1, "rpg": 10.2, "apg": 4.2, "ws": 14.3, "team_wins": 54},
    "2021-22": {"player": "Nikola Jokic", "ppg": 27.1, "rpg": 13.8, "apg": 7.9, "ws": 15.2, "team_wins": 48},
    "2020-21": {"player": "Nikola Jokic", "ppg": 26.4, "rpg": 10.8, "apg": 8.3, "ws": 15.6, "team_wins": 47},
    "2019-20": {"player": "Giannis Antetokounmpo", "ppg": 29.5, "rpg": 13.6, "apg": 5.6, "ws": 14.4, "team_wins": 56},
    "2018-19": {"player": "Giannis Antetokounmpo", "ppg": 27.7, "rpg": 12.5, "apg": 5.9, "ws": 14.4, "team_wins": 60},
    "2017-18": {"player": "James Harden", "ppg": 30.4, "rpg": 5.4, "apg": 8.8, "ws": 15.4, "team_wins": 65},
    "2016-17": {"player": "Russell Westbrook", "ppg": 31.6, "rpg": 10.7, "apg": 10.4, "ws": 13.1, "team_wins": 47},
    "2015-16": {"player": "Stephen Curry", "ppg": 30.1, "rpg": 5.4, "apg": 6.7, "ws": 17.9, "team_wins": 73},
}


# =============================================================================
# MVP PREDICTOR
# =============================================================================
class MVPPredictor:
    """
    Predicts MVP vote share using gradient boosting with narrative features.
    """
    
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.feature_columns = None
        self.trained = False
    
    def calculate_mvp_similarity(self, player_stats: Dict) -> float:
        """
        Calculate cosine similarity to historical MVP profiles.
        Captures voter psychology by finding players who "look like" past MVPs.
        """
        # Create feature vector for player
        player_vector = np.array([
            player_stats.get("ppg", 0),
            player_stats.get("rpg", 0),
            player_stats.get("apg", 0),
            player_stats.get("ws", 0),
            player_stats.get("team_wins", 0) / 82  # Normalize to 0-1
        ]).reshape(1, -1)
        
        # Create matrix of historical MVP profiles
        mvp_vectors = []
        for season, profile in HISTORICAL_MVP_PROFILES.items():
            mvp_vectors.append([
                profile["ppg"],
                profile["rpg"],
                profile["apg"],
                profile["ws"],
                profile["team_wins"] / 82
            ])
        
        mvp_matrix = np.array(mvp_vectors)
        
        # Normalize
        if len(mvp_matrix) > 0:
            mvp_matrix_normalized = self.scaler.fit_transform(mvp_matrix)
            player_normalized = self.scaler.transform(player_vector)
            
            # Calculate similarity to each MVP season
            similarities = cosine_similarity(player_normalized, mvp_matrix_normalized)[0]
            
            # Return max similarity (closest to any MVP)
            return float(np.max(similarities))
        
        return 0.0
    
    def calculate_narrative_features(self, player_stats: Dict, 
                                      prev_season_stats: Optional[Dict] = None) -> Dict:
        """
        Calculate narrative momentum features that voters care about.
        """
        features = {}
        
        # Stat improvement year-over-year
        if prev_season_stats:
            features["ppg_improvement"] = player_stats.get("ppg", 0) - prev_season_stats.get("ppg", 0)
            features["rpg_improvement"] = player_stats.get("rpg", 0) - prev_season_stats.get("rpg", 0)
            features["apg_improvement"] = player_stats.get("apg", 0) - prev_season_stats.get("apg", 0)
        else:
            features["ppg_improvement"] = 0
            features["rpg_improvement"] = 0
            features["apg_improvement"] = 0
        
        # Team success
        features["team_wins"] = player_stats.get("team_wins", 0)
        features["team_win_pct"] = player_stats.get("team_wins", 41) / 82
        
        # Games played (durability matters)
        features["games_played"] = player_stats.get("gp", 0)
        features["games_played_pct"] = player_stats.get("gp", 0) / 82
        
        return features
    
    def prepare_features(self, player_df: pd.DataFrame) -> pd.DataFrame:
        """Prepare all features for MVP prediction."""
        features = player_df.copy()
        
        # Calculate MVP similarity for each player
        features["mvp_similarity"] = features.apply(
            lambda row: self.calculate_mvp_similarity({
                "ppg": row.get("PTS", 0),
                "rpg": row.get("REB", 0),
                "apg": row.get("AST", 0),
                "ws": row.get("WS", 10),  # Default if not available
                "team_wins": row.get("TEAM_WINS", 41)
            }), axis=1
        )
        
        return features
    
    def train(self, X: np.ndarray, y: np.ndarray, feature_columns: List[str]):
        """Train the MVP prediction model."""
        self.feature_columns = feature_columns
        
        self.model = xgb.XGBRegressor(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            random_state=42
        )
        
        self.model.fit(X, y)
        self.trained = True
        logger.info("MVP model trained")
    
    def predict_vote_share(self, X: np.ndarray) -> np.ndarray:
        """Predict MVP vote share (0-1 scale)."""
        if not self.trained:
            raise ValueError("Model not trained")
        return self.model.predict(X)
    
    def rank_candidates(self, player_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
        """
        Rank MVP candidates and return top N.
        Uses real stats-based scoring formula.
        """
        df = player_df.copy()
        
        # MVP score based on stats available from NBA API
        # Weighted formula considering:
        # - Scoring (30%): Points per game
        # - Playmaking (20%): Assists per game
        # - Rebounding (15%): Rebounds per game
        # - Defense (10%): Steals + Blocks
        # - Efficiency (10%): Plus/Minus and FG%
        # - Team Success (15%): Team win percentage
        
        pts = df.get("PTS", pd.Series([0]*len(df))).fillna(0)
        ast = df.get("AST", pd.Series([0]*len(df))).fillna(0)
        reb = df.get("REB", pd.Series([0]*len(df))).fillna(0)
        stl = df.get("STL", pd.Series([0]*len(df))).fillna(0)
        blk = df.get("BLK", pd.Series([0]*len(df))).fillna(0)
        plus_minus = df.get("PLUS_MINUS", pd.Series([0]*len(df))).fillna(0)
        fg_pct = df.get("FG_PCT", pd.Series([0.45]*len(df))).fillna(0.45)
        team_win_pct = df.get("TEAM_WIN_PCT", pd.Series([0.5]*len(df))).fillna(0.5)
        
        df["mvp_score"] = (
            pts * 1.0 +                    # Points (raw weight)
            ast * 2.0 +                    # Assists (weighted more for playmaking)
            reb * 1.0 +                    # Rebounds
            (stl + blk) * 1.5 +            # Defense
            plus_minus * 0.3 +             # Impact metric
            fg_pct * 20 +                  # Efficiency bonus
            team_win_pct * 30              # Team success (big factor for MVP)
        )
        
        # Add MVP similarity if we can calculate it
        if "mvp_similarity" not in df.columns:
            df = self.prepare_features(df)
        
        if "mvp_similarity" in df.columns:
            df["mvp_score"] = df["mvp_score"] + df["mvp_similarity"].fillna(0) * 10
        
        # Sort and return top candidates
        df = df.sort_values("mvp_score", ascending=False)
        
        # Ensure columns exist for return
        if "mvp_similarity" not in df.columns:
            df["mvp_similarity"] = 0.0
        
        return df.head(top_n)[["PLAYER_NAME", "PTS", "REB", "AST", "mvp_score", "mvp_similarity"]]
    
    def save(self, path: Path = None):
        """Save model to disk."""
        if path is None:
            path = MODELS_DIR / "mvp_predictor.joblib"
        
        joblib.dump({
            "model": self.model,
            "scaler": self.scaler,
            "feature_columns": self.feature_columns,
            "trained": self.trained
        }, path)
        logger.info(f"Saved MVP model to {path}")
    
    def load(self, path: Path = None):
        """Load model from disk."""
        if path is None:
            path = MODELS_DIR / "mvp_predictor.joblib"
        
        data = joblib.load(path)
        self.model = data["model"]
        self.scaler = data["scaler"]
        self.feature_columns = data["feature_columns"]
        self.trained = data["trained"]


# =============================================================================
# CLI INTERFACE
# =============================================================================
if __name__ == "__main__":
    print("Testing MVP Similarity Calculator...")
    
    predictor = MVPPredictor()
    
    # Test with a hypothetical MVP-caliber season
    test_stats = {
        "ppg": 28.5,
        "rpg": 12.0,
        "apg": 8.5,
        "ws": 15.0,
        "team_wins": 55
    }
    
    similarity = predictor.calculate_mvp_similarity(test_stats)
    print(f"MVP Similarity Score: {similarity:.3f}")
    
    # Test narrative features
    prev_stats = {"ppg": 25.0, "rpg": 10.0, "apg": 7.0}
    narrative = predictor.calculate_narrative_features(test_stats, prev_stats)
    print(f"Narrative Features: {narrative}")