Spaces:
Running
Running
| """ | |
| NBA ML Prediction System - Prediction Pipeline | |
| =============================================== | |
| End-to-end pipeline for generating predictions with live data integration. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from typing import Dict, List, Optional | |
| import logging | |
| from nba_api.stats.endpoints import leaguegamefinder | |
| from nba_api.stats.static import teams | |
| from src.config import ( | |
| API_CACHE_DIR, | |
| MODELS_DIR, | |
| NBA_TEAMS, | |
| API_CONFIG | |
| ) | |
| from src.data_collector import CacheManager, retry_with_backoff | |
| from src.feature_engineering import FeatureGenerator | |
| from src.injury_collector import InjuryCollector | |
| from src.models.game_predictor import GamePredictor | |
| from src.models.mvp_predictor import MVPPredictor | |
| from src.models.championship_predictor import ChampionshipPredictor | |
| from src.preprocessing import DataPreprocessor | |
| from src.live_data_collector import LiveDataCollector | |
| from src.prediction_tracker import PredictionTracker | |
| logger = logging.getLogger(__name__) | |
| # ============================================================================= | |
| # PREDICTION PIPELINE | |
| # ============================================================================= | |
| class PredictionPipeline: | |
| """ | |
| End-to-end prediction pipeline for: | |
| - Today's games (with live scores) | |
| - Upcoming games with predictions | |
| - MVP race | |
| - Championship odds | |
| - Prediction tracking and accuracy | |
| """ | |
| def __init__(self): | |
| self.cache = CacheManager() | |
| self.feature_gen = FeatureGenerator() | |
| self.injury_collector = InjuryCollector() | |
| # Live data and tracking | |
| self.live_collector = LiveDataCollector() | |
| self.prediction_tracker = PredictionTracker() | |
| # Models (loaded on demand) | |
| self._game_model = None | |
| self._mvp_model = None | |
| self._champ_model = None | |
| self._preprocessor = None | |
| # Initialize ELO ratings from historical games | |
| self._initialize_elo_from_history() | |
| def _initialize_elo_from_history(self): | |
| """ | |
| Process all historical games to build accurate ELO ratings. | |
| This ensures predictions reflect actual team strength. | |
| """ | |
| try: | |
| from src.config import API_CACHE_DIR | |
| games_path = API_CACHE_DIR / "all_games_summary.parquet" | |
| logger.info(f"Looking for ELO data at: {games_path}") | |
| logger.info(f"API_CACHE_DIR exists: {API_CACHE_DIR.exists()}") | |
| if API_CACHE_DIR.exists(): | |
| logger.info(f"API_CACHE_DIR contents: {list(API_CACHE_DIR.glob('*.parquet'))[:5]}") | |
| if not games_path.exists(): | |
| logger.warning(f"No historical game data found for ELO initialization at {games_path}") | |
| return | |
| games_df = pd.read_parquet(games_path) | |
| # Sort by date to process games chronologically | |
| games_df = games_df.sort_values("GAME_DATE").copy() | |
| # Track processed game IDs to avoid double-counting (home & away) | |
| processed_games = set() | |
| current_season = None | |
| for _, row in games_df.iterrows(): | |
| game_id = row["GAME_ID"] | |
| # Skip if we've already processed this game | |
| if game_id in processed_games: | |
| continue | |
| processed_games.add(game_id) | |
| # Regress ELO at season changes | |
| season = row.get("SEASON_ID", "") | |
| if season != current_season: | |
| if current_season is not None: | |
| self.feature_gen.elo.regress_to_mean() | |
| current_season = season | |
| team_id = row["TEAM_ID"] | |
| matchup = row.get("MATCHUP", "") | |
| wl = row.get("WL", "") | |
| if not matchup or not wl: | |
| continue | |
| # Parse opponent from matchup (e.g., "LAL vs. BOS" or "LAL @ BOS") | |
| is_home = "vs." in matchup | |
| opponent_abbrev = matchup.split(" ")[-1] | |
| opponent_id = next( | |
| (tid for tid, abbr in NBA_TEAMS.items() if abbr == opponent_abbrev), | |
| None | |
| ) | |
| if opponent_id: | |
| won = wl == "W" | |
| self.feature_gen.elo.update_ratings(team_id, opponent_id, won, is_home) | |
| logger.info(f"Initialized ELO ratings from {len(processed_games)} games") | |
| # Log some example ratings for verification | |
| sample_teams = ["LAL", "BOS", "GSW", "MIL", "DEN"] | |
| for abbrev in sample_teams: | |
| team_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == abbrev), None) | |
| if team_id: | |
| rating = self.feature_gen.elo.get_rating(team_id) | |
| logger.info(f" {abbrev}: {rating:.0f}") | |
| except Exception as e: | |
| logger.warning(f"Could not initialize ELO from history: {e}") | |
| def game_model(self) -> GamePredictor: | |
| if self._game_model is None: | |
| self._game_model = GamePredictor() | |
| try: | |
| self._game_model.load() | |
| except: | |
| logger.warning("Game model not found, using untrained model") | |
| return self._game_model | |
| def mvp_model(self) -> MVPPredictor: | |
| if self._mvp_model is None: | |
| self._mvp_model = MVPPredictor() | |
| try: | |
| self._mvp_model.load() | |
| except: | |
| logger.warning("MVP model not found, using untrained model") | |
| return self._mvp_model | |
| def champ_model(self) -> ChampionshipPredictor: | |
| if self._champ_model is None: | |
| self._champ_model = ChampionshipPredictor() | |
| try: | |
| self._champ_model.load() | |
| except: | |
| logger.warning("Championship model not found, using untrained model") | |
| return self._champ_model | |
| def get_todays_games(self) -> List[Dict]: | |
| """Fetch today's games from NBA Live API using LiveDataCollector.""" | |
| return self.live_collector.get_live_scoreboard() | |
| def get_live_games(self) -> List[Dict]: | |
| """Get currently in-progress games.""" | |
| return self.live_collector.get_live_games() | |
| def get_final_games(self) -> List[Dict]: | |
| """Get completed games from today.""" | |
| return self.live_collector.get_final_games() | |
| def get_upcoming_games(self, days_ahead: int = 7) -> List[Dict]: | |
| """ | |
| Get upcoming games using REAL NBA schedule. | |
| Uses live API for today's not-started games, plus NBA schedule API | |
| for future days. | |
| """ | |
| from datetime import timedelta | |
| import time | |
| upcoming = [] | |
| base_date = datetime.now() | |
| # Today's not-started games from live API | |
| todays_upcoming = self.live_collector.get_upcoming_games() | |
| for game in todays_upcoming: | |
| upcoming.append({ | |
| "game_id": game["game_id"], | |
| "date": game["game_date"] or base_date.strftime("%Y-%m-%d"), | |
| "time": game["status_text"] or "TBD", | |
| "day_name": base_date.strftime("%A"), | |
| "home_team": game["home_team"], | |
| "away_team": game["away_team"], | |
| "home_record": game.get("home_record", ""), | |
| "away_record": game.get("away_record", ""), | |
| }) | |
| # Note: NBA API doesn't reliably provide future game schedules | |
| # Today's games from live scoreboard are accurate | |
| # Future schedule requires web scraping or third-party API | |
| return upcoming | |
| def get_team_roster(self, team_abbrev: str) -> List[Dict]: | |
| """ | |
| Get projected starting 5 for a team. | |
| NOTE: This is a FAST fallback. The server caches real API data. | |
| This returns hardcoded 2025-26 starters for instant response. | |
| """ | |
| # Fast hardcoded rosters for all 30 teams (2025-26 season) | |
| # Using 'pts' field to match server API and frontend expectations | |
| rosters = { | |
| "ATL": [{"name": "Trae Young", "position": "G", "pts": 23.5}, {"name": "Jalen Johnson", "position": "F", "pts": 19.1}, {"name": "De'Andre Hunter", "position": "F", "pts": 15.2}, {"name": "Clint Capela", "position": "C", "pts": 8.5}, {"name": "Dyson Daniels", "position": "G", "pts": 11.2}], | |
| "BOS": [{"name": "Jayson Tatum", "position": "F", "pts": 27.5}, {"name": "Jaylen Brown", "position": "G", "pts": 24.1}, {"name": "Derrick White", "position": "G", "pts": 16.2}, {"name": "Kristaps Porzingis", "position": "C", "pts": 18.8}, {"name": "Jrue Holiday", "position": "G", "pts": 12.5}], | |
| "BKN": [{"name": "Cam Thomas", "position": "G", "pts": 24.8}, {"name": "Cameron Johnson", "position": "F", "pts": 14.5}, {"name": "Nic Claxton", "position": "C", "pts": 11.2}, {"name": "Dennis Schroder", "position": "G", "pts": 17.1}, {"name": "Dorian Finney-Smith", "position": "F", "pts": 9.5}], | |
| "CHA": [{"name": "LaMelo Ball", "position": "G", "pts": 22.5}, {"name": "Brandon Miller", "position": "F", "pts": 18.2}, {"name": "Miles Bridges", "position": "F", "pts": 16.8}, {"name": "Mark Williams", "position": "C", "pts": 11.5}, {"name": "Tre Mann", "position": "G", "pts": 10.2}], | |
| "CHI": [{"name": "Zach LaVine", "position": "G", "pts": 22.1}, {"name": "Coby White", "position": "G", "pts": 19.5}, {"name": "Patrick Williams", "position": "F", "pts": 12.8}, {"name": "Nikola Vucevic", "position": "C", "pts": 17.5}, {"name": "Josh Giddey", "position": "G", "pts": 13.2}], | |
| "CLE": [{"name": "Donovan Mitchell", "position": "G", "pts": 26.5}, {"name": "Darius Garland", "position": "G", "pts": 21.2}, {"name": "Evan Mobley", "position": "F", "pts": 18.1}, {"name": "Jarrett Allen", "position": "C", "pts": 16.5}, {"name": "Max Strus", "position": "G", "pts": 11.2}], | |
| "DAL": [{"name": "Luka Doncic", "position": "G", "pts": 33.5}, {"name": "Kyrie Irving", "position": "G", "pts": 25.2}, {"name": "Klay Thompson", "position": "G", "pts": 14.1}, {"name": "Daniel Gafford", "position": "C", "pts": 12.5}, {"name": "P.J. Washington", "position": "F", "pts": 13.8}], | |
| "DEN": [{"name": "Nikola Jokic", "position": "C", "pts": 29.5}, {"name": "Jamal Murray", "position": "G", "pts": 21.2}, {"name": "Michael Porter Jr.", "position": "F", "pts": 17.5}, {"name": "Aaron Gordon", "position": "F", "pts": 14.1}, {"name": "Russell Westbrook", "position": "G", "pts": 10.5}], | |
| "DET": [{"name": "Cade Cunningham", "position": "G", "pts": 24.2}, {"name": "Jaden Ivey", "position": "G", "pts": 17.5}, {"name": "Ausar Thompson", "position": "F", "pts": 11.2}, {"name": "Jalen Duren", "position": "C", "pts": 13.8}, {"name": "Tobias Harris", "position": "F", "pts": 12.5}], | |
| "GSW": [{"name": "Stephen Curry", "position": "G", "pts": 26.8}, {"name": "Andrew Wiggins", "position": "F", "pts": 16.5}, {"name": "Jonathan Kuminga", "position": "F", "pts": 14.2}, {"name": "Draymond Green", "position": "F", "pts": 9.1}, {"name": "Kevon Looney", "position": "C", "pts": 7.5}], | |
| "HOU": [{"name": "Jalen Green", "position": "G", "pts": 22.5}, {"name": "Alperen Sengun", "position": "C", "pts": 19.2}, {"name": "Fred VanVleet", "position": "G", "pts": 15.8}, {"name": "Jabari Smith Jr.", "position": "F", "pts": 14.5}, {"name": "Dillon Brooks", "position": "F", "pts": 12.2}], | |
| "IND": [{"name": "Tyrese Haliburton", "position": "G", "pts": 20.5}, {"name": "Pascal Siakam", "position": "F", "pts": 21.2}, {"name": "Myles Turner", "position": "C", "pts": 17.1}, {"name": "Andrew Nembhard", "position": "G", "pts": 11.5}, {"name": "Bennedict Mathurin", "position": "G", "pts": 15.2}], | |
| "LAC": [{"name": "James Harden", "position": "G", "pts": 21.5}, {"name": "Kawhi Leonard", "position": "F", "pts": 23.8}, {"name": "Norman Powell", "position": "G", "pts": 18.2}, {"name": "Ivica Zubac", "position": "C", "pts": 12.5}, {"name": "Terance Mann", "position": "G", "pts": 9.8}], | |
| "LAL": [{"name": "LeBron James", "position": "F", "pts": 25.5}, {"name": "Anthony Davis", "position": "C", "pts": 27.2}, {"name": "Austin Reaves", "position": "G", "pts": 18.1}, {"name": "D'Angelo Russell", "position": "G", "pts": 14.5}, {"name": "Rui Hachimura", "position": "F", "pts": 12.8}], | |
| "MEM": [{"name": "Ja Morant", "position": "G", "pts": 25.8}, {"name": "Desmond Bane", "position": "G", "pts": 21.2}, {"name": "Jaren Jackson Jr.", "position": "F", "pts": 22.5}, {"name": "Zach Edey", "position": "C", "pts": 10.5}, {"name": "Marcus Smart", "position": "G", "pts": 9.2}], | |
| "MIA": [{"name": "Jimmy Butler", "position": "F", "pts": 20.5}, {"name": "Tyler Herro", "position": "G", "pts": 21.2}, {"name": "Bam Adebayo", "position": "C", "pts": 19.8}, {"name": "Terry Rozier", "position": "G", "pts": 16.5}, {"name": "Jaime Jaquez Jr.", "position": "F", "pts": 12.2}], | |
| "MIL": [{"name": "Giannis Antetokounmpo", "position": "F", "pts": 30.5}, {"name": "Damian Lillard", "position": "G", "pts": 25.2}, {"name": "Khris Middleton", "position": "F", "pts": 14.1}, {"name": "Brook Lopez", "position": "C", "pts": 12.5}, {"name": "Gary Trent Jr.", "position": "G", "pts": 11.8}], | |
| "MIN": [{"name": "Anthony Edwards", "position": "G", "pts": 27.5}, {"name": "Julius Randle", "position": "F", "pts": 20.2}, {"name": "Rudy Gobert", "position": "C", "pts": 14.5}, {"name": "Mike Conley", "position": "G", "pts": 10.1}, {"name": "Jaden McDaniels", "position": "F", "pts": 12.2}], | |
| "NOP": [{"name": "Zion Williamson", "position": "F", "pts": 22.5}, {"name": "Brandon Ingram", "position": "F", "pts": 21.8}, {"name": "CJ McCollum", "position": "G", "pts": 18.5}, {"name": "Dejounte Murray", "position": "G", "pts": 14.2}, {"name": "Trey Murphy III", "position": "F", "pts": 15.1}], | |
| "NYK": [{"name": "Jalen Brunson", "position": "G", "pts": 28.5}, {"name": "Karl-Anthony Towns", "position": "C", "pts": 25.2}, {"name": "Mikal Bridges", "position": "F", "pts": 18.1}, {"name": "OG Anunoby", "position": "F", "pts": 15.5}, {"name": "Josh Hart", "position": "G", "pts": 12.2}], | |
| "OKC": [{"name": "Shai Gilgeous-Alexander", "position": "G", "pts": 32.5}, {"name": "Jalen Williams", "position": "F", "pts": 20.2}, {"name": "Chet Holmgren", "position": "C", "pts": 18.1}, {"name": "Lu Dort", "position": "G", "pts": 11.5}, {"name": "Isaiah Hartenstein", "position": "C", "pts": 9.8}], | |
| "ORL": [{"name": "Paolo Banchero", "position": "F", "pts": 24.5}, {"name": "Franz Wagner", "position": "F", "pts": 22.2}, {"name": "Jalen Suggs", "position": "G", "pts": 14.1}, {"name": "Wendell Carter Jr.", "position": "C", "pts": 12.5}, {"name": "Anthony Black", "position": "G", "pts": 8.2}], | |
| "PHI": [{"name": "Tyrese Maxey", "position": "G", "pts": 26.5}, {"name": "Paul George", "position": "F", "pts": 22.2}, {"name": "Joel Embiid", "position": "C", "pts": 28.5}, {"name": "Kelly Oubre Jr.", "position": "F", "pts": 12.1}, {"name": "Kyle Lowry", "position": "G", "pts": 8.5}], | |
| "PHX": [{"name": "Kevin Durant", "position": "F", "pts": 27.5}, {"name": "Devin Booker", "position": "G", "pts": 26.2}, {"name": "Bradley Beal", "position": "G", "pts": 18.5}, {"name": "Jusuf Nurkic", "position": "C", "pts": 11.2}, {"name": "Tyus Jones", "position": "G", "pts": 10.1}], | |
| "POR": [{"name": "Anfernee Simons", "position": "G", "pts": 22.5}, {"name": "Scoot Henderson", "position": "G", "pts": 16.2}, {"name": "Shaedon Sharpe", "position": "G", "pts": 14.8}, {"name": "Jerami Grant", "position": "F", "pts": 18.1}, {"name": "Deandre Ayton", "position": "C", "pts": 17.5}], | |
| "SAC": [{"name": "De'Aaron Fox", "position": "G", "pts": 27.5}, {"name": "Domantas Sabonis", "position": "C", "pts": 21.2}, {"name": "DeMar DeRozan", "position": "F", "pts": 18.5}, {"name": "Keegan Murray", "position": "F", "pts": 15.1}, {"name": "Malik Monk", "position": "G", "pts": 14.2}], | |
| "SAS": [{"name": "Victor Wembanyama", "position": "C", "pts": 24.5}, {"name": "Devin Vassell", "position": "G", "pts": 18.2}, {"name": "Chris Paul", "position": "G", "pts": 10.5}, {"name": "Harrison Barnes", "position": "F", "pts": 12.1}, {"name": "Jeremy Sochan", "position": "F", "pts": 14.8}], | |
| "TOR": [{"name": "Scottie Barnes", "position": "F", "pts": 22.5}, {"name": "RJ Barrett", "position": "G", "pts": 18.2}, {"name": "Immanuel Quickley", "position": "G", "pts": 16.5}, {"name": "Jakob Poeltl", "position": "C", "pts": 14.1}, {"name": "Gradey Dick", "position": "G", "pts": 12.8}], | |
| "UTA": [{"name": "Lauri Markkanen", "position": "F", "pts": 23.5}, {"name": "Collin Sexton", "position": "G", "pts": 17.2}, {"name": "Jordan Clarkson", "position": "G", "pts": 16.5}, {"name": "Walker Kessler", "position": "C", "pts": 10.1}, {"name": "John Collins", "position": "F", "pts": 14.2}], | |
| "WAS": [{"name": "Jordan Poole", "position": "G", "pts": 18.5}, {"name": "Kyle Kuzma", "position": "F", "pts": 17.2}, {"name": "Bilal Coulibaly", "position": "F", "pts": 11.5}, {"name": "Jonas Valanciunas", "position": "C", "pts": 12.8}, {"name": "Malcolm Brogdon", "position": "G", "pts": 14.1}], | |
| } | |
| return rosters.get(team_abbrev, [ | |
| {"name": "Starter 1", "position": "G", "pts": 0}, | |
| {"name": "Starter 2", "position": "G", "pts": 0}, | |
| {"name": "Starter 3", "position": "F", "pts": 0}, | |
| {"name": "Starter 4", "position": "F", "pts": 0}, | |
| {"name": "Starter 5", "position": "C", "pts": 0}, | |
| ]) | |
| def get_team_record(self, team_id: int, season: str = "2024-25") -> Dict: | |
| """Get current record for a team.""" | |
| try: | |
| games = leaguegamefinder.LeagueGameFinder( | |
| team_id_nullable=team_id, | |
| season_nullable=season | |
| ).get_data_frames()[0] | |
| if games.empty: | |
| return {"wins": 0, "losses": 0, "win_pct": 0.5} | |
| wins = (games["WL"] == "W").sum() | |
| losses = (games["WL"] == "L").sum() | |
| return { | |
| "wins": wins, | |
| "losses": losses, | |
| "win_pct": wins / (wins + losses) if (wins + losses) > 0 else 0.5 | |
| } | |
| except: | |
| return {"wins": 0, "losses": 0, "win_pct": 0.5} | |
| def _get_current_standings_cache(self) -> Dict[str, Dict]: | |
| """Get cached current season standings with win percentages.""" | |
| if not hasattr(self, '_standings_cache') or self._standings_cache is None: | |
| self._standings_cache = {} | |
| try: | |
| # Try to load from cached standings file for current season | |
| standings_path = API_CACHE_DIR / "standings_2025-26.parquet" | |
| if standings_path.exists(): | |
| df = pd.read_parquet(standings_path) | |
| for _, row in df.iterrows(): | |
| team_name = row.get('TeamName', row.get('TEAM_NAME', '')) | |
| team_id = row.get('TeamID', row.get('TEAM_ID', 0)) | |
| # Get team abbreviation from ID | |
| abbrev = NBA_TEAMS.get(team_id, '') | |
| if not abbrev and team_name: | |
| # Try to match by city/name | |
| for tid, abb in NBA_TEAMS.items(): | |
| if abb in team_name or team_name.split()[-1][:3].upper() == abb: | |
| abbrev = abb | |
| break | |
| if abbrev: | |
| wins = row.get('WINS', row.get('W', 0)) | |
| losses = row.get('LOSSES', row.get('L', 0)) | |
| total = wins + losses | |
| win_pct = wins / total if total > 0 else 0.5 | |
| self._standings_cache[abbrev] = { | |
| 'wins': wins, | |
| 'losses': losses, | |
| 'win_pct': win_pct, | |
| 'games_played': total | |
| } | |
| logger.info(f"Loaded standings for {len(self._standings_cache)} teams") | |
| except Exception as e: | |
| logger.warning(f"Could not load standings cache: {e}") | |
| return self._standings_cache | |
| def _get_recent_form(self, team_abbrev: str, n_games: int = 10) -> float: | |
| """Get team's recent form (win % in last N games).""" | |
| try: | |
| games_path = API_CACHE_DIR / "games_2025-26.parquet" | |
| if not games_path.exists(): | |
| return 0.5 | |
| df = pd.read_parquet(games_path) | |
| team_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == team_abbrev), None) | |
| if not team_id: | |
| return 0.5 | |
| team_games = df[df['TEAM_ID'] == team_id].sort_values('GAME_DATE', ascending=False).head(n_games) | |
| if len(team_games) < 3: | |
| return 0.5 | |
| wins = (team_games['WL'] == 'W').sum() | |
| return wins / len(team_games) | |
| except Exception: | |
| return 0.5 | |
| def predict_game(self, home_team: str, away_team: str) -> Dict: | |
| """ | |
| Generate prediction for a single game using multi-factor algorithm. | |
| Combines: | |
| - Current season standings (win %) | |
| - ELO ratings (historical strength) | |
| - Home court advantage (~3-4% boost) | |
| - Recent form (last 10 games) | |
| - Injury impact | |
| Args: | |
| home_team: Home team abbreviation (e.g., "LAL") | |
| away_team: Away team abbreviation (e.g., "BOS") | |
| Returns: | |
| Prediction dict with probabilities and explanations | |
| """ | |
| # Get team IDs | |
| home_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == home_team), None) | |
| away_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == away_team), None) | |
| if not home_id or not away_id: | |
| return {"error": "Unknown team"} | |
| # ===== MULTI-FACTOR PREDICTION ALGORITHM ===== | |
| # 1. Get current season standings | |
| standings = self._get_current_standings_cache() | |
| home_standings = standings.get(home_team, {'win_pct': 0.5, 'wins': 0, 'losses': 0}) | |
| away_standings = standings.get(away_team, {'win_pct': 0.5, 'wins': 0, 'losses': 0}) | |
| home_win_pct = home_standings['win_pct'] | |
| away_win_pct = away_standings['win_pct'] | |
| # 2. Get ELO features (historical context) | |
| elo_features = self.feature_gen.elo.calculate_game_features( | |
| home_id, away_id, is_home=True | |
| ) | |
| # 3. Get recent form (momentum) | |
| home_form = self._get_recent_form(home_team, 10) | |
| away_form = self._get_recent_form(away_team, 10) | |
| # 4. Get injury impact | |
| home_injuries = self.injury_collector.get_injury_summary(home_team) | |
| away_injuries = self.injury_collector.get_injury_summary(away_team) | |
| home_injury_impact = self.injury_collector.calculate_injury_impact(home_team) | |
| away_injury_impact = self.injury_collector.calculate_injury_impact(away_team) | |
| # ===== CALCULATE WIN PROBABILITY ===== | |
| # Method: Log5 formula for head-to-head probability | |
| # P(A beats B) = (pA * (1 - pB)) / (pA * (1 - pB) + pB * (1 - pA)) | |
| # Where pA and pB are true talent levels (blend of factors) | |
| # Calculate "true talent" rating for each team (0 to 1 scale) | |
| # Weights: Season record (40%), Recent form (30%), ELO-based (20%), Base (10%) | |
| # ELO-based win expectancy (convert ELO to win expectancy vs average team) | |
| home_elo_strength = 1.0 / (1.0 + 10 ** (-(elo_features["team_elo"] - 1500) / 400)) | |
| away_elo_strength = 1.0 / (1.0 + 10 ** (-(elo_features["opponent_elo"] - 1500) / 400)) | |
| # Blend factors for "true talent" | |
| home_talent = ( | |
| 0.40 * home_win_pct + # Season record (most important) | |
| 0.30 * home_form + # Recent form (10 games) | |
| 0.20 * home_elo_strength + # Historical ELO | |
| 0.10 * 0.5 # Baseline | |
| ) | |
| away_talent = ( | |
| 0.40 * away_win_pct + | |
| 0.30 * away_form + | |
| 0.20 * away_elo_strength + | |
| 0.10 * 0.5 | |
| ) | |
| # Apply home court advantage (typically 3-4% in NBA) | |
| HOME_COURT_ADVANTAGE = 0.035 | |
| home_talent = min(0.95, home_talent + HOME_COURT_ADVANTAGE) | |
| # Apply injury adjustments (injuries hurt team) | |
| # Each injury point reduces win probability by ~2% | |
| home_talent = max(0.05, home_talent - home_injury_impact * 0.02) | |
| away_talent = max(0.05, away_talent - away_injury_impact * 0.02) | |
| # Log5 formula for head-to-head probability | |
| if home_talent + away_talent == 0: | |
| win_prob = 0.5 | |
| elif home_talent == 0: | |
| win_prob = 0.0 | |
| elif away_talent == 0: | |
| win_prob = 1.0 | |
| else: | |
| win_prob = (home_talent * (1 - away_talent)) / ( | |
| home_talent * (1 - away_talent) + away_talent * (1 - home_talent) | |
| ) | |
| # Clamp to reasonable range (5% - 95%) | |
| win_prob = max(0.05, min(0.95, win_prob)) | |
| # ===== DETERMINE CONFIDENCE LEVEL ===== | |
| prob_diff = abs(win_prob - 0.5) | |
| if prob_diff > 0.25: | |
| confidence = "high" | |
| elif prob_diff > 0.10: | |
| confidence = "medium" | |
| else: | |
| confidence = "low" | |
| # ===== BUILD RESULT ===== | |
| result = { | |
| "home_team": home_team, | |
| "away_team": away_team, | |
| "home_win_probability": round(win_prob, 3), | |
| "away_win_probability": round(1 - win_prob, 3), | |
| "predicted_winner": home_team if win_prob > 0.5 else away_team, | |
| "confidence": confidence, | |
| "home_elo": elo_features["team_elo"], | |
| "away_elo": elo_features["opponent_elo"], | |
| "elo_diff": elo_features["elo_diff"], | |
| "home_record": f"{home_standings.get('wins', 0)}-{home_standings.get('losses', 0)}", | |
| "away_record": f"{away_standings.get('wins', 0)}-{away_standings.get('losses', 0)}", | |
| "home_form": f"{home_form:.1%}", | |
| "away_form": f"{away_form:.1%}", | |
| "home_injuries": home_injuries, | |
| "away_injuries": away_injuries, | |
| "home_injury_impact": home_injury_impact, | |
| "away_injury_impact": away_injury_impact, | |
| "factors": [] | |
| } | |
| # ===== ADD EXPLAINING FACTORS ===== | |
| # Record comparison | |
| if home_win_pct > away_win_pct + 0.1: | |
| result["factors"].append(f"{home_team} has better record ({home_win_pct:.1%} vs {away_win_pct:.1%})") | |
| elif away_win_pct > home_win_pct + 0.1: | |
| result["factors"].append(f"{away_team} has better record ({away_win_pct:.1%} vs {home_win_pct:.1%})") | |
| # Momentum | |
| if home_form > away_form + 0.15: | |
| result["factors"].append(f"{home_team} in better recent form (L10: {home_form:.0%})") | |
| elif away_form > home_form + 0.15: | |
| result["factors"].append(f"{away_team} in better recent form (L10: {away_form:.0%})") | |
| # Home court | |
| result["factors"].append(f"Home court advantage for {home_team}") | |
| # Injuries | |
| if home_injuries["total_injuries"] > 0: | |
| result["factors"].append(f"{home_team} has {home_injuries['total_injuries']} injuries") | |
| if away_injuries["total_injuries"] > 0: | |
| result["factors"].append(f"{away_team} has {away_injuries['total_injuries']} injuries") | |
| return result | |
| def predict_todays_games(self, save_predictions: bool = True) -> List[Dict]: | |
| """ | |
| Generate predictions for all of today's games. | |
| Args: | |
| save_predictions: If True, save predictions to ChromaDB tracker | |
| """ | |
| games = self.get_todays_games() | |
| if not games: | |
| logger.info("No games today") | |
| return [] | |
| predictions = [] | |
| for game in games: | |
| home_team = game.get("home_team", "") | |
| away_team = game.get("away_team", "") | |
| if home_team and away_team: | |
| pred = self.predict_game(home_team, away_team) | |
| pred["game_id"] = game.get("game_id", "") | |
| pred["game_date"] = game.get("game_date", "") | |
| pred["game_status"] = game.get("status", "") | |
| pred["current_home_score"] = game.get("home_score", 0) | |
| pred["current_away_score"] = game.get("away_score", 0) | |
| # Save prediction if game hasn't started and tracking enabled | |
| if save_predictions and game.get("status") == "NOT_STARTED": | |
| self.save_prediction_for_game(game["game_id"], pred) | |
| predictions.append(pred) | |
| return predictions | |
| def save_prediction_for_game(self, game_id: str, prediction: Dict) -> bool: | |
| """Save a prediction to the tracker before game starts.""" | |
| return self.prediction_tracker.save_prediction(game_id, prediction) | |
| def check_prediction_results(self) -> List[Dict]: | |
| """ | |
| Check completed games and update prediction results. | |
| Returns: | |
| List of updated predictions with results | |
| """ | |
| final_games = self.get_final_games() | |
| updated = [] | |
| for game in final_games: | |
| game_id = game["game_id"] | |
| home_score = game["home_score"] | |
| away_score = game["away_score"] | |
| actual_winner = game["home_team"] if home_score > away_score else game["away_team"] | |
| # Update the prediction in tracker | |
| success = self.prediction_tracker.update_result( | |
| game_id=game_id, | |
| actual_winner=actual_winner, | |
| home_score=home_score, | |
| away_score=away_score | |
| ) | |
| if success: | |
| pred = self.prediction_tracker.get_prediction(game_id) | |
| if pred: | |
| pred["actual_winner"] = actual_winner | |
| pred["home_score"] = home_score | |
| pred["away_score"] = away_score | |
| updated.append(pred) | |
| return updated | |
| def get_accuracy_stats(self) -> Dict: | |
| """Get comprehensive model accuracy statistics.""" | |
| return self.prediction_tracker.get_accuracy_stats() | |
| def get_recent_predictions(self, n: int = 20) -> List[Dict]: | |
| """Get recent predictions with results.""" | |
| return self.prediction_tracker.get_recent_predictions(n) | |
| def get_pending_predictions(self) -> List[Dict]: | |
| """Get predictions for games not yet completed.""" | |
| return self.prediction_tracker.get_pending_predictions() | |
| def get_games_with_predictions(self) -> List[Dict]: | |
| """ | |
| Get all today's games with prediction data and live scores. | |
| Enriches each game with prediction info and correctness status. | |
| """ | |
| games = self.get_todays_games() | |
| enriched = [] | |
| for game in games: | |
| game_data = dict(game) # Copy | |
| # Get prediction for this game | |
| pred = self.predict_game(game["home_team"], game["away_team"]) | |
| game_data["prediction"] = pred | |
| # Check if prediction was correct (for completed games) | |
| if game["status"] == "FINAL": | |
| actual_winner = game["home_team"] if game["home_score"] > game["away_score"] else game["away_team"] | |
| game_data["actual_winner"] = actual_winner | |
| game_data["prediction_correct"] = pred["predicted_winner"] == actual_winner | |
| else: | |
| game_data["actual_winner"] = None | |
| game_data["prediction_correct"] = None | |
| enriched.append(game_data) | |
| return enriched | |
| def get_mvp_race(self, player_df: pd.DataFrame = None) -> pd.DataFrame: | |
| """Get current MVP race standings using ONLY current 2025-26 season data.""" | |
| # Always fetch real current season player stats from NBA API | |
| max_retries = 1 # Fail fast and use fallback | |
| for attempt in range(max_retries): | |
| try: | |
| from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings | |
| import time | |
| # Shorter delay for faster response | |
| time.sleep(0.5) | |
| # Reduced timeout to fail faster if API is slow | |
| stats = leaguedashplayerstats.LeagueDashPlayerStats( | |
| season='2025-26', | |
| per_mode_detailed='PerGame', | |
| timeout=30 # 30 second timeout | |
| ) | |
| df = stats.get_data_frames()[0] | |
| # Get team standings for team win percentage | |
| time.sleep(1.0) | |
| standings = leaguestandings.LeagueStandings( | |
| season='2025-26', | |
| timeout=60 | |
| ) | |
| standings_df = standings.get_data_frames()[0] | |
| # Map team win% to players by TEAM_ID | |
| team_win_pct = {} | |
| for _, row in standings_df.iterrows(): | |
| team_id = row.get('TeamID', 0) | |
| wins = row.get('WINS', 0) | |
| losses = row.get('LOSSES', 0) | |
| total = wins + losses | |
| if total > 0: | |
| team_win_pct[team_id] = wins / total | |
| # Add team win% to player stats | |
| df['TEAM_WIN_PCT'] = df['TEAM_ID'].map(team_win_pct).fillna(0.5) | |
| # Filter to players with significant minutes (starters/key players) | |
| df = df[ | |
| (df['MIN'] >= 25) & | |
| (df['GP'] >= 15) | |
| ].copy() | |
| # Calculate MVP score directly (no model dependency) | |
| df['mvp_score'] = ( | |
| df['PTS'].fillna(0) * 1.0 + # Points | |
| df['AST'].fillna(0) * 2.0 + # Assists (playmaking) | |
| df['REB'].fillna(0) * 1.0 + # Rebounds | |
| (df['STL'].fillna(0) + df['BLK'].fillna(0)) * 1.5 + # Defense | |
| df['PLUS_MINUS'].fillna(0) * 0.3 + # Impact | |
| df['FG_PCT'].fillna(0.45) * 20 + # Efficiency | |
| df['TEAM_WIN_PCT'].fillna(0.5) * 30 # Team success | |
| ) | |
| # Add similarity score (simplified - based on stats profile) | |
| df['mvp_similarity'] = ( | |
| (df['PTS'] / 30.0).clip(0, 1) * 0.4 + # Elite scorer | |
| (df['REB'] / 12.0).clip(0, 1) * 0.2 + # Elite rebounder | |
| (df['AST'] / 10.0).clip(0, 1) * 0.2 + # Elite playmaker | |
| df['TEAM_WIN_PCT'] * 0.2 # Winning team | |
| ).fillna(0) | |
| # Sort by MVP score | |
| df = df.sort_values('mvp_score', ascending=False) | |
| logger.info(f"Successfully fetched MVP data on attempt {attempt + 1}") | |
| # Return top 10 MVP candidates | |
| return df.head(10)[['PLAYER_NAME', 'PTS', 'REB', 'AST', 'mvp_score', 'mvp_similarity']] | |
| except Exception as e: | |
| logger.warning(f"MVP data fetch attempt {attempt + 1} failed: {e}") | |
| if attempt < max_retries - 1: | |
| import time | |
| time.sleep(2 ** attempt) # Exponential backoff | |
| continue | |
| logger.error("All MVP data fetch attempts failed, returning fallback data") | |
| # Return fallback mock data with real 2025-26 MVP candidates | |
| return pd.DataFrame({ | |
| 'PLAYER_NAME': [ | |
| 'Nikola Jokić', 'Shai Gilgeous-Alexander', 'Luka Dončić', | |
| 'Giannis Antetokounmpo', 'Jayson Tatum', 'Anthony Davis', | |
| 'Victor Wembanyama', 'LeBron James', 'Kevin Durant', 'Tyrese Maxey' | |
| ], | |
| 'PTS': [29.6, 31.8, 33.6, 28.8, 27.2, 26.5, 24.5, 23.8, 27.1, 30.3], | |
| 'REB': [12.2, 4.4, 7.7, 9.5, 8.1, 11.8, 10.9, 7.2, 6.4, 4.4], | |
| 'AST': [11.0, 6.2, 8.7, 5.5, 5.4, 3.2, 3.0, 8.4, 4.2, 6.7], | |
| 'mvp_score': [102.8, 90.6, 89.5, 78.7, 77.4, 76.2, 80.1, 75.8, 74.3, 79.1], | |
| 'mvp_similarity': [0.933, 0.760, 0.822, 0.735, 0.720, 0.705, 0.706, 0.698, 0.685, 0.717] | |
| }) | |
| def get_championship_odds(self, team_df: pd.DataFrame = None) -> pd.DataFrame: | |
| """Get current championship odds using LIVE standings data from NBA API.""" | |
| if team_df is None: | |
| # Fetch real current season standings from NBA API | |
| max_retries = 1 # Fail fast and use fallback | |
| for attempt in range(max_retries): | |
| try: | |
| from nba_api.stats.endpoints import leaguestandings | |
| import time | |
| time.sleep(0.5) | |
| standings = leaguestandings.LeagueStandings( | |
| season='2025-26', | |
| timeout=30 | |
| ) | |
| df = standings.get_data_frames()[0] | |
| if df.empty: | |
| logger.warning("NBA API returned empty standings data") | |
| continue | |
| logger.info(f"Got standings for {len(df)} teams from NBA API") | |
| # Build team DataFrame with required columns | |
| team_df = pd.DataFrame({ | |
| 'TEAM_ABBREVIATION': df['TeamCity'].apply(lambda x: NBA_TEAMS.get( | |
| next((tid for tid, abbr in NBA_TEAMS.items() | |
| if x.lower() in abbr.lower() or abbr.lower() in x.lower()), 0), | |
| 'UNK' | |
| )), | |
| 'W_PCT': df['WinPCT'].fillna(0.5), | |
| 'NET_RATING': df['NetRating'].fillna(0) if 'NetRating' in df.columns else 0, | |
| }) | |
| # If team abbreviations didn't map well, try using TeamAbbreviation directly if available | |
| if 'TeamAbbreviation' in df.columns: | |
| team_df['TEAM_ABBREVIATION'] = df['TeamAbbreviation'] | |
| # Add ELO ratings from our feature generator | |
| elo_ratings = {} | |
| for team_id, abbrev in NBA_TEAMS.items(): | |
| elo_ratings[abbrev] = self.feature_gen.elo.get_rating(team_id) | |
| team_df['ELO'] = team_df['TEAM_ABBREVIATION'].map(elo_ratings).fillna(1500) | |
| logger.info(f"Successfully built championship data for {len(team_df)} teams") | |
| break | |
| except Exception as e: | |
| logger.warning(f"Championship standings fetch attempt {attempt + 1} failed: {e}") | |
| continue | |
| else: | |
| # All retries failed - use fallback mock data | |
| logger.warning("Using fallback championship odds data") | |
| team_df = pd.DataFrame({ | |
| "TEAM_ABBREVIATION": ["OKC", "CLE", "BOS", "DEN", "MEM", "HOU", "NYK", "GSW", | |
| "MIN", "LAL", "MIL", "PHX", "DAL", "MIA", "SAC", "IND"], | |
| "W_PCT": [0.74, 0.70, 0.66, 0.62, 0.60, 0.58, 0.56, 0.54, | |
| 0.52, 0.50, 0.48, 0.46, 0.44, 0.42, 0.40, 0.38], | |
| "NET_RATING": [10.5, 8.2, 7.5, 6.0, 5.5, 4.5, 4.0, 3.5, | |
| 3.0, 2.5, 2.0, 1.5, 1.0, 0.5, 0.0, -0.5] | |
| }) | |
| return self.champ_model.get_top_contenders(team_df) | |
| # ============================================================================= | |
| # CLI INTERFACE | |
| # ============================================================================= | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="NBA Prediction Pipeline") | |
| parser.add_argument("--test", action="store_true", help="Run test prediction") | |
| parser.add_argument("--today", action="store_true", help="Predict today's games") | |
| parser.add_argument("--game", nargs=2, help="Predict single game: HOME AWAY") | |
| args = parser.parse_args() | |
| pipeline = PredictionPipeline() | |
| if args.test: | |
| print("Testing prediction pipeline...") | |
| result = pipeline.predict_game("LAL", "BOS") | |
| for k, v in result.items(): | |
| print(f" {k}: {v}") | |
| elif args.today: | |
| print("Today's game predictions:") | |
| predictions = pipeline.predict_todays_games() | |
| for pred in predictions: | |
| print(f"\n{pred['away_team']} @ {pred['home_team']}") | |
| print(f" Predicted winner: {pred['predicted_winner']}") | |
| print(f" Win probability: {pred['home_win_probability']:.1%}") | |
| elif args.game: | |
| home, away = args.game | |
| result = pipeline.predict_game(home.upper(), away.upper()) | |
| print(f"\n{away.upper()} @ {home.upper()}") | |
| for k, v in result.items(): | |
| print(f" {k}: {v}") | |
| else: | |
| print("Use --test, --today, or --game HOME AWAY") | |