NBA_PREDICTOR / src /prediction_pipeline.py
jashdoshi77's picture
Fix auto-training and add dynamic MVP/Championship predictions
dfac64b
"""
NBA ML Prediction System - Prediction Pipeline
===============================================
End-to-end pipeline for generating predictions with live data integration.
"""
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import logging
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
from src.config import (
API_CACHE_DIR,
MODELS_DIR,
NBA_TEAMS,
API_CONFIG
)
from src.data_collector import CacheManager, retry_with_backoff
from src.feature_engineering import FeatureGenerator
from src.injury_collector import InjuryCollector
from src.models.game_predictor import GamePredictor
from src.models.mvp_predictor import MVPPredictor
from src.models.championship_predictor import ChampionshipPredictor
from src.preprocessing import DataPreprocessor
from src.live_data_collector import LiveDataCollector
from src.prediction_tracker import PredictionTracker
logger = logging.getLogger(__name__)
# =============================================================================
# PREDICTION PIPELINE
# =============================================================================
class PredictionPipeline:
"""
End-to-end prediction pipeline for:
- Today's games (with live scores)
- Upcoming games with predictions
- MVP race
- Championship odds
- Prediction tracking and accuracy
"""
def __init__(self):
self.cache = CacheManager()
self.feature_gen = FeatureGenerator()
self.injury_collector = InjuryCollector()
# Live data and tracking
self.live_collector = LiveDataCollector()
self.prediction_tracker = PredictionTracker()
# Models (loaded on demand)
self._game_model = None
self._mvp_model = None
self._champ_model = None
self._preprocessor = None
# Initialize ELO ratings from historical games
self._initialize_elo_from_history()
def _initialize_elo_from_history(self):
"""
Process all historical games to build accurate ELO ratings.
This ensures predictions reflect actual team strength.
"""
try:
from src.config import API_CACHE_DIR
games_path = API_CACHE_DIR / "all_games_summary.parquet"
logger.info(f"Looking for ELO data at: {games_path}")
logger.info(f"API_CACHE_DIR exists: {API_CACHE_DIR.exists()}")
if API_CACHE_DIR.exists():
logger.info(f"API_CACHE_DIR contents: {list(API_CACHE_DIR.glob('*.parquet'))[:5]}")
if not games_path.exists():
logger.warning(f"No historical game data found for ELO initialization at {games_path}")
return
games_df = pd.read_parquet(games_path)
# Sort by date to process games chronologically
games_df = games_df.sort_values("GAME_DATE").copy()
# Track processed game IDs to avoid double-counting (home & away)
processed_games = set()
current_season = None
for _, row in games_df.iterrows():
game_id = row["GAME_ID"]
# Skip if we've already processed this game
if game_id in processed_games:
continue
processed_games.add(game_id)
# Regress ELO at season changes
season = row.get("SEASON_ID", "")
if season != current_season:
if current_season is not None:
self.feature_gen.elo.regress_to_mean()
current_season = season
team_id = row["TEAM_ID"]
matchup = row.get("MATCHUP", "")
wl = row.get("WL", "")
if not matchup or not wl:
continue
# Parse opponent from matchup (e.g., "LAL vs. BOS" or "LAL @ BOS")
is_home = "vs." in matchup
opponent_abbrev = matchup.split(" ")[-1]
opponent_id = next(
(tid for tid, abbr in NBA_TEAMS.items() if abbr == opponent_abbrev),
None
)
if opponent_id:
won = wl == "W"
self.feature_gen.elo.update_ratings(team_id, opponent_id, won, is_home)
logger.info(f"Initialized ELO ratings from {len(processed_games)} games")
# Log some example ratings for verification
sample_teams = ["LAL", "BOS", "GSW", "MIL", "DEN"]
for abbrev in sample_teams:
team_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == abbrev), None)
if team_id:
rating = self.feature_gen.elo.get_rating(team_id)
logger.info(f" {abbrev}: {rating:.0f}")
except Exception as e:
logger.warning(f"Could not initialize ELO from history: {e}")
@property
def game_model(self) -> GamePredictor:
if self._game_model is None:
self._game_model = GamePredictor()
try:
self._game_model.load()
except:
logger.warning("Game model not found, using untrained model")
return self._game_model
@property
def mvp_model(self) -> MVPPredictor:
if self._mvp_model is None:
self._mvp_model = MVPPredictor()
try:
self._mvp_model.load()
except:
logger.warning("MVP model not found, using untrained model")
return self._mvp_model
@property
def champ_model(self) -> ChampionshipPredictor:
if self._champ_model is None:
self._champ_model = ChampionshipPredictor()
try:
self._champ_model.load()
except:
logger.warning("Championship model not found, using untrained model")
return self._champ_model
def get_todays_games(self) -> List[Dict]:
"""Fetch today's games from NBA Live API using LiveDataCollector."""
return self.live_collector.get_live_scoreboard()
def get_live_games(self) -> List[Dict]:
"""Get currently in-progress games."""
return self.live_collector.get_live_games()
def get_final_games(self) -> List[Dict]:
"""Get completed games from today."""
return self.live_collector.get_final_games()
def get_upcoming_games(self, days_ahead: int = 7) -> List[Dict]:
"""
Get upcoming games using REAL NBA schedule.
Uses live API for today's not-started games, plus NBA schedule API
for future days.
"""
from datetime import timedelta
import time
upcoming = []
base_date = datetime.now()
# Today's not-started games from live API
todays_upcoming = self.live_collector.get_upcoming_games()
for game in todays_upcoming:
upcoming.append({
"game_id": game["game_id"],
"date": game["game_date"] or base_date.strftime("%Y-%m-%d"),
"time": game["status_text"] or "TBD",
"day_name": base_date.strftime("%A"),
"home_team": game["home_team"],
"away_team": game["away_team"],
"home_record": game.get("home_record", ""),
"away_record": game.get("away_record", ""),
})
# Note: NBA API doesn't reliably provide future game schedules
# Today's games from live scoreboard are accurate
# Future schedule requires web scraping or third-party API
return upcoming
def get_team_roster(self, team_abbrev: str) -> List[Dict]:
"""
Get projected starting 5 for a team.
NOTE: This is a FAST fallback. The server caches real API data.
This returns hardcoded 2025-26 starters for instant response.
"""
# Fast hardcoded rosters for all 30 teams (2025-26 season)
# Using 'pts' field to match server API and frontend expectations
rosters = {
"ATL": [{"name": "Trae Young", "position": "G", "pts": 23.5}, {"name": "Jalen Johnson", "position": "F", "pts": 19.1}, {"name": "De'Andre Hunter", "position": "F", "pts": 15.2}, {"name": "Clint Capela", "position": "C", "pts": 8.5}, {"name": "Dyson Daniels", "position": "G", "pts": 11.2}],
"BOS": [{"name": "Jayson Tatum", "position": "F", "pts": 27.5}, {"name": "Jaylen Brown", "position": "G", "pts": 24.1}, {"name": "Derrick White", "position": "G", "pts": 16.2}, {"name": "Kristaps Porzingis", "position": "C", "pts": 18.8}, {"name": "Jrue Holiday", "position": "G", "pts": 12.5}],
"BKN": [{"name": "Cam Thomas", "position": "G", "pts": 24.8}, {"name": "Cameron Johnson", "position": "F", "pts": 14.5}, {"name": "Nic Claxton", "position": "C", "pts": 11.2}, {"name": "Dennis Schroder", "position": "G", "pts": 17.1}, {"name": "Dorian Finney-Smith", "position": "F", "pts": 9.5}],
"CHA": [{"name": "LaMelo Ball", "position": "G", "pts": 22.5}, {"name": "Brandon Miller", "position": "F", "pts": 18.2}, {"name": "Miles Bridges", "position": "F", "pts": 16.8}, {"name": "Mark Williams", "position": "C", "pts": 11.5}, {"name": "Tre Mann", "position": "G", "pts": 10.2}],
"CHI": [{"name": "Zach LaVine", "position": "G", "pts": 22.1}, {"name": "Coby White", "position": "G", "pts": 19.5}, {"name": "Patrick Williams", "position": "F", "pts": 12.8}, {"name": "Nikola Vucevic", "position": "C", "pts": 17.5}, {"name": "Josh Giddey", "position": "G", "pts": 13.2}],
"CLE": [{"name": "Donovan Mitchell", "position": "G", "pts": 26.5}, {"name": "Darius Garland", "position": "G", "pts": 21.2}, {"name": "Evan Mobley", "position": "F", "pts": 18.1}, {"name": "Jarrett Allen", "position": "C", "pts": 16.5}, {"name": "Max Strus", "position": "G", "pts": 11.2}],
"DAL": [{"name": "Luka Doncic", "position": "G", "pts": 33.5}, {"name": "Kyrie Irving", "position": "G", "pts": 25.2}, {"name": "Klay Thompson", "position": "G", "pts": 14.1}, {"name": "Daniel Gafford", "position": "C", "pts": 12.5}, {"name": "P.J. Washington", "position": "F", "pts": 13.8}],
"DEN": [{"name": "Nikola Jokic", "position": "C", "pts": 29.5}, {"name": "Jamal Murray", "position": "G", "pts": 21.2}, {"name": "Michael Porter Jr.", "position": "F", "pts": 17.5}, {"name": "Aaron Gordon", "position": "F", "pts": 14.1}, {"name": "Russell Westbrook", "position": "G", "pts": 10.5}],
"DET": [{"name": "Cade Cunningham", "position": "G", "pts": 24.2}, {"name": "Jaden Ivey", "position": "G", "pts": 17.5}, {"name": "Ausar Thompson", "position": "F", "pts": 11.2}, {"name": "Jalen Duren", "position": "C", "pts": 13.8}, {"name": "Tobias Harris", "position": "F", "pts": 12.5}],
"GSW": [{"name": "Stephen Curry", "position": "G", "pts": 26.8}, {"name": "Andrew Wiggins", "position": "F", "pts": 16.5}, {"name": "Jonathan Kuminga", "position": "F", "pts": 14.2}, {"name": "Draymond Green", "position": "F", "pts": 9.1}, {"name": "Kevon Looney", "position": "C", "pts": 7.5}],
"HOU": [{"name": "Jalen Green", "position": "G", "pts": 22.5}, {"name": "Alperen Sengun", "position": "C", "pts": 19.2}, {"name": "Fred VanVleet", "position": "G", "pts": 15.8}, {"name": "Jabari Smith Jr.", "position": "F", "pts": 14.5}, {"name": "Dillon Brooks", "position": "F", "pts": 12.2}],
"IND": [{"name": "Tyrese Haliburton", "position": "G", "pts": 20.5}, {"name": "Pascal Siakam", "position": "F", "pts": 21.2}, {"name": "Myles Turner", "position": "C", "pts": 17.1}, {"name": "Andrew Nembhard", "position": "G", "pts": 11.5}, {"name": "Bennedict Mathurin", "position": "G", "pts": 15.2}],
"LAC": [{"name": "James Harden", "position": "G", "pts": 21.5}, {"name": "Kawhi Leonard", "position": "F", "pts": 23.8}, {"name": "Norman Powell", "position": "G", "pts": 18.2}, {"name": "Ivica Zubac", "position": "C", "pts": 12.5}, {"name": "Terance Mann", "position": "G", "pts": 9.8}],
"LAL": [{"name": "LeBron James", "position": "F", "pts": 25.5}, {"name": "Anthony Davis", "position": "C", "pts": 27.2}, {"name": "Austin Reaves", "position": "G", "pts": 18.1}, {"name": "D'Angelo Russell", "position": "G", "pts": 14.5}, {"name": "Rui Hachimura", "position": "F", "pts": 12.8}],
"MEM": [{"name": "Ja Morant", "position": "G", "pts": 25.8}, {"name": "Desmond Bane", "position": "G", "pts": 21.2}, {"name": "Jaren Jackson Jr.", "position": "F", "pts": 22.5}, {"name": "Zach Edey", "position": "C", "pts": 10.5}, {"name": "Marcus Smart", "position": "G", "pts": 9.2}],
"MIA": [{"name": "Jimmy Butler", "position": "F", "pts": 20.5}, {"name": "Tyler Herro", "position": "G", "pts": 21.2}, {"name": "Bam Adebayo", "position": "C", "pts": 19.8}, {"name": "Terry Rozier", "position": "G", "pts": 16.5}, {"name": "Jaime Jaquez Jr.", "position": "F", "pts": 12.2}],
"MIL": [{"name": "Giannis Antetokounmpo", "position": "F", "pts": 30.5}, {"name": "Damian Lillard", "position": "G", "pts": 25.2}, {"name": "Khris Middleton", "position": "F", "pts": 14.1}, {"name": "Brook Lopez", "position": "C", "pts": 12.5}, {"name": "Gary Trent Jr.", "position": "G", "pts": 11.8}],
"MIN": [{"name": "Anthony Edwards", "position": "G", "pts": 27.5}, {"name": "Julius Randle", "position": "F", "pts": 20.2}, {"name": "Rudy Gobert", "position": "C", "pts": 14.5}, {"name": "Mike Conley", "position": "G", "pts": 10.1}, {"name": "Jaden McDaniels", "position": "F", "pts": 12.2}],
"NOP": [{"name": "Zion Williamson", "position": "F", "pts": 22.5}, {"name": "Brandon Ingram", "position": "F", "pts": 21.8}, {"name": "CJ McCollum", "position": "G", "pts": 18.5}, {"name": "Dejounte Murray", "position": "G", "pts": 14.2}, {"name": "Trey Murphy III", "position": "F", "pts": 15.1}],
"NYK": [{"name": "Jalen Brunson", "position": "G", "pts": 28.5}, {"name": "Karl-Anthony Towns", "position": "C", "pts": 25.2}, {"name": "Mikal Bridges", "position": "F", "pts": 18.1}, {"name": "OG Anunoby", "position": "F", "pts": 15.5}, {"name": "Josh Hart", "position": "G", "pts": 12.2}],
"OKC": [{"name": "Shai Gilgeous-Alexander", "position": "G", "pts": 32.5}, {"name": "Jalen Williams", "position": "F", "pts": 20.2}, {"name": "Chet Holmgren", "position": "C", "pts": 18.1}, {"name": "Lu Dort", "position": "G", "pts": 11.5}, {"name": "Isaiah Hartenstein", "position": "C", "pts": 9.8}],
"ORL": [{"name": "Paolo Banchero", "position": "F", "pts": 24.5}, {"name": "Franz Wagner", "position": "F", "pts": 22.2}, {"name": "Jalen Suggs", "position": "G", "pts": 14.1}, {"name": "Wendell Carter Jr.", "position": "C", "pts": 12.5}, {"name": "Anthony Black", "position": "G", "pts": 8.2}],
"PHI": [{"name": "Tyrese Maxey", "position": "G", "pts": 26.5}, {"name": "Paul George", "position": "F", "pts": 22.2}, {"name": "Joel Embiid", "position": "C", "pts": 28.5}, {"name": "Kelly Oubre Jr.", "position": "F", "pts": 12.1}, {"name": "Kyle Lowry", "position": "G", "pts": 8.5}],
"PHX": [{"name": "Kevin Durant", "position": "F", "pts": 27.5}, {"name": "Devin Booker", "position": "G", "pts": 26.2}, {"name": "Bradley Beal", "position": "G", "pts": 18.5}, {"name": "Jusuf Nurkic", "position": "C", "pts": 11.2}, {"name": "Tyus Jones", "position": "G", "pts": 10.1}],
"POR": [{"name": "Anfernee Simons", "position": "G", "pts": 22.5}, {"name": "Scoot Henderson", "position": "G", "pts": 16.2}, {"name": "Shaedon Sharpe", "position": "G", "pts": 14.8}, {"name": "Jerami Grant", "position": "F", "pts": 18.1}, {"name": "Deandre Ayton", "position": "C", "pts": 17.5}],
"SAC": [{"name": "De'Aaron Fox", "position": "G", "pts": 27.5}, {"name": "Domantas Sabonis", "position": "C", "pts": 21.2}, {"name": "DeMar DeRozan", "position": "F", "pts": 18.5}, {"name": "Keegan Murray", "position": "F", "pts": 15.1}, {"name": "Malik Monk", "position": "G", "pts": 14.2}],
"SAS": [{"name": "Victor Wembanyama", "position": "C", "pts": 24.5}, {"name": "Devin Vassell", "position": "G", "pts": 18.2}, {"name": "Chris Paul", "position": "G", "pts": 10.5}, {"name": "Harrison Barnes", "position": "F", "pts": 12.1}, {"name": "Jeremy Sochan", "position": "F", "pts": 14.8}],
"TOR": [{"name": "Scottie Barnes", "position": "F", "pts": 22.5}, {"name": "RJ Barrett", "position": "G", "pts": 18.2}, {"name": "Immanuel Quickley", "position": "G", "pts": 16.5}, {"name": "Jakob Poeltl", "position": "C", "pts": 14.1}, {"name": "Gradey Dick", "position": "G", "pts": 12.8}],
"UTA": [{"name": "Lauri Markkanen", "position": "F", "pts": 23.5}, {"name": "Collin Sexton", "position": "G", "pts": 17.2}, {"name": "Jordan Clarkson", "position": "G", "pts": 16.5}, {"name": "Walker Kessler", "position": "C", "pts": 10.1}, {"name": "John Collins", "position": "F", "pts": 14.2}],
"WAS": [{"name": "Jordan Poole", "position": "G", "pts": 18.5}, {"name": "Kyle Kuzma", "position": "F", "pts": 17.2}, {"name": "Bilal Coulibaly", "position": "F", "pts": 11.5}, {"name": "Jonas Valanciunas", "position": "C", "pts": 12.8}, {"name": "Malcolm Brogdon", "position": "G", "pts": 14.1}],
}
return rosters.get(team_abbrev, [
{"name": "Starter 1", "position": "G", "pts": 0},
{"name": "Starter 2", "position": "G", "pts": 0},
{"name": "Starter 3", "position": "F", "pts": 0},
{"name": "Starter 4", "position": "F", "pts": 0},
{"name": "Starter 5", "position": "C", "pts": 0},
])
def get_team_record(self, team_id: int, season: str = "2024-25") -> Dict:
"""Get current record for a team."""
try:
games = leaguegamefinder.LeagueGameFinder(
team_id_nullable=team_id,
season_nullable=season
).get_data_frames()[0]
if games.empty:
return {"wins": 0, "losses": 0, "win_pct": 0.5}
wins = (games["WL"] == "W").sum()
losses = (games["WL"] == "L").sum()
return {
"wins": wins,
"losses": losses,
"win_pct": wins / (wins + losses) if (wins + losses) > 0 else 0.5
}
except:
return {"wins": 0, "losses": 0, "win_pct": 0.5}
def _get_current_standings_cache(self) -> Dict[str, Dict]:
"""Get cached current season standings with win percentages."""
if not hasattr(self, '_standings_cache') or self._standings_cache is None:
self._standings_cache = {}
try:
# Try to load from cached standings file for current season
standings_path = API_CACHE_DIR / "standings_2025-26.parquet"
if standings_path.exists():
df = pd.read_parquet(standings_path)
for _, row in df.iterrows():
team_name = row.get('TeamName', row.get('TEAM_NAME', ''))
team_id = row.get('TeamID', row.get('TEAM_ID', 0))
# Get team abbreviation from ID
abbrev = NBA_TEAMS.get(team_id, '')
if not abbrev and team_name:
# Try to match by city/name
for tid, abb in NBA_TEAMS.items():
if abb in team_name or team_name.split()[-1][:3].upper() == abb:
abbrev = abb
break
if abbrev:
wins = row.get('WINS', row.get('W', 0))
losses = row.get('LOSSES', row.get('L', 0))
total = wins + losses
win_pct = wins / total if total > 0 else 0.5
self._standings_cache[abbrev] = {
'wins': wins,
'losses': losses,
'win_pct': win_pct,
'games_played': total
}
logger.info(f"Loaded standings for {len(self._standings_cache)} teams")
except Exception as e:
logger.warning(f"Could not load standings cache: {e}")
return self._standings_cache
def _get_recent_form(self, team_abbrev: str, n_games: int = 10) -> float:
"""Get team's recent form (win % in last N games)."""
try:
games_path = API_CACHE_DIR / "games_2025-26.parquet"
if not games_path.exists():
return 0.5
df = pd.read_parquet(games_path)
team_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == team_abbrev), None)
if not team_id:
return 0.5
team_games = df[df['TEAM_ID'] == team_id].sort_values('GAME_DATE', ascending=False).head(n_games)
if len(team_games) < 3:
return 0.5
wins = (team_games['WL'] == 'W').sum()
return wins / len(team_games)
except Exception:
return 0.5
def predict_game(self, home_team: str, away_team: str) -> Dict:
"""
Generate prediction for a single game using multi-factor algorithm.
Combines:
- Current season standings (win %)
- ELO ratings (historical strength)
- Home court advantage (~3-4% boost)
- Recent form (last 10 games)
- Injury impact
Args:
home_team: Home team abbreviation (e.g., "LAL")
away_team: Away team abbreviation (e.g., "BOS")
Returns:
Prediction dict with probabilities and explanations
"""
# Get team IDs
home_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == home_team), None)
away_id = next((tid for tid, abbr in NBA_TEAMS.items() if abbr == away_team), None)
if not home_id or not away_id:
return {"error": "Unknown team"}
# ===== MULTI-FACTOR PREDICTION ALGORITHM =====
# 1. Get current season standings
standings = self._get_current_standings_cache()
home_standings = standings.get(home_team, {'win_pct': 0.5, 'wins': 0, 'losses': 0})
away_standings = standings.get(away_team, {'win_pct': 0.5, 'wins': 0, 'losses': 0})
home_win_pct = home_standings['win_pct']
away_win_pct = away_standings['win_pct']
# 2. Get ELO features (historical context)
elo_features = self.feature_gen.elo.calculate_game_features(
home_id, away_id, is_home=True
)
# 3. Get recent form (momentum)
home_form = self._get_recent_form(home_team, 10)
away_form = self._get_recent_form(away_team, 10)
# 4. Get injury impact
home_injuries = self.injury_collector.get_injury_summary(home_team)
away_injuries = self.injury_collector.get_injury_summary(away_team)
home_injury_impact = self.injury_collector.calculate_injury_impact(home_team)
away_injury_impact = self.injury_collector.calculate_injury_impact(away_team)
# ===== CALCULATE WIN PROBABILITY =====
# Method: Log5 formula for head-to-head probability
# P(A beats B) = (pA * (1 - pB)) / (pA * (1 - pB) + pB * (1 - pA))
# Where pA and pB are true talent levels (blend of factors)
# Calculate "true talent" rating for each team (0 to 1 scale)
# Weights: Season record (40%), Recent form (30%), ELO-based (20%), Base (10%)
# ELO-based win expectancy (convert ELO to win expectancy vs average team)
home_elo_strength = 1.0 / (1.0 + 10 ** (-(elo_features["team_elo"] - 1500) / 400))
away_elo_strength = 1.0 / (1.0 + 10 ** (-(elo_features["opponent_elo"] - 1500) / 400))
# Blend factors for "true talent"
home_talent = (
0.40 * home_win_pct + # Season record (most important)
0.30 * home_form + # Recent form (10 games)
0.20 * home_elo_strength + # Historical ELO
0.10 * 0.5 # Baseline
)
away_talent = (
0.40 * away_win_pct +
0.30 * away_form +
0.20 * away_elo_strength +
0.10 * 0.5
)
# Apply home court advantage (typically 3-4% in NBA)
HOME_COURT_ADVANTAGE = 0.035
home_talent = min(0.95, home_talent + HOME_COURT_ADVANTAGE)
# Apply injury adjustments (injuries hurt team)
# Each injury point reduces win probability by ~2%
home_talent = max(0.05, home_talent - home_injury_impact * 0.02)
away_talent = max(0.05, away_talent - away_injury_impact * 0.02)
# Log5 formula for head-to-head probability
if home_talent + away_talent == 0:
win_prob = 0.5
elif home_talent == 0:
win_prob = 0.0
elif away_talent == 0:
win_prob = 1.0
else:
win_prob = (home_talent * (1 - away_talent)) / (
home_talent * (1 - away_talent) + away_talent * (1 - home_talent)
)
# Clamp to reasonable range (5% - 95%)
win_prob = max(0.05, min(0.95, win_prob))
# ===== DETERMINE CONFIDENCE LEVEL =====
prob_diff = abs(win_prob - 0.5)
if prob_diff > 0.25:
confidence = "high"
elif prob_diff > 0.10:
confidence = "medium"
else:
confidence = "low"
# ===== BUILD RESULT =====
result = {
"home_team": home_team,
"away_team": away_team,
"home_win_probability": round(win_prob, 3),
"away_win_probability": round(1 - win_prob, 3),
"predicted_winner": home_team if win_prob > 0.5 else away_team,
"confidence": confidence,
"home_elo": elo_features["team_elo"],
"away_elo": elo_features["opponent_elo"],
"elo_diff": elo_features["elo_diff"],
"home_record": f"{home_standings.get('wins', 0)}-{home_standings.get('losses', 0)}",
"away_record": f"{away_standings.get('wins', 0)}-{away_standings.get('losses', 0)}",
"home_form": f"{home_form:.1%}",
"away_form": f"{away_form:.1%}",
"home_injuries": home_injuries,
"away_injuries": away_injuries,
"home_injury_impact": home_injury_impact,
"away_injury_impact": away_injury_impact,
"factors": []
}
# ===== ADD EXPLAINING FACTORS =====
# Record comparison
if home_win_pct > away_win_pct + 0.1:
result["factors"].append(f"{home_team} has better record ({home_win_pct:.1%} vs {away_win_pct:.1%})")
elif away_win_pct > home_win_pct + 0.1:
result["factors"].append(f"{away_team} has better record ({away_win_pct:.1%} vs {home_win_pct:.1%})")
# Momentum
if home_form > away_form + 0.15:
result["factors"].append(f"{home_team} in better recent form (L10: {home_form:.0%})")
elif away_form > home_form + 0.15:
result["factors"].append(f"{away_team} in better recent form (L10: {away_form:.0%})")
# Home court
result["factors"].append(f"Home court advantage for {home_team}")
# Injuries
if home_injuries["total_injuries"] > 0:
result["factors"].append(f"{home_team} has {home_injuries['total_injuries']} injuries")
if away_injuries["total_injuries"] > 0:
result["factors"].append(f"{away_team} has {away_injuries['total_injuries']} injuries")
return result
def predict_todays_games(self, save_predictions: bool = True) -> List[Dict]:
"""
Generate predictions for all of today's games.
Args:
save_predictions: If True, save predictions to ChromaDB tracker
"""
games = self.get_todays_games()
if not games:
logger.info("No games today")
return []
predictions = []
for game in games:
home_team = game.get("home_team", "")
away_team = game.get("away_team", "")
if home_team and away_team:
pred = self.predict_game(home_team, away_team)
pred["game_id"] = game.get("game_id", "")
pred["game_date"] = game.get("game_date", "")
pred["game_status"] = game.get("status", "")
pred["current_home_score"] = game.get("home_score", 0)
pred["current_away_score"] = game.get("away_score", 0)
# Save prediction if game hasn't started and tracking enabled
if save_predictions and game.get("status") == "NOT_STARTED":
self.save_prediction_for_game(game["game_id"], pred)
predictions.append(pred)
return predictions
def save_prediction_for_game(self, game_id: str, prediction: Dict) -> bool:
"""Save a prediction to the tracker before game starts."""
return self.prediction_tracker.save_prediction(game_id, prediction)
def check_prediction_results(self) -> List[Dict]:
"""
Check completed games and update prediction results.
Returns:
List of updated predictions with results
"""
final_games = self.get_final_games()
updated = []
for game in final_games:
game_id = game["game_id"]
home_score = game["home_score"]
away_score = game["away_score"]
actual_winner = game["home_team"] if home_score > away_score else game["away_team"]
# Update the prediction in tracker
success = self.prediction_tracker.update_result(
game_id=game_id,
actual_winner=actual_winner,
home_score=home_score,
away_score=away_score
)
if success:
pred = self.prediction_tracker.get_prediction(game_id)
if pred:
pred["actual_winner"] = actual_winner
pred["home_score"] = home_score
pred["away_score"] = away_score
updated.append(pred)
return updated
def get_accuracy_stats(self) -> Dict:
"""Get comprehensive model accuracy statistics."""
return self.prediction_tracker.get_accuracy_stats()
def get_recent_predictions(self, n: int = 20) -> List[Dict]:
"""Get recent predictions with results."""
return self.prediction_tracker.get_recent_predictions(n)
def get_pending_predictions(self) -> List[Dict]:
"""Get predictions for games not yet completed."""
return self.prediction_tracker.get_pending_predictions()
def get_games_with_predictions(self) -> List[Dict]:
"""
Get all today's games with prediction data and live scores.
Enriches each game with prediction info and correctness status.
"""
games = self.get_todays_games()
enriched = []
for game in games:
game_data = dict(game) # Copy
# Get prediction for this game
pred = self.predict_game(game["home_team"], game["away_team"])
game_data["prediction"] = pred
# Check if prediction was correct (for completed games)
if game["status"] == "FINAL":
actual_winner = game["home_team"] if game["home_score"] > game["away_score"] else game["away_team"]
game_data["actual_winner"] = actual_winner
game_data["prediction_correct"] = pred["predicted_winner"] == actual_winner
else:
game_data["actual_winner"] = None
game_data["prediction_correct"] = None
enriched.append(game_data)
return enriched
def get_mvp_race(self, player_df: pd.DataFrame = None) -> pd.DataFrame:
"""Get current MVP race standings using ONLY current 2025-26 season data."""
# Always fetch real current season player stats from NBA API
max_retries = 1 # Fail fast and use fallback
for attempt in range(max_retries):
try:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings
import time
# Shorter delay for faster response
time.sleep(0.5)
# Reduced timeout to fail faster if API is slow
stats = leaguedashplayerstats.LeagueDashPlayerStats(
season='2025-26',
per_mode_detailed='PerGame',
timeout=30 # 30 second timeout
)
df = stats.get_data_frames()[0]
# Get team standings for team win percentage
time.sleep(1.0)
standings = leaguestandings.LeagueStandings(
season='2025-26',
timeout=60
)
standings_df = standings.get_data_frames()[0]
# Map team win% to players by TEAM_ID
team_win_pct = {}
for _, row in standings_df.iterrows():
team_id = row.get('TeamID', 0)
wins = row.get('WINS', 0)
losses = row.get('LOSSES', 0)
total = wins + losses
if total > 0:
team_win_pct[team_id] = wins / total
# Add team win% to player stats
df['TEAM_WIN_PCT'] = df['TEAM_ID'].map(team_win_pct).fillna(0.5)
# Filter to players with significant minutes (starters/key players)
df = df[
(df['MIN'] >= 25) &
(df['GP'] >= 15)
].copy()
# Calculate MVP score directly (no model dependency)
df['mvp_score'] = (
df['PTS'].fillna(0) * 1.0 + # Points
df['AST'].fillna(0) * 2.0 + # Assists (playmaking)
df['REB'].fillna(0) * 1.0 + # Rebounds
(df['STL'].fillna(0) + df['BLK'].fillna(0)) * 1.5 + # Defense
df['PLUS_MINUS'].fillna(0) * 0.3 + # Impact
df['FG_PCT'].fillna(0.45) * 20 + # Efficiency
df['TEAM_WIN_PCT'].fillna(0.5) * 30 # Team success
)
# Add similarity score (simplified - based on stats profile)
df['mvp_similarity'] = (
(df['PTS'] / 30.0).clip(0, 1) * 0.4 + # Elite scorer
(df['REB'] / 12.0).clip(0, 1) * 0.2 + # Elite rebounder
(df['AST'] / 10.0).clip(0, 1) * 0.2 + # Elite playmaker
df['TEAM_WIN_PCT'] * 0.2 # Winning team
).fillna(0)
# Sort by MVP score
df = df.sort_values('mvp_score', ascending=False)
logger.info(f"Successfully fetched MVP data on attempt {attempt + 1}")
# Return top 10 MVP candidates
return df.head(10)[['PLAYER_NAME', 'PTS', 'REB', 'AST', 'mvp_score', 'mvp_similarity']]
except Exception as e:
logger.warning(f"MVP data fetch attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
import time
time.sleep(2 ** attempt) # Exponential backoff
continue
logger.error("All MVP data fetch attempts failed, returning fallback data")
# Return fallback mock data with real 2025-26 MVP candidates
return pd.DataFrame({
'PLAYER_NAME': [
'Nikola Jokić', 'Shai Gilgeous-Alexander', 'Luka Dončić',
'Giannis Antetokounmpo', 'Jayson Tatum', 'Anthony Davis',
'Victor Wembanyama', 'LeBron James', 'Kevin Durant', 'Tyrese Maxey'
],
'PTS': [29.6, 31.8, 33.6, 28.8, 27.2, 26.5, 24.5, 23.8, 27.1, 30.3],
'REB': [12.2, 4.4, 7.7, 9.5, 8.1, 11.8, 10.9, 7.2, 6.4, 4.4],
'AST': [11.0, 6.2, 8.7, 5.5, 5.4, 3.2, 3.0, 8.4, 4.2, 6.7],
'mvp_score': [102.8, 90.6, 89.5, 78.7, 77.4, 76.2, 80.1, 75.8, 74.3, 79.1],
'mvp_similarity': [0.933, 0.760, 0.822, 0.735, 0.720, 0.705, 0.706, 0.698, 0.685, 0.717]
})
def get_championship_odds(self, team_df: pd.DataFrame = None) -> pd.DataFrame:
"""Get current championship odds using LIVE standings data from NBA API."""
if team_df is None:
# Fetch real current season standings from NBA API
max_retries = 1 # Fail fast and use fallback
for attempt in range(max_retries):
try:
from nba_api.stats.endpoints import leaguestandings
import time
time.sleep(0.5)
standings = leaguestandings.LeagueStandings(
season='2025-26',
timeout=30
)
df = standings.get_data_frames()[0]
if df.empty:
logger.warning("NBA API returned empty standings data")
continue
logger.info(f"Got standings for {len(df)} teams from NBA API")
# Build team DataFrame with required columns
team_df = pd.DataFrame({
'TEAM_ABBREVIATION': df['TeamCity'].apply(lambda x: NBA_TEAMS.get(
next((tid for tid, abbr in NBA_TEAMS.items()
if x.lower() in abbr.lower() or abbr.lower() in x.lower()), 0),
'UNK'
)),
'W_PCT': df['WinPCT'].fillna(0.5),
'NET_RATING': df['NetRating'].fillna(0) if 'NetRating' in df.columns else 0,
})
# If team abbreviations didn't map well, try using TeamAbbreviation directly if available
if 'TeamAbbreviation' in df.columns:
team_df['TEAM_ABBREVIATION'] = df['TeamAbbreviation']
# Add ELO ratings from our feature generator
elo_ratings = {}
for team_id, abbrev in NBA_TEAMS.items():
elo_ratings[abbrev] = self.feature_gen.elo.get_rating(team_id)
team_df['ELO'] = team_df['TEAM_ABBREVIATION'].map(elo_ratings).fillna(1500)
logger.info(f"Successfully built championship data for {len(team_df)} teams")
break
except Exception as e:
logger.warning(f"Championship standings fetch attempt {attempt + 1} failed: {e}")
continue
else:
# All retries failed - use fallback mock data
logger.warning("Using fallback championship odds data")
team_df = pd.DataFrame({
"TEAM_ABBREVIATION": ["OKC", "CLE", "BOS", "DEN", "MEM", "HOU", "NYK", "GSW",
"MIN", "LAL", "MIL", "PHX", "DAL", "MIA", "SAC", "IND"],
"W_PCT": [0.74, 0.70, 0.66, 0.62, 0.60, 0.58, 0.56, 0.54,
0.52, 0.50, 0.48, 0.46, 0.44, 0.42, 0.40, 0.38],
"NET_RATING": [10.5, 8.2, 7.5, 6.0, 5.5, 4.5, 4.0, 3.5,
3.0, 2.5, 2.0, 1.5, 1.0, 0.5, 0.0, -0.5]
})
return self.champ_model.get_top_contenders(team_df)
# =============================================================================
# CLI INTERFACE
# =============================================================================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="NBA Prediction Pipeline")
parser.add_argument("--test", action="store_true", help="Run test prediction")
parser.add_argument("--today", action="store_true", help="Predict today's games")
parser.add_argument("--game", nargs=2, help="Predict single game: HOME AWAY")
args = parser.parse_args()
pipeline = PredictionPipeline()
if args.test:
print("Testing prediction pipeline...")
result = pipeline.predict_game("LAL", "BOS")
for k, v in result.items():
print(f" {k}: {v}")
elif args.today:
print("Today's game predictions:")
predictions = pipeline.predict_todays_games()
for pred in predictions:
print(f"\n{pred['away_team']} @ {pred['home_team']}")
print(f" Predicted winner: {pred['predicted_winner']}")
print(f" Win probability: {pred['home_win_probability']:.1%}")
elif args.game:
home, away = args.game
result = pipeline.predict_game(home.upper(), away.upper())
print(f"\n{away.upper()} @ {home.upper()}")
for k, v in result.items():
print(f" {k}: {v}")
else:
print("Use --test, --today, or --game HOME AWAY")