File size: 6,548 Bytes
c095e08
 
 
 
 
 
 
 
 
 
 
 
 
fd577ad
c095e08
fd577ad
 
c095e08
 
 
fd577ad
c095e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f475b4e
 
c095e08
 
f475b4e
 
 
 
 
 
c095e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
NBA ML Prediction System - Configuration
=========================================
Central configuration for data collection, model training, and predictions.
"""

from pathlib import Path
from dataclasses import dataclass, field
from typing import List

# =============================================================================
# PATHS
# =============================================================================
import os
PROJECT_ROOT = Path(__file__).parent.parent
# Use environment variable for data dir if set (for HF Spaces persistent storage)
DATA_DIR = Path(os.environ.get("NBA_ML_DATA_DIR", str(PROJECT_ROOT / "data")))
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
API_CACHE_DIR = DATA_DIR / "api_data"
MODELS_DIR = Path(os.environ.get("NBA_ML_MODELS_DIR", str(PROJECT_ROOT / "models")))

# Create directories if they don't exist
for dir_path in [RAW_DATA_DIR, PROCESSED_DATA_DIR, API_CACHE_DIR, MODELS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# =============================================================================
# SEASONS
# =============================================================================
# Extended dataset: 23 years (2003-2026) for comprehensive training
INITIAL_SEASON_START = 2003
INITIAL_SEASON_END = 2026  # Current 2025-26 season
FULL_SEASON_START = 2003  # Full dataset starts from 2003

def get_season_strings(start_year: int = INITIAL_SEASON_START, 
                       end_year: int = INITIAL_SEASON_END) -> List[str]:
    """Generate season strings like '2003-04', '2004-05', etc."""
    return [f"{year}-{str(year+1)[-2:]}" for year in range(start_year, end_year)]

SEASON_STRINGS = get_season_strings()

# =============================================================================
# CHROMADB CONFIGURATION
# =============================================================================
import os

@dataclass
class ChromaDBConfig:
    """Configuration for ChromaDB prediction tracking.
    Reads from environment variables for security, with fallback defaults.
    """
    tenant: str = os.environ.get("CHROMADB_TENANT", "70e82e68-9fa7-4224-9975-d49d355f6328")
    database: str = os.environ.get("CHROMADB_DATABASE", "NBA_ML")
    api_key: str = os.environ.get("CHROMADB_API_KEY", "ck-2bXunZK4X3BFSPHtwLG2Ki9xr5r6ZPxzADESDperHweT")
    collection_name: str = "predictions"
    
CHROMADB_CONFIG = ChromaDBConfig()

# =============================================================================
# LIVE DATA CONFIGURATION
# =============================================================================
LIVE_REFRESH_INTERVAL = 15  # Seconds between live score refreshes

# =============================================================================
# API CONFIGURATION
# =============================================================================
@dataclass
class APIConfig:
    """Configuration for NBA API requests with robustness features."""
    base_delay: float = 0.6  # Base delay between requests (seconds)
    max_retries: int = 3  # Maximum retry attempts
    initial_backoff: float = 2.0  # Initial backoff in seconds
    max_backoff: float = 60.0  # Maximum backoff in seconds
    backoff_multiplier: float = 2.0  # Exponential backoff multiplier
    timeout: int = 30  # Request timeout in seconds
    
API_CONFIG = APIConfig()

# =============================================================================
# ELO CONFIGURATION
# =============================================================================
@dataclass
class ELOConfig:
    """Configuration for ELO rating calculations."""
    initial_rating: float = 1500.0
    k_factor: float = 20.0  # How much ratings change per game
    home_advantage: float = 100.0  # ELO points added for home team
    season_regression: float = 0.85  # Regress to mean at season start (85% = only 15% carryover)
    
ELO_CONFIG = ELOConfig()

# =============================================================================
# FEATURE CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """Configuration for feature engineering."""
    rolling_windows: List[int] = field(default_factory=lambda: [5, 10, 20])
    min_games_for_features: int = 5  # Minimum games before generating features
    
FEATURE_CONFIG = FeatureConfig()

# =============================================================================
# MODEL CONFIGURATION
# =============================================================================
@dataclass
class ModelConfig:
    """Configuration for model training."""
    test_seasons: List[str] = field(default_factory=lambda: ["2024-25"])
    val_seasons: List[str] = field(default_factory=lambda: ["2023-24"])
    random_state: int = 42
    
    # XGBoost defaults
    xgb_params: dict = field(default_factory=lambda: {
        "n_estimators": 500,
        "max_depth": 6,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    })
    
    # LightGBM defaults
    lgb_params: dict = field(default_factory=lambda: {
        "n_estimators": 500,
        "max_depth": 6,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42,
        "verbose": -1
    })
    
MODEL_CONFIG = ModelConfig()

# =============================================================================
# TEAM MAPPINGS
# =============================================================================
# NBA Team IDs (for reference)
NBA_TEAMS = {
    1610612737: "ATL", 1610612738: "BOS", 1610612739: "CLE", 1610612740: "NOP",
    1610612741: "CHI", 1610612742: "DAL", 1610612743: "DEN", 1610612744: "GSW",
    1610612745: "HOU", 1610612746: "LAC", 1610612747: "LAL", 1610612748: "MIA",
    1610612749: "MIL", 1610612750: "MIN", 1610612751: "BKN", 1610612752: "NYK",
    1610612753: "ORL", 1610612754: "IND", 1610612755: "PHI", 1610612756: "PHX",
    1610612757: "POR", 1610612758: "SAC", 1610612759: "SAS", 1610612760: "OKC",
    1610612761: "TOR", 1610612762: "UTA", 1610612763: "MEM", 1610612764: "WAS",
    1610612765: "DET", 1610612766: "CHA"
}

# =============================================================================
# INJURY STATUS WEIGHTS
# =============================================================================
INJURY_IMPACT = {
    "Out": 1.0,  # Full impact - player not available
    "Doubtful": 0.8,
    "Questionable": 0.5,
    "Probable": 0.2,
    "Available": 0.0
}