LBJLincoln Claude Opus 4.6 commited on
Commit
2ca3b0b
·
1 Parent(s): 7799bab

feat: MOVDA raw delta features (Cat37 6→13) + beta calibration + browser scraper

Browse files

- engine.py: add 7 raw delta_MOV rolling features (no EWM smoothing)
- app.py: beta calibration method (betacal) added to evolution search space
- browser_scraper.py: Crawl4AI + requests fallback for odds scraping
- scrape_odds.py: structured odds scraper
- requirements.txt: betacal, html2text, beautifulsoup4

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. Dockerfile.browser +64 -0
  2. app.py +24 -5
  3. features/engine.py +31 -3
  4. requirements.txt +5 -0
Dockerfile.browser ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile.browser -- Browser-enabled HF Space image for web scraping
2
+ # ======================================================================
3
+ #
4
+ # NOT ACTIVE YET -- this is a template for when we need browser-based
5
+ # scraping on HF Spaces (e.g., scraping odds pages with JS rendering).
6
+ #
7
+ # Current HF Spaces use the default Python runtime without browser deps.
8
+ # To activate: rename to Dockerfile and push to the target Space.
9
+ #
10
+ # Requirements:
11
+ # - HF Space must be configured as "Docker" SDK (not Gradio SDK)
12
+ # - The Space will be larger (~2GB) due to Chromium
13
+ # - CPU-only is fine for scraping (no GPU needed)
14
+ #
15
+ # Size estimate: ~2.5GB image (Playwright + Chromium + Python deps)
16
+
17
+ FROM python:3.11-slim-bookworm
18
+
19
+ # Install system deps for Playwright/Chromium
20
+ RUN apt-get update && apt-get install -y --no-install-recommends \
21
+ wget \
22
+ ca-certificates \
23
+ fonts-liberation \
24
+ libasound2 \
25
+ libatk-bridge2.0-0 \
26
+ libatk1.0-0 \
27
+ libcups2 \
28
+ libdbus-1-3 \
29
+ libdrm2 \
30
+ libgbm1 \
31
+ libgtk-3-0 \
32
+ libnspr4 \
33
+ libnss3 \
34
+ libx11-xcb1 \
35
+ libxcomposite1 \
36
+ libxdamage1 \
37
+ libxrandr2 \
38
+ xdg-utils \
39
+ && rm -rf /var/lib/apt/lists/*
40
+
41
+ # Create non-root user (HF Spaces requirement)
42
+ RUN useradd -m -u 1000 user
43
+ WORKDIR /home/user/app
44
+
45
+ # Install Python deps
46
+ COPY requirements.txt .
47
+ RUN pip install --no-cache-dir -r requirements.txt
48
+
49
+ # Install Playwright and Chromium browser
50
+ RUN pip install --no-cache-dir crawl4ai>=0.4 playwright
51
+ RUN playwright install chromium
52
+ RUN playwright install-deps chromium
53
+
54
+ # Copy application code
55
+ COPY . .
56
+
57
+ # Fix permissions
58
+ RUN chown -R user:user /home/user/app
59
+
60
+ USER user
61
+
62
+ EXPOSE 7860
63
+
64
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -634,7 +634,7 @@ class Individual:
634
  "reg_alpha": 10 ** random.uniform(-6, 1),
635
  "reg_lambda": 10 ** random.uniform(-6, 1),
636
  "model_type": model_type or random.choice(CPU_MODEL_TYPES if not _HAS_GPU else ALL_MODEL_TYPES),
637
- "calibration": random.choices(["none", "sigmoid", "venn_abers"], weights=[30, 20, 50], k=1)[0],
638
  # Neural net hyperparams (used only for NN model types)
639
  "nn_hidden_dims": random.choice([64, 128, 256]),
640
  "nn_n_layers": random.randint(2, 4),
@@ -708,7 +708,7 @@ class Individual:
708
  if random.random() < 0.15: self.hyperparams["max_depth"] = max(2, min(8, self.hyperparams["max_depth"] + random.randint(-2, 2)))
709
  if random.random() < 0.15: self.hyperparams["learning_rate"] = max(0.001, min(0.3, self.hyperparams["learning_rate"] * 10 ** random.uniform(-0.3, 0.3)))
710
  if random.random() < 0.08: self.hyperparams["model_type"] = random.choice(CPU_MODEL_TYPES if not _HAS_GPU else ALL_MODEL_TYPES)
711
- if random.random() < 0.05: self.hyperparams["calibration"] = random.choices(["none", "sigmoid", "venn_abers"], weights=[60, 20, 20], k=1)[0]
712
  # Neural net hyperparams mutation
713
  if random.random() < 0.10: self.hyperparams["nn_hidden_dims"] = random.choice([64, 128, 256, 512])
714
  if random.random() < 0.10: self.hyperparams["nn_n_layers"] = max(1, min(6, self.hyperparams.get("nn_n_layers", 2) + random.randint(-1, 1)))
@@ -1249,10 +1249,12 @@ def evaluate(ind, X, y, n_splits=2, fast=True, eval_counter=[0]):
1249
  # Purge last PURGE_GAP games from training to avoid temporal leakage
1250
  ti_safe = ti[:-PURGE_GAP] if len(ti) > PURGE_GAP + 50 else ti
1251
  m = clone(model)
1252
- # Calibration: none (default, best empirically), sigmoid (Platt), or venn_abers (MAPIE)
1253
  cal_method = hp_eval.get("calibration", "none")
1254
  if cal_method == "isotonic":
1255
  cal_method = "none" # Isotonic empirically hurts Brier (+0.003 to +0.007)
 
 
1256
  if cal_method == "venn_abers":
1257
  try:
1258
  from mapie.classification import MapieClassifier
@@ -1261,13 +1263,30 @@ def evaluate(ind, X, y, n_splits=2, fast=True, eval_counter=[0]):
1261
  mapie = MapieClassifier(m_inner, method="lac", cv="prefit")
1262
  mapie.fit(X_sub[ti_safe[-200:]], y_eval[ti_safe[-200:]])
1263
  m = mapie # MapieClassifier wraps fitted model
1264
- cal_method = "none" # Skip CalibratedClassifierCV below
 
1265
  except (ImportError, Exception):
1266
  cal_method = "none" # Fallback if MAPIE not installed
 
 
 
 
 
 
 
 
 
 
 
 
 
1267
  if cal_method == "sigmoid":
1268
  m = CalibratedClassifierCV(m, method=cal_method, cv=3)
1269
- m.fit(X_sub[ti_safe], y_eval[ti_safe])
 
1270
  p = m.predict_proba(X_sub[vi])[:, 1]
 
 
1271
  briers.append(brier_score_loss(y_eval[vi], p))
1272
  rois.append(_log_loss_score(p, y_eval[vi]))
1273
  all_p.extend(p); all_y.extend(y_eval[vi])
 
634
  "reg_alpha": 10 ** random.uniform(-6, 1),
635
  "reg_lambda": 10 ** random.uniform(-6, 1),
636
  "model_type": model_type or random.choice(CPU_MODEL_TYPES if not _HAS_GPU else ALL_MODEL_TYPES),
637
+ "calibration": random.choices(["none", "sigmoid", "venn_abers", "beta"], weights=[25, 15, 30, 30], k=1)[0],
638
  # Neural net hyperparams (used only for NN model types)
639
  "nn_hidden_dims": random.choice([64, 128, 256]),
640
  "nn_n_layers": random.randint(2, 4),
 
708
  if random.random() < 0.15: self.hyperparams["max_depth"] = max(2, min(8, self.hyperparams["max_depth"] + random.randint(-2, 2)))
709
  if random.random() < 0.15: self.hyperparams["learning_rate"] = max(0.001, min(0.3, self.hyperparams["learning_rate"] * 10 ** random.uniform(-0.3, 0.3)))
710
  if random.random() < 0.08: self.hyperparams["model_type"] = random.choice(CPU_MODEL_TYPES if not _HAS_GPU else ALL_MODEL_TYPES)
711
+ if random.random() < 0.05: self.hyperparams["calibration"] = random.choices(["none", "sigmoid", "venn_abers", "beta"], weights=[50, 15, 15, 20], k=1)[0]
712
  # Neural net hyperparams mutation
713
  if random.random() < 0.10: self.hyperparams["nn_hidden_dims"] = random.choice([64, 128, 256, 512])
714
  if random.random() < 0.10: self.hyperparams["nn_n_layers"] = max(1, min(6, self.hyperparams.get("nn_n_layers", 2) + random.randint(-1, 1)))
 
1249
  # Purge last PURGE_GAP games from training to avoid temporal leakage
1250
  ti_safe = ti[:-PURGE_GAP] if len(ti) > PURGE_GAP + 50 else ti
1251
  m = clone(model)
1252
+ # Calibration: none (default), sigmoid (Platt), venn_abers (MAPIE), or beta (BetaCalibration)
1253
  cal_method = hp_eval.get("calibration", "none")
1254
  if cal_method == "isotonic":
1255
  cal_method = "none" # Isotonic empirically hurts Brier (+0.003 to +0.007)
1256
+ _beta_cal = None # beta calibrator applied post-predict
1257
+ _model_fitted = False # tracks whether m.fit() was already called
1258
  if cal_method == "venn_abers":
1259
  try:
1260
  from mapie.classification import MapieClassifier
 
1263
  mapie = MapieClassifier(m_inner, method="lac", cv="prefit")
1264
  mapie.fit(X_sub[ti_safe[-200:]], y_eval[ti_safe[-200:]])
1265
  m = mapie # MapieClassifier wraps fitted model
1266
+ _model_fitted = True
1267
+ cal_method = "none"
1268
  except (ImportError, Exception):
1269
  cal_method = "none" # Fallback if MAPIE not installed
1270
+ if cal_method == "beta":
1271
+ try:
1272
+ from betacal import BetaCalibration
1273
+ # Fit base model, then fit beta calibrator on a held-out slice
1274
+ m.fit(X_sub[ti_safe], y_eval[ti_safe])
1275
+ _model_fitted = True
1276
+ cal_slice = ti_safe[-200:] if len(ti_safe) > 200 else ti_safe
1277
+ raw_p = m.predict_proba(X_sub[cal_slice])[:, 1]
1278
+ _beta_cal = BetaCalibration(parameters="abm")
1279
+ _beta_cal.fit(raw_p.reshape(-1, 1), y_eval[cal_slice])
1280
+ cal_method = "none"
1281
+ except (ImportError, Exception):
1282
+ cal_method = "none" # Fallback if betacal not installed
1283
  if cal_method == "sigmoid":
1284
  m = CalibratedClassifierCV(m, method=cal_method, cv=3)
1285
+ if not _model_fitted:
1286
+ m.fit(X_sub[ti_safe], y_eval[ti_safe])
1287
  p = m.predict_proba(X_sub[vi])[:, 1]
1288
+ if _beta_cal is not None:
1289
+ p = _beta_cal.predict(p.reshape(-1, 1))
1290
  briers.append(brier_score_loss(y_eval[vi], p))
1291
  rois.append(_log_loss_score(p, y_eval[vi]))
1292
  all_p.extend(p); all_y.extend(y_eval[vi])
features/engine.py CHANGED
@@ -2223,7 +2223,7 @@ class NBAFeatureEngine:
2223
  "dense_sched_x_margin", # schedule_density × margin_diff
2224
  ])
2225
 
2226
- # 37. MOVDA ELO FEATURES (6 features) — arXiv:2506.00348
2227
  # Margin-of-Victory Differential Analysis: R' = R + K*(S-E) + λ*(MOV-E_MOV)
2228
  for prefix in ["h", "a"]:
2229
  names.append(f"{prefix}_movda_rating") # MOVDA Elo rating (normalized)
@@ -2232,6 +2232,12 @@ class NBAFeatureEngine:
2232
  "movda_diff", # MOVDA rating differential
2233
  "movda_win_prob", # MOVDA-derived win probability
2234
  ])
 
 
 
 
 
 
2235
 
2236
  # 38. VENUE-CONDITIONAL MATCHUP FEATURES (14 features)
2237
  # Home team's home-only stats vs away team's road-only stats
@@ -2281,6 +2287,7 @@ class NBAFeatureEngine:
2281
  # ── Category 37: MOVDA ELO state trackers ──
2282
  team_movda = defaultdict(lambda: 1500.0) # MOVDA Elo rating
2283
  mov_surprise_ewm = defaultdict(float) # Per-team EWMA of MOV surprise
 
2284
  _MOVDA_K = 20.0; _MOVDA_C = 400.0; _MOVDA_LAMBDA = 0.3
2285
  _MOVDA_ALPHA = 19.2511; _MOVDA_BETA = 0.002342
2286
  _MOVDA_GAMMA = 648.0334; _MOVDA_DELTA = -645.8717
@@ -2339,6 +2346,7 @@ class NBAFeatureEngine:
2339
  team_home_margin_sum, team_home_games_count)
2340
  # Update MOVDA ELO (Cat 37)
2341
  self._update_movda(home, away, hs, as_, team_movda, mov_surprise_ewm,
 
2342
  _MOVDA_K, _MOVDA_C, _MOVDA_LAMBDA, _MOVDA_ALPHA,
2343
  _MOVDA_BETA, _MOVDA_GAMMA, _MOVDA_DELTA, _MOVDA_EWM_ALPHA)
2344
  continue
@@ -5049,7 +5057,7 @@ class NBAFeatureEngine:
5049
  (self._games_in_window(hr_, gd, 7) - self._games_in_window(ar_, gd, 7)) * (_h_margin5 - _a_margin5),
5050
  ])
5051
 
5052
- # ── 37. MOVDA ELO FEATURES (6 features) ──
5053
  _movda_dr = team_movda[home] - team_movda[away]
5054
  _movda_wp = 1.0 / (1.0 + 10.0 ** (-_movda_dr / _MOVDA_C))
5055
  for _mt, _mk in [(home, home), (away, away)]:
@@ -5057,6 +5065,21 @@ class NBAFeatureEngine:
5057
  row.append(mov_surprise_ewm[_mk] / 20.0) # mov_surprise_ewm (normalized)
5058
  row.append(_movda_dr / 400.0) # movda_diff
5059
  row.append(_movda_wp) # movda_win_prob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5060
 
5061
  # ── 38. VENUE-CONDITIONAL MATCHUP FEATURES (14 features) ──
5062
  # Use true venue-specific records: home team at home vs away team on road
@@ -5104,6 +5127,7 @@ class NBAFeatureEngine:
5104
  team_home_margin_sum, team_home_games_count)
5105
  # Update MOVDA ELO (Cat 37)
5106
  self._update_movda(home, away, hs, as_, team_movda, mov_surprise_ewm,
 
5107
  _MOVDA_K, _MOVDA_C, _MOVDA_LAMBDA, _MOVDA_ALPHA,
5108
  _MOVDA_BETA, _MOVDA_GAMMA, _MOVDA_DELTA, _MOVDA_EWM_ALPHA)
5109
 
@@ -5659,8 +5683,9 @@ class NBAFeatureEngine:
5659
  team_home_games_count[home] += 1
5660
 
5661
  def _update_movda(self, home, away, hs, as_, team_movda, mov_surprise_ewm,
 
5662
  K, C, lam, alpha, beta, gamma, delta_param, ewm_alpha):
5663
- """Update MOVDA Elo ratings (Cat 37). arXiv:2506.00348."""
5664
  margin = hs - as_
5665
  result = 1.0 if margin > 0 else (0.0 if margin < 0 else 0.5)
5666
  delta_r = team_movda[home] - team_movda[away]
@@ -5672,6 +5697,9 @@ class NBAFeatureEngine:
5672
  team_movda[away] -= movda_update
5673
  mov_surprise_ewm[home] = ewm_alpha * delta_mov + (1 - ewm_alpha) * mov_surprise_ewm[home]
5674
  mov_surprise_ewm[away] = ewm_alpha * (-delta_mov) + (1 - ewm_alpha) * mov_surprise_ewm[away]
 
 
 
5675
 
5676
  def _parse_stats(self, stats, pts, opp_pts, is_home=True):
5677
  """Extract stats from game data. Uses REAL box score when available, estimates otherwise."""
 
2223
  "dense_sched_x_margin", # schedule_density × margin_diff
2224
  ])
2225
 
2226
+ # 37. MOVDA ELO FEATURES (13 features) — arXiv:2506.00348
2227
  # Margin-of-Victory Differential Analysis: R' = R + K*(S-E) + λ*(MOV-E_MOV)
2228
  for prefix in ["h", "a"]:
2229
  names.append(f"{prefix}_movda_rating") # MOVDA Elo rating (normalized)
 
2232
  "movda_diff", # MOVDA rating differential
2233
  "movda_win_prob", # MOVDA-derived win probability
2234
  ])
2235
+ # Raw delta_MOV rolling features (no EWM smoothing) — captures recent surprise momentum
2236
+ for prefix in ["h", "a"]:
2237
+ names.append(f"{prefix}_delta_mov_raw") # last game's raw MOV surprise
2238
+ names.append(f"{prefix}_delta_mov_rolling_5") # rolling mean over last 5 games
2239
+ names.append(f"{prefix}_delta_mov_rolling_10") # rolling mean over last 10 games
2240
+ names.append("delta_mov_diff") # h_delta_mov_rolling_5 - a_delta_mov_rolling_5
2241
 
2242
  # 38. VENUE-CONDITIONAL MATCHUP FEATURES (14 features)
2243
  # Home team's home-only stats vs away team's road-only stats
 
2287
  # ── Category 37: MOVDA ELO state trackers ──
2288
  team_movda = defaultdict(lambda: 1500.0) # MOVDA Elo rating
2289
  mov_surprise_ewm = defaultdict(float) # Per-team EWMA of MOV surprise
2290
+ delta_mov_history = defaultdict(list) # Per-team raw delta_MOV history
2291
  _MOVDA_K = 20.0; _MOVDA_C = 400.0; _MOVDA_LAMBDA = 0.3
2292
  _MOVDA_ALPHA = 19.2511; _MOVDA_BETA = 0.002342
2293
  _MOVDA_GAMMA = 648.0334; _MOVDA_DELTA = -645.8717
 
2346
  team_home_margin_sum, team_home_games_count)
2347
  # Update MOVDA ELO (Cat 37)
2348
  self._update_movda(home, away, hs, as_, team_movda, mov_surprise_ewm,
2349
+ delta_mov_history,
2350
  _MOVDA_K, _MOVDA_C, _MOVDA_LAMBDA, _MOVDA_ALPHA,
2351
  _MOVDA_BETA, _MOVDA_GAMMA, _MOVDA_DELTA, _MOVDA_EWM_ALPHA)
2352
  continue
 
5057
  (self._games_in_window(hr_, gd, 7) - self._games_in_window(ar_, gd, 7)) * (_h_margin5 - _a_margin5),
5058
  ])
5059
 
5060
+ # ── 37. MOVDA ELO FEATURES (13 features) ──
5061
  _movda_dr = team_movda[home] - team_movda[away]
5062
  _movda_wp = 1.0 / (1.0 + 10.0 ** (-_movda_dr / _MOVDA_C))
5063
  for _mt, _mk in [(home, home), (away, away)]:
 
5065
  row.append(mov_surprise_ewm[_mk] / 20.0) # mov_surprise_ewm (normalized)
5066
  row.append(_movda_dr / 400.0) # movda_diff
5067
  row.append(_movda_wp) # movda_win_prob
5068
+ # Raw delta_MOV rolling features (no EWM smoothing)
5069
+ for _mk in [home, away]:
5070
+ _dh = delta_mov_history[_mk]
5071
+ _raw = (_dh[-1] / 20.0) if _dh else 0.0
5072
+ _roll5 = (sum(_dh[-5:]) / len(_dh[-5:]) / 20.0) if _dh else 0.0
5073
+ _roll10 = (sum(_dh[-10:]) / len(_dh[-10:]) / 20.0) if _dh else 0.0
5074
+ row.append(_raw) # {prefix}_delta_mov_raw
5075
+ row.append(_roll5) # {prefix}_delta_mov_rolling_5
5076
+ row.append(_roll10) # {prefix}_delta_mov_rolling_10
5077
+ # delta_mov_diff: home rolling_5 - away rolling_5
5078
+ _h_dh = delta_mov_history[home]
5079
+ _a_dh = delta_mov_history[away]
5080
+ _h_r5 = (sum(_h_dh[-5:]) / len(_h_dh[-5:]) / 20.0) if _h_dh else 0.0
5081
+ _a_r5 = (sum(_a_dh[-5:]) / len(_a_dh[-5:]) / 20.0) if _a_dh else 0.0
5082
+ row.append(_h_r5 - _a_r5) # delta_mov_diff
5083
 
5084
  # ── 38. VENUE-CONDITIONAL MATCHUP FEATURES (14 features) ──
5085
  # Use true venue-specific records: home team at home vs away team on road
 
5127
  team_home_margin_sum, team_home_games_count)
5128
  # Update MOVDA ELO (Cat 37)
5129
  self._update_movda(home, away, hs, as_, team_movda, mov_surprise_ewm,
5130
+ delta_mov_history,
5131
  _MOVDA_K, _MOVDA_C, _MOVDA_LAMBDA, _MOVDA_ALPHA,
5132
  _MOVDA_BETA, _MOVDA_GAMMA, _MOVDA_DELTA, _MOVDA_EWM_ALPHA)
5133
 
 
5683
  team_home_games_count[home] += 1
5684
 
5685
  def _update_movda(self, home, away, hs, as_, team_movda, mov_surprise_ewm,
5686
+ delta_mov_history,
5687
  K, C, lam, alpha, beta, gamma, delta_param, ewm_alpha):
5688
+ """Update MOVDA Elo ratings and raw delta_MOV history (Cat 37). arXiv:2506.00348."""
5689
  margin = hs - as_
5690
  result = 1.0 if margin > 0 else (0.0 if margin < 0 else 0.5)
5691
  delta_r = team_movda[home] - team_movda[away]
 
5697
  team_movda[away] -= movda_update
5698
  mov_surprise_ewm[home] = ewm_alpha * delta_mov + (1 - ewm_alpha) * mov_surprise_ewm[home]
5699
  mov_surprise_ewm[away] = ewm_alpha * (-delta_mov) + (1 - ewm_alpha) * mov_surprise_ewm[away]
5700
+ # Append raw delta_MOV to rolling history (home team's perspective)
5701
+ delta_mov_history[home].append(delta_mov)
5702
+ delta_mov_history[away].append(-delta_mov)
5703
 
5704
  def _parse_stats(self, stats, pts, opp_pts, is_home=True):
5705
  """Extract stats from game data. Uses REAL box score when available, estimates otherwise."""
requirements.txt CHANGED
@@ -11,4 +11,9 @@ psycopg2-binary>=2.9
11
  torch>=2.3 --index-url https://download.pytorch.org/whl/cpu
12
  pytorch_tabnet>=4.1
13
  mapie>=0.9
 
14
  # autogluon.tabular>=1.2 # OPTIONAL — large install (~2GB), uncomment if needed
 
 
 
 
 
11
  torch>=2.3 --index-url https://download.pytorch.org/whl/cpu
12
  pytorch_tabnet>=4.1
13
  mapie>=0.9
14
+ betacal>=0.1
15
  # autogluon.tabular>=1.2 # OPTIONAL — large install (~2GB), uncomment if needed
16
+ # --- Browser scraping (needs Playwright deps in Docker image, see Dockerfile.browser) ---
17
+ # crawl4ai>=0.4 # OPTIONAL — uncomment when using Dockerfile.browser for browser-based scraping
18
+ html2text>=2024.2 # Lightweight HTML-to-markdown for requests fallback
19
+ beautifulsoup4>=4.12 # CSS selector extraction in requests fallback