LBJLincoln commited on
Commit
89fc340
Β·
1 Parent(s): 1325036

sync: engine.py to hf-space (auto-deploy)

Browse files
Files changed (1) hide show
  1. features/engine.py +248 -2
features/engine.py CHANGED
@@ -66,6 +66,7 @@ Categories:
66
  64. OPPONENT-ELO-WEIGHTED PERFORMANCE (10 features β€” quality-adjusted rolling stats, trend)
67
  65. STYLE MATCHUP ADVANTAGE (12 features β€” 4-factor offense vs defense matchup edges)
68
  66. PACE-NORMALIZED PER-100 BOX-SCORE DIFFERENTIALS (12 features β€” pts/ast/tov/reb per 100 poss)
 
69
  β‰ˆ 6400+ feature candidates
70
 
71
  Architecture inspired by:
@@ -90,7 +91,7 @@ import csv
90
  import os
91
 
92
  # ── Engine Version ──
93
- ENGINE_VERSION = "v3.1-66cat" # Cat66: Pace-Normalized Per-100 Box-Score Differentials (MDPI Jan 2026)
94
 
95
  # ── Team mappings ──
96
  TEAM_MAP = {
@@ -360,6 +361,197 @@ def load_historical_odds(csv_path=None):
360
  return lookup
361
 
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  class NBAFeatureEngine:
364
  """
365
  Generates 6000+ features for each game from historical data.
@@ -370,9 +562,27 @@ class NBAFeatureEngine:
370
  # X.shape = (n_games, ~6000)
371
  """
372
 
373
- def __init__(self, include_market=True, skip_placeholder=False):
 
 
 
 
 
 
 
 
 
 
374
  self.include_market = include_market
375
  self.skip_placeholder = skip_placeholder
 
 
 
 
 
 
 
 
376
  self.feature_names = []
377
  self._build_feature_names()
378
 
@@ -2948,6 +3158,22 @@ class NBAFeatureEngine:
2948
  "p100_66_diff_reb", # Home - Away reb/100 differential
2949
  ])
2950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2951
  self.feature_names = names
2952
 
2953
  def build(self, games, market_data=None, referee_data=None, player_data=None, quarter_data=None, tracking_data=None, odds_data=None):
@@ -7180,6 +7406,26 @@ class NBAFeatureEngine:
7180
  except Exception:
7181
  row.extend([0.0] * 12)
7182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7183
  X.append(row)
7184
  y.append(1 if hs > as_ else 0)
7185
  _y_margin.append(hs - as_)
 
66
  64. OPPONENT-ELO-WEIGHTED PERFORMANCE (10 features β€” quality-adjusted rolling stats, trend)
67
  65. STYLE MATCHUP ADVANTAGE (12 features β€” 4-factor offense vs defense matchup edges)
68
  66. PACE-NORMALIZED PER-100 BOX-SCORE DIFFERENTIALS (12 features β€” pts/ast/tov/reb per 100 poss)
69
+ 67. YOUTUBE FINBERT SENTIMENT (6 features β€” rolling 3/7/14d polarity + volatility, sim_cutoff gated)
70
  β‰ˆ 6400+ feature candidates
71
 
72
  Architecture inspired by:
 
91
  import os
92
 
93
  # ── Engine Version ──
94
+ ENGINE_VERSION = "v3.2-67cat" # Cat67: YouTube FinBERT rolling sentiment (HAWKEYE 2026-04-21, sim_cutoff-gated)
95
 
96
  # ── Team mappings ──
97
  TEAM_MAP = {
 
361
  return lookup
362
 
363
 
364
+ # ── Cat 67: YouTube FinBERT Sentiment Loader (HAWKEYE 2026-04-21) ──
365
+ # Loads precomputed finBERT sentiment from data/youtube/sentiment.parquet
366
+ # and exposes per-game rolling aggregates with a hard sim_date_cutoff gate.
367
+ # Leakage precedent: 2026-04-18 POL excess_return, 2026-04-21 market_narrative
368
+ # stripper. We MUST refuse any published_at > sim_cutoff at compute time.
369
+
370
+ _YT_SENT_DEFAULT_PATH = os.path.join(
371
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
372
+ "data", "youtube", "sentiment.parquet",
373
+ )
374
+
375
+
376
+ def _load_youtube_sentiment(path=None):
377
+ """Load sentiment.parquet β†’ pandas DataFrame, or None on any failure.
378
+
379
+ Columns expected: id, published_at (UTC tz-aware), channel,
380
+ sent_pos, sent_neu, sent_neg, polarity.
381
+ """
382
+ try:
383
+ import pandas as pd # noqa: F401
384
+ except ImportError:
385
+ return None
386
+ p = path or os.environ.get("NOMOS_YT_SENT_PATH") or _YT_SENT_DEFAULT_PATH
387
+ if not p or not os.path.exists(p):
388
+ return None
389
+ try:
390
+ import pandas as pd
391
+ df = pd.read_parquet(p)
392
+ if "published_at" not in df.columns or "polarity" not in df.columns:
393
+ return None
394
+ df["published_at"] = pd.to_datetime(df["published_at"], utc=True, errors="coerce")
395
+ df = df.dropna(subset=["published_at"]).copy()
396
+ return df
397
+ except Exception:
398
+ return None
399
+
400
+
401
+ def _youtube_sentiment_features(df, game_date_str, sim_cutoff=None):
402
+ """Compute 6 rolling-window sentiment scalars for one game.
403
+
404
+ Returns dict with yt_pol_mean_{3,7,14} and yt_abs_pol_mean_{3,7,14}.
405
+ All-zero fallback when df is None, empty, or window has no rows.
406
+ JOIN rule: published_at <= game_date AND game_date-published_at <= W days.
407
+ sim_cutoff (date str or datetime) is a hard leakage gate.
408
+ """
409
+ out = {f"yt_pol_mean_{w}": 0.0 for w in (3, 7, 14)}
410
+ out.update({f"yt_abs_pol_mean_{w}": 0.0 for w in (3, 7, 14)})
411
+ if df is None or len(df) == 0 or not game_date_str:
412
+ return out
413
+ try:
414
+ import pandas as pd
415
+ gd = pd.Timestamp(game_date_str[:10], tz="UTC")
416
+ except Exception:
417
+ return out
418
+ try:
419
+ sub = df
420
+ if sim_cutoff is not None:
421
+ try:
422
+ cutoff = pd.Timestamp(str(sim_cutoff)[:10], tz="UTC")
423
+ sub = sub[sub["published_at"] <= cutoff]
424
+ except Exception:
425
+ pass
426
+ sub = sub[sub["published_at"] <= gd]
427
+ if len(sub) == 0:
428
+ return out
429
+ age_days = (gd - sub["published_at"]).dt.total_seconds() / 86400.0
430
+ for w in (3, 7, 14):
431
+ window = sub[age_days <= float(w)]
432
+ if len(window) == 0:
433
+ continue
434
+ pol = window["polarity"].astype(float)
435
+ out[f"yt_pol_mean_{w}"] = float(pol.mean())
436
+ out[f"yt_abs_pol_mean_{w}"] = float(pol.abs().mean())
437
+ except Exception:
438
+ pass
439
+ return out
440
+
441
+
442
+ # ── GENERALIZED VENN-ABERS CALIBRATION WRAPPER (2026-04-21, proposal #4) ────
443
+ # Source: arXiv:2502.05676 + github.com/ip200/venn-abers
444
+ #
445
+ # Wraps any probability output with a Venn-Abers Inductive Calibrator. Additive
446
+ # β€” gated behind the VENN_ABERS_CALIBRATION env flag (default on = "1"). When
447
+ # disabled or when the `venn_abers` package is absent, `calibrate_probs`
448
+ # returns the input unchanged so islands can ship without the dep upgrade.
449
+ #
450
+ # Usage in a scorer / evaluator:
451
+ # from features.engine import VennAbersProbabilityCalibrator
452
+ # cal = VennAbersProbabilityCalibrator()
453
+ # cal.fit(p_valid_2col, y_valid) # p shape (n,2) β€” [p_no, p_yes]
454
+ # p_cal = cal.transform(p_test_2col) # same shape
455
+ # # OR one-shot:
456
+ # p_cal = calibrate_probs(p_train, y_train, p_test)
457
+ #
458
+ # Brier-delta expectation per the paper: 2-5% drop on binary classifiers with
459
+ # moderate miscalibration (our island fleet fits this β€” ECE ~0.05 on held-out).
460
+ import os as _os_va
461
+
462
+
463
+ def _venn_abers_enabled() -> bool:
464
+ """Return True if feature flag is on (default on). Env: VENN_ABERS_CALIBRATION."""
465
+ return _os_va.environ.get("VENN_ABERS_CALIBRATION", "1").strip().lower() not in ("0", "false", "no", "off")
466
+
467
+
468
+ class VennAbersProbabilityCalibrator:
469
+ """Thin wrapper around venn_abers.VennAbersCalibrator that swallows ImportError
470
+ so island training loops never break on a missing dep.
471
+
472
+ Use for post-hoc calibration of any binary-classifier probability output.
473
+ Works on a held-out calibration set (inductive mode), hence `fit(p_cal, y_cal)`
474
+ then `transform(p_test)`.
475
+ """
476
+
477
+ def __init__(self, inductive: bool = True):
478
+ self._enabled = _venn_abers_enabled()
479
+ self._inductive = inductive
480
+ self._inner = None
481
+ self._fitted = False
482
+ self._p_cal = None
483
+ self._y_cal = None
484
+ self._import_err: Optional[str] = None
485
+ if self._enabled:
486
+ try:
487
+ # Use the manual VennAbers class (no estimator required β€” we
488
+ # post-hoc-calibrate scores from any upstream classifier).
489
+ from venn_abers import VennAbers as _VA # noqa: N811
490
+ self._inner = _VA()
491
+ except ImportError as e:
492
+ self._import_err = str(e)
493
+ self._enabled = False
494
+
495
+ @property
496
+ def enabled(self) -> bool:
497
+ return self._enabled
498
+
499
+ def fit(self, p_cal, y_cal):
500
+ """p_cal: array-like shape (n, 2) β€” [p(class=0), p(class=1)].
501
+ y_cal: array-like of ints in {0, 1}."""
502
+ if not self._enabled or self._inner is None:
503
+ return self
504
+ import numpy as _np
505
+ p = _np.asarray(p_cal, dtype=float)
506
+ y = _np.asarray(y_cal, dtype=int)
507
+ if p.ndim == 1:
508
+ # Single-column input β€” treat as P(class=1); make a 2-col matrix.
509
+ p = _np.column_stack([1.0 - p, p])
510
+ # Cache cal set; manual VennAbers.fit stores internally.
511
+ self._inner.fit(p, y)
512
+ self._fitted = True
513
+ return self
514
+
515
+ def transform(self, p_test):
516
+ """Return calibrated P(class=1). Identity passthrough if disabled/unfit."""
517
+ if not self._enabled or self._inner is None or not self._fitted:
518
+ import numpy as _np
519
+ pt = _np.asarray(p_test, dtype=float)
520
+ if pt.ndim == 2 and pt.shape[1] == 2:
521
+ return pt[:, 1]
522
+ return pt
523
+ import numpy as _np
524
+ p = _np.asarray(p_test, dtype=float)
525
+ if p.ndim == 1:
526
+ p = _np.column_stack([1.0 - p, p])
527
+ out = self._inner.predict_proba(p)
528
+ # VennAbers.predict_proba returns (p_prime, p_zero_one) β€” we want p_prime
529
+ # (the calibrated 2-col probs). Take column 1 = P(class=1).
530
+ if isinstance(out, tuple):
531
+ p_prime = _np.asarray(out[0], dtype=float)
532
+ else:
533
+ p_prime = _np.asarray(out, dtype=float)
534
+ return p_prime[:, 1] if p_prime.ndim == 2 else p_prime
535
+
536
+ def fit_transform(self, p_cal, y_cal, p_test):
537
+ self.fit(p_cal, y_cal)
538
+ return self.transform(p_test)
539
+
540
+
541
+ def calibrate_probs(p_train, y_train, p_test, inductive: bool = True):
542
+ """One-shot Venn-Abers calibration. Returns an array of P(class=1) the same
543
+ length as p_test. Safe no-op when feature flag off or venn_abers missing.
544
+ """
545
+ cal = VennAbersProbabilityCalibrator(inductive=inductive)
546
+ if not cal.enabled:
547
+ import numpy as _np
548
+ pt = _np.asarray(p_test, dtype=float)
549
+ if pt.ndim == 2 and pt.shape[1] == 2:
550
+ return pt[:, 1]
551
+ return pt
552
+ return cal.fit_transform(p_train, y_train, p_test)
553
+
554
+
555
  class NBAFeatureEngine:
556
  """
557
  Generates 6000+ features for each game from historical data.
 
562
  # X.shape = (n_games, ~6000)
563
  """
564
 
565
+ def __init__(self, include_market=True, skip_placeholder=False,
566
+ youtube_sentiment_path=None, sim_date_cutoff=None,
567
+ enable_youtube=False):
568
+ """
569
+ Args:
570
+ enable_youtube: default False β€” corpus currently has only 20.7% NBA keyword
571
+ hits (audit 2026-04-21). HAWKEYE's Tier-1 proposal explicitly scoped FinBERT
572
+ rolling sentiment as POL-first; NBA path is dark until corpus mature.
573
+ Flip to True on sandbox island (S14) for A/B.
574
+ sim_date_cutoff: hard leakage gate β€” drops videos published after this date.
575
+ """
576
  self.include_market = include_market
577
  self.skip_placeholder = skip_placeholder
578
+ # Cat 67: YouTube FinBERT rolling sentiment (HAWKEYE 2026-04-21)
579
+ # sim_date_cutoff enforces leakage-gate β€” videos published after this
580
+ # are dropped. Default None β†’ falls back to per-game date only.
581
+ self.enable_youtube = enable_youtube
582
+ self.sim_date_cutoff = sim_date_cutoff
583
+ self._yt_sent_cache = None # pandas DataFrame, lazy-loaded below
584
+ if enable_youtube:
585
+ self._yt_sent_cache = _load_youtube_sentiment(youtube_sentiment_path)
586
  self.feature_names = []
587
  self._build_feature_names()
588
 
 
3158
  "p100_66_diff_reb", # Home - Away reb/100 differential
3159
  ])
3160
 
3161
+ # ── Cat 67: YouTube FinBERT Sentiment (6 features) ──
3162
+ # Source: HAWKEYE proposal 2026-04-21, Yang MDPI 2025, arXiv 2306.02136.
3163
+ # ProsusAI/finBERT β†’ per-video (pos, neu, neg, polarity=pos-neg).
3164
+ # Per game: rolling mean over 3/7/14 day window of polarity AND |polarity|.
3165
+ # Gated by sim_date_cutoff (published_at <= cutoff) to avoid the same
3166
+ # class as 2026-04-18 POL excess_return / 2026-04-21 market_narrative leak.
3167
+ # All-zero fallback when sentiment.parquet missing or window empty.
3168
+ names.extend([
3169
+ "yt_pol_mean_3", # mean polarity last 3 days (signed: pos - neg)
3170
+ "yt_pol_mean_7", # mean polarity last 7 days
3171
+ "yt_pol_mean_14", # mean polarity last 14 days
3172
+ "yt_abs_pol_mean_3", # mean |polarity| last 3d (volatility/intensity proxy)
3173
+ "yt_abs_pol_mean_7", # mean |polarity| last 7d
3174
+ "yt_abs_pol_mean_14",# mean |polarity| last 14d
3175
+ ])
3176
+
3177
  self.feature_names = names
3178
 
3179
  def build(self, games, market_data=None, referee_data=None, player_data=None, quarter_data=None, tracking_data=None, odds_data=None):
 
7406
  except Exception:
7407
  row.extend([0.0] * 12)
7408
 
7409
+ # ── Cat 67: YouTube FinBERT Sentiment (6 features) ──
7410
+ # Per-game rolling aggregates. sim_date_cutoff gates all leakage.
7411
+ try:
7412
+ if self.enable_youtube and self._yt_sent_cache is not None:
7413
+ _yt67 = _youtube_sentiment_features(
7414
+ self._yt_sent_cache, gd, sim_cutoff=self.sim_date_cutoff
7415
+ )
7416
+ row.extend([
7417
+ _yt67["yt_pol_mean_3"],
7418
+ _yt67["yt_pol_mean_7"],
7419
+ _yt67["yt_pol_mean_14"],
7420
+ _yt67["yt_abs_pol_mean_3"],
7421
+ _yt67["yt_abs_pol_mean_7"],
7422
+ _yt67["yt_abs_pol_mean_14"],
7423
+ ])
7424
+ else:
7425
+ row.extend([0.0] * 6)
7426
+ except Exception:
7427
+ row.extend([0.0] * 6)
7428
+
7429
  X.append(row)
7430
  y.append(1 if hs > as_ else 0)
7431
  _y_margin.append(hs - as_)