Commit Β·
89fc340
1
Parent(s): 1325036
sync: engine.py to hf-space (auto-deploy)
Browse files- features/engine.py +248 -2
features/engine.py
CHANGED
|
@@ -66,6 +66,7 @@ Categories:
|
|
| 66 |
64. OPPONENT-ELO-WEIGHTED PERFORMANCE (10 features β quality-adjusted rolling stats, trend)
|
| 67 |
65. STYLE MATCHUP ADVANTAGE (12 features β 4-factor offense vs defense matchup edges)
|
| 68 |
66. PACE-NORMALIZED PER-100 BOX-SCORE DIFFERENTIALS (12 features β pts/ast/tov/reb per 100 poss)
|
|
|
|
| 69 |
β 6400+ feature candidates
|
| 70 |
|
| 71 |
Architecture inspired by:
|
|
@@ -90,7 +91,7 @@ import csv
|
|
| 90 |
import os
|
| 91 |
|
| 92 |
# ββ Engine Version ββ
|
| 93 |
-
ENGINE_VERSION = "v3.
|
| 94 |
|
| 95 |
# ββ Team mappings ββ
|
| 96 |
TEAM_MAP = {
|
|
@@ -360,6 +361,197 @@ def load_historical_odds(csv_path=None):
|
|
| 360 |
return lookup
|
| 361 |
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
class NBAFeatureEngine:
|
| 364 |
"""
|
| 365 |
Generates 6000+ features for each game from historical data.
|
|
@@ -370,9 +562,27 @@ class NBAFeatureEngine:
|
|
| 370 |
# X.shape = (n_games, ~6000)
|
| 371 |
"""
|
| 372 |
|
| 373 |
-
def __init__(self, include_market=True, skip_placeholder=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
self.include_market = include_market
|
| 375 |
self.skip_placeholder = skip_placeholder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
self.feature_names = []
|
| 377 |
self._build_feature_names()
|
| 378 |
|
|
@@ -2948,6 +3158,22 @@ class NBAFeatureEngine:
|
|
| 2948 |
"p100_66_diff_reb", # Home - Away reb/100 differential
|
| 2949 |
])
|
| 2950 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2951 |
self.feature_names = names
|
| 2952 |
|
| 2953 |
def build(self, games, market_data=None, referee_data=None, player_data=None, quarter_data=None, tracking_data=None, odds_data=None):
|
|
@@ -7180,6 +7406,26 @@ class NBAFeatureEngine:
|
|
| 7180 |
except Exception:
|
| 7181 |
row.extend([0.0] * 12)
|
| 7182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7183 |
X.append(row)
|
| 7184 |
y.append(1 if hs > as_ else 0)
|
| 7185 |
_y_margin.append(hs - as_)
|
|
|
|
| 66 |
64. OPPONENT-ELO-WEIGHTED PERFORMANCE (10 features β quality-adjusted rolling stats, trend)
|
| 67 |
65. STYLE MATCHUP ADVANTAGE (12 features β 4-factor offense vs defense matchup edges)
|
| 68 |
66. PACE-NORMALIZED PER-100 BOX-SCORE DIFFERENTIALS (12 features β pts/ast/tov/reb per 100 poss)
|
| 69 |
+
67. YOUTUBE FINBERT SENTIMENT (6 features β rolling 3/7/14d polarity + volatility, sim_cutoff gated)
|
| 70 |
β 6400+ feature candidates
|
| 71 |
|
| 72 |
Architecture inspired by:
|
|
|
|
| 91 |
import os
|
| 92 |
|
| 93 |
# ββ Engine Version ββ
|
| 94 |
+
ENGINE_VERSION = "v3.2-67cat" # Cat67: YouTube FinBERT rolling sentiment (HAWKEYE 2026-04-21, sim_cutoff-gated)
|
| 95 |
|
| 96 |
# ββ Team mappings ββ
|
| 97 |
TEAM_MAP = {
|
|
|
|
| 361 |
return lookup
|
| 362 |
|
| 363 |
|
| 364 |
+
# ββ Cat 67: YouTube FinBERT Sentiment Loader (HAWKEYE 2026-04-21) ββ
|
| 365 |
+
# Loads precomputed finBERT sentiment from data/youtube/sentiment.parquet
|
| 366 |
+
# and exposes per-game rolling aggregates with a hard sim_date_cutoff gate.
|
| 367 |
+
# Leakage precedent: 2026-04-18 POL excess_return, 2026-04-21 market_narrative
|
| 368 |
+
# stripper. We MUST refuse any published_at > sim_cutoff at compute time.
|
| 369 |
+
|
| 370 |
+
_YT_SENT_DEFAULT_PATH = os.path.join(
|
| 371 |
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
| 372 |
+
"data", "youtube", "sentiment.parquet",
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
def _load_youtube_sentiment(path=None):
|
| 377 |
+
"""Load sentiment.parquet β pandas DataFrame, or None on any failure.
|
| 378 |
+
|
| 379 |
+
Columns expected: id, published_at (UTC tz-aware), channel,
|
| 380 |
+
sent_pos, sent_neu, sent_neg, polarity.
|
| 381 |
+
"""
|
| 382 |
+
try:
|
| 383 |
+
import pandas as pd # noqa: F401
|
| 384 |
+
except ImportError:
|
| 385 |
+
return None
|
| 386 |
+
p = path or os.environ.get("NOMOS_YT_SENT_PATH") or _YT_SENT_DEFAULT_PATH
|
| 387 |
+
if not p or not os.path.exists(p):
|
| 388 |
+
return None
|
| 389 |
+
try:
|
| 390 |
+
import pandas as pd
|
| 391 |
+
df = pd.read_parquet(p)
|
| 392 |
+
if "published_at" not in df.columns or "polarity" not in df.columns:
|
| 393 |
+
return None
|
| 394 |
+
df["published_at"] = pd.to_datetime(df["published_at"], utc=True, errors="coerce")
|
| 395 |
+
df = df.dropna(subset=["published_at"]).copy()
|
| 396 |
+
return df
|
| 397 |
+
except Exception:
|
| 398 |
+
return None
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def _youtube_sentiment_features(df, game_date_str, sim_cutoff=None):
|
| 402 |
+
"""Compute 6 rolling-window sentiment scalars for one game.
|
| 403 |
+
|
| 404 |
+
Returns dict with yt_pol_mean_{3,7,14} and yt_abs_pol_mean_{3,7,14}.
|
| 405 |
+
All-zero fallback when df is None, empty, or window has no rows.
|
| 406 |
+
JOIN rule: published_at <= game_date AND game_date-published_at <= W days.
|
| 407 |
+
sim_cutoff (date str or datetime) is a hard leakage gate.
|
| 408 |
+
"""
|
| 409 |
+
out = {f"yt_pol_mean_{w}": 0.0 for w in (3, 7, 14)}
|
| 410 |
+
out.update({f"yt_abs_pol_mean_{w}": 0.0 for w in (3, 7, 14)})
|
| 411 |
+
if df is None or len(df) == 0 or not game_date_str:
|
| 412 |
+
return out
|
| 413 |
+
try:
|
| 414 |
+
import pandas as pd
|
| 415 |
+
gd = pd.Timestamp(game_date_str[:10], tz="UTC")
|
| 416 |
+
except Exception:
|
| 417 |
+
return out
|
| 418 |
+
try:
|
| 419 |
+
sub = df
|
| 420 |
+
if sim_cutoff is not None:
|
| 421 |
+
try:
|
| 422 |
+
cutoff = pd.Timestamp(str(sim_cutoff)[:10], tz="UTC")
|
| 423 |
+
sub = sub[sub["published_at"] <= cutoff]
|
| 424 |
+
except Exception:
|
| 425 |
+
pass
|
| 426 |
+
sub = sub[sub["published_at"] <= gd]
|
| 427 |
+
if len(sub) == 0:
|
| 428 |
+
return out
|
| 429 |
+
age_days = (gd - sub["published_at"]).dt.total_seconds() / 86400.0
|
| 430 |
+
for w in (3, 7, 14):
|
| 431 |
+
window = sub[age_days <= float(w)]
|
| 432 |
+
if len(window) == 0:
|
| 433 |
+
continue
|
| 434 |
+
pol = window["polarity"].astype(float)
|
| 435 |
+
out[f"yt_pol_mean_{w}"] = float(pol.mean())
|
| 436 |
+
out[f"yt_abs_pol_mean_{w}"] = float(pol.abs().mean())
|
| 437 |
+
except Exception:
|
| 438 |
+
pass
|
| 439 |
+
return out
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# ββ GENERALIZED VENN-ABERS CALIBRATION WRAPPER (2026-04-21, proposal #4) ββββ
|
| 443 |
+
# Source: arXiv:2502.05676 + github.com/ip200/venn-abers
|
| 444 |
+
#
|
| 445 |
+
# Wraps any probability output with a Venn-Abers Inductive Calibrator. Additive
|
| 446 |
+
# β gated behind the VENN_ABERS_CALIBRATION env flag (default on = "1"). When
|
| 447 |
+
# disabled or when the `venn_abers` package is absent, `calibrate_probs`
|
| 448 |
+
# returns the input unchanged so islands can ship without the dep upgrade.
|
| 449 |
+
#
|
| 450 |
+
# Usage in a scorer / evaluator:
|
| 451 |
+
# from features.engine import VennAbersProbabilityCalibrator
|
| 452 |
+
# cal = VennAbersProbabilityCalibrator()
|
| 453 |
+
# cal.fit(p_valid_2col, y_valid) # p shape (n,2) β [p_no, p_yes]
|
| 454 |
+
# p_cal = cal.transform(p_test_2col) # same shape
|
| 455 |
+
# # OR one-shot:
|
| 456 |
+
# p_cal = calibrate_probs(p_train, y_train, p_test)
|
| 457 |
+
#
|
| 458 |
+
# Brier-delta expectation per the paper: 2-5% drop on binary classifiers with
|
| 459 |
+
# moderate miscalibration (our island fleet fits this β ECE ~0.05 on held-out).
|
| 460 |
+
import os as _os_va
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def _venn_abers_enabled() -> bool:
|
| 464 |
+
"""Return True if feature flag is on (default on). Env: VENN_ABERS_CALIBRATION."""
|
| 465 |
+
return _os_va.environ.get("VENN_ABERS_CALIBRATION", "1").strip().lower() not in ("0", "false", "no", "off")
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
class VennAbersProbabilityCalibrator:
|
| 469 |
+
"""Thin wrapper around venn_abers.VennAbersCalibrator that swallows ImportError
|
| 470 |
+
so island training loops never break on a missing dep.
|
| 471 |
+
|
| 472 |
+
Use for post-hoc calibration of any binary-classifier probability output.
|
| 473 |
+
Works on a held-out calibration set (inductive mode), hence `fit(p_cal, y_cal)`
|
| 474 |
+
then `transform(p_test)`.
|
| 475 |
+
"""
|
| 476 |
+
|
| 477 |
+
def __init__(self, inductive: bool = True):
|
| 478 |
+
self._enabled = _venn_abers_enabled()
|
| 479 |
+
self._inductive = inductive
|
| 480 |
+
self._inner = None
|
| 481 |
+
self._fitted = False
|
| 482 |
+
self._p_cal = None
|
| 483 |
+
self._y_cal = None
|
| 484 |
+
self._import_err: Optional[str] = None
|
| 485 |
+
if self._enabled:
|
| 486 |
+
try:
|
| 487 |
+
# Use the manual VennAbers class (no estimator required β we
|
| 488 |
+
# post-hoc-calibrate scores from any upstream classifier).
|
| 489 |
+
from venn_abers import VennAbers as _VA # noqa: N811
|
| 490 |
+
self._inner = _VA()
|
| 491 |
+
except ImportError as e:
|
| 492 |
+
self._import_err = str(e)
|
| 493 |
+
self._enabled = False
|
| 494 |
+
|
| 495 |
+
@property
|
| 496 |
+
def enabled(self) -> bool:
|
| 497 |
+
return self._enabled
|
| 498 |
+
|
| 499 |
+
def fit(self, p_cal, y_cal):
|
| 500 |
+
"""p_cal: array-like shape (n, 2) β [p(class=0), p(class=1)].
|
| 501 |
+
y_cal: array-like of ints in {0, 1}."""
|
| 502 |
+
if not self._enabled or self._inner is None:
|
| 503 |
+
return self
|
| 504 |
+
import numpy as _np
|
| 505 |
+
p = _np.asarray(p_cal, dtype=float)
|
| 506 |
+
y = _np.asarray(y_cal, dtype=int)
|
| 507 |
+
if p.ndim == 1:
|
| 508 |
+
# Single-column input β treat as P(class=1); make a 2-col matrix.
|
| 509 |
+
p = _np.column_stack([1.0 - p, p])
|
| 510 |
+
# Cache cal set; manual VennAbers.fit stores internally.
|
| 511 |
+
self._inner.fit(p, y)
|
| 512 |
+
self._fitted = True
|
| 513 |
+
return self
|
| 514 |
+
|
| 515 |
+
def transform(self, p_test):
|
| 516 |
+
"""Return calibrated P(class=1). Identity passthrough if disabled/unfit."""
|
| 517 |
+
if not self._enabled or self._inner is None or not self._fitted:
|
| 518 |
+
import numpy as _np
|
| 519 |
+
pt = _np.asarray(p_test, dtype=float)
|
| 520 |
+
if pt.ndim == 2 and pt.shape[1] == 2:
|
| 521 |
+
return pt[:, 1]
|
| 522 |
+
return pt
|
| 523 |
+
import numpy as _np
|
| 524 |
+
p = _np.asarray(p_test, dtype=float)
|
| 525 |
+
if p.ndim == 1:
|
| 526 |
+
p = _np.column_stack([1.0 - p, p])
|
| 527 |
+
out = self._inner.predict_proba(p)
|
| 528 |
+
# VennAbers.predict_proba returns (p_prime, p_zero_one) β we want p_prime
|
| 529 |
+
# (the calibrated 2-col probs). Take column 1 = P(class=1).
|
| 530 |
+
if isinstance(out, tuple):
|
| 531 |
+
p_prime = _np.asarray(out[0], dtype=float)
|
| 532 |
+
else:
|
| 533 |
+
p_prime = _np.asarray(out, dtype=float)
|
| 534 |
+
return p_prime[:, 1] if p_prime.ndim == 2 else p_prime
|
| 535 |
+
|
| 536 |
+
def fit_transform(self, p_cal, y_cal, p_test):
|
| 537 |
+
self.fit(p_cal, y_cal)
|
| 538 |
+
return self.transform(p_test)
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def calibrate_probs(p_train, y_train, p_test, inductive: bool = True):
|
| 542 |
+
"""One-shot Venn-Abers calibration. Returns an array of P(class=1) the same
|
| 543 |
+
length as p_test. Safe no-op when feature flag off or venn_abers missing.
|
| 544 |
+
"""
|
| 545 |
+
cal = VennAbersProbabilityCalibrator(inductive=inductive)
|
| 546 |
+
if not cal.enabled:
|
| 547 |
+
import numpy as _np
|
| 548 |
+
pt = _np.asarray(p_test, dtype=float)
|
| 549 |
+
if pt.ndim == 2 and pt.shape[1] == 2:
|
| 550 |
+
return pt[:, 1]
|
| 551 |
+
return pt
|
| 552 |
+
return cal.fit_transform(p_train, y_train, p_test)
|
| 553 |
+
|
| 554 |
+
|
| 555 |
class NBAFeatureEngine:
|
| 556 |
"""
|
| 557 |
Generates 6000+ features for each game from historical data.
|
|
|
|
| 562 |
# X.shape = (n_games, ~6000)
|
| 563 |
"""
|
| 564 |
|
| 565 |
+
def __init__(self, include_market=True, skip_placeholder=False,
|
| 566 |
+
youtube_sentiment_path=None, sim_date_cutoff=None,
|
| 567 |
+
enable_youtube=False):
|
| 568 |
+
"""
|
| 569 |
+
Args:
|
| 570 |
+
enable_youtube: default False β corpus currently has only 20.7% NBA keyword
|
| 571 |
+
hits (audit 2026-04-21). HAWKEYE's Tier-1 proposal explicitly scoped FinBERT
|
| 572 |
+
rolling sentiment as POL-first; NBA path is dark until corpus mature.
|
| 573 |
+
Flip to True on sandbox island (S14) for A/B.
|
| 574 |
+
sim_date_cutoff: hard leakage gate β drops videos published after this date.
|
| 575 |
+
"""
|
| 576 |
self.include_market = include_market
|
| 577 |
self.skip_placeholder = skip_placeholder
|
| 578 |
+
# Cat 67: YouTube FinBERT rolling sentiment (HAWKEYE 2026-04-21)
|
| 579 |
+
# sim_date_cutoff enforces leakage-gate β videos published after this
|
| 580 |
+
# are dropped. Default None β falls back to per-game date only.
|
| 581 |
+
self.enable_youtube = enable_youtube
|
| 582 |
+
self.sim_date_cutoff = sim_date_cutoff
|
| 583 |
+
self._yt_sent_cache = None # pandas DataFrame, lazy-loaded below
|
| 584 |
+
if enable_youtube:
|
| 585 |
+
self._yt_sent_cache = _load_youtube_sentiment(youtube_sentiment_path)
|
| 586 |
self.feature_names = []
|
| 587 |
self._build_feature_names()
|
| 588 |
|
|
|
|
| 3158 |
"p100_66_diff_reb", # Home - Away reb/100 differential
|
| 3159 |
])
|
| 3160 |
|
| 3161 |
+
# ββ Cat 67: YouTube FinBERT Sentiment (6 features) ββ
|
| 3162 |
+
# Source: HAWKEYE proposal 2026-04-21, Yang MDPI 2025, arXiv 2306.02136.
|
| 3163 |
+
# ProsusAI/finBERT β per-video (pos, neu, neg, polarity=pos-neg).
|
| 3164 |
+
# Per game: rolling mean over 3/7/14 day window of polarity AND |polarity|.
|
| 3165 |
+
# Gated by sim_date_cutoff (published_at <= cutoff) to avoid the same
|
| 3166 |
+
# class as 2026-04-18 POL excess_return / 2026-04-21 market_narrative leak.
|
| 3167 |
+
# All-zero fallback when sentiment.parquet missing or window empty.
|
| 3168 |
+
names.extend([
|
| 3169 |
+
"yt_pol_mean_3", # mean polarity last 3 days (signed: pos - neg)
|
| 3170 |
+
"yt_pol_mean_7", # mean polarity last 7 days
|
| 3171 |
+
"yt_pol_mean_14", # mean polarity last 14 days
|
| 3172 |
+
"yt_abs_pol_mean_3", # mean |polarity| last 3d (volatility/intensity proxy)
|
| 3173 |
+
"yt_abs_pol_mean_7", # mean |polarity| last 7d
|
| 3174 |
+
"yt_abs_pol_mean_14",# mean |polarity| last 14d
|
| 3175 |
+
])
|
| 3176 |
+
|
| 3177 |
self.feature_names = names
|
| 3178 |
|
| 3179 |
def build(self, games, market_data=None, referee_data=None, player_data=None, quarter_data=None, tracking_data=None, odds_data=None):
|
|
|
|
| 7406 |
except Exception:
|
| 7407 |
row.extend([0.0] * 12)
|
| 7408 |
|
| 7409 |
+
# ββ Cat 67: YouTube FinBERT Sentiment (6 features) ββ
|
| 7410 |
+
# Per-game rolling aggregates. sim_date_cutoff gates all leakage.
|
| 7411 |
+
try:
|
| 7412 |
+
if self.enable_youtube and self._yt_sent_cache is not None:
|
| 7413 |
+
_yt67 = _youtube_sentiment_features(
|
| 7414 |
+
self._yt_sent_cache, gd, sim_cutoff=self.sim_date_cutoff
|
| 7415 |
+
)
|
| 7416 |
+
row.extend([
|
| 7417 |
+
_yt67["yt_pol_mean_3"],
|
| 7418 |
+
_yt67["yt_pol_mean_7"],
|
| 7419 |
+
_yt67["yt_pol_mean_14"],
|
| 7420 |
+
_yt67["yt_abs_pol_mean_3"],
|
| 7421 |
+
_yt67["yt_abs_pol_mean_7"],
|
| 7422 |
+
_yt67["yt_abs_pol_mean_14"],
|
| 7423 |
+
])
|
| 7424 |
+
else:
|
| 7425 |
+
row.extend([0.0] * 6)
|
| 7426 |
+
except Exception:
|
| 7427 |
+
row.extend([0.0] * 6)
|
| 7428 |
+
|
| 7429 |
X.append(row)
|
| 7430 |
y.append(1 if hs > as_ else 0)
|
| 7431 |
_y_margin.append(hs - as_)
|