| """ |
| sf-ml-baseline v0.1 — inference helpers. |
| |
| Load pre-trained LightGBM + XGBoost + CatBoost ensembles and predict |
| 24h prediction-market outcomes from 5 engineered indicator features. |
| |
| Model weights are in `./weights/`. Licensed CC-BY-4.0 with attribution |
| — see LICENSE. |
| |
| Example: |
| from sf_ml_baseline import SFBaseline |
| model = SFBaseline() |
| p_up = model.predict_direction(price_cents=55, delta_cents=3, |
| iy=12.5, cri=0.6, cvr=0.8) |
| """ |
|
|
| from __future__ import annotations |
| from pathlib import Path |
| from typing import Iterable, Sequence |
| import numpy as np |
| import pandas as pd |
| import lightgbm as lgb |
| import xgboost as xgb |
| from catboost import CatBoostClassifier |
|
|
| FEATURE_COLS_V1 = ('price_cents', 'delta_cents', 'iy', 'cri', 'cvr') |
| SEEDS = (42, 137, 2026) |
|
|
| class SFBaseline: |
| """9-model ensemble (3 architectures × 3 seeds) for 24h direction forecasting.""" |
|
|
| def __init__(self, weights_dir: str | Path | None = None): |
| if weights_dir is None: |
| weights_dir = Path(__file__).parent / 'weights' |
| self.weights_dir = Path(weights_dir) |
| if not self.weights_dir.is_dir(): |
| raise FileNotFoundError(f'weights dir not found: {self.weights_dir}') |
| self._load_t1_models() |
| self._load_t4_models() |
|
|
| def _load_t1_models(self): |
| """V1 × T1 — direction forecast.""" |
| self.lgb_t1 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v1_t1_seed{s}.txt')) |
| for s in SEEDS] |
| self.xgb_t1 = [] |
| for s in SEEDS: |
| m = xgb.XGBClassifier() |
| m.load_model(str(self.weights_dir / f'xgb_v1_t1_seed{s}.json')) |
| self.xgb_t1.append(m) |
| self.cat_t1 = [] |
| for s in SEEDS: |
| m = CatBoostClassifier() |
| m.load_model(str(self.weights_dir / f'cat_v1_t1_seed{s}.cbm')) |
| self.cat_t1.append(m) |
|
|
| def _load_t4_models(self): |
| """V2 × T4 — resolution forecast. Requires 35 features (price + rolling stats) — |
| see docs/ml/phase-a-results.md. This helper loads them but the user must |
| supply the full 35-feature vector (via the training-data pipeline).""" |
| try: |
| self.lgb_t4 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v2_t4_seed{s}.txt')) |
| for s in SEEDS] |
| self.xgb_t4 = [] |
| for s in SEEDS: |
| m = xgb.XGBClassifier() |
| m.load_model(str(self.weights_dir / f'xgb_v2_t4_seed{s}.json')) |
| self.xgb_t4.append(m) |
| self.cat_t4 = [] |
| for s in SEEDS: |
| m = CatBoostClassifier() |
| m.load_model(str(self.weights_dir / f'cat_v2_t4_seed{s}.cbm')) |
| self.cat_t4.append(m) |
| self.t4_features = list(self.lgb_t4[0].feature_name()) |
| except Exception: |
| self.lgb_t4 = self.xgb_t4 = self.cat_t4 = None |
| self.t4_features = None |
|
|
| |
|
|
| def predict_direction_batch(self, df: pd.DataFrame) -> np.ndarray: |
| """Return P(up-move in 24h) for each row in df. |
| df must have columns: price_cents, delta_cents, iy, cri, cvr.""" |
| missing = [c for c in FEATURE_COLS_V1 if c not in df.columns] |
| if missing: |
| raise ValueError(f'missing feature columns: {missing}') |
| X = df[list(FEATURE_COLS_V1)].astype('float32').values |
| return self._predict_t1(X) |
|
|
| def predict_direction( |
| self, |
| price_cents: float, |
| delta_cents: float, |
| iy: float, |
| cri: float, |
| cvr: float, |
| ) -> float: |
| """Single-row prediction. Returns scalar probability.""" |
| X = np.array([[price_cents, delta_cents, iy, cri, cvr]], dtype='float32') |
| return float(self._predict_t1(X)[0]) |
|
|
| def _predict_t1(self, X: np.ndarray) -> np.ndarray: |
| """9-model ensemble: 3 LGBM + 3 XGB + 3 Cat, equal weight.""" |
| preds = [] |
| for m in self.lgb_t1: |
| preds.append(m.predict(X)) |
| for m in self.xgb_t1: |
| preds.append(m.predict_proba(X)[:, 1]) |
| for m in self.cat_t1: |
| preds.append(m.predict_proba(X)[:, 1]) |
| return np.mean(preds, axis=0) |
|
|
| |
|
|
| def predict_resolution_batch(self, df: pd.DataFrame) -> np.ndarray: |
| """Return P(YES resolution in 24h) for each row in df. |
| df must have all 35 V2 features (base 5 + rolling stats). See self.t4_features.""" |
| if self.lgb_t4 is None: |
| raise RuntimeError('T4 weights not loaded') |
| missing = [c for c in self.t4_features if c not in df.columns] |
| if missing: |
| raise ValueError(f'missing V2 feature columns: {missing[:5]}... ({len(missing)} total)') |
| X = df[self.t4_features].astype('float32').values |
| preds = [] |
| for m in self.lgb_t4: |
| preds.append(m.predict(X)) |
| for m in self.xgb_t4: |
| preds.append(m.predict_proba(X)[:, 1]) |
| for m in self.cat_t4: |
| preds.append(m.predict_proba(X)[:, 1]) |
| return np.mean(preds, axis=0) |
|
|
|
|
| if __name__ == '__main__': |
| |
| model = SFBaseline() |
| p = model.predict_direction(price_cents=55, delta_cents=3, iy=12.5, cri=0.6, cvr=0.8) |
| print(f'P(price rises in 24h | market at 55c, +3c delta, iy=12.5%, cri=0.6, cvr=0.8) = {p:.3f}') |
| p_batch = model.predict_direction_batch(pd.DataFrame([ |
| {'price_cents': 55, 'delta_cents': 3, 'iy': 12.5, 'cri': 0.6, 'cvr': 0.8}, |
| {'price_cents': 82, 'delta_cents': -1, 'iy': 4.5, 'cri': 0.3, 'cvr': 0.9}, |
| ])) |
| print(f'Batch predictions: {p_batch}') |
|
|