sf-ml-baseline / sf_ml_baseline.py
Patrickspf's picture
Initial release: sf-ml-baseline v0.1 (LightGBM + XGBoost + CatBoost ensemble)
4a2ce27 verified
Raw
History Blame Contribute Delete
5.67 kB
"""
sf-ml-baseline v0.1 — inference helpers.
Load pre-trained LightGBM + XGBoost + CatBoost ensembles and predict
24h prediction-market outcomes from 5 engineered indicator features.
Model weights are in `./weights/`. Licensed CC-BY-4.0 with attribution
— see LICENSE.
Example:
from sf_ml_baseline import SFBaseline
model = SFBaseline()
p_up = model.predict_direction(price_cents=55, delta_cents=3,
iy=12.5, cri=0.6, cvr=0.8)
"""
from __future__ import annotations
from pathlib import Path
from typing import Iterable, Sequence
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
FEATURE_COLS_V1 = ('price_cents', 'delta_cents', 'iy', 'cri', 'cvr')
SEEDS = (42, 137, 2026)
class SFBaseline:
"""9-model ensemble (3 architectures × 3 seeds) for 24h direction forecasting."""
def __init__(self, weights_dir: str | Path | None = None):
if weights_dir is None:
weights_dir = Path(__file__).parent / 'weights'
self.weights_dir = Path(weights_dir)
if not self.weights_dir.is_dir():
raise FileNotFoundError(f'weights dir not found: {self.weights_dir}')
self._load_t1_models()
self._load_t4_models()
def _load_t1_models(self):
"""V1 × T1 — direction forecast."""
self.lgb_t1 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v1_t1_seed{s}.txt'))
for s in SEEDS]
self.xgb_t1 = []
for s in SEEDS:
m = xgb.XGBClassifier()
m.load_model(str(self.weights_dir / f'xgb_v1_t1_seed{s}.json'))
self.xgb_t1.append(m)
self.cat_t1 = []
for s in SEEDS:
m = CatBoostClassifier()
m.load_model(str(self.weights_dir / f'cat_v1_t1_seed{s}.cbm'))
self.cat_t1.append(m)
def _load_t4_models(self):
"""V2 × T4 — resolution forecast. Requires 35 features (price + rolling stats) —
see docs/ml/phase-a-results.md. This helper loads them but the user must
supply the full 35-feature vector (via the training-data pipeline)."""
try:
self.lgb_t4 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v2_t4_seed{s}.txt'))
for s in SEEDS]
self.xgb_t4 = []
for s in SEEDS:
m = xgb.XGBClassifier()
m.load_model(str(self.weights_dir / f'xgb_v2_t4_seed{s}.json'))
self.xgb_t4.append(m)
self.cat_t4 = []
for s in SEEDS:
m = CatBoostClassifier()
m.load_model(str(self.weights_dir / f'cat_v2_t4_seed{s}.cbm'))
self.cat_t4.append(m)
self.t4_features = list(self.lgb_t4[0].feature_name())
except Exception:
self.lgb_t4 = self.xgb_t4 = self.cat_t4 = None
self.t4_features = None
# --- direction forecast (T1) ---
def predict_direction_batch(self, df: pd.DataFrame) -> np.ndarray:
"""Return P(up-move in 24h) for each row in df.
df must have columns: price_cents, delta_cents, iy, cri, cvr."""
missing = [c for c in FEATURE_COLS_V1 if c not in df.columns]
if missing:
raise ValueError(f'missing feature columns: {missing}')
X = df[list(FEATURE_COLS_V1)].astype('float32').values
return self._predict_t1(X)
def predict_direction(
self,
price_cents: float,
delta_cents: float,
iy: float,
cri: float,
cvr: float,
) -> float:
"""Single-row prediction. Returns scalar probability."""
X = np.array([[price_cents, delta_cents, iy, cri, cvr]], dtype='float32')
return float(self._predict_t1(X)[0])
def _predict_t1(self, X: np.ndarray) -> np.ndarray:
"""9-model ensemble: 3 LGBM + 3 XGB + 3 Cat, equal weight."""
preds = []
for m in self.lgb_t1:
preds.append(m.predict(X))
for m in self.xgb_t1:
preds.append(m.predict_proba(X)[:, 1])
for m in self.cat_t1:
preds.append(m.predict_proba(X)[:, 1])
return np.mean(preds, axis=0)
# --- resolution forecast (T4) ---
def predict_resolution_batch(self, df: pd.DataFrame) -> np.ndarray:
"""Return P(YES resolution in 24h) for each row in df.
df must have all 35 V2 features (base 5 + rolling stats). See self.t4_features."""
if self.lgb_t4 is None:
raise RuntimeError('T4 weights not loaded')
missing = [c for c in self.t4_features if c not in df.columns]
if missing:
raise ValueError(f'missing V2 feature columns: {missing[:5]}... ({len(missing)} total)')
X = df[self.t4_features].astype('float32').values
preds = []
for m in self.lgb_t4:
preds.append(m.predict(X))
for m in self.xgb_t4:
preds.append(m.predict_proba(X)[:, 1])
for m in self.cat_t4:
preds.append(m.predict_proba(X)[:, 1])
return np.mean(preds, axis=0)
if __name__ == '__main__':
# smoke test
model = SFBaseline()
p = model.predict_direction(price_cents=55, delta_cents=3, iy=12.5, cri=0.6, cvr=0.8)
print(f'P(price rises in 24h | market at 55c, +3c delta, iy=12.5%, cri=0.6, cvr=0.8) = {p:.3f}')
p_batch = model.predict_direction_batch(pd.DataFrame([
{'price_cents': 55, 'delta_cents': 3, 'iy': 12.5, 'cri': 0.6, 'cvr': 0.8},
{'price_cents': 82, 'delta_cents': -1, 'iy': 4.5, 'cri': 0.3, 'cvr': 0.9},
]))
print(f'Batch predictions: {p_batch}')