sf_ml_baseline.py · SimpleFunctions/sf-ml-baseline at main

sf-ml-baseline / sf_ml_baseline.py

Initial release: sf-ml-baseline v0.1 (LightGBM + XGBoost + CatBoost ensemble)

4a2ce27 verified 2 months ago

5.67 kB

	"""
	sf-ml-baseline v0.1 — inference helpers.

	Load pre-trained LightGBM + XGBoost + CatBoost ensembles and predict
	24h prediction-market outcomes from 5 engineered indicator features.

	Model weights are in `./weights/`. Licensed CC-BY-4.0 with attribution
	— see LICENSE.

	Example:
	from sf_ml_baseline import SFBaseline
	model = SFBaseline()
	p_up = model.predict_direction(price_cents=55, delta_cents=3,
	iy=12.5, cri=0.6, cvr=0.8)
	"""

	from __future__ import annotations
	from pathlib import Path
	from typing import Iterable, Sequence
	import numpy as np
	import pandas as pd
	import lightgbm as lgb
	import xgboost as xgb
	from catboost import CatBoostClassifier

	FEATURE_COLS_V1 = ('price_cents', 'delta_cents', 'iy', 'cri', 'cvr')
	SEEDS = (42, 137, 2026)

	class SFBaseline:
	"""9-model ensemble (3 architectures × 3 seeds) for 24h direction forecasting."""

	def __init__(self, weights_dir: str \| Path \| None = None):
	if weights_dir is None:
	weights_dir = Path(__file__).parent / 'weights'
	self.weights_dir = Path(weights_dir)
	if not self.weights_dir.is_dir():
	raise FileNotFoundError(f'weights dir not found: {self.weights_dir}')
	self._load_t1_models()
	self._load_t4_models()

	def _load_t1_models(self):
	"""V1 × T1 — direction forecast."""
	self.lgb_t1 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v1_t1_seed{s}.txt'))
	for s in SEEDS]
	self.xgb_t1 = []
	for s in SEEDS:
	m = xgb.XGBClassifier()
	m.load_model(str(self.weights_dir / f'xgb_v1_t1_seed{s}.json'))
	self.xgb_t1.append(m)
	self.cat_t1 = []
	for s in SEEDS:
	m = CatBoostClassifier()
	m.load_model(str(self.weights_dir / f'cat_v1_t1_seed{s}.cbm'))
	self.cat_t1.append(m)

	def _load_t4_models(self):
	"""V2 × T4 — resolution forecast. Requires 35 features (price + rolling stats) —
	see docs/ml/phase-a-results.md. This helper loads them but the user must
	supply the full 35-feature vector (via the training-data pipeline)."""
	try:
	self.lgb_t4 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v2_t4_seed{s}.txt'))
	for s in SEEDS]
	self.xgb_t4 = []
	for s in SEEDS:
	m = xgb.XGBClassifier()
	m.load_model(str(self.weights_dir / f'xgb_v2_t4_seed{s}.json'))
	self.xgb_t4.append(m)
	self.cat_t4 = []
	for s in SEEDS:
	m = CatBoostClassifier()
	m.load_model(str(self.weights_dir / f'cat_v2_t4_seed{s}.cbm'))
	self.cat_t4.append(m)
	self.t4_features = list(self.lgb_t4[0].feature_name())
	except Exception:
	self.lgb_t4 = self.xgb_t4 = self.cat_t4 = None
	self.t4_features = None

	# --- direction forecast (T1) ---

	def predict_direction_batch(self, df: pd.DataFrame) -> np.ndarray:
	"""Return P(up-move in 24h) for each row in df.
	df must have columns: price_cents, delta_cents, iy, cri, cvr."""
	missing = [c for c in FEATURE_COLS_V1 if c not in df.columns]
	if missing:
	raise ValueError(f'missing feature columns: {missing}')
	X = df[list(FEATURE_COLS_V1)].astype('float32').values
	return self._predict_t1(X)

	def predict_direction(
	self,
	price_cents: float,
	delta_cents: float,
	iy: float,
	cri: float,
	cvr: float,
	) -> float:
	"""Single-row prediction. Returns scalar probability."""
	X = np.array([[price_cents, delta_cents, iy, cri, cvr]], dtype='float32')
	return float(self._predict_t1(X)[0])

	def _predict_t1(self, X: np.ndarray) -> np.ndarray:
	"""9-model ensemble: 3 LGBM + 3 XGB + 3 Cat, equal weight."""
	preds = []
	for m in self.lgb_t1:
	preds.append(m.predict(X))
	for m in self.xgb_t1:
	preds.append(m.predict_proba(X)[:, 1])
	for m in self.cat_t1:
	preds.append(m.predict_proba(X)[:, 1])
	return np.mean(preds, axis=0)

	# --- resolution forecast (T4) ---

	def predict_resolution_batch(self, df: pd.DataFrame) -> np.ndarray:
	"""Return P(YES resolution in 24h) for each row in df.
	df must have all 35 V2 features (base 5 + rolling stats). See self.t4_features."""
	if self.lgb_t4 is None:
	raise RuntimeError('T4 weights not loaded')
	missing = [c for c in self.t4_features if c not in df.columns]
	if missing:
	raise ValueError(f'missing V2 feature columns: {missing[:5]}... ({len(missing)} total)')
	X = df[self.t4_features].astype('float32').values
	preds = []
	for m in self.lgb_t4:
	preds.append(m.predict(X))
	for m in self.xgb_t4:
	preds.append(m.predict_proba(X)[:, 1])
	for m in self.cat_t4:
	preds.append(m.predict_proba(X)[:, 1])
	return np.mean(preds, axis=0)


	if __name__ == '__main__':
	# smoke test
	model = SFBaseline()
	p = model.predict_direction(price_cents=55, delta_cents=3, iy=12.5, cri=0.6, cvr=0.8)
	print(f'P(price rises in 24h \| market at 55c, +3c delta, iy=12.5%, cri=0.6, cvr=0.8) = {p:.3f}')
	p_batch = model.predict_direction_batch(pd.DataFrame([
	{'price_cents': 55, 'delta_cents': 3, 'iy': 12.5, 'cri': 0.6, 'cvr': 0.8},
	{'price_cents': 82, 'delta_cents': -1, 'iy': 4.5, 'cri': 0.3, 'cvr': 0.9},
	]))
	print(f'Batch predictions: {p_batch}')