nananie143's picture
feat: Add 1002 real data features - no dummy values
8366ee8 verified
"""
Stacking Ensemble Model
=======================
Implements stacking ensemble that combines multiple base models using a meta-learner.
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
import logging
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class StackingConfig:
"""Configuration for stacking ensemble."""
n_folds: int = 5
use_features_in_meta: bool = True
meta_model_type: str = 'logistic' # 'logistic', 'ridge', 'rf', 'gbm'
random_state: int = 42
class StackingEnsemble:
"""
Stacking Ensemble that combines multiple base models.
Uses out-of-fold predictions from base models as features
for a meta-learner to make final predictions.
"""
def __init__(self, config: StackingConfig = None):
self.config = config or StackingConfig()
self.base_models = {}
self.meta_model = None
self.is_fitted = False
self.feature_names = []
def add_base_model(self, name: str, model: Any) -> None:
"""Add a base model to the ensemble."""
self.base_models[name] = {
'model': model,
'fitted': False
}
logger.info(f"Added base model: {name}")
def _create_meta_model(self):
"""Create the meta-learner model."""
if self.config.meta_model_type == 'logistic':
return LogisticRegression(
max_iter=1000,
random_state=self.config.random_state
)
elif self.config.meta_model_type == 'ridge':
return Ridge(alpha=1.0, random_state=self.config.random_state)
elif self.config.meta_model_type == 'rf':
return RandomForestClassifier(
n_estimators=100,
max_depth=5,
random_state=self.config.random_state
)
elif self.config.meta_model_type == 'gbm':
return GradientBoostingClassifier(
n_estimators=100,
max_depth=3,
random_state=self.config.random_state
)
else:
return LogisticRegression(max_iter=1000)
def _get_oof_predictions(
self,
X: np.ndarray,
y: np.ndarray
) -> np.ndarray:
"""
Get out-of-fold predictions from all base models.
Returns:
Array of shape (n_samples, n_base_models * n_classes)
"""
n_samples = X.shape[0]
oof_predictions = []
kfold = KFold(
n_splits=self.config.n_folds,
shuffle=True,
random_state=self.config.random_state
)
for name, model_info in self.base_models.items():
model = model_info['model']
# Get OOF predictions
if hasattr(model, 'predict_proba'):
oof_pred = cross_val_predict(
model, X, y,
cv=kfold,
method='predict_proba'
)
else:
oof_pred = cross_val_predict(
model, X, y,
cv=kfold
).reshape(-1, 1)
oof_predictions.append(oof_pred)
logger.info(f"Generated OOF predictions for {name}: shape {oof_pred.shape}")
# Stack all OOF predictions
stacked = np.hstack(oof_predictions)
# Optionally include original features
if self.config.use_features_in_meta:
stacked = np.hstack([stacked, X])
return stacked
def fit(
self,
X: np.ndarray,
y: np.ndarray,
feature_names: List[str] = None
) -> 'StackingEnsemble':
"""
Fit the stacking ensemble.
Args:
X: Training features
y: Training labels
feature_names: Names of features
"""
if len(self.base_models) == 0:
raise ValueError("No base models added. Use add_base_model() first.")
self.feature_names = feature_names or [f"f_{i}" for i in range(X.shape[1])]
logger.info(f"Fitting stacking ensemble with {len(self.base_models)} base models")
# Step 1: Get OOF predictions for meta-features
meta_features = self._get_oof_predictions(X, y)
logger.info(f"Meta features shape: {meta_features.shape}")
# Step 2: Fit all base models on full training data
for name, model_info in self.base_models.items():
model_info['model'].fit(X, y)
model_info['fitted'] = True
logger.info(f"Fitted base model: {name}")
# Step 3: Fit meta-model on OOF predictions
self.meta_model = self._create_meta_model()
self.meta_model.fit(meta_features, y)
self.is_fitted = True
logger.info("Stacking ensemble fitted successfully")
return self
def predict(self, X: np.ndarray) -> np.ndarray:
"""Predict class labels."""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
meta_features = self._get_meta_features(X)
return self.meta_model.predict(meta_features)
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict class probabilities."""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
meta_features = self._get_meta_features(X)
if hasattr(self.meta_model, 'predict_proba'):
return self.meta_model.predict_proba(meta_features)
else:
# For regression-based meta models
pred = self.meta_model.predict(meta_features)
return np.column_stack([1 - pred, pred])
def _get_meta_features(self, X: np.ndarray) -> np.ndarray:
"""Get meta-features from base model predictions."""
base_predictions = []
for name, model_info in self.base_models.items():
model = model_info['model']
if hasattr(model, 'predict_proba'):
pred = model.predict_proba(X)
else:
pred = model.predict(X).reshape(-1, 1)
base_predictions.append(pred)
stacked = np.hstack(base_predictions)
if self.config.use_features_in_meta:
stacked = np.hstack([stacked, X])
return stacked
def predict_match(
self,
home_features: Dict[str, float],
away_features: Dict[str, float]
) -> Dict[str, float]:
"""
Predict match outcome.
Args:
home_features: Features for home team
away_features: Features for away team
Returns:
Probabilities for home/draw/away
"""
# Combine features
features = []
for fname in self.feature_names:
if fname.startswith('home_'):
key = fname.replace('home_', '')
features.append(home_features.get(key, 0))
elif fname.startswith('away_'):
key = fname.replace('away_', '')
features.append(away_features.get(key, 0))
else:
features.append(home_features.get(fname, away_features.get(fname, 0)))
X = np.array(features).reshape(1, -1)
probs = self.predict_proba(X)[0]
if len(probs) == 3:
return {
'home_win': float(probs[0]),
'draw': float(probs[1]),
'away_win': float(probs[2])
}
else:
return {
'home_win': float(probs[1]),
'not_home_win': float(probs[0])
}
def save(self, path: str) -> None:
"""Save the ensemble to disk."""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
save_data = {
'config': self.config,
'base_models': self.base_models,
'meta_model': self.meta_model,
'feature_names': self.feature_names,
'is_fitted': self.is_fitted
}
joblib.dump(save_data, path)
logger.info(f"Saved stacking ensemble to {path}")
@classmethod
def load(cls, path: str) -> 'StackingEnsemble':
"""Load ensemble from disk."""
save_data = joblib.load(path)
ensemble = cls(config=save_data['config'])
ensemble.base_models = save_data['base_models']
ensemble.meta_model = save_data['meta_model']
ensemble.feature_names = save_data['feature_names']
ensemble.is_fitted = save_data['is_fitted']
logger.info(f"Loaded stacking ensemble from {path}")
return ensemble
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance from meta-model."""
if not self.is_fitted:
return {}
importance = {}
if hasattr(self.meta_model, 'coef_'):
coefs = np.abs(self.meta_model.coef_).mean(axis=0) if self.meta_model.coef_.ndim > 1 else np.abs(self.meta_model.coef_)
# Base model contributions
idx = 0
for name in self.base_models.keys():
importance[f"base_{name}"] = float(coefs[idx:idx+3].sum()) if idx+3 <= len(coefs) else float(coefs[idx])
idx += 3
# Original features if included
if self.config.use_features_in_meta:
for i, fname in enumerate(self.feature_names):
if idx + i < len(coefs):
importance[fname] = float(coefs[idx + i])
elif hasattr(self.meta_model, 'feature_importances_'):
importances = self.meta_model.feature_importances_
idx = 0
for name in self.base_models.keys():
importance[f"base_{name}"] = float(importances[idx:idx+3].sum()) if idx+3 <= len(importances) else float(importances[idx])
idx += 3
return importance
# Global instance
_ensemble: Optional[StackingEnsemble] = None
def get_ensemble() -> StackingEnsemble:
"""Get or create stacking ensemble."""
global _ensemble
if _ensemble is None:
_ensemble = StackingEnsemble()
return _ensemble
def create_default_ensemble() -> StackingEnsemble:
"""Create ensemble with default base models."""
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
ensemble = StackingEnsemble(StackingConfig(
n_folds=5,
use_features_in_meta=True,
meta_model_type='logistic'
))
# Add base models
ensemble.add_base_model('logistic', LogisticRegression(max_iter=1000))
ensemble.add_base_model('rf', RandomForestClassifier(n_estimators=100, max_depth=10))
ensemble.add_base_model('gbm', GradientBoostingClassifier(n_estimators=100, max_depth=5))
ensemble.add_base_model('nb', GaussianNB())
return ensemble