Spaces:

nananie143
/

footypredict-pro

Runtime error

App Files Files Community

footypredict-pro / src /models /machine_learning /stacking_ensemble.py

nananie143

feat: Add 1002 real data features - no dummy values

8366ee8 verified about 1 month ago

raw

history blame contribute delete

11.6 kB

	"""
	Stacking Ensemble Model
	=======================
	Implements stacking ensemble that combines multiple base models using a meta-learner.
	"""

	import numpy as np
	import pandas as pd
	from typing import Dict, List, Optional, Any, Tuple
	from dataclasses import dataclass
	import logging
	from sklearn.model_selection import KFold, cross_val_predict
	from sklearn.linear_model import LogisticRegression, Ridge
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	import joblib
	from pathlib import Path

	logger = logging.getLogger(__name__)


	@dataclass
	class StackingConfig:
	"""Configuration for stacking ensemble."""
	n_folds: int = 5
	use_features_in_meta: bool = True
	meta_model_type: str = 'logistic' # 'logistic', 'ridge', 'rf', 'gbm'
	random_state: int = 42


	class StackingEnsemble:
	"""
	Stacking Ensemble that combines multiple base models.

	Uses out-of-fold predictions from base models as features
	for a meta-learner to make final predictions.
	"""

	def __init__(self, config: StackingConfig = None):
	self.config = config or StackingConfig()
	self.base_models = {}
	self.meta_model = None
	self.is_fitted = False
	self.feature_names = []

	def add_base_model(self, name: str, model: Any) -> None:
	"""Add a base model to the ensemble."""
	self.base_models[name] = {
	'model': model,
	'fitted': False
	}
	logger.info(f"Added base model: {name}")

	def _create_meta_model(self):
	"""Create the meta-learner model."""
	if self.config.meta_model_type == 'logistic':
	return LogisticRegression(
	max_iter=1000,
	random_state=self.config.random_state
	)
	elif self.config.meta_model_type == 'ridge':
	return Ridge(alpha=1.0, random_state=self.config.random_state)
	elif self.config.meta_model_type == 'rf':
	return RandomForestClassifier(
	n_estimators=100,
	max_depth=5,
	random_state=self.config.random_state
	)
	elif self.config.meta_model_type == 'gbm':
	return GradientBoostingClassifier(
	n_estimators=100,
	max_depth=3,
	random_state=self.config.random_state
	)
	else:
	return LogisticRegression(max_iter=1000)

	def _get_oof_predictions(
	self,
	X: np.ndarray,
	y: np.ndarray
	) -> np.ndarray:
	"""
	Get out-of-fold predictions from all base models.

	Returns:
	Array of shape (n_samples, n_base_models * n_classes)
	"""
	n_samples = X.shape[0]
	oof_predictions = []

	kfold = KFold(
	n_splits=self.config.n_folds,
	shuffle=True,
	random_state=self.config.random_state
	)

	for name, model_info in self.base_models.items():
	model = model_info['model']

	# Get OOF predictions
	if hasattr(model, 'predict_proba'):
	oof_pred = cross_val_predict(
	model, X, y,
	cv=kfold,
	method='predict_proba'
	)
	else:
	oof_pred = cross_val_predict(
	model, X, y,
	cv=kfold
	).reshape(-1, 1)

	oof_predictions.append(oof_pred)
	logger.info(f"Generated OOF predictions for {name}: shape {oof_pred.shape}")

	# Stack all OOF predictions
	stacked = np.hstack(oof_predictions)

	# Optionally include original features
	if self.config.use_features_in_meta:
	stacked = np.hstack([stacked, X])

	return stacked

	def fit(
	self,
	X: np.ndarray,
	y: np.ndarray,
	feature_names: List[str] = None
	) -> 'StackingEnsemble':
	"""
	Fit the stacking ensemble.

	Args:
	X: Training features
	y: Training labels
	feature_names: Names of features
	"""
	if len(self.base_models) == 0:
	raise ValueError("No base models added. Use add_base_model() first.")

	self.feature_names = feature_names or [f"f_{i}" for i in range(X.shape[1])]

	logger.info(f"Fitting stacking ensemble with {len(self.base_models)} base models")

	# Step 1: Get OOF predictions for meta-features
	meta_features = self._get_oof_predictions(X, y)
	logger.info(f"Meta features shape: {meta_features.shape}")

	# Step 2: Fit all base models on full training data
	for name, model_info in self.base_models.items():
	model_info['model'].fit(X, y)
	model_info['fitted'] = True
	logger.info(f"Fitted base model: {name}")

	# Step 3: Fit meta-model on OOF predictions
	self.meta_model = self._create_meta_model()
	self.meta_model.fit(meta_features, y)

	self.is_fitted = True
	logger.info("Stacking ensemble fitted successfully")

	return self

	def predict(self, X: np.ndarray) -> np.ndarray:
	"""Predict class labels."""
	if not self.is_fitted:
	raise ValueError("Model not fitted. Call fit() first.")

	meta_features = self._get_meta_features(X)
	return self.meta_model.predict(meta_features)

	def predict_proba(self, X: np.ndarray) -> np.ndarray:
	"""Predict class probabilities."""
	if not self.is_fitted:
	raise ValueError("Model not fitted. Call fit() first.")

	meta_features = self._get_meta_features(X)

	if hasattr(self.meta_model, 'predict_proba'):
	return self.meta_model.predict_proba(meta_features)
	else:
	# For regression-based meta models
	pred = self.meta_model.predict(meta_features)
	return np.column_stack([1 - pred, pred])

	def _get_meta_features(self, X: np.ndarray) -> np.ndarray:
	"""Get meta-features from base model predictions."""
	base_predictions = []

	for name, model_info in self.base_models.items():
	model = model_info['model']

	if hasattr(model, 'predict_proba'):
	pred = model.predict_proba(X)
	else:
	pred = model.predict(X).reshape(-1, 1)

	base_predictions.append(pred)

	stacked = np.hstack(base_predictions)

	if self.config.use_features_in_meta:
	stacked = np.hstack([stacked, X])

	return stacked

	def predict_match(
	self,
	home_features: Dict[str, float],
	away_features: Dict[str, float]
	) -> Dict[str, float]:
	"""
	Predict match outcome.

	Args:
	home_features: Features for home team
	away_features: Features for away team

	Returns:
	Probabilities for home/draw/away
	"""
	# Combine features
	features = []
	for fname in self.feature_names:
	if fname.startswith('home_'):
	key = fname.replace('home_', '')
	features.append(home_features.get(key, 0))
	elif fname.startswith('away_'):
	key = fname.replace('away_', '')
	features.append(away_features.get(key, 0))
	else:
	features.append(home_features.get(fname, away_features.get(fname, 0)))

	X = np.array(features).reshape(1, -1)
	probs = self.predict_proba(X)[0]

	if len(probs) == 3:
	return {
	'home_win': float(probs[0]),
	'draw': float(probs[1]),
	'away_win': float(probs[2])
	}
	else:
	return {
	'home_win': float(probs[1]),
	'not_home_win': float(probs[0])
	}

	def save(self, path: str) -> None:
	"""Save the ensemble to disk."""
	path = Path(path)
	path.parent.mkdir(parents=True, exist_ok=True)

	save_data = {
	'config': self.config,
	'base_models': self.base_models,
	'meta_model': self.meta_model,
	'feature_names': self.feature_names,
	'is_fitted': self.is_fitted
	}

	joblib.dump(save_data, path)
	logger.info(f"Saved stacking ensemble to {path}")

	@classmethod
	def load(cls, path: str) -> 'StackingEnsemble':
	"""Load ensemble from disk."""
	save_data = joblib.load(path)

	ensemble = cls(config=save_data['config'])
	ensemble.base_models = save_data['base_models']
	ensemble.meta_model = save_data['meta_model']
	ensemble.feature_names = save_data['feature_names']
	ensemble.is_fitted = save_data['is_fitted']

	logger.info(f"Loaded stacking ensemble from {path}")
	return ensemble

	def get_feature_importance(self) -> Dict[str, float]:
	"""Get feature importance from meta-model."""
	if not self.is_fitted:
	return {}

	importance = {}

	if hasattr(self.meta_model, 'coef_'):
	coefs = np.abs(self.meta_model.coef_).mean(axis=0) if self.meta_model.coef_.ndim > 1 else np.abs(self.meta_model.coef_)

	# Base model contributions
	idx = 0
	for name in self.base_models.keys():
	importance[f"base_{name}"] = float(coefs[idx:idx+3].sum()) if idx+3 <= len(coefs) else float(coefs[idx])
	idx += 3

	# Original features if included
	if self.config.use_features_in_meta:
	for i, fname in enumerate(self.feature_names):
	if idx + i < len(coefs):
	importance[fname] = float(coefs[idx + i])

	elif hasattr(self.meta_model, 'feature_importances_'):
	importances = self.meta_model.feature_importances_

	idx = 0
	for name in self.base_models.keys():
	importance[f"base_{name}"] = float(importances[idx:idx+3].sum()) if idx+3 <= len(importances) else float(importances[idx])
	idx += 3

	return importance


	# Global instance
	_ensemble: Optional[StackingEnsemble] = None


	def get_ensemble() -> StackingEnsemble:
	"""Get or create stacking ensemble."""
	global _ensemble
	if _ensemble is None:
	_ensemble = StackingEnsemble()
	return _ensemble


	def create_default_ensemble() -> StackingEnsemble:
	"""Create ensemble with default base models."""
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.naive_bayes import GaussianNB

	ensemble = StackingEnsemble(StackingConfig(
	n_folds=5,
	use_features_in_meta=True,
	meta_model_type='logistic'
	))

	# Add base models
	ensemble.add_base_model('logistic', LogisticRegression(max_iter=1000))
	ensemble.add_base_model('rf', RandomForestClassifier(n_estimators=100, max_depth=10))
	ensemble.add_base_model('gbm', GradientBoostingClassifier(n_estimators=100, max_depth=5))
	ensemble.add_base_model('nb', GaussianNB())

	return ensemble