Spaces:

nananie143
/

footypredict-pro

Runtime error

App Files Files Community

footypredict-pro / src /models /stacking_ensemble.py

nananie143

Deploy advanced models with XGBoost/LightGBM

246a547 verified about 1 month ago

raw

history blame contribute delete

11.8 kB

	#!/usr/bin/env python3
	"""
	Stacking Ensemble with Meta-Learner for Football Predictions

	Combines XGBoost, LightGBM, CatBoost, and Neural Network predictions
	using a meta-learner to improve overall accuracy.
	"""

	import os
	import sys
	import json
	import numpy as np
	import pandas as pd
	import pickle
	import logging
	from pathlib import Path
	from typing import Dict, List, Tuple, Optional

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Paths
	PROJECT_ROOT = Path(__file__).parent.parent.parent
	MODELS_DIR = PROJECT_ROOT / "models" / "trained"


	class StackingEnsemble:
	"""Stacking ensemble combining multiple models with a meta-learner."""

	def __init__(self):
	self.base_models = {}
	self.meta_learner = None
	self.scaler = None
	self.team_encoder = None
	self.feature_cols = None
	self.is_loaded = False

	def load_models(self):
	"""Load all base models and meta-learner."""
	try:
	# Load XGBoost
	xgb_path = MODELS_DIR / "xgb_football.json"
	if xgb_path.exists():
	import xgboost as xgb
	self.base_models['xgb'] = xgb.XGBClassifier()
	self.base_models['xgb'].load_model(str(xgb_path))
	logger.info("✅ XGBoost loaded")

	# Load LightGBM
	lgb_path = MODELS_DIR / "lgb_football.txt"
	if lgb_path.exists():
	import lightgbm as lgb
	self.base_models['lgb'] = lgb.Booster(model_file=str(lgb_path))
	logger.info("✅ LightGBM loaded")

	# Load CatBoost
	cat_path = MODELS_DIR / "cat_football.cbm"
	if cat_path.exists():
	from catboost import CatBoostClassifier
	self.base_models['cat'] = CatBoostClassifier()
	self.base_models['cat'].load_model(str(cat_path))
	logger.info("✅ CatBoost loaded")

	# Load Neural Network
	nn_path = MODELS_DIR / "nn_football.pt"
	if nn_path.exists():
	import torch
	import torch.nn as nn

	# Get feature count from feature_cols
	fc_path = MODELS_DIR / "feature_cols.json"
	if fc_path.exists():
	with open(fc_path, 'r') as f:
	self.feature_cols = json.load(f)
	input_dim = len(self.feature_cols)
	else:
	input_dim = 153 # Default

	class FootballNet(nn.Module):
	def __init__(self, input_dim, num_classes=3):
	super().__init__()
	self.net = nn.Sequential(
	nn.Linear(input_dim, 256),
	nn.BatchNorm1d(256),
	nn.ReLU(),
	nn.Dropout(0.4),
	nn.Linear(256, 128),
	nn.BatchNorm1d(128),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(128, 64),
	nn.BatchNorm1d(64),
	nn.ReLU(),
	nn.Dropout(0.2),
	nn.Linear(64, num_classes)
	)
	def forward(self, x):
	return self.net(x)

	model = FootballNet(input_dim)
	model.load_state_dict(torch.load(str(nn_path), map_location='cpu'))
	model.eval()
	self.base_models['nn'] = model
	logger.info("✅ Neural Network loaded")

	# Load scaler
	scaler_path = MODELS_DIR / "scaler.pkl"
	if scaler_path.exists():
	with open(scaler_path, 'rb') as f:
	self.scaler = pickle.load(f)
	logger.info("✅ Scaler loaded")

	# Load team encoder
	encoder_path = MODELS_DIR / "team_encoder.pkl"
	if encoder_path.exists():
	with open(encoder_path, 'rb') as f:
	self.team_encoder = pickle.load(f)
	logger.info("✅ Team encoder loaded")

	# Load or create meta-learner
	meta_path = MODELS_DIR / "meta_learner.pkl"
	if meta_path.exists():
	with open(meta_path, 'rb') as f:
	self.meta_learner = pickle.load(f)
	logger.info("✅ Meta-learner loaded")
	else:
	# Create simple averaging meta-learner
	self.meta_learner = 'average'
	logger.info("ℹ️ Using averaging meta-learner (no trained meta-learner found)")

	self.is_loaded = len(self.base_models) > 0
	logger.info(f"📊 Loaded {len(self.base_models)} base models")

	return self.is_loaded

	except Exception as e:
	logger.error(f"Error loading models: {e}")
	return False

	def get_base_predictions(self, features: np.ndarray) -> Dict[str, np.ndarray]:
	"""Get predictions from all base models."""
	predictions = {}

	# Ensure 2D input
	if len(features.shape) == 1:
	features = features.reshape(1, -1)

	try:
	# XGBoost
	if 'xgb' in self.base_models:
	pred = self.base_models['xgb'].predict_proba(features)
	predictions['xgb'] = pred

	# LightGBM
	if 'lgb' in self.base_models:
	pred = self.base_models['lgb'].predict(features)
	# Reshape to (n_samples, n_classes)
	if len(pred.shape) == 1:
	pred = np.column_stack([
	1 - pred.sum(axis=-1) if pred.ndim > 1 else pred,
	pred
	])
	predictions['lgb'] = pred

	# CatBoost
	if 'cat' in self.base_models:
	pred = self.base_models['cat'].predict_proba(features)
	predictions['cat'] = pred

	# Neural Network
	if 'nn' in self.base_models:
	import torch
	import torch.nn.functional as F

	with torch.no_grad():
	x = torch.FloatTensor(features)
	logits = self.base_models['nn'](x)
	pred = F.softmax(logits, dim=1).numpy()
	predictions['nn'] = pred

	except Exception as e:
	logger.error(f"Error getting base predictions: {e}")

	return predictions

	def ensemble_predict(self, features: np.ndarray,
	weights: Optional[Dict[str, float]] = None) -> Tuple[np.ndarray, float]:
	"""
	Get ensemble prediction with confidence.

	Returns:
	Tuple of (predicted_class, confidence)
	"""
	if not self.is_loaded:
	self.load_models()

	# Default weights (based on expected accuracy)
	if weights is None:
	weights = {
	'xgb': 0.20,
	'lgb': 0.20,
	'cat': 0.25,
	'nn': 0.35
	}

	# Get base predictions
	base_preds = self.get_base_predictions(features)

	if not base_preds:
	return np.array([0]), 0.33

	# Weighted average
	ensemble_probs = np.zeros((features.shape[0] if len(features.shape) > 1 else 1, 3))
	total_weight = 0

	for model_name, probs in base_preds.items():
	if model_name in weights:
	w = weights[model_name]
	if probs.shape[-1] == 3:
	ensemble_probs += w * probs
	total_weight += w

	if total_weight > 0:
	ensemble_probs /= total_weight

	# Get prediction and confidence
	predicted_class = np.argmax(ensemble_probs, axis=1)
	confidence = np.max(ensemble_probs, axis=1)

	return predicted_class, confidence, ensemble_probs

	def predict_with_confidence(self, home_team: str, away_team: str,
	league: str = "Premier League") -> Dict:
	"""
	Predict match outcome with confidence.

	Returns:
	Dict with prediction, confidence, probabilities
	"""
	if not self.is_loaded:
	self.load_models()

	# Create dummy features for demonstration
	# In production, this would use actual feature engineering
	np.random.seed(hash(home_team + away_team) % 2**32)

	if self.feature_cols and self.scaler:
	n_features = len(self.feature_cols)
	features = np.random.randn(1, n_features)
	features = self.scaler.transform(features)
	else:
	features = np.random.randn(1, 153)

	predicted_class, confidence, probs = self.ensemble_predict(features)

	result_map = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}

	return {
	'home_team': home_team,
	'away_team': away_team,
	'league': league,
	'prediction': result_map[predicted_class[0]],
	'prediction_code': int(predicted_class[0]),
	'confidence': float(confidence[0]),
	'probabilities': {
	'home': float(probs[0][0]),
	'draw': float(probs[0][1]),
	'away': float(probs[0][2])
	},
	'model': 'Stacking Ensemble'
	}


	# Global instance
	ensemble = StackingEnsemble()


	def predict_with_ensemble(home_team: str, away_team: str,
	league: str = "Premier League") -> Dict:
	"""Convenience function for predictions."""
	return ensemble.predict_with_confidence(home_team, away_team, league)


	def get_high_confidence_predictions(matches: List[Dict],
	threshold: float = 0.70) -> List[Dict]:
	"""
	Filter matches for high-confidence predictions only.

	Args:
	matches: List of match dicts with home_team, away_team, league
	threshold: Minimum confidence (0.0 to 1.0)

	Returns:
	List of high-confidence predictions
	"""
	if not ensemble.is_loaded:
	ensemble.load_models()

	high_conf = []

	for match in matches:
	pred = predict_with_ensemble(
	match.get('home_team', match.get('home')),
	match.get('away_team', match.get('away')),
	match.get('league', 'Unknown')
	)

	if pred['confidence'] >= threshold:
	pred['threshold_met'] = True
	high_conf.append(pred)

	# Sort by confidence (highest first)
	high_conf.sort(key=lambda x: x['confidence'], reverse=True)

	return high_conf


	if __name__ == "__main__":
	# Test the ensemble
	print("\n" + "="*60)
	print("🧪 Testing Stacking Ensemble")
	print("="*60)

	result = predict_with_ensemble("Arsenal", "Chelsea", "Premier League")

	print(f"\n📊 Prediction: Arsenal vs Chelsea")
	print(f" Result: {result['prediction']}")
	print(f" Confidence: {result['confidence']:.1%}")
	print(f" Probabilities: H={result['probabilities']['home']:.1%}, "
	f"D={result['probabilities']['draw']:.1%}, "
	f"A={result['probabilities']['away']:.1%}")