File size: 10,893 Bytes
6f7e932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
"""
Model Ensemble - Combines multiple ML models for robust predictions

Weighted averaging of predictions from:
- Podos Transformer (30%)
- XGBoost (35%)
- FootballerModel (20%)
- LSTM Form (15%)

Includes confidence calibration and model disagreement detection.
"""

import numpy as np
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path
import json
import logging

logger = logging.getLogger(__name__)


@dataclass
class EnsemblePrediction:
    """Output from ensemble prediction"""
    home_win_prob: float
    draw_prob: float
    away_win_prob: float
    predicted_outcome: str
    confidence: float
    model_agreement: float
    individual_predictions: Dict[str, Dict]
    calibrated: bool = True
    
    def to_dict(self) -> Dict:
        return asdict(self)


class ModelEnsemble:
    """
    Ensemble predictor combining multiple ML models.
    
    Uses weighted averaging with confidence calibration.
    """
    
    # Default model weights (sum to 1.0)
    DEFAULT_WEIGHTS = {
        'podos': 0.30,
        'xgboost': 0.35,
        'footballer': 0.20,
        'lstm': 0.15
    }
    
    def __init__(self, weights: Optional[Dict[str, float]] = None):
        """
        Initialize ensemble.
        
        Args:
            weights: Custom model weights. Default weights used if not provided.
        """
        self.weights = weights or self.DEFAULT_WEIGHTS.copy()
        self.models: Dict[str, Any] = {}
        self._calibration_params: Dict[str, float] = {}
        self._load_calibration()
    
    def _load_calibration(self):
        """Load calibration parameters if available"""
        config_path = Path(__file__).parent.parent.parent / "models" / "config" / "calibration.json"
        if config_path.exists():
            with open(config_path, 'r') as f:
                self._calibration_params = json.load(f)
    
    def register_model(self, name: str, model: Any, weight: Optional[float] = None):
        """
        Register a model with the ensemble.
        
        Args:
            name: Model identifier
            model: Model object with predict() method
            weight: Model weight (optional, uses default if not specified)
        """
        self.models[name] = model
        if weight is not None:
            self.weights[name] = weight
            self._normalize_weights()
        logger.info(f"Registered model: {name} (weight: {self.weights.get(name, 0)})")
    
    def _normalize_weights(self):
        """Ensure weights sum to 1.0"""
        # Only consider weights for registered models
        active_weights = {k: v for k, v in self.weights.items() if k in self.models}
        total = sum(active_weights.values())
        if total > 0:
            for k in active_weights:
                self.weights[k] = active_weights[k] / total
    
    def predict(self, home_team: str, away_team: str, **features) -> EnsemblePrediction:
        """
        Get ensemble prediction from all models.
        
        Args:
            home_team: Home team name
            away_team: Away team name
            **features: Additional features (form, odds, etc.)
            
        Returns:
            EnsemblePrediction with combined probabilities
        """
        if not self.models:
            raise ValueError("No models registered in ensemble")
        
        individual_preds = {}
        weighted_probs = np.array([0.0, 0.0, 0.0])  # home, draw, away
        total_weight = 0.0
        
        # Get prediction from each model
        for name, model in self.models.items():
            try:
                pred = model.predict(home_team, away_team, **features)
                
                # Extract probabilities
                if hasattr(pred, 'home_win_prob'):
                    probs = np.array([
                        pred.home_win_prob,
                        pred.draw_prob,
                        pred.away_win_prob
                    ])
                elif isinstance(pred, dict):
                    probs = np.array([
                        pred.get('home_win_prob', 0.33),
                        pred.get('draw_prob', 0.33),
                        pred.get('away_win_prob', 0.34)
                    ])
                else:
                    continue
                
                # Store individual prediction
                individual_preds[name] = {
                    'home_win_prob': float(probs[0]),
                    'draw_prob': float(probs[1]),
                    'away_win_prob': float(probs[2]),
                    'confidence': getattr(pred, 'confidence', 0.5)
                }
                
                # Add weighted contribution
                weight = self.weights.get(name, 0.1)
                weighted_probs += probs * weight
                total_weight += weight
                
            except Exception as e:
                logger.warning(f"Model {name} prediction failed: {e}")
                continue
        
        if total_weight == 0:
            raise ValueError("All model predictions failed")
        
        # Normalize weighted probabilities
        final_probs = weighted_probs / total_weight
        
        # Ensure probabilities sum to 1
        final_probs = final_probs / final_probs.sum()
        
        # Calculate model agreement (measure of consensus)
        agreement = self._calculate_agreement(individual_preds)
        
        # Determine predicted outcome
        outcome_idx = np.argmax(final_probs)
        outcomes = ['Home Win', 'Draw', 'Away Win']
        predicted_outcome = outcomes[outcome_idx]
        
        # Calculate confidence (base + agreement bonus)
        base_confidence = float(final_probs[outcome_idx])
        confidence = self._calibrate_confidence(base_confidence, agreement)
        
        return EnsemblePrediction(
            home_win_prob=float(final_probs[0]),
            draw_prob=float(final_probs[1]),
            away_win_prob=float(final_probs[2]),
            predicted_outcome=predicted_outcome,
            confidence=confidence,
            model_agreement=agreement,
            individual_predictions=individual_preds,
            calibrated=True
        )
    
    def _calculate_agreement(self, predictions: Dict[str, Dict]) -> float:
        """
        Calculate how much models agree on the outcome.
        
        Returns:
            Agreement score from 0 (complete disagreement) to 1 (full agreement)
        """
        if len(predictions) < 2:
            return 1.0
        
        # Get predicted outcome from each model
        outcomes = []
        for pred in predictions.values():
            probs = [pred['home_win_prob'], pred['draw_prob'], pred['away_win_prob']]
            outcomes.append(np.argmax(probs))
        
        # Calculate agreement as percentage of models agreeing with majority
        from collections import Counter
        outcome_counts = Counter(outcomes)
        most_common_count = outcome_counts.most_common(1)[0][1]
        
        return most_common_count / len(outcomes)
    
    def _calibrate_confidence(self, raw_confidence: float, agreement: float) -> float:
        """
        Calibrate confidence based on model agreement.
        
        High agreement → boost confidence
        Low agreement → reduce confidence
        """
        # Agreement multiplier
        if agreement >= 0.8:
            multiplier = 1.1  # Boost for high agreement
        elif agreement >= 0.6:
            multiplier = 1.0  # No change
        elif agreement >= 0.4:
            multiplier = 0.9  # Slight reduction
        else:
            multiplier = 0.8  # Significant reduction for disagreement
        
        calibrated = raw_confidence * multiplier
        
        # Clamp to valid range
        return max(0.3, min(0.95, calibrated))
    
    def get_model_contributions(self, home_team: str, away_team: str, 
                                  **features) -> Dict[str, Dict]:
        """
        Get detailed breakdown of each model's contribution.
        
        Useful for debugging and understanding predictions.
        """
        contributions = {}
        
        for name, model in self.models.items():
            try:
                pred = model.predict(home_team, away_team, **features)
                weight = self.weights.get(name, 0)
                
                contributions[name] = {
                    'weight': weight,
                    'prediction': pred.to_dict() if hasattr(pred, 'to_dict') else pred,
                    'weighted_contribution': {
                        'home': getattr(pred, 'home_win_prob', 0.33) * weight,
                        'draw': getattr(pred, 'draw_prob', 0.33) * weight,
                        'away': getattr(pred, 'away_win_prob', 0.34) * weight
                    }
                }
            except Exception as e:
                contributions[name] = {'error': str(e)}
        
        return contributions
    
    def save_weights(self, path: Optional[Path] = None):
        """Save current weights to config file"""
        path = path or Path(__file__).parent.parent.parent / "models" / "config" / "ensemble_weights.json"
        path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(path, 'w') as f:
            json.dump(self.weights, f, indent=2)
        
        logger.info(f"Saved weights to {path}")
    
    def load_weights(self, path: Optional[Path] = None):
        """Load weights from config file"""
        path = path or Path(__file__).parent.parent.parent / "models" / "config" / "ensemble_weights.json"
        
        if path.exists():
            with open(path, 'r') as f:
                self.weights = json.load(f)
            logger.info(f"Loaded weights from {path}")
        else:
            logger.warning(f"No weights file found at {path}, using defaults")


class SimpleVotingEnsemble:
    """
    Simple voting ensemble - each model gets one vote.
    Good for when you want equal weighting.
    """
    
    def __init__(self):
        self.models: Dict[str, Any] = {}
    
    def register_model(self, name: str, model: Any):
        self.models[name] = model
    
    def predict(self, home_team: str, away_team: str, **features) -> str:
        """Get majority vote for outcome"""
        votes = {'home': 0, 'draw': 0, 'away': 0}
        
        for model in self.models.values():
            pred = model.predict(home_team, away_team, **features)
            probs = [pred.home_win_prob, pred.draw_prob, pred.away_win_prob]
            outcome_idx = np.argmax(probs)
            vote_keys = ['home', 'draw', 'away']
            votes[vote_keys[outcome_idx]] += 1
        
        winner = max(votes, key=votes.get)
        return {'home': 'Home Win', 'draw': 'Draw', 'away': 'Away Win'}[winner]