File size: 11,266 Bytes
6f7e932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e6fc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f7e932
2e6fc13
 
 
 
 
 
 
 
 
 
 
 
6f7e932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34643b5
6f7e932
34643b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f7e932
34643b5
6f7e932
34643b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f7e932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
"""
Trained Model Loader

Loads models trained on Kaggle and exported to models/trained/
Supports: XGBoost, LightGBM, CatBoost, PyTorch, ONNX
"""

import os
import json
import pickle
import logging
from pathlib import Path
from typing import Dict, Optional, Any
import numpy as np

logger = logging.getLogger(__name__)

MODELS_DIR = Path(__file__).parent.parent.parent / "models"
TRAINED_DIR = MODELS_DIR / "trained"
CONFIG_DIR = MODELS_DIR / "config"


class TrainedModelLoader:
    """Load models trained on Kaggle"""
    
    def __init__(self):
        self.models: Dict[str, Any] = {}
        self.encoders: Dict[str, Any] = {}
        self.elo_ratings: Dict[str, float] = {}
        self.metadata: Dict[str, Any] = {}
        self.scaler = None
        self._loaded = False
    
    def load_all(self) -> bool:
        """Load all available trained models"""
        try:
            self._load_config()
            self._load_xgboost()
            self._load_lightgbm()
            self._load_catboost()
            self._load_neural_net()
            self._load_onnx()
            self._loaded = len(self.models) > 0
            logger.info(f"Loaded {len(self.models)} trained models")
            return self._loaded
        except Exception as e:
            logger.error(f"Error loading models: {e}")
            return False
    
    def _load_config(self):
        """Load encoders, elo ratings, and metadata"""
        # Encoders
        enc_path = CONFIG_DIR / "encoders.pkl"
        if enc_path.exists():
            with open(enc_path, 'rb') as f:
                data = pickle.load(f)
                self.encoders = data
                self.scaler = data.get('scaler')
            logger.info("Loaded encoders")
        
        # Elo ratings
        elo_path = CONFIG_DIR / "elo_ratings.json"
        if elo_path.exists():
            with open(elo_path, 'r') as f:
                self.elo_ratings = json.load(f)
            logger.info(f"Loaded {len(self.elo_ratings)} team Elo ratings")
        
        # Metadata
        meta_path = CONFIG_DIR / "model_meta.json"
        if meta_path.exists():
            with open(meta_path, 'r') as f:
                self.metadata = json.load(f)
            logger.info("Loaded model metadata")
    
    def _load_xgboost(self):
        """Load XGBoost model"""
        path = TRAINED_DIR / "xgb_football.json"
        if path.exists():
            try:
                from xgboost import XGBClassifier
                model = XGBClassifier()
                model.load_model(str(path))
                self.models['xgb'] = model
                logger.info("Loaded XGBoost model")
            except ImportError:
                logger.warning("XGBoost not installed")
    
    def _load_lightgbm(self):
        """Load LightGBM model"""
        path = TRAINED_DIR / "lgb_football.txt"
        if path.exists():
            try:
                import lightgbm as lgb
                model = lgb.Booster(model_file=str(path))
                self.models['lgb'] = model
                logger.info("Loaded LightGBM model")
            except ImportError:
                logger.warning("LightGBM not installed")
    
    def _load_catboost(self):
        """Load CatBoost model"""
        path = TRAINED_DIR / "cat_football.cbm"
        if path.exists():
            try:
                from catboost import CatBoostClassifier
                model = CatBoostClassifier()
                model.load_model(str(path))
                self.models['cat'] = model
                logger.info("Loaded CatBoost model")
            except ImportError:
                logger.warning("CatBoost not installed")
    
    def _load_neural_net(self):
        """Load PyTorch neural network"""
        path = TRAINED_DIR / "nn_football.pt"
        if path.exists():
            try:
                import torch
                import torch.nn as nn
                
                class FootballNet(nn.Module):
                    def __init__(self, input_dim=8, hidden=128):
                        super().__init__()
                        self.net = nn.Sequential(
                            nn.Linear(input_dim, hidden),
                            nn.ReLU(), nn.Dropout(0.3),
                            nn.Linear(hidden, 64),
                            nn.ReLU(), nn.Dropout(0.2),
                            nn.Linear(64, 3)
                        )
                    def forward(self, x):
                        return self.net(x)
                
                model = FootballNet()
                model.load_state_dict(torch.load(path, map_location='cpu'))
                model.eval()
                self.models['nn'] = model
                logger.info("Loaded PyTorch neural network")
            except ImportError:
                logger.warning("PyTorch not installed")
    
    def _load_onnx(self):
        """Load ONNX model for fast inference"""
        path = TRAINED_DIR / "football_transformer.onnx"
        if path.exists():
            try:
                import onnxruntime as ort
                session = ort.InferenceSession(str(path))
                self.models['onnx'] = session
                logger.info("Loaded ONNX transformer")
            except ImportError:
                logger.warning("ONNX Runtime not installed")
    
    def get_elo(self, team: str) -> float:
        """Get Elo rating for a team"""
        if team in self.elo_ratings:
            return self.elo_ratings[team]
        # Fuzzy match
        team_lower = team.lower()
        for t, elo in self.elo_ratings.items():
            if t.lower() in team_lower or team_lower in t.lower():
                return elo
        return 1500.0  # Default
    
    def build_features(self, home_team: str, away_team: str, league: str = 'premier_league') -> np.ndarray:
        """Build comprehensive 153-feature vector for prediction."""
        try:
            # Use comprehensive feature builder
            from .comprehensive_features import build_match_features
            features = build_match_features(home_team, away_team, league)
            logger.debug(f"Built {features.shape[1]} features for {home_team} vs {away_team}")
            return features
        except Exception as e:
            logger.warning(f"Comprehensive features failed, using fallback: {e}")
            # Fallback to basic features
            home_elo = self.get_elo(home_team)
            away_elo = self.get_elo(away_team)
            
            # Encode teams
            team_enc = self.encoders.get('team_enc')
            if team_enc:
                try:
                    home_enc = team_enc.transform([home_team])[0]
                    away_enc = team_enc.transform([away_team])[0]
                except:
                    home_enc, away_enc = 0, 0
            else:
                home_enc, away_enc = 0, 0
            
            # Build basic feature vector
            import datetime
            now = datetime.datetime.now()
            features = np.array([
                home_enc, away_enc,
                home_elo, away_elo,
                home_elo - away_elo,
                now.year, now.month, now.weekday()
            ], dtype=np.float32)
            
            return features.reshape(1, -1)
    
    def predict(self, home_team: str, away_team: str) -> Dict:
        """Get ensemble prediction"""
        if not self._loaded:
            self.load_all()
        
        if not self.models:
            return {'error': 'No trained models available'}
        
        features = self.build_features(home_team, away_team)
        
        # Ensemble weights
        weights = self.metadata.get('ensemble_weights', {
            'xgb': 0.3, 'lgb': 0.3, 'cat': 0.25, 'nn': 0.15
        })
        
        probs = np.zeros(3)
        total_weight = 0
        
        # XGBoost
        if 'xgb' in self.models:
            probs += weights.get('xgb', 0.3) * self.models['xgb'].predict_proba(features)[0]
            total_weight += weights.get('xgb', 0.3)
        
        # LightGBM (skip if feature count mismatch)
        if 'lgb' in self.models:
            try:
                lgb_raw = self.models['lgb'].predict(features)
                # Handle different output shapes
                if lgb_raw.ndim == 1:
                    lgb_probs = lgb_raw
                elif lgb_raw.ndim == 2:
                    lgb_probs = lgb_raw[0]
                else:
                    lgb_probs = np.array([lgb_raw, 0.3, 0.3])
                
                # Normalize if needed
                if len(lgb_probs) >= 3:
                    lgb_probs = lgb_probs[:3]
                    lgb_probs = lgb_probs / lgb_probs.sum()
                    probs += weights.get('lgb', 0.3) * lgb_probs
                    total_weight += weights.get('lgb', 0.3)
            except Exception as e:
                # Feature mismatch - skip this model
                logger.debug(f"LightGBM skipped: {e}")
        
        # CatBoost (skip if feature count mismatch)
        if 'cat' in self.models:
            try:
                cat_probs = self.models['cat'].predict_proba(features)[0]
                probs += weights.get('cat', 0.25) * cat_probs
                total_weight += weights.get('cat', 0.25)
            except Exception as e:
                logger.debug(f"CatBoost skipped: {e}")
        
        # Neural Net (skip if scaler or feature issues)
        if 'nn' in self.models:
            try:
                import torch
                if self.scaler:
                    scaled = self.scaler.transform(features)
                else:
                    scaled = features
                with torch.no_grad():
                    nn_out = torch.softmax(self.models['nn'](torch.FloatTensor(scaled)), dim=1).numpy()[0]
                probs += weights.get('nn', 0.15) * nn_out
                total_weight += weights.get('nn', 0.15)
            except Exception as e:
                logger.debug(f"Neural Net skipped: {e}")
        
        if total_weight > 0:
            probs = probs / total_weight
        
        # Normalize
        probs = probs / probs.sum()
        
        # Get classes
        classes = self.metadata.get('classes', ['A', 'D', 'H'])
        pred_idx = probs.argmax()
        
        return {
            'home_team': home_team,
            'away_team': away_team,
            'home_win_prob': float(probs[classes.index('H')] if 'H' in classes else probs[0]),
            'draw_prob': float(probs[classes.index('D')] if 'D' in classes else probs[1]),
            'away_win_prob': float(probs[classes.index('A')] if 'A' in classes else probs[2]),
            'predicted_outcome': classes[pred_idx].replace('H', 'Home Win').replace('A', 'Away Win').replace('D', 'Draw'),
            'confidence': float(probs[pred_idx]),
            'models_used': list(self.models.keys())
        }


# Global instance
_loader: Optional[TrainedModelLoader] = None

def get_trained_loader() -> TrainedModelLoader:
    global _loader
    if _loader is None:
        _loader = TrainedModelLoader()
        _loader.load_all()
    return _loader

def predict_with_trained(home: str, away: str) -> Dict:
    return get_trained_loader().predict(home, away)