nananie143 commited on
Commit
ac15a01
·
verified ·
1 Parent(s): 76d714e

feat: Add src/models/comprehensive_features.py

Browse files
Files changed (1) hide show
  1. src/models/comprehensive_features.py +349 -0
src/models/comprehensive_features.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive Feature Builder
3
+
4
+ Builds all 153 features required by the trained models.
5
+ Features include: Elo ratings, form, H2H, betting odds, match stats.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Tuple
12
+ from datetime import datetime, timedelta
13
+ import numpy as np
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Data directories
18
+ DATA_DIR = Path(__file__).parent.parent.parent / "data"
19
+ MODELS_DIR = Path(__file__).parent.parent.parent / "models"
20
+
21
+
22
+ class ComprehensiveFeatureBuilder:
23
+ """Build all 153 features for trained model predictions."""
24
+
25
+ # Feature order must match training exactly
26
+ FEATURE_COLS = [
27
+ "HomeTeamEnc", "AwayTeamEnc", "LeagueEnc", "HomeElo", "AwayElo", "EloDiff",
28
+ "HomeEloNorm", "AwayEloNorm", "EloRatio", "HomeMomentum", "AwayMomentum",
29
+ "MomentumDiff", "HomeStreak", "AwayStreak", "HomeUnbeatenStreak", "AwayUnbeatenStreak",
30
+ "HomeScoringStreak", "AwayScoringStreak", "HomeGoalsTrend", "AwayGoalsTrend",
31
+ "H2HHomeWinRate", "H2HAwayWinRate", "H2HDrawRate", "H2HAvgGoals", "H2HAvgHomeGoals",
32
+ "H2HAvgAwayGoals", "H2HBTTSRate", "H2HOver25Rate", "H2HMatches",
33
+ "HomeExpGoals", "AwayExpGoals", "ExpTotalGoals", "PoissonHome", "PoissonDraw", "PoissonAway",
34
+ "HomeForm3", "AwayForm3", "HomeGoalsAvg3", "AwayGoalsAvg3", "HomeConcededAvg3", "AwayConcededAvg3",
35
+ "HomeAttackStrength3", "AwayAttackStrength3", "HomeDefenseStrength3", "AwayDefenseStrength3",
36
+ "HomeForm5", "AwayForm5", "HomeGoalsAvg5", "AwayGoalsAvg5", "HomeConcededAvg5", "AwayConcededAvg5",
37
+ "HomeAttackStrength5", "AwayAttackStrength5", "HomeDefenseStrength5", "AwayDefenseStrength5",
38
+ "HomeForm10", "AwayForm10", "HomeGoalsAvg10", "AwayGoalsAvg10", "HomeConcededAvg10", "AwayConcededAvg10",
39
+ "HomeAttackStrength10", "AwayAttackStrength10", "HomeDefenseStrength10", "AwayDefenseStrength10",
40
+ "HomeForm15", "AwayForm15", "HomeGoalsAvg15", "AwayGoalsAvg15", "HomeConcededAvg15", "AwayConcededAvg15",
41
+ "HomeAttackStrength15", "AwayAttackStrength15", "HomeDefenseStrength15", "AwayDefenseStrength15",
42
+ "HomeBTTSRate5", "AwayBTTSRate5", "HomeO15Rate5", "AwayO15Rate5", "HomeO25Rate5", "AwayO25Rate5",
43
+ "HomeO35Rate5", "AwayO35Rate5", "HomeCSRate5", "AwayCSRate5", "HomeFTSRate5", "AwayFTSRate5",
44
+ "HomeBTTSRate10", "AwayBTTSRate10", "HomeO15Rate10", "AwayO15Rate10", "HomeO25Rate10", "AwayO25Rate10",
45
+ "HomeO35Rate10", "AwayO35Rate10", "HomeCSRate10", "AwayCSRate10", "HomeFTSRate10", "AwayFTSRate10",
46
+ "B365H", "B365D", "B365A", "B365_HomeProb", "B365_DrawProb", "B365_AwayProb",
47
+ "BWH", "BWD", "BWA", "BW_HomeProb", "BW_DrawProb", "BW_AwayProb",
48
+ "PSH", "PSD", "PSA", "PS_HomeProb", "PS_DrawProb", "PS_AwayProb",
49
+ "WHH", "WHD", "WHA", "WH_HomeProb", "WH_DrawProb", "WH_AwayProb",
50
+ "IWH", "IWD", "IWA", "IW_HomeProb", "IW_DrawProb", "IW_AwayProb",
51
+ "VCH", "VCD", "VCA", "VC_HomeProb", "VC_DrawProb", "VC_AwayProb",
52
+ "AvgH", "AvgD", "AvgA", "Avg_HomeProb", "Avg_DrawProb", "Avg_AwayProb",
53
+ "HS", "AS", "HST", "AST", "HF", "AF", "HC", "AC", "HY", "AY", "HR", "AR"
54
+ ]
55
+
56
+ def __init__(self):
57
+ self.team_stats: Dict[str, Dict] = {}
58
+ self.elo_ratings: Dict[str, float] = {}
59
+ self.h2h_cache: Dict[str, Dict] = {}
60
+ self.league_encodings: Dict[str, int] = {}
61
+ self.team_encodings: Dict[str, int] = {}
62
+ self._load_historical_data()
63
+
64
+ def _load_historical_data(self):
65
+ """Load historical match data to compute form and stats."""
66
+ try:
67
+ # Load Elo ratings
68
+ elo_file = MODELS_DIR / "config" / "elo_ratings.json"
69
+ if elo_file.exists():
70
+ with open(elo_file) as f:
71
+ self.elo_ratings = json.load(f)
72
+ logger.info(f"Loaded {len(self.elo_ratings)} Elo ratings")
73
+
74
+ # Load team stats from cache
75
+ stats_file = DATA_DIR / "team_stats_cache.json"
76
+ if stats_file.exists():
77
+ with open(stats_file) as f:
78
+ self.team_stats = json.load(f)
79
+ logger.info(f"Loaded stats for {len(self.team_stats)} teams")
80
+
81
+ # Load league encodings
82
+ self.league_encodings = {
83
+ 'premier_league': 0, 'bundesliga': 1, 'la_liga': 2,
84
+ 'serie_a': 3, 'ligue_1': 4, 'eredivisie': 5,
85
+ 'primeira_liga': 6, 'championship': 7, 'scottish_premiership': 8
86
+ }
87
+
88
+ # Build team stats from historical data if not cached
89
+ if not self.team_stats:
90
+ self._build_team_stats_from_history()
91
+
92
+ except Exception as e:
93
+ logger.warning(f"Error loading historical data: {e}")
94
+
95
+ def _build_team_stats_from_history(self):
96
+ """Build team stats from historical CSV data."""
97
+ import pandas as pd
98
+
99
+ # Try to load comprehensive data
100
+ csv_files = list((DATA_DIR / "raw").glob("**/*.csv"))
101
+
102
+ all_matches = []
103
+ for csv_file in csv_files[:50]: # Limit to avoid memory issues
104
+ try:
105
+ df = pd.read_csv(csv_file, encoding='latin1', low_memory=False)
106
+ if 'HomeTeam' in df.columns and 'AwayTeam' in df.columns:
107
+ all_matches.append(df)
108
+ except Exception:
109
+ pass
110
+
111
+ if all_matches:
112
+ combined = pd.concat(all_matches, ignore_index=True)
113
+ self._compute_team_stats(combined)
114
+ logger.info(f"Built stats from {len(combined)} historical matches")
115
+
116
+ def _compute_team_stats(self, df):
117
+ """Compute team statistics from match data."""
118
+ import pandas as pd
119
+
120
+ for team in pd.concat([df['HomeTeam'], df['AwayTeam']]).unique():
121
+ if pd.isna(team):
122
+ continue
123
+
124
+ # Home matches
125
+ home_matches = df[df['HomeTeam'] == team].tail(15)
126
+ # Away matches
127
+ away_matches = df[df['AwayTeam'] == team].tail(15)
128
+
129
+ self.team_stats[team] = {
130
+ 'home_goals_avg': home_matches['FTHG'].mean() if 'FTHG' in home_matches else 1.5,
131
+ 'away_goals_avg': away_matches['FTAG'].mean() if 'FTAG' in away_matches else 1.0,
132
+ 'home_conceded_avg': home_matches['FTAG'].mean() if 'FTAG' in home_matches else 1.2,
133
+ 'away_conceded_avg': away_matches['FTHG'].mean() if 'FTHG' in away_matches else 1.5,
134
+ 'home_wins': len(home_matches[home_matches['FTR'] == 'H']) if 'FTR' in home_matches else 5,
135
+ 'away_wins': len(away_matches[away_matches['FTR'] == 'A']) if 'FTR' in away_matches else 3,
136
+ 'matches_played': len(home_matches) + len(away_matches)
137
+ }
138
+
139
+ def get_elo(self, team: str) -> float:
140
+ """Get Elo rating with fuzzy matching."""
141
+ if team in self.elo_ratings:
142
+ return self.elo_ratings[team]
143
+
144
+ # Fuzzy match
145
+ team_lower = team.lower()
146
+ for t, elo in self.elo_ratings.items():
147
+ if t.lower() in team_lower or team_lower in t.lower():
148
+ return elo
149
+
150
+ return 1500.0 # Default
151
+
152
+ def get_team_encoding(self, team: str) -> int:
153
+ """Get or create team encoding."""
154
+ if team not in self.team_encodings:
155
+ self.team_encodings[team] = len(self.team_encodings)
156
+ return self.team_encodings[team]
157
+
158
+ def get_team_stats(self, team: str) -> Dict:
159
+ """Get team stats with defaults."""
160
+ if team in self.team_stats:
161
+ return self.team_stats[team]
162
+
163
+ # Fuzzy match
164
+ team_lower = team.lower()
165
+ for t, stats in self.team_stats.items():
166
+ if t.lower() in team_lower or team_lower in t.lower():
167
+ return stats
168
+
169
+ # Return sensible defaults
170
+ return {
171
+ 'home_goals_avg': 1.5, 'away_goals_avg': 1.0,
172
+ 'home_conceded_avg': 1.2, 'away_conceded_avg': 1.5,
173
+ 'home_wins': 5, 'away_wins': 3, 'matches_played': 10
174
+ }
175
+
176
+ def compute_poisson_probs(self, home_xg: float, away_xg: float) -> Tuple[float, float, float]:
177
+ """Compute Poisson-based probabilities."""
178
+ from math import exp, factorial
179
+
180
+ def poisson(k, lam):
181
+ return (lam ** k) * exp(-lam) / factorial(k)
182
+
183
+ home_win = 0
184
+ draw = 0
185
+ away_win = 0
186
+
187
+ for i in range(10):
188
+ for j in range(10):
189
+ prob = poisson(i, home_xg) * poisson(j, away_xg)
190
+ if i > j:
191
+ home_win += prob
192
+ elif i == j:
193
+ draw += prob
194
+ else:
195
+ away_win += prob
196
+
197
+ total = home_win + draw + away_win
198
+ return home_win / total, draw / total, away_win / total
199
+
200
+ def build_features(self, home_team: str, away_team: str, league: str = 'premier_league') -> np.ndarray:
201
+ """Build complete 153-feature vector."""
202
+ features = {}
203
+
204
+ # 1. Team Encodings (3 features)
205
+ features['HomeTeamEnc'] = self.get_team_encoding(home_team)
206
+ features['AwayTeamEnc'] = self.get_team_encoding(away_team)
207
+ features['LeagueEnc'] = self.league_encodings.get(league, 0)
208
+
209
+ # 2. Elo Ratings (6 features)
210
+ home_elo = self.get_elo(home_team)
211
+ away_elo = self.get_elo(away_team)
212
+ features['HomeElo'] = home_elo
213
+ features['AwayElo'] = away_elo
214
+ features['EloDiff'] = home_elo - away_elo
215
+ features['HomeEloNorm'] = (home_elo - 1000) / 1000
216
+ features['AwayEloNorm'] = (away_elo - 1000) / 1000
217
+ features['EloRatio'] = home_elo / away_elo if away_elo > 0 else 1.0
218
+
219
+ # 3. Get team stats
220
+ home_stats = self.get_team_stats(home_team)
221
+ away_stats = self.get_team_stats(away_team)
222
+
223
+ # 4. Momentum & Streaks (10 features)
224
+ features['HomeMomentum'] = home_stats.get('home_wins', 5) / max(home_stats.get('matches_played', 10), 1)
225
+ features['AwayMomentum'] = away_stats.get('away_wins', 3) / max(away_stats.get('matches_played', 10), 1)
226
+ features['MomentumDiff'] = features['HomeMomentum'] - features['AwayMomentum']
227
+ features['HomeStreak'] = min(home_stats.get('home_wins', 3), 5)
228
+ features['AwayStreak'] = min(away_stats.get('away_wins', 2), 5)
229
+ features['HomeUnbeatenStreak'] = min(home_stats.get('home_wins', 3) + 2, 8)
230
+ features['AwayUnbeatenStreak'] = min(away_stats.get('away_wins', 2) + 2, 8)
231
+ features['HomeScoringStreak'] = min(int(home_stats.get('home_goals_avg', 1.5) * 3), 10)
232
+ features['AwayScoringStreak'] = min(int(away_stats.get('away_goals_avg', 1.0) * 3), 10)
233
+ features['HomeGoalsTrend'] = home_stats.get('home_goals_avg', 1.5) - 1.3
234
+ features['AwayGoalsTrend'] = away_stats.get('away_goals_avg', 1.0) - 1.0
235
+
236
+ # 5. H2H Stats (9 features) - Use reasonable defaults
237
+ features['H2HHomeWinRate'] = 0.45
238
+ features['H2HAwayWinRate'] = 0.30
239
+ features['H2HDrawRate'] = 0.25
240
+ features['H2HAvgGoals'] = 2.5
241
+ features['H2HAvgHomeGoals'] = 1.4
242
+ features['H2HAvgAwayGoals'] = 1.1
243
+ features['H2HBTTSRate'] = 0.55
244
+ features['H2HOver25Rate'] = 0.50
245
+ features['H2HMatches'] = 10
246
+
247
+ # 6. Expected Goals & Poisson (6 features)
248
+ home_xg = home_stats.get('home_goals_avg', 1.5) * 0.9 + 0.15
249
+ away_xg = away_stats.get('away_goals_avg', 1.0) * 0.9 + 0.1
250
+ features['HomeExpGoals'] = home_xg
251
+ features['AwayExpGoals'] = away_xg
252
+ features['ExpTotalGoals'] = home_xg + away_xg
253
+
254
+ poisson_h, poisson_d, poisson_a = self.compute_poisson_probs(home_xg, away_xg)
255
+ features['PoissonHome'] = poisson_h
256
+ features['PoissonDraw'] = poisson_d
257
+ features['PoissonAway'] = poisson_a
258
+
259
+ # 7. Form Features for windows 3, 5, 10, 15 (40 features)
260
+ for window in [3, 5, 10, 15]:
261
+ decay = 1.0 - (window - 3) * 0.05
262
+ features[f'HomeForm{window}'] = features['HomeMomentum'] * decay
263
+ features[f'AwayForm{window}'] = features['AwayMomentum'] * decay
264
+ features[f'HomeGoalsAvg{window}'] = home_stats.get('home_goals_avg', 1.5) * decay
265
+ features[f'AwayGoalsAvg{window}'] = away_stats.get('away_goals_avg', 1.0) * decay
266
+ features[f'HomeConcededAvg{window}'] = home_stats.get('home_conceded_avg', 1.2) * decay
267
+ features[f'AwayConcededAvg{window}'] = away_stats.get('away_conceded_avg', 1.5) * decay
268
+ features[f'HomeAttackStrength{window}'] = features[f'HomeGoalsAvg{window}'] / 1.3
269
+ features[f'AwayAttackStrength{window}'] = features[f'AwayGoalsAvg{window}'] / 1.1
270
+ features[f'HomeDefenseStrength{window}'] = 1.3 / max(features[f'HomeConcededAvg{window}'], 0.5)
271
+ features[f'AwayDefenseStrength{window}'] = 1.1 / max(features[f'AwayConcededAvg{window}'], 0.5)
272
+
273
+ # 8. Goals Market Features (24 features)
274
+ for window in [5, 10]:
275
+ decay = 1.0 if window == 5 else 0.95
276
+ features[f'HomeBTTSRate{window}'] = 0.55 * decay
277
+ features[f'AwayBTTSRate{window}'] = 0.50 * decay
278
+ features[f'HomeO15Rate{window}'] = 0.75 * decay
279
+ features[f'AwayO15Rate{window}'] = 0.65 * decay
280
+ features[f'HomeO25Rate{window}'] = 0.50 * decay
281
+ features[f'AwayO25Rate{window}'] = 0.40 * decay
282
+ features[f'HomeO35Rate{window}'] = 0.30 * decay
283
+ features[f'AwayO35Rate{window}'] = 0.20 * decay
284
+ features[f'HomeCSRate{window}'] = 0.30 * decay
285
+ features[f'AwayCSRate{window}'] = 0.25 * decay
286
+ features[f'HomeFTSRate{window}'] = 0.70 * decay
287
+ features[f'AwayFTSRate{window}'] = 0.60 * decay
288
+
289
+ # 9. Betting Odds Features (42 features) - Use implied from Elo
290
+ elo_home_prob = 1 / (1 + 10 ** ((away_elo - home_elo - 100) / 400))
291
+ elo_away_prob = 1 / (1 + 10 ** ((home_elo - away_elo + 100) / 400))
292
+ elo_draw_prob = max(0.15, 1 - elo_home_prob - elo_away_prob)
293
+
294
+ # Normalize
295
+ total = elo_home_prob + elo_draw_prob + elo_away_prob
296
+ home_prob = elo_home_prob / total
297
+ draw_prob = elo_draw_prob / total
298
+ away_prob = elo_away_prob / total
299
+
300
+ # Convert to odds (with margin)
301
+ margin = 1.05
302
+ home_odds = margin / max(home_prob, 0.05)
303
+ draw_odds = margin / max(draw_prob, 0.05)
304
+ away_odds = margin / max(away_prob, 0.05)
305
+
306
+ for bookie in ['B365', 'BW', 'PS', 'WH', 'IW', 'VC', 'Avg']:
307
+ noise = 0.02 if bookie != 'Avg' else 0
308
+ features[f'{bookie}H'] = home_odds + np.random.uniform(-noise, noise) * home_odds
309
+ features[f'{bookie}D'] = draw_odds + np.random.uniform(-noise, noise) * draw_odds
310
+ features[f'{bookie}A'] = away_odds + np.random.uniform(-noise, noise) * away_odds
311
+ features[f'{bookie}_HomeProb'] = home_prob
312
+ features[f'{bookie}_DrawProb'] = draw_prob
313
+ features[f'{bookie}_AwayProb'] = away_prob
314
+
315
+ # 10. Match Stats Features (12 features) - Use averages
316
+ features['HS'] = 12 # Home shots
317
+ features['AS'] = 10 # Away shots
318
+ features['HST'] = 5 # Home shots on target
319
+ features['AST'] = 4 # Away shots on target
320
+ features['HF'] = 12 # Home fouls
321
+ features['AF'] = 11 # Away fouls
322
+ features['HC'] = 5 # Home corners
323
+ features['AC'] = 4 # Away corners
324
+ features['HY'] = 2 # Home yellow cards
325
+ features['AY'] = 2 # Away yellow cards
326
+ features['HR'] = 0 # Home red cards
327
+ features['AR'] = 0 # Away red cards
328
+
329
+ # Build ordered array
330
+ feature_array = np.array([features.get(col, 0.0) for col in self.FEATURE_COLS], dtype=np.float32)
331
+
332
+ return feature_array.reshape(1, -1)
333
+
334
+
335
+ # Global instance
336
+ _builder: Optional[ComprehensiveFeatureBuilder] = None
337
+
338
+
339
+ def get_feature_builder() -> ComprehensiveFeatureBuilder:
340
+ """Get or create feature builder singleton."""
341
+ global _builder
342
+ if _builder is None:
343
+ _builder = ComprehensiveFeatureBuilder()
344
+ return _builder
345
+
346
+
347
+ def build_match_features(home: str, away: str, league: str = 'premier_league') -> np.ndarray:
348
+ """Build features for a match."""
349
+ return get_feature_builder().build_features(home, away, league)