nananie143 commited on
Commit
6d2403a
·
verified ·
1 Parent(s): 1ac8cff

Upload src/features/engineering/advanced_features.py with huggingface_hub

Browse files
src/features/engineering/advanced_features.py CHANGED
@@ -1,34 +1,34 @@
1
  """
2
- Advanced Feature Engineering Module V3.0
3
- Creates 400+ features for comprehensive match prediction
4
 
5
- Features cover:
6
- - Goal statistics (scored, conceded, difference)
7
- - Attack/defense ratings (relative to league)
8
- - Form features (PPG, win/draw/loss rates, streaks)
9
- - Momentum indicators (short vs long term)
10
- - xG features and overperformance
11
- - BTTS-specific features
12
- - Over/Under features
13
- - HT/FT features
14
- - Correct score features
15
- - H2H features
16
- - Timing and schedule features
17
- - Fatigue indicators
18
  """
19
 
20
  import pandas as pd
21
  import numpy as np
22
- from typing import Dict, List, Optional
23
  from scipy import stats
 
24
  import warnings
 
25
  warnings.filterwarnings('ignore')
 
26
 
27
 
28
  class AdvancedFeatureEngineer:
29
  """
30
  Comprehensive feature engineering with 400+ features covering:
31
  - Team performance metrics
 
32
  - Momentum & form indicators
33
  - Tactical patterns
34
  - Head-to-head statistics
@@ -47,7 +47,7 @@ class AdvancedFeatureEngineer:
47
 
48
  def create_all_features(self) -> pd.DataFrame:
49
  """Create comprehensive feature set (400+ features)."""
50
- print("🔧 Creating advanced features...")
51
 
52
  # Core features
53
  self._create_basic_goal_features()
@@ -59,6 +59,19 @@ class AdvancedFeatureEngineer:
59
  self._create_xg_features()
60
  self._create_shot_features()
61
  self._create_possession_features()
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Market-specific features
64
  self._create_btts_specific_features()
@@ -66,16 +79,22 @@ class AdvancedFeatureEngineer:
66
  self._create_htft_features()
67
  self._create_correct_score_features()
68
 
69
- # Context features
70
- self._create_timing_features()
71
- self._create_schedule_features()
72
- self._create_h2h_features()
73
 
74
  # Derived features
75
  self._create_interaction_features()
76
  self._create_ratio_features()
77
 
78
- print(f"✅ Created {len(self.features_created)} features")
 
 
 
 
 
 
 
79
  return self.df
80
 
81
  def _create_basic_goal_features(self):
@@ -97,7 +116,7 @@ class AdvancedFeatureEngineer:
97
  lambda x: x.rolling(window, min_periods=1).mean()
98
  )
99
  self.df[f'{team_type}_goals_scored_std_{window}'] = self.df.groupby(team_col)[goals_for].transform(
100
- lambda x: x.rolling(window, min_periods=2).std().fillna(0)
101
  )
102
  self.df[f'{team_type}_goals_scored_max_{window}'] = self.df.groupby(team_col)[goals_for].transform(
103
  lambda x: x.rolling(window, min_periods=1).max()
@@ -111,7 +130,7 @@ class AdvancedFeatureEngineer:
111
  lambda x: x.rolling(window, min_periods=1).mean()
112
  )
113
  self.df[f'{team_type}_goals_conceded_std_{window}'] = self.df.groupby(team_col)[goals_against].transform(
114
- lambda x: x.rolling(window, min_periods=2).std().fillna(0)
115
  )
116
 
117
  # Goal difference
@@ -143,41 +162,34 @@ class AdvancedFeatureEngineer:
143
  league_stats.columns = ['league', 'league_home_avg', 'league_away_avg']
144
  self.df = self.df.merge(league_stats, on='league', how='left')
145
 
146
- # Fill defaults
147
- self.df['league_home_avg'] = self.df['league_home_avg'].fillna(1.5)
148
- self.df['league_away_avg'] = self.df['league_away_avg'].fillna(1.2)
149
-
150
  for window in self.ROLLING_WINDOWS:
151
  for team_type in ['home', 'away']:
 
 
 
152
  # Attack strength (relative to league average)
153
- scored_col = f'{team_type}_goals_scored_avg_{window}'
154
- if scored_col in self.df.columns:
155
- self.df[f'{team_type}_attack_strength_{window}'] = (
156
- self.df[scored_col] /
157
- self.df[f'league_{team_type}_avg'].clip(lower=0.1)
158
- )
159
 
160
  # Defense weakness (higher = worse defense)
161
- conceded_col = f'{team_type}_goals_conceded_avg_{window}'
162
- if conceded_col in self.df.columns:
163
- opp_type = 'away' if team_type == 'home' else 'home'
164
- self.df[f'{team_type}_defense_weakness_{window}'] = (
165
- self.df[conceded_col] /
166
- self.df[f'league_{opp_type}_avg'].clip(lower=0.1)
167
- )
168
 
169
  # Combined rating
170
- if f'{team_type}_attack_strength_{window}' in self.df.columns:
171
- self.df[f'{team_type}_overall_rating_{window}'] = (
172
- self.df[f'{team_type}_attack_strength_{window}'] -
173
- self.df[f'{team_type}_defense_weakness_{window}'] + 1
174
- )
175
-
176
- self.features_created.extend([
177
- f'{team_type}_attack_strength_{window}',
178
- f'{team_type}_defense_weakness_{window}',
179
- f'{team_type}_overall_rating_{window}'
180
- ])
181
 
182
  def _create_form_features(self):
183
  """Create team form features."""
@@ -185,8 +197,8 @@ class AdvancedFeatureEngineer:
185
  return
186
 
187
  # Points calculation
188
- self.df['home_points'] = self.df['result'].map({'H': 3, 'D': 1, 'A': 0}).fillna(0)
189
- self.df['away_points'] = self.df['result'].map({'A': 3, 'D': 1, 'H': 0}).fillna(0)
190
 
191
  for window in self.ROLLING_WINDOWS:
192
  for team_type in ['home', 'away']:
@@ -201,21 +213,15 @@ class AdvancedFeatureEngineer:
201
  lambda x: x.rolling(window, min_periods=1).mean()
202
  )
203
 
204
- # Win rate
205
- win_result = 'H' if team_type == 'home' else 'A'
206
  self.df[f'{team_type}_win_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
207
- lambda x: (x == win_result).rolling(window, min_periods=1).mean()
208
  )
209
-
210
- # Draw rate
211
  self.df[f'{team_type}_draw_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
212
  lambda x: (x == 'D').rolling(window, min_periods=1).mean()
213
  )
214
-
215
- # Loss rate
216
- loss_result = 'A' if team_type == 'home' else 'H'
217
  self.df[f'{team_type}_loss_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
218
- lambda x: (x == loss_result).rolling(window, min_periods=1).mean()
219
  )
220
 
221
  self.features_created.extend([
@@ -224,39 +230,17 @@ class AdvancedFeatureEngineer:
224
  f'{team_type}_draw_rate_{window}',
225
  f'{team_type}_loss_rate_{window}'
226
  ])
227
-
228
- # Streaks
229
- for team_type in ['home', 'away']:
230
- team_col = f'{team_type}_team'
231
- if team_col not in self.df.columns:
232
- continue
233
-
234
- win_result = 'H' if team_type == 'home' else 'A'
235
-
236
- # Winning streak
237
- self.df[f'{team_type}_winning_streak'] = self.df.groupby(team_col)['result'].transform(
238
- lambda x: self._calculate_streak(x, win_result)
239
- )
240
-
241
- # Unbeaten streak
242
- self.df[f'{team_type}_unbeaten_streak'] = self.df.groupby(team_col)['result'].transform(
243
- lambda x: self._calculate_streak(x, win_result, 'D')
244
- )
245
-
246
- self.features_created.extend([
247
- f'{team_type}_winning_streak',
248
- f'{team_type}_unbeaten_streak'
249
- ])
250
 
251
  def _create_momentum_features(self):
252
  """Create momentum and trend features."""
253
  for team_type in ['home', 'away']:
254
- # Check if required features exist
255
- if f'{team_type}_ppg_3' not in self.df.columns:
 
256
  continue
257
-
258
  # Short-term vs long-term form (momentum indicator)
259
- if f'{team_type}_ppg_10' in self.df.columns:
260
  self.df[f'{team_type}_momentum_3v10'] = (
261
  self.df[f'{team_type}_ppg_3'] - self.df[f'{team_type}_ppg_10']
262
  )
@@ -283,19 +267,17 @@ class AdvancedFeatureEngineer:
283
  self.features_created.append(f'{team_type}_defense_momentum_3v10')
284
 
285
  # Exponential weighted moving average for form
286
- team_col = f'{team_type}_team'
287
- points_col = f'{team_type}_points'
288
- if team_col in self.df.columns and points_col in self.df.columns:
289
- self.df[f'{team_type}_ewm_form'] = self.df.groupby(team_col)[points_col].transform(
290
  lambda x: x.ewm(span=5, adjust=False).mean()
291
  )
292
  self.features_created.append(f'{team_type}_ewm_form')
293
 
294
  def _create_xg_features(self):
295
  """Create expected goals features if available."""
296
- xg_cols = ['home_xg', 'away_xg']
297
 
298
- if not all(col in self.df.columns for col in xg_cols):
299
  return
300
 
301
  for window in self.ROLLING_WINDOWS[:4]: # Limit to shorter windows for xG
@@ -303,34 +285,29 @@ class AdvancedFeatureEngineer:
303
  team_col = f'{team_type}_team'
304
  xg_col = f'{team_type}_xg'
305
 
306
- if xg_col not in self.df.columns or team_col not in self.df.columns:
307
- continue
308
-
309
- # xG average
310
- self.df[f'{team_type}_xg_avg_{window}'] = self.df.groupby(team_col)[xg_col].transform(
311
- lambda x: x.rolling(window, min_periods=1).mean()
312
- )
313
-
314
- # xG overperformance (goals - xG)
315
- scored_col = f'{team_type}_goals_scored_avg_{window}'
316
- if scored_col in self.df.columns:
317
- self.df[f'{team_type}_xg_overperformance_{window}'] = (
318
- self.df[scored_col] -
319
- self.df[f'{team_type}_xg_avg_{window}']
320
  )
321
 
322
- self.features_created.extend([
323
- f'{team_type}_xg_avg_{window}',
324
- f'{team_type}_xg_overperformance_{window}'
325
- ])
 
 
 
 
 
326
 
327
  def _create_shot_features(self):
328
- """Create shot-related features if available."""
329
  shot_cols = ['home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target']
330
 
331
  if not any(col in self.df.columns for col in shot_cols):
332
  return
333
-
334
  for window in [3, 5, 10]:
335
  for team_type in ['home', 'away']:
336
  team_col = f'{team_type}_team'
@@ -338,50 +315,133 @@ class AdvancedFeatureEngineer:
338
  if team_col not in self.df.columns:
339
  continue
340
 
341
- # Shots
342
- shots_col = f'{team_type}_shots'
343
- if shots_col in self.df.columns:
344
- self.df[f'{team_type}_shots_avg_{window}'] = self.df.groupby(team_col)[shots_col].transform(
345
  lambda x: x.rolling(window, min_periods=1).mean()
346
  )
347
  self.features_created.append(f'{team_type}_shots_avg_{window}')
348
 
349
- # Shots on target
350
- sot_col = f'{team_type}_shots_on_target'
351
- if sot_col in self.df.columns:
352
- self.df[f'{team_type}_sot_avg_{window}'] = self.df.groupby(team_col)[sot_col].transform(
353
  lambda x: x.rolling(window, min_periods=1).mean()
354
  )
355
  self.features_created.append(f'{team_type}_sot_avg_{window}')
356
 
357
  # Shot accuracy
358
- if shots_col in self.df.columns:
359
  self.df[f'{team_type}_shot_accuracy_{window}'] = (
360
- self.df[f'{team_type}_sot_avg_{window}'] /
361
- self.df[f'{team_type}_shots_avg_{window}'].clip(lower=1)
362
  )
363
  self.features_created.append(f'{team_type}_shot_accuracy_{window}')
364
 
365
  def _create_possession_features(self):
366
- """Create possession-related features if available."""
367
- poss_cols = ['home_possession', 'away_possession']
368
-
369
- if not all(col in self.df.columns for col in poss_cols):
370
  return
371
 
372
  for window in [3, 5, 10]:
373
  for team_type in ['home', 'away']:
374
  team_col = f'{team_type}_team'
375
- poss_col = f'{team_type}_possession'
376
 
377
- if team_col not in self.df.columns:
378
  continue
379
-
380
- self.df[f'{team_type}_possession_avg_{window}'] = self.df.groupby(team_col)[poss_col].transform(
381
  lambda x: x.rolling(window, min_periods=1).mean()
382
  )
383
  self.features_created.append(f'{team_type}_possession_avg_{window}')
384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  def _create_btts_specific_features(self):
386
  """Create BTTS-specific features."""
387
  if 'home_goals' not in self.df.columns:
@@ -434,7 +494,9 @@ class AdvancedFeatureEngineer:
434
 
435
  # Combined BTTS probability features
436
  for window in [3, 5, 10]:
437
- if all(f'{t}_scored_rate_{window}' in self.df.columns for t in ['home', 'away']):
 
 
438
  self.df[f'combined_btts_prob_{window}'] = (
439
  self.df[f'home_scored_rate_{window}'] * self.df[f'away_scored_rate_{window}'] *
440
  self.df[f'home_conceded_rate_{window}'] * self.df[f'away_conceded_rate_{window}']
@@ -451,7 +513,7 @@ class AdvancedFeatureEngineer:
451
  # Create indicators for different thresholds
452
  thresholds = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
453
  for threshold in thresholds:
454
- self.df[f'over_{threshold}'] = (self.df['total_goals'] > threshold).astype(int)
455
 
456
  for window in self.ROLLING_WINDOWS:
457
  for team_type in ['home', 'away']:
@@ -467,24 +529,26 @@ class AdvancedFeatureEngineer:
467
 
468
  # Total goals variance
469
  self.df[f'{team_type}_total_goals_std_{window}'] = self.df.groupby(team_col)['total_goals'].transform(
470
- lambda x: x.rolling(window, min_periods=2).std().fillna(0)
471
  )
472
 
473
- # Over rates for each threshold
474
- for threshold in [1.5, 2.5, 3.5]:
475
- self.df[f'{team_type}_over_{threshold}_rate_{window}'] = self.df.groupby(team_col)[f'over_{threshold}'].transform(
476
- lambda x: x.rolling(window, min_periods=1).mean()
477
- )
478
- self.features_created.append(f'{team_type}_over_{threshold}_rate_{window}')
479
-
480
  self.features_created.extend([
481
  f'{team_type}_total_goals_avg_{window}',
482
  f'{team_type}_total_goals_std_{window}'
483
  ])
 
 
 
 
 
 
 
 
 
484
 
485
  # Combined over probability
486
  for window in [3, 5, 10]:
487
- if f'home_total_goals_avg_{window}' in self.df.columns:
488
  self.df[f'combined_total_goals_avg_{window}'] = (
489
  self.df[f'home_total_goals_avg_{window}'] + self.df[f'away_total_goals_avg_{window}']
490
  ) / 2
@@ -523,29 +587,9 @@ class AdvancedFeatureEngineer:
523
  lambda x: x.rolling(window, min_periods=1).mean()
524
  )
525
 
526
- # First half win rate
527
- ht_win = 'H' if team_type == 'home' else 'A'
528
- self.df[f'{team_type}_1h_win_rate_{window}'] = self.df.groupby(team_col)['ht_result'].transform(
529
- lambda x: (x == ht_win).rolling(window, min_periods=1).mean()
530
- )
531
-
532
- # First half draw rate
533
- self.df[f'{team_type}_1h_draw_rate_{window}'] = self.df.groupby(team_col)['ht_result'].transform(
534
- lambda x: (x == 'D').rolling(window, min_periods=1).mean()
535
- )
536
-
537
- # Goal ratio 1H vs 2H
538
- self.df[f'{team_type}_1h_2h_ratio_{window}'] = (
539
- self.df[f'{team_type}_1h_goals_avg_{window}'] /
540
- self.df[f'{team_type}_2h_goals_avg_{window}'].clip(lower=0.1)
541
- )
542
-
543
  self.features_created.extend([
544
  f'{team_type}_1h_goals_avg_{window}',
545
- f'{team_type}_2h_goals_avg_{window}',
546
- f'{team_type}_1h_win_rate_{window}',
547
- f'{team_type}_1h_draw_rate_{window}',
548
- f'{team_type}_1h_2h_ratio_{window}'
549
  ])
550
 
551
  def _create_correct_score_features(self):
@@ -556,209 +600,241 @@ class AdvancedFeatureEngineer:
556
  # Score string
557
  self.df['score'] = self.df['home_goals'].astype(str) + '-' + self.df['away_goals'].astype(str)
558
 
559
- # Common score patterns
560
- for team_type in ['home', 'away']:
561
- team_col = f'{team_type}_team'
562
- goals_col = f'{team_type}_goals'
563
-
564
- if team_col not in self.df.columns:
565
- continue
566
-
567
- for window in [10, 20]:
568
- # Nil scorer rate
569
- self.df[f'{team_type}_nil_scorer_rate_{window}'] = self.df.groupby(team_col)[goals_col].transform(
570
- lambda x: (x == 0).rolling(window, min_periods=5).mean()
571
- )
572
-
573
- # One goal scorer rate
574
- self.df[f'{team_type}_one_goal_rate_{window}'] = self.df.groupby(team_col)[goals_col].transform(
575
- lambda x: (x == 1).rolling(window, min_periods=5).mean()
576
- )
577
-
578
- # Two goals scorer rate
579
- self.df[f'{team_type}_two_goals_rate_{window}'] = self.df.groupby(team_col)[goals_col].transform(
580
- lambda x: (x == 2).rolling(window, min_periods=5).mean()
581
- )
582
-
583
- # Three+ goals scorer rate
584
- self.df[f'{team_type}_three_plus_goals_rate_{window}'] = self.df.groupby(team_col)[goals_col].transform(
585
- lambda x: (x >= 3).rolling(window, min_periods=5).mean()
586
- )
587
-
588
- self.features_created.extend([
589
- f'{team_type}_nil_scorer_rate_{window}',
590
- f'{team_type}_one_goal_rate_{window}',
591
- f'{team_type}_two_goals_rate_{window}',
592
- f'{team_type}_three_plus_goals_rate_{window}'
593
- ])
594
 
595
- def _create_timing_features(self):
596
- """Create time-based features."""
597
- if 'match_date' not in self.df.columns:
598
  return
599
 
600
- self.df['match_date'] = pd.to_datetime(self.df['match_date'])
601
-
602
- self.df['day_of_week'] = self.df['match_date'].dt.dayofweek
603
- self.df['month'] = self.df['match_date'].dt.month
604
- self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6]).astype(int)
605
- self.df['is_midweek'] = self.df['day_of_week'].isin([1, 2, 3]).astype(int)
606
 
607
- # Season progress (0 to 1)
608
- if 'season' in self.df.columns and 'league' in self.df.columns:
609
- self.df['match_number'] = self.df.groupby(['league', 'season']).cumcount() + 1
610
- max_matches = self.df.groupby(['league', 'season'])['match_number'].transform('max')
611
- self.df['season_progress'] = self.df['match_number'] / max_matches
612
 
613
- # Early/mid/late season indicators
614
- self.df['early_season'] = (self.df['season_progress'] < 0.25).astype(int)
615
- self.df['mid_season'] = ((self.df['season_progress'] >= 0.25) & (self.df['season_progress'] < 0.75)).astype(int)
616
- self.df['late_season'] = (self.df['season_progress'] >= 0.75).astype(int)
 
 
 
 
617
 
618
- self.features_created.extend([
619
- 'match_number', 'season_progress', 'early_season', 'mid_season', 'late_season'
620
- ])
621
-
622
- self.features_created.extend([
623
- 'day_of_week', 'month', 'is_weekend', 'is_midweek'
624
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
- def _create_schedule_features(self):
627
- """Create schedule-related features."""
628
- if 'match_date' not in self.df.columns:
629
  return
630
 
631
- for team_type in ['home', 'away']:
632
- team_col = f'{team_type}_team'
633
-
634
- if team_col not in self.df.columns:
635
- continue
636
-
637
- # Days since last match
638
- self.df[f'{team_type}_days_rest'] = self.df.groupby(team_col)['match_date'].diff().dt.days.fillna(7)
639
-
640
- self.features_created.append(f'{team_type}_days_rest')
641
 
642
- if 'home_days_rest' in self.df.columns and 'away_days_rest' in self.df.columns:
643
- self.df['rest_difference'] = self.df['home_days_rest'] - self.df['away_days_rest']
644
- self.features_created.append('rest_difference')
645
 
646
- def _create_h2h_features(self):
647
- """Create head-to-head features (simplified for performance)."""
648
- if 'home_team' not in self.df.columns or 'away_team' not in self.df.columns:
649
- return
650
-
651
- # Create matchup key
652
- self.df['matchup'] = self.df.apply(
653
- lambda x: tuple(sorted([x['home_team'], x['away_team']])), axis=1
654
- )
655
-
656
- # H2H total goals average
657
- self.df['h2h_total_goals_avg'] = self.df.groupby('matchup')['total_goals'].transform(
658
- lambda x: x.rolling(10, min_periods=1).mean()
659
- ) if 'total_goals' in self.df.columns else 2.5
660
-
661
- # H2H BTTS rate
662
- if 'btts' in self.df.columns:
663
- self.df['h2h_btts_rate'] = self.df.groupby('matchup')['btts'].transform(
664
- lambda x: x.rolling(10, min_periods=1).mean()
665
- )
666
- self.features_created.append('h2h_btts_rate')
667
-
668
- self.features_created.append('h2h_total_goals_avg')
669
 
670
  def _create_interaction_features(self):
671
  """Create interaction features between home and away."""
672
  for window in [5, 10]:
673
- # Check if required features exist
674
- if f'home_attack_strength_{window}' not in self.df.columns:
675
- continue
 
 
 
 
676
 
677
- # Attack vs Defense matchups
678
- self.df[f'attack_vs_defense_{window}'] = (
679
- self.df[f'home_attack_strength_{window}'] * self.df[f'away_defense_weakness_{window}']
680
- )
681
- self.df[f'defense_vs_attack_{window}'] = (
682
- self.df[f'away_attack_strength_{window}'] * self.df[f'home_defense_weakness_{window}']
683
- )
684
 
685
- # Form difference
686
- if f'home_ppg_{window}' in self.df.columns:
687
  self.df[f'form_difference_{window}'] = (
688
  self.df[f'home_ppg_{window}'] - self.df[f'away_ppg_{window}']
689
  )
690
  self.features_created.append(f'form_difference_{window}')
691
 
692
- # Rating difference
693
- self.df[f'rating_difference_{window}'] = (
694
- self.df[f'home_overall_rating_{window}'] - self.df[f'away_overall_rating_{window}']
695
- )
696
-
697
- self.features_created.extend([
698
- f'attack_vs_defense_{window}',
699
- f'defense_vs_attack_{window}',
700
- f'rating_difference_{window}'
701
- ])
702
 
703
  def _create_ratio_features(self):
704
  """Create ratio-based features."""
705
  for window in [5, 10]:
706
- if f'home_attack_strength_{window}' not in self.df.columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
  continue
 
 
 
 
 
 
708
 
709
- # Attack ratio
710
- self.df[f'attack_ratio_{window}'] = (
711
- self.df[f'home_attack_strength_{window}'] /
712
- self.df[f'away_attack_strength_{window}'].clip(lower=0.1)
713
- )
 
 
714
 
715
- # Defense ratio
716
- self.df[f'defense_ratio_{window}'] = (
717
- self.df[f'away_defense_weakness_{window}'] /
718
- self.df[f'home_defense_weakness_{window}'].clip(lower=0.1)
719
  )
720
-
721
- self.features_created.extend([
722
- f'attack_ratio_{window}',
723
- f'defense_ratio_{window}'
724
- ])
725
 
726
- @staticmethod
727
- def _calculate_streak(series, *winning_values):
728
- """Calculate current streak of winning/unbeaten."""
729
- streak = 0
730
- streaks = []
731
- for val in series:
732
- if val in winning_values:
733
- streak += 1
734
- else:
735
- streak = 0
736
- streaks.append(streak)
737
- return pd.Series(streaks, index=series.index)
 
 
 
 
 
 
 
738
 
739
- def get_feature_importance(self, target_col: str = 'result') -> Dict[str, float]:
740
- """Get feature importance using correlation analysis."""
741
- if target_col not in self.df.columns:
742
- return {}
743
-
744
- # Encode target
745
- if self.df[target_col].dtype == 'object':
746
- target = self.df[target_col].map({'H': 1, 'D': 0.5, 'A': 0}).fillna(0.5)
747
- else:
748
- target = self.df[target_col]
749
-
750
- importance = {}
751
- for feature in self.features_created:
752
- if feature in self.df.columns:
753
- corr = self.df[feature].corr(target)
754
- if not np.isnan(corr):
755
- importance[feature] = abs(corr)
756
-
757
- return dict(sorted(importance.items(), key=lambda x: x[1], reverse=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
758
 
759
 
760
- # Convenience function
761
- def create_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
762
- """Create all advanced features for a DataFrame."""
763
- engineer = AdvancedFeatureEngineer(df)
764
  return engineer.create_all_features()
 
1
  """
2
+ Advanced Feature Engineering Module - EXPANDED 400+ Features
3
+ Comprehensive feature engineering based on the complete blueprint.
4
 
5
+ Creates 400+ features covering:
6
+ - Team performance metrics (multiple windows)
7
+ - Player-level aggregations
8
+ - Momentum & form indicators
9
+ - Tactical patterns
10
+ - Head-to-head statistics
11
+ - Contextual features
12
+ - Market-derived features
13
+ - BTTS, Over/Under, HT/FT specific features
 
 
 
 
14
  """
15
 
16
  import pandas as pd
17
  import numpy as np
18
+ from typing import Dict, List, Tuple, Optional
19
  from scipy import stats
20
+ import logging
21
  import warnings
22
+
23
  warnings.filterwarnings('ignore')
24
+ logger = logging.getLogger(__name__)
25
 
26
 
27
  class AdvancedFeatureEngineer:
28
  """
29
  Comprehensive feature engineering with 400+ features covering:
30
  - Team performance metrics
31
+ - Player-level aggregations
32
  - Momentum & form indicators
33
  - Tactical patterns
34
  - Head-to-head statistics
 
47
 
48
  def create_all_features(self) -> pd.DataFrame:
49
  """Create comprehensive feature set (400+ features)."""
50
+ logger.info("Creating advanced features (400+ features)...")
51
 
52
  # Core features
53
  self._create_basic_goal_features()
 
59
  self._create_xg_features()
60
  self._create_shot_features()
61
  self._create_possession_features()
62
+ self._create_set_piece_features()
63
+
64
+ # Tactical features
65
+ self._create_tactical_features()
66
+ self._create_style_features()
67
+
68
+ # Time-based features
69
+ self._create_timing_features()
70
+ self._create_schedule_features()
71
+ self._create_fatigue_features()
72
+
73
+ # Head-to-head features
74
+ self._create_h2h_features()
75
 
76
  # Market-specific features
77
  self._create_btts_specific_features()
 
79
  self._create_htft_features()
80
  self._create_correct_score_features()
81
 
82
+ # Contextual features
83
+ self._create_league_context_features()
84
+ self._create_situational_features()
 
85
 
86
  # Derived features
87
  self._create_interaction_features()
88
  self._create_ratio_features()
89
 
90
+ # Additional advanced features
91
+ self._create_elo_features()
92
+ self._create_poisson_features()
93
+ self._create_streak_features()
94
+ self._create_consistency_features()
95
+ self._create_scoring_pattern_features()
96
+
97
+ logger.info(f"Created {len(self.features_created)} features")
98
  return self.df
99
 
100
  def _create_basic_goal_features(self):
 
116
  lambda x: x.rolling(window, min_periods=1).mean()
117
  )
118
  self.df[f'{team_type}_goals_scored_std_{window}'] = self.df.groupby(team_col)[goals_for].transform(
119
+ lambda x: x.rolling(window, min_periods=2).std()
120
  )
121
  self.df[f'{team_type}_goals_scored_max_{window}'] = self.df.groupby(team_col)[goals_for].transform(
122
  lambda x: x.rolling(window, min_periods=1).max()
 
130
  lambda x: x.rolling(window, min_periods=1).mean()
131
  )
132
  self.df[f'{team_type}_goals_conceded_std_{window}'] = self.df.groupby(team_col)[goals_against].transform(
133
+ lambda x: x.rolling(window, min_periods=2).std()
134
  )
135
 
136
  # Goal difference
 
162
  league_stats.columns = ['league', 'league_home_avg', 'league_away_avg']
163
  self.df = self.df.merge(league_stats, on='league', how='left')
164
 
 
 
 
 
165
  for window in self.ROLLING_WINDOWS:
166
  for team_type in ['home', 'away']:
167
+ if f'{team_type}_goals_scored_avg_{window}' not in self.df.columns:
168
+ continue
169
+
170
  # Attack strength (relative to league average)
171
+ self.df[f'{team_type}_attack_strength_{window}'] = (
172
+ self.df[f'{team_type}_goals_scored_avg_{window}'] /
173
+ self.df[f'league_{team_type}_avg'].clip(lower=0.1)
174
+ )
 
 
175
 
176
  # Defense weakness (higher = worse defense)
177
+ self.df[f'{team_type}_defense_weakness_{window}'] = (
178
+ self.df[f'{team_type}_goals_conceded_avg_{window}'] /
179
+ self.df[f'league_{("away" if team_type == "home" else "home")}_avg'].clip(lower=0.1)
180
+ )
 
 
 
181
 
182
  # Combined rating
183
+ self.df[f'{team_type}_overall_rating_{window}'] = (
184
+ self.df[f'{team_type}_attack_strength_{window}'] -
185
+ self.df[f'{team_type}_defense_weakness_{window}'] + 1
186
+ )
187
+
188
+ self.features_created.extend([
189
+ f'{team_type}_attack_strength_{window}',
190
+ f'{team_type}_defense_weakness_{window}',
191
+ f'{team_type}_overall_rating_{window}'
192
+ ])
 
193
 
194
  def _create_form_features(self):
195
  """Create team form features."""
 
197
  return
198
 
199
  # Points calculation
200
+ self.df['home_points'] = self.df['result'].map({'H': 3, 'D': 1, 'A': 0})
201
+ self.df['away_points'] = self.df['result'].map({'A': 3, 'D': 1, 'H': 0})
202
 
203
  for window in self.ROLLING_WINDOWS:
204
  for team_type in ['home', 'away']:
 
213
  lambda x: x.rolling(window, min_periods=1).mean()
214
  )
215
 
216
+ # Win/Draw/Loss rates
 
217
  self.df[f'{team_type}_win_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
218
+ lambda x: (x == ('H' if team_type == 'home' else 'A')).rolling(window, min_periods=1).mean()
219
  )
 
 
220
  self.df[f'{team_type}_draw_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
221
  lambda x: (x == 'D').rolling(window, min_periods=1).mean()
222
  )
 
 
 
223
  self.df[f'{team_type}_loss_rate_{window}'] = self.df.groupby(team_col)['result'].transform(
224
+ lambda x: (x == ('A' if team_type == 'home' else 'H')).rolling(window, min_periods=1).mean()
225
  )
226
 
227
  self.features_created.extend([
 
230
  f'{team_type}_draw_rate_{window}',
231
  f'{team_type}_loss_rate_{window}'
232
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  def _create_momentum_features(self):
235
  """Create momentum and trend features."""
236
  for team_type in ['home', 'away']:
237
+ team_col = f'{team_type}_team'
238
+
239
+ if team_col not in self.df.columns:
240
  continue
241
+
242
  # Short-term vs long-term form (momentum indicator)
243
+ if f'{team_type}_ppg_3' in self.df.columns and f'{team_type}_ppg_10' in self.df.columns:
244
  self.df[f'{team_type}_momentum_3v10'] = (
245
  self.df[f'{team_type}_ppg_3'] - self.df[f'{team_type}_ppg_10']
246
  )
 
267
  self.features_created.append(f'{team_type}_defense_momentum_3v10')
268
 
269
  # Exponential weighted moving average for form
270
+ if f'{team_type}_points' in self.df.columns:
271
+ self.df[f'{team_type}_ewm_form'] = self.df.groupby(team_col)[f'{team_type}_points'].transform(
 
 
272
  lambda x: x.ewm(span=5, adjust=False).mean()
273
  )
274
  self.features_created.append(f'{team_type}_ewm_form')
275
 
276
  def _create_xg_features(self):
277
  """Create expected goals features if available."""
278
+ xg_cols = ['home_xg', 'away_xg', 'home_xga', 'away_xga']
279
 
280
+ if not all(col in self.df.columns for col in xg_cols[:2]):
281
  return
282
 
283
  for window in self.ROLLING_WINDOWS[:4]: # Limit to shorter windows for xG
 
285
  team_col = f'{team_type}_team'
286
  xg_col = f'{team_type}_xg'
287
 
288
+ if xg_col in self.df.columns and team_col in self.df.columns:
289
+ # xG average
290
+ self.df[f'{team_type}_xg_avg_{window}'] = self.df.groupby(team_col)[xg_col].transform(
291
+ lambda x: x.rolling(window, min_periods=1).mean()
 
 
 
 
 
 
 
 
 
 
292
  )
293
 
294
+ # xG overperformance (goals - xG)
295
+ if f'{team_type}_goals_scored_avg_{window}' in self.df.columns:
296
+ self.df[f'{team_type}_xg_overperformance_{window}'] = (
297
+ self.df[f'{team_type}_goals_scored_avg_{window}'] -
298
+ self.df[f'{team_type}_xg_avg_{window}']
299
+ )
300
+ self.features_created.append(f'{team_type}_xg_overperformance_{window}')
301
+
302
+ self.features_created.append(f'{team_type}_xg_avg_{window}')
303
 
304
  def _create_shot_features(self):
305
+ """Create shot-related features."""
306
  shot_cols = ['home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target']
307
 
308
  if not any(col in self.df.columns for col in shot_cols):
309
  return
310
+
311
  for window in [3, 5, 10]:
312
  for team_type in ['home', 'away']:
313
  team_col = f'{team_type}_team'
 
315
  if team_col not in self.df.columns:
316
  continue
317
 
318
+ if f'{team_type}_shots' in self.df.columns:
319
+ self.df[f'{team_type}_shots_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_shots'].transform(
 
 
320
  lambda x: x.rolling(window, min_periods=1).mean()
321
  )
322
  self.features_created.append(f'{team_type}_shots_avg_{window}')
323
 
324
+ if f'{team_type}_shots_on_target' in self.df.columns:
325
+ self.df[f'{team_type}_sot_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_shots_on_target'].transform(
 
 
326
  lambda x: x.rolling(window, min_periods=1).mean()
327
  )
328
  self.features_created.append(f'{team_type}_sot_avg_{window}')
329
 
330
  # Shot accuracy
331
+ if f'{team_type}_shots_avg_{window}' in self.df.columns:
332
  self.df[f'{team_type}_shot_accuracy_{window}'] = (
333
+ self.df[f'{team_type}_sot_avg_{window}'] /
334
+ self.df[f'{team_type}_shots_avg_{window}'].clip(lower=0.1)
335
  )
336
  self.features_created.append(f'{team_type}_shot_accuracy_{window}')
337
 
338
  def _create_possession_features(self):
339
+ """Create possession-related features."""
340
+ if 'home_possession' not in self.df.columns:
 
 
341
  return
342
 
343
  for window in [3, 5, 10]:
344
  for team_type in ['home', 'away']:
345
  team_col = f'{team_type}_team'
 
346
 
347
+ if team_col not in self.df.columns or f'{team_type}_possession' not in self.df.columns:
348
  continue
349
+
350
+ self.df[f'{team_type}_possession_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_possession'].transform(
351
  lambda x: x.rolling(window, min_periods=1).mean()
352
  )
353
  self.features_created.append(f'{team_type}_possession_avg_{window}')
354
 
355
+ def _create_set_piece_features(self):
356
+ """Create set piece features."""
357
+ corner_cols = ['home_corners', 'away_corners']
358
+
359
+ if not all(col in self.df.columns for col in corner_cols):
360
+ return
361
+
362
+ for window in [5, 10]:
363
+ for team_type in ['home', 'away']:
364
+ team_col = f'{team_type}_team'
365
+
366
+ if team_col not in self.df.columns:
367
+ continue
368
+
369
+ self.df[f'{team_type}_corners_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_corners'].transform(
370
+ lambda x: x.rolling(window, min_periods=1).mean()
371
+ )
372
+ self.features_created.append(f'{team_type}_corners_avg_{window}')
373
+
374
+ def _create_tactical_features(self):
375
+ """Create tactical style features."""
376
+ pass # Placeholder for tactical data
377
+
378
+ def _create_style_features(self):
379
+ """Create playing style features."""
380
+ pass # Placeholder for style data
381
+
382
+ def _create_timing_features(self):
383
+ """Create time-based features."""
384
+ if 'match_date' not in self.df.columns:
385
+ return
386
+
387
+ self.df['match_date'] = pd.to_datetime(self.df['match_date'])
388
+
389
+ self.df['day_of_week'] = self.df['match_date'].dt.dayofweek
390
+ self.df['month'] = self.df['match_date'].dt.month
391
+ self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6]).astype(int)
392
+ self.df['is_midweek'] = self.df['day_of_week'].isin([1, 2, 3]).astype(int)
393
+
394
+ # Season progress (0 to 1)
395
+ if 'league' in self.df.columns and 'season' in self.df.columns:
396
+ self.df['match_number'] = self.df.groupby(['league', 'season']).cumcount() + 1
397
+ max_matches = self.df.groupby(['league', 'season'])['match_number'].transform('max')
398
+ self.df['season_progress'] = self.df['match_number'] / max_matches
399
+
400
+ # Early/mid/late season indicators
401
+ self.df['early_season'] = (self.df['season_progress'] < 0.25).astype(int)
402
+ self.df['mid_season'] = ((self.df['season_progress'] >= 0.25) & (self.df['season_progress'] < 0.75)).astype(int)
403
+ self.df['late_season'] = (self.df['season_progress'] >= 0.75).astype(int)
404
+
405
+ self.features_created.extend([
406
+ 'season_progress', 'early_season', 'mid_season', 'late_season'
407
+ ])
408
+
409
+ self.features_created.extend([
410
+ 'day_of_week', 'month', 'is_weekend', 'is_midweek'
411
+ ])
412
+
413
+ def _create_schedule_features(self):
414
+ """Create schedule-related features."""
415
+ if 'match_date' not in self.df.columns:
416
+ return
417
+
418
+ for team_type in ['home', 'away']:
419
+ team_col = f'{team_type}_team'
420
+
421
+ if team_col not in self.df.columns:
422
+ continue
423
+
424
+ # Days since last match
425
+ self.df[f'{team_type}_days_rest'] = self.df.groupby(team_col)['match_date'].diff().dt.days
426
+ self.df[f'{team_type}_days_rest'] = self.df[f'{team_type}_days_rest'].fillna(7)
427
+
428
+ self.features_created.append(f'{team_type}_days_rest')
429
+
430
+ if 'home_days_rest' in self.df.columns and 'away_days_rest' in self.df.columns:
431
+ self.df['rest_difference'] = self.df['home_days_rest'] - self.df['away_days_rest']
432
+ self.features_created.append('rest_difference')
433
+
434
+ def _create_fatigue_features(self):
435
+ """Create fatigue indicators."""
436
+ if 'match_date' not in self.df.columns:
437
+ return
438
+
439
+ # Simplified fatigue based on rest days
440
+ for team_type in ['home', 'away']:
441
+ if f'{team_type}_days_rest' in self.df.columns:
442
+ self.df[f'{team_type}_fatigue'] = (7 - self.df[f'{team_type}_days_rest'].clip(upper=7)) / 7
443
+ self.features_created.append(f'{team_type}_fatigue')
444
+
445
  def _create_btts_specific_features(self):
446
  """Create BTTS-specific features."""
447
  if 'home_goals' not in self.df.columns:
 
494
 
495
  # Combined BTTS probability features
496
  for window in [3, 5, 10]:
497
+ if all(f'{t}_{r}_{window}' in self.df.columns
498
+ for t in ['home', 'away']
499
+ for r in ['scored_rate', 'conceded_rate']):
500
  self.df[f'combined_btts_prob_{window}'] = (
501
  self.df[f'home_scored_rate_{window}'] * self.df[f'away_scored_rate_{window}'] *
502
  self.df[f'home_conceded_rate_{window}'] * self.df[f'away_conceded_rate_{window}']
 
513
  # Create indicators for different thresholds
514
  thresholds = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
515
  for threshold in thresholds:
516
+ self.df[f'over_{str(threshold).replace(".", "_")}'] = (self.df['total_goals'] > threshold).astype(int)
517
 
518
  for window in self.ROLLING_WINDOWS:
519
  for team_type in ['home', 'away']:
 
529
 
530
  # Total goals variance
531
  self.df[f'{team_type}_total_goals_std_{window}'] = self.df.groupby(team_col)['total_goals'].transform(
532
+ lambda x: x.rolling(window, min_periods=2).std()
533
  )
534
 
 
 
 
 
 
 
 
535
  self.features_created.extend([
536
  f'{team_type}_total_goals_avg_{window}',
537
  f'{team_type}_total_goals_std_{window}'
538
  ])
539
+
540
+ # Over rates for each threshold
541
+ for threshold in [1.5, 2.5, 3.5]:
542
+ col_name = f'over_{str(threshold).replace(".", "_")}'
543
+ if col_name in self.df.columns:
544
+ self.df[f'{team_type}_over_{str(threshold).replace(".", "_")}_rate_{window}'] = self.df.groupby(team_col)[col_name].transform(
545
+ lambda x: x.rolling(window, min_periods=1).mean()
546
+ )
547
+ self.features_created.append(f'{team_type}_over_{str(threshold).replace(".", "_")}_rate_{window}')
548
 
549
  # Combined over probability
550
  for window in [3, 5, 10]:
551
+ if f'home_total_goals_avg_{window}' in self.df.columns and f'away_total_goals_avg_{window}' in self.df.columns:
552
  self.df[f'combined_total_goals_avg_{window}'] = (
553
  self.df[f'home_total_goals_avg_{window}'] + self.df[f'away_total_goals_avg_{window}']
554
  ) / 2
 
587
  lambda x: x.rolling(window, min_periods=1).mean()
588
  )
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  self.features_created.extend([
591
  f'{team_type}_1h_goals_avg_{window}',
592
+ f'{team_type}_2h_goals_avg_{window}'
 
 
 
593
  ])
594
 
595
  def _create_correct_score_features(self):
 
600
  # Score string
601
  self.df['score'] = self.df['home_goals'].astype(str) + '-' + self.df['away_goals'].astype(str)
602
 
603
+ # Common score frequencies
604
+ common_scores = ['1-0', '0-0', '1-1', '2-1', '2-0', '0-1', '1-2', '0-2', '2-2', '3-1']
605
+
606
+ for score in common_scores:
607
+ self.df[f'is_{score.replace("-", "_")}'] = (self.df['score'] == score).astype(int)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
+ def _create_h2h_features(self):
610
+ """Create head-to-head features."""
611
+ if 'home_team' not in self.df.columns or 'match_date' not in self.df.columns:
612
  return
613
 
614
+ h2h_stats = []
 
 
 
 
 
615
 
616
+ for idx, row in self.df.iterrows():
617
+ home = row['home_team']
618
+ away = row['away_team']
619
+ date = row['match_date']
 
620
 
621
+ # Previous encounters (last 10)
622
+ prev = self.df[
623
+ (self.df['match_date'] < date) &
624
+ (
625
+ ((self.df['home_team'] == home) & (self.df['away_team'] == away)) |
626
+ ((self.df['home_team'] == away) & (self.df['away_team'] == home))
627
+ )
628
+ ].tail(10)
629
 
630
+ if len(prev) > 0:
631
+ home_wins = len(prev[
632
+ ((prev['home_team'] == home) & (prev['result'] == 'H')) |
633
+ ((prev['away_team'] == home) & (prev['result'] == 'A'))
634
+ ])
635
+ draws = len(prev[prev['result'] == 'D'])
636
+ total = len(prev)
637
+
638
+ home_goals = prev[prev['home_team'] == home]['home_goals'].sum() + \
639
+ prev[prev['away_team'] == home]['away_goals'].sum()
640
+ away_goals = prev[prev['home_team'] == away]['home_goals'].sum() + \
641
+ prev[prev['away_team'] == away]['away_goals'].sum()
642
+
643
+ h2h_stats.append({
644
+ 'h2h_home_win_rate': home_wins / total,
645
+ 'h2h_draw_rate': draws / total,
646
+ 'h2h_avg_home_goals': home_goals / total,
647
+ 'h2h_avg_away_goals': away_goals / total,
648
+ 'h2h_total_goals_avg': (home_goals + away_goals) / total,
649
+ 'h2h_btts_rate': len(prev[(prev['home_goals'] > 0) & (prev['away_goals'] > 0)]) / total,
650
+ 'h2h_matches': total
651
+ })
652
+ else:
653
+ h2h_stats.append({
654
+ 'h2h_home_win_rate': 0.33,
655
+ 'h2h_draw_rate': 0.33,
656
+ 'h2h_avg_home_goals': 1.3,
657
+ 'h2h_avg_away_goals': 1.0,
658
+ 'h2h_total_goals_avg': 2.3,
659
+ 'h2h_btts_rate': 0.5,
660
+ 'h2h_matches': 0
661
+ })
662
+
663
+ h2h_df = pd.DataFrame(h2h_stats)
664
+ for col in h2h_df.columns:
665
+ self.df[col] = h2h_df[col].values
666
+ self.features_created.append(col)
667
 
668
+ def _create_league_context_features(self):
669
+ """Create league position and context features."""
670
+ if 'league_position_home' not in self.df.columns:
671
  return
672
 
673
+ self.df['position_diff'] = self.df['league_position_home'] - self.df['league_position_away']
674
+ self.df['top_6_match'] = ((self.df['league_position_home'] <= 6) & (self.df['league_position_away'] <= 6)).astype(int)
675
+ self.df['relegation_match'] = ((self.df['league_position_home'] >= 15) | (self.df['league_position_away'] >= 15)).astype(int)
 
 
 
 
 
 
 
676
 
677
+ self.features_created.extend(['position_diff', 'top_6_match', 'relegation_match'])
 
 
678
 
679
+ def _create_situational_features(self):
680
+ """Create situational context features."""
681
+ pass # Placeholder for derby/importance data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
 
683
  def _create_interaction_features(self):
684
  """Create interaction features between home and away."""
685
  for window in [5, 10]:
686
+ if f'home_attack_strength_{window}' in self.df.columns and f'away_defense_weakness_{window}' in self.df.columns:
687
+ self.df[f'attack_vs_defense_{window}'] = (
688
+ self.df[f'home_attack_strength_{window}'] * self.df[f'away_defense_weakness_{window}']
689
+ )
690
+ self.df[f'defense_vs_attack_{window}'] = (
691
+ self.df[f'away_attack_strength_{window}'] * self.df[f'home_defense_weakness_{window}']
692
+ )
693
 
694
+ self.features_created.extend([
695
+ f'attack_vs_defense_{window}',
696
+ f'defense_vs_attack_{window}'
697
+ ])
 
 
 
698
 
699
+ if f'home_ppg_{window}' in self.df.columns and f'away_ppg_{window}' in self.df.columns:
 
700
  self.df[f'form_difference_{window}'] = (
701
  self.df[f'home_ppg_{window}'] - self.df[f'away_ppg_{window}']
702
  )
703
  self.features_created.append(f'form_difference_{window}')
704
 
705
+ if f'home_overall_rating_{window}' in self.df.columns and f'away_overall_rating_{window}' in self.df.columns:
706
+ self.df[f'rating_difference_{window}'] = (
707
+ self.df[f'home_overall_rating_{window}'] - self.df[f'away_overall_rating_{window}']
708
+ )
709
+ self.features_created.append(f'rating_difference_{window}')
 
 
 
 
 
710
 
711
  def _create_ratio_features(self):
712
  """Create ratio-based features."""
713
  for window in [5, 10]:
714
+ if f'home_attack_strength_{window}' in self.df.columns and f'away_attack_strength_{window}' in self.df.columns:
715
+ self.df[f'attack_ratio_{window}'] = (
716
+ self.df[f'home_attack_strength_{window}'] /
717
+ self.df[f'away_attack_strength_{window}'].clip(lower=0.1)
718
+ )
719
+ self.features_created.append(f'attack_ratio_{window}')
720
+
721
+ if f'home_defense_weakness_{window}' in self.df.columns and f'away_defense_weakness_{window}' in self.df.columns:
722
+ self.df[f'defense_ratio_{window}'] = (
723
+ self.df[f'away_defense_weakness_{window}'] /
724
+ self.df[f'home_defense_weakness_{window}'].clip(lower=0.1)
725
+ )
726
+ self.features_created.append(f'defense_ratio_{window}')
727
+
728
+ def _create_elo_features(self):
729
+ """Create Elo rating features."""
730
+ # Placeholder - would need Elo rating data
731
+ pass
732
+
733
+ def _create_poisson_features(self):
734
+ """Create Poisson-based expected goal features."""
735
+ for window in [5, 10]:
736
+ if f'home_goals_scored_avg_{window}' in self.df.columns and f'away_goals_conceded_avg_{window}' in self.df.columns:
737
+ # Expected home goals
738
+ self.df[f'poisson_home_xg_{window}'] = (
739
+ self.df[f'home_goals_scored_avg_{window}'] *
740
+ self.df[f'away_goals_conceded_avg_{window}'].clip(lower=0.5) / 1.5
741
+ )
742
+
743
+ # Expected away goals
744
+ self.df[f'poisson_away_xg_{window}'] = (
745
+ self.df[f'away_goals_scored_avg_{window}'] *
746
+ self.df[f'home_goals_conceded_avg_{window}'].clip(lower=0.5) / 1.5
747
+ )
748
+
749
+ self.features_created.extend([
750
+ f'poisson_home_xg_{window}',
751
+ f'poisson_away_xg_{window}'
752
+ ])
753
+
754
+ def _create_streak_features(self):
755
+ """Create winning/losing streak features."""
756
+ for team_type in ['home', 'away']:
757
+ team_col = f'{team_type}_team'
758
+
759
+ if team_col not in self.df.columns or 'result' not in self.df.columns:
760
  continue
761
+
762
+ # Calculate streaks
763
+ def calc_win_streak(results, team_type):
764
+ streaks = []
765
+ streak = 0
766
+ win_result = 'H' if team_type == 'home' else 'A'
767
 
768
+ for r in results:
769
+ if r == win_result:
770
+ streak += 1
771
+ else:
772
+ streak = 0
773
+ streaks.append(streak)
774
+ return streaks
775
 
776
+ self.df[f'{team_type}_win_streak'] = self.df.groupby(team_col)['result'].transform(
777
+ lambda x: calc_win_streak(x.tolist(), team_type)
 
 
778
  )
779
+ self.features_created.append(f'{team_type}_win_streak')
 
 
 
 
780
 
781
+ def _create_consistency_features(self):
782
+ """Create consistency/variance features."""
783
+ for window in [10, 20]:
784
+ for team_type in ['home', 'away']:
785
+ team_col = f'{team_type}_team'
786
+
787
+ if team_col not in self.df.columns or f'{team_type}_points' not in self.df.columns:
788
+ continue
789
+
790
+ # Points consistency (coefficient of variation)
791
+ mean_pts = self.df.groupby(team_col)[f'{team_type}_points'].transform(
792
+ lambda x: x.rolling(window, min_periods=3).mean()
793
+ )
794
+ std_pts = self.df.groupby(team_col)[f'{team_type}_points'].transform(
795
+ lambda x: x.rolling(window, min_periods=3).std()
796
+ )
797
+
798
+ self.df[f'{team_type}_consistency_{window}'] = 1 - (std_pts / mean_pts.clip(lower=0.1))
799
+ self.features_created.append(f'{team_type}_consistency_{window}')
800
 
801
+ def _create_scoring_pattern_features(self):
802
+ """Create scoring pattern features."""
803
+ if 'home_goals' not in self.df.columns:
804
+ return
805
+
806
+ # High scoring indicator
807
+ self.df['high_scoring'] = (self.df['home_goals'] + self.df['away_goals'] >= 3).astype(int)
808
+
809
+ # Low scoring indicator
810
+ self.df['low_scoring'] = (self.df['home_goals'] + self.df['away_goals'] <= 1).astype(int)
811
+
812
+ for window in [5, 10]:
813
+ for team_type in ['home', 'away']:
814
+ team_col = f'{team_type}_team'
815
+
816
+ if team_col not in self.df.columns:
817
+ continue
818
+
819
+ self.df[f'{team_type}_high_scoring_rate_{window}'] = self.df.groupby(team_col)['high_scoring'].transform(
820
+ lambda x: x.rolling(window, min_periods=1).mean()
821
+ )
822
+ self.df[f'{team_type}_low_scoring_rate_{window}'] = self.df.groupby(team_col)['low_scoring'].transform(
823
+ lambda x: x.rolling(window, min_periods=1).mean()
824
+ )
825
+
826
+ self.features_created.extend([
827
+ f'{team_type}_high_scoring_rate_{window}',
828
+ f'{team_type}_low_scoring_rate_{window}'
829
+ ])
830
+
831
+
832
+ def get_feature_engineer(df: pd.DataFrame = None) -> AdvancedFeatureEngineer:
833
+ """Get feature engineer instance."""
834
+ return AdvancedFeatureEngineer(df)
835
 
836
 
837
+ def create_match_features(historical_df: pd.DataFrame) -> pd.DataFrame:
838
+ """Create all features from historical data."""
839
+ engineer = AdvancedFeatureEngineer(historical_df)
 
840
  return engineer.create_all_features()