Spaces:
Runtime error
Runtime error
| """ | |
| Advanced Feature Engineering Module - EXPANDED 400+ Features | |
| Comprehensive feature engineering based on the complete blueprint. | |
| Creates 400+ features covering: | |
| - Team performance metrics (multiple windows) | |
| - Player-level aggregations | |
| - Momentum & form indicators | |
| - Tactical patterns | |
| - Head-to-head statistics | |
| - Contextual features | |
| - Market-derived features | |
| - BTTS, Over/Under, HT/FT specific features | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Optional | |
| from scipy import stats | |
| import logging | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| logger = logging.getLogger(__name__) | |
| class AdvancedFeatureEngineer: | |
| """ | |
| Comprehensive feature engineering with 400+ features covering: | |
| - Team performance metrics | |
| - Player-level aggregations | |
| - Momentum & form indicators | |
| - Tactical patterns | |
| - Head-to-head statistics | |
| - Contextual features | |
| - Market-derived features | |
| """ | |
| ROLLING_WINDOWS = [3, 5, 10, 15, 20, 38] # Various lookback periods | |
| def __init__(self, df: pd.DataFrame = None): | |
| self.df = df.copy() if df is not None else pd.DataFrame() | |
| if len(self.df) > 0: | |
| if 'match_date' in self.df.columns: | |
| self.df = self.df.sort_values('match_date').reset_index(drop=True) | |
| self.features_created = [] | |
| def create_all_features(self) -> pd.DataFrame: | |
| """Create comprehensive feature set (400+ features).""" | |
| logger.info("Creating advanced features (400+ features)...") | |
| # Core features | |
| self._create_basic_goal_features() | |
| self._create_attack_defense_ratings() | |
| self._create_form_features() | |
| self._create_momentum_features() | |
| # Advanced features | |
| self._create_xg_features() | |
| self._create_shot_features() | |
| self._create_possession_features() | |
| self._create_set_piece_features() | |
| # Tactical features | |
| self._create_tactical_features() | |
| self._create_style_features() | |
| # Time-based features | |
| self._create_timing_features() | |
| self._create_schedule_features() | |
| self._create_fatigue_features() | |
| # Head-to-head features | |
| self._create_h2h_features() | |
| # Market-specific features | |
| self._create_btts_specific_features() | |
| self._create_over_under_features() | |
| self._create_htft_features() | |
| self._create_correct_score_features() | |
| # Contextual features | |
| self._create_league_context_features() | |
| self._create_situational_features() | |
| # Derived features | |
| self._create_interaction_features() | |
| self._create_ratio_features() | |
| # Additional advanced features | |
| self._create_elo_features() | |
| self._create_poisson_features() | |
| self._create_streak_features() | |
| self._create_consistency_features() | |
| self._create_scoring_pattern_features() | |
| logger.info(f"Created {len(self.features_created)} features") | |
| return self.df | |
| def _create_basic_goal_features(self): | |
| """Create basic goal-related features.""" | |
| if 'home_goals' not in self.df.columns: | |
| return | |
| for window in self.ROLLING_WINDOWS: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| goals_for = f'{team_type}_goals' | |
| goals_against = 'away_goals' if team_type == 'home' else 'home_goals' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Goals scored statistics | |
| self.df[f'{team_type}_goals_scored_avg_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.df[f'{team_type}_goals_scored_std_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: x.rolling(window, min_periods=2).std() | |
| ) | |
| self.df[f'{team_type}_goals_scored_max_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: x.rolling(window, min_periods=1).max() | |
| ) | |
| self.df[f'{team_type}_goals_scored_min_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: x.rolling(window, min_periods=1).min() | |
| ) | |
| # Goals conceded statistics | |
| self.df[f'{team_type}_goals_conceded_avg_{window}'] = self.df.groupby(team_col)[goals_against].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.df[f'{team_type}_goals_conceded_std_{window}'] = self.df.groupby(team_col)[goals_against].transform( | |
| lambda x: x.rolling(window, min_periods=2).std() | |
| ) | |
| # Goal difference | |
| self.df[f'{team_type}_goal_diff_avg_{window}'] = ( | |
| self.df[f'{team_type}_goals_scored_avg_{window}'] - | |
| self.df[f'{team_type}_goals_conceded_avg_{window}'] | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_goals_scored_avg_{window}', | |
| f'{team_type}_goals_scored_std_{window}', | |
| f'{team_type}_goals_scored_max_{window}', | |
| f'{team_type}_goals_scored_min_{window}', | |
| f'{team_type}_goals_conceded_avg_{window}', | |
| f'{team_type}_goals_conceded_std_{window}', | |
| f'{team_type}_goal_diff_avg_{window}' | |
| ]) | |
| def _create_attack_defense_ratings(self): | |
| """Create attack and defense strength ratings.""" | |
| if 'league' not in self.df.columns or 'home_goals' not in self.df.columns: | |
| return | |
| # League averages | |
| league_stats = self.df.groupby('league').agg({ | |
| 'home_goals': 'mean', | |
| 'away_goals': 'mean' | |
| }).reset_index() | |
| league_stats.columns = ['league', 'league_home_avg', 'league_away_avg'] | |
| self.df = self.df.merge(league_stats, on='league', how='left') | |
| for window in self.ROLLING_WINDOWS: | |
| for team_type in ['home', 'away']: | |
| if f'{team_type}_goals_scored_avg_{window}' not in self.df.columns: | |
| continue | |
| # Attack strength (relative to league average) | |
| self.df[f'{team_type}_attack_strength_{window}'] = ( | |
| self.df[f'{team_type}_goals_scored_avg_{window}'] / | |
| self.df[f'league_{team_type}_avg'].clip(lower=0.1) | |
| ) | |
| # Defense weakness (higher = worse defense) | |
| self.df[f'{team_type}_defense_weakness_{window}'] = ( | |
| self.df[f'{team_type}_goals_conceded_avg_{window}'] / | |
| self.df[f'league_{("away" if team_type == "home" else "home")}_avg'].clip(lower=0.1) | |
| ) | |
| # Combined rating | |
| self.df[f'{team_type}_overall_rating_{window}'] = ( | |
| self.df[f'{team_type}_attack_strength_{window}'] - | |
| self.df[f'{team_type}_defense_weakness_{window}'] + 1 | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_attack_strength_{window}', | |
| f'{team_type}_defense_weakness_{window}', | |
| f'{team_type}_overall_rating_{window}' | |
| ]) | |
| def _create_form_features(self): | |
| """Create team form features.""" | |
| if 'result' not in self.df.columns: | |
| return | |
| # Points calculation | |
| self.df['home_points'] = self.df['result'].map({'H': 3, 'D': 1, 'A': 0}) | |
| self.df['away_points'] = self.df['result'].map({'A': 3, 'D': 1, 'H': 0}) | |
| for window in self.ROLLING_WINDOWS: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| points_col = f'{team_type}_points' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Points per game | |
| self.df[f'{team_type}_ppg_{window}'] = self.df.groupby(team_col)[points_col].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| # Win/Draw/Loss rates | |
| self.df[f'{team_type}_win_rate_{window}'] = self.df.groupby(team_col)['result'].transform( | |
| lambda x: (x == ('H' if team_type == 'home' else 'A')).rolling(window, min_periods=1).mean() | |
| ) | |
| self.df[f'{team_type}_draw_rate_{window}'] = self.df.groupby(team_col)['result'].transform( | |
| lambda x: (x == 'D').rolling(window, min_periods=1).mean() | |
| ) | |
| self.df[f'{team_type}_loss_rate_{window}'] = self.df.groupby(team_col)['result'].transform( | |
| lambda x: (x == ('A' if team_type == 'home' else 'H')).rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_ppg_{window}', | |
| f'{team_type}_win_rate_{window}', | |
| f'{team_type}_draw_rate_{window}', | |
| f'{team_type}_loss_rate_{window}' | |
| ]) | |
| def _create_momentum_features(self): | |
| """Create momentum and trend features.""" | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Short-term vs long-term form (momentum indicator) | |
| if f'{team_type}_ppg_3' in self.df.columns and f'{team_type}_ppg_10' in self.df.columns: | |
| self.df[f'{team_type}_momentum_3v10'] = ( | |
| self.df[f'{team_type}_ppg_3'] - self.df[f'{team_type}_ppg_10'] | |
| ) | |
| self.features_created.append(f'{team_type}_momentum_3v10') | |
| if f'{team_type}_ppg_5' in self.df.columns and f'{team_type}_ppg_20' in self.df.columns: | |
| self.df[f'{team_type}_momentum_5v20'] = ( | |
| self.df[f'{team_type}_ppg_5'] - self.df[f'{team_type}_ppg_20'] | |
| ) | |
| self.features_created.append(f'{team_type}_momentum_5v20') | |
| # Goal scoring momentum | |
| if f'{team_type}_goals_scored_avg_3' in self.df.columns and f'{team_type}_goals_scored_avg_10' in self.df.columns: | |
| self.df[f'{team_type}_scoring_momentum_3v10'] = ( | |
| self.df[f'{team_type}_goals_scored_avg_3'] - self.df[f'{team_type}_goals_scored_avg_10'] | |
| ) | |
| self.features_created.append(f'{team_type}_scoring_momentum_3v10') | |
| # Defense momentum | |
| if f'{team_type}_goals_conceded_avg_3' in self.df.columns and f'{team_type}_goals_conceded_avg_10' in self.df.columns: | |
| self.df[f'{team_type}_defense_momentum_3v10'] = ( | |
| self.df[f'{team_type}_goals_conceded_avg_10'] - self.df[f'{team_type}_goals_conceded_avg_3'] | |
| ) | |
| self.features_created.append(f'{team_type}_defense_momentum_3v10') | |
| # Exponential weighted moving average for form | |
| if f'{team_type}_points' in self.df.columns: | |
| self.df[f'{team_type}_ewm_form'] = self.df.groupby(team_col)[f'{team_type}_points'].transform( | |
| lambda x: x.ewm(span=5, adjust=False).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_ewm_form') | |
| def _create_xg_features(self): | |
| """Create expected goals features if available.""" | |
| xg_cols = ['home_xg', 'away_xg', 'home_xga', 'away_xga'] | |
| if not all(col in self.df.columns for col in xg_cols[:2]): | |
| return | |
| for window in self.ROLLING_WINDOWS[:4]: # Limit to shorter windows for xG | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| xg_col = f'{team_type}_xg' | |
| if xg_col in self.df.columns and team_col in self.df.columns: | |
| # xG average | |
| self.df[f'{team_type}_xg_avg_{window}'] = self.df.groupby(team_col)[xg_col].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| # xG overperformance (goals - xG) | |
| if f'{team_type}_goals_scored_avg_{window}' in self.df.columns: | |
| self.df[f'{team_type}_xg_overperformance_{window}'] = ( | |
| self.df[f'{team_type}_goals_scored_avg_{window}'] - | |
| self.df[f'{team_type}_xg_avg_{window}'] | |
| ) | |
| self.features_created.append(f'{team_type}_xg_overperformance_{window}') | |
| self.features_created.append(f'{team_type}_xg_avg_{window}') | |
| def _create_shot_features(self): | |
| """Create shot-related features.""" | |
| shot_cols = ['home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target'] | |
| if not any(col in self.df.columns for col in shot_cols): | |
| return | |
| for window in [3, 5, 10]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| if f'{team_type}_shots' in self.df.columns: | |
| self.df[f'{team_type}_shots_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_shots'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_shots_avg_{window}') | |
| if f'{team_type}_shots_on_target' in self.df.columns: | |
| self.df[f'{team_type}_sot_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_shots_on_target'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_sot_avg_{window}') | |
| # Shot accuracy | |
| if f'{team_type}_shots_avg_{window}' in self.df.columns: | |
| self.df[f'{team_type}_shot_accuracy_{window}'] = ( | |
| self.df[f'{team_type}_sot_avg_{window}'] / | |
| self.df[f'{team_type}_shots_avg_{window}'].clip(lower=0.1) | |
| ) | |
| self.features_created.append(f'{team_type}_shot_accuracy_{window}') | |
| def _create_possession_features(self): | |
| """Create possession-related features.""" | |
| if 'home_possession' not in self.df.columns: | |
| return | |
| for window in [3, 5, 10]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns or f'{team_type}_possession' not in self.df.columns: | |
| continue | |
| self.df[f'{team_type}_possession_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_possession'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_possession_avg_{window}') | |
| def _create_set_piece_features(self): | |
| """Create set piece features.""" | |
| corner_cols = ['home_corners', 'away_corners'] | |
| if not all(col in self.df.columns for col in corner_cols): | |
| return | |
| for window in [5, 10]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| self.df[f'{team_type}_corners_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_corners'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_corners_avg_{window}') | |
| def _create_tactical_features(self): | |
| """Create tactical style features.""" | |
| pass # Placeholder for tactical data | |
| def _create_style_features(self): | |
| """Create playing style features.""" | |
| pass # Placeholder for style data | |
| def _create_timing_features(self): | |
| """Create time-based features.""" | |
| if 'match_date' not in self.df.columns: | |
| return | |
| self.df['match_date'] = pd.to_datetime(self.df['match_date']) | |
| self.df['day_of_week'] = self.df['match_date'].dt.dayofweek | |
| self.df['month'] = self.df['match_date'].dt.month | |
| self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6]).astype(int) | |
| self.df['is_midweek'] = self.df['day_of_week'].isin([1, 2, 3]).astype(int) | |
| # Season progress (0 to 1) | |
| if 'league' in self.df.columns and 'season' in self.df.columns: | |
| self.df['match_number'] = self.df.groupby(['league', 'season']).cumcount() + 1 | |
| max_matches = self.df.groupby(['league', 'season'])['match_number'].transform('max') | |
| self.df['season_progress'] = self.df['match_number'] / max_matches | |
| # Early/mid/late season indicators | |
| self.df['early_season'] = (self.df['season_progress'] < 0.25).astype(int) | |
| self.df['mid_season'] = ((self.df['season_progress'] >= 0.25) & (self.df['season_progress'] < 0.75)).astype(int) | |
| self.df['late_season'] = (self.df['season_progress'] >= 0.75).astype(int) | |
| self.features_created.extend([ | |
| 'season_progress', 'early_season', 'mid_season', 'late_season' | |
| ]) | |
| self.features_created.extend([ | |
| 'day_of_week', 'month', 'is_weekend', 'is_midweek' | |
| ]) | |
| def _create_schedule_features(self): | |
| """Create schedule-related features.""" | |
| if 'match_date' not in self.df.columns: | |
| return | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Days since last match | |
| self.df[f'{team_type}_days_rest'] = self.df.groupby(team_col)['match_date'].diff().dt.days | |
| self.df[f'{team_type}_days_rest'] = self.df[f'{team_type}_days_rest'].fillna(7) | |
| self.features_created.append(f'{team_type}_days_rest') | |
| if 'home_days_rest' in self.df.columns and 'away_days_rest' in self.df.columns: | |
| self.df['rest_difference'] = self.df['home_days_rest'] - self.df['away_days_rest'] | |
| self.features_created.append('rest_difference') | |
| def _create_fatigue_features(self): | |
| """Create fatigue indicators.""" | |
| if 'match_date' not in self.df.columns: | |
| return | |
| # Simplified fatigue based on rest days | |
| for team_type in ['home', 'away']: | |
| if f'{team_type}_days_rest' in self.df.columns: | |
| self.df[f'{team_type}_fatigue'] = (7 - self.df[f'{team_type}_days_rest'].clip(upper=7)) / 7 | |
| self.features_created.append(f'{team_type}_fatigue') | |
| def _create_btts_specific_features(self): | |
| """Create BTTS-specific features.""" | |
| if 'home_goals' not in self.df.columns: | |
| return | |
| # BTTS indicator | |
| self.df['btts'] = ((self.df['home_goals'] > 0) & (self.df['away_goals'] > 0)).astype(int) | |
| for window in self.ROLLING_WINDOWS: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| goals_for = f'{team_type}_goals' | |
| goals_against = 'away_goals' if team_type == 'home' else 'home_goals' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Team scored rate | |
| self.df[f'{team_type}_scored_rate_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: (x > 0).rolling(window, min_periods=1).mean() | |
| ) | |
| # Team conceded rate | |
| self.df[f'{team_type}_conceded_rate_{window}'] = self.df.groupby(team_col)[goals_against].transform( | |
| lambda x: (x > 0).rolling(window, min_periods=1).mean() | |
| ) | |
| # Clean sheet rate | |
| self.df[f'{team_type}_clean_sheet_rate_{window}'] = self.df.groupby(team_col)[goals_against].transform( | |
| lambda x: (x == 0).rolling(window, min_periods=1).mean() | |
| ) | |
| # Failed to score rate | |
| self.df[f'{team_type}_failed_to_score_rate_{window}'] = self.df.groupby(team_col)[goals_for].transform( | |
| lambda x: (x == 0).rolling(window, min_periods=1).mean() | |
| ) | |
| # BTTS involvement rate | |
| self.df[f'{team_type}_btts_rate_{window}'] = self.df.groupby(team_col)['btts'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_scored_rate_{window}', | |
| f'{team_type}_conceded_rate_{window}', | |
| f'{team_type}_clean_sheet_rate_{window}', | |
| f'{team_type}_failed_to_score_rate_{window}', | |
| f'{team_type}_btts_rate_{window}' | |
| ]) | |
| # Combined BTTS probability features | |
| for window in [3, 5, 10]: | |
| if all(f'{t}_{r}_{window}' in self.df.columns | |
| for t in ['home', 'away'] | |
| for r in ['scored_rate', 'conceded_rate']): | |
| self.df[f'combined_btts_prob_{window}'] = ( | |
| self.df[f'home_scored_rate_{window}'] * self.df[f'away_scored_rate_{window}'] * | |
| self.df[f'home_conceded_rate_{window}'] * self.df[f'away_conceded_rate_{window}'] | |
| ) | |
| self.features_created.append(f'combined_btts_prob_{window}') | |
| def _create_over_under_features(self): | |
| """Create Over/Under specific features.""" | |
| if 'home_goals' not in self.df.columns: | |
| return | |
| self.df['total_goals'] = self.df['home_goals'] + self.df['away_goals'] | |
| # Create indicators for different thresholds | |
| thresholds = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5] | |
| for threshold in thresholds: | |
| self.df[f'over_{str(threshold).replace(".", "_")}'] = (self.df['total_goals'] > threshold).astype(int) | |
| for window in self.ROLLING_WINDOWS: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| # Total goals average | |
| self.df[f'{team_type}_total_goals_avg_{window}'] = self.df.groupby(team_col)['total_goals'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| # Total goals variance | |
| self.df[f'{team_type}_total_goals_std_{window}'] = self.df.groupby(team_col)['total_goals'].transform( | |
| lambda x: x.rolling(window, min_periods=2).std() | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_total_goals_avg_{window}', | |
| f'{team_type}_total_goals_std_{window}' | |
| ]) | |
| # Over rates for each threshold | |
| for threshold in [1.5, 2.5, 3.5]: | |
| col_name = f'over_{str(threshold).replace(".", "_")}' | |
| if col_name in self.df.columns: | |
| self.df[f'{team_type}_over_{str(threshold).replace(".", "_")}_rate_{window}'] = self.df.groupby(team_col)[col_name].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.append(f'{team_type}_over_{str(threshold).replace(".", "_")}_rate_{window}') | |
| # Combined over probability | |
| for window in [3, 5, 10]: | |
| if f'home_total_goals_avg_{window}' in self.df.columns and f'away_total_goals_avg_{window}' in self.df.columns: | |
| self.df[f'combined_total_goals_avg_{window}'] = ( | |
| self.df[f'home_total_goals_avg_{window}'] + self.df[f'away_total_goals_avg_{window}'] | |
| ) / 2 | |
| self.features_created.append(f'combined_total_goals_avg_{window}') | |
| def _create_htft_features(self): | |
| """Create HT/FT specific features.""" | |
| if 'home_goals_ht' not in self.df.columns: | |
| return | |
| # HT result | |
| self.df['ht_result'] = self.df.apply( | |
| lambda x: 'H' if x['home_goals_ht'] > x['away_goals_ht'] | |
| else ('A' if x['home_goals_ht'] < x['away_goals_ht'] else 'D'), | |
| axis=1 | |
| ) | |
| # Second half goals | |
| self.df['home_goals_2h'] = self.df['home_goals'] - self.df['home_goals_ht'] | |
| self.df['away_goals_2h'] = self.df['away_goals'] - self.df['away_goals_ht'] | |
| for window in [3, 5, 10]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| # First half goals average | |
| self.df[f'{team_type}_1h_goals_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_goals_ht'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| # Second half goals average | |
| self.df[f'{team_type}_2h_goals_avg_{window}'] = self.df.groupby(team_col)[f'{team_type}_goals_2h'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_1h_goals_avg_{window}', | |
| f'{team_type}_2h_goals_avg_{window}' | |
| ]) | |
| def _create_correct_score_features(self): | |
| """Create correct score prediction features.""" | |
| if 'home_goals' not in self.df.columns: | |
| return | |
| # Score string | |
| self.df['score'] = self.df['home_goals'].astype(str) + '-' + self.df['away_goals'].astype(str) | |
| # Common score frequencies | |
| common_scores = ['1-0', '0-0', '1-1', '2-1', '2-0', '0-1', '1-2', '0-2', '2-2', '3-1'] | |
| for score in common_scores: | |
| self.df[f'is_{score.replace("-", "_")}'] = (self.df['score'] == score).astype(int) | |
| def _create_h2h_features(self): | |
| """Create head-to-head features.""" | |
| if 'home_team' not in self.df.columns or 'match_date' not in self.df.columns: | |
| return | |
| h2h_stats = [] | |
| for idx, row in self.df.iterrows(): | |
| home = row['home_team'] | |
| away = row['away_team'] | |
| date = row['match_date'] | |
| # Previous encounters (last 10) | |
| prev = self.df[ | |
| (self.df['match_date'] < date) & | |
| ( | |
| ((self.df['home_team'] == home) & (self.df['away_team'] == away)) | | |
| ((self.df['home_team'] == away) & (self.df['away_team'] == home)) | |
| ) | |
| ].tail(10) | |
| if len(prev) > 0: | |
| home_wins = len(prev[ | |
| ((prev['home_team'] == home) & (prev['result'] == 'H')) | | |
| ((prev['away_team'] == home) & (prev['result'] == 'A')) | |
| ]) | |
| draws = len(prev[prev['result'] == 'D']) | |
| total = len(prev) | |
| home_goals = prev[prev['home_team'] == home]['home_goals'].sum() + \ | |
| prev[prev['away_team'] == home]['away_goals'].sum() | |
| away_goals = prev[prev['home_team'] == away]['home_goals'].sum() + \ | |
| prev[prev['away_team'] == away]['away_goals'].sum() | |
| h2h_stats.append({ | |
| 'h2h_home_win_rate': home_wins / total, | |
| 'h2h_draw_rate': draws / total, | |
| 'h2h_avg_home_goals': home_goals / total, | |
| 'h2h_avg_away_goals': away_goals / total, | |
| 'h2h_total_goals_avg': (home_goals + away_goals) / total, | |
| 'h2h_btts_rate': len(prev[(prev['home_goals'] > 0) & (prev['away_goals'] > 0)]) / total, | |
| 'h2h_matches': total | |
| }) | |
| else: | |
| h2h_stats.append({ | |
| 'h2h_home_win_rate': 0.33, | |
| 'h2h_draw_rate': 0.33, | |
| 'h2h_avg_home_goals': 1.3, | |
| 'h2h_avg_away_goals': 1.0, | |
| 'h2h_total_goals_avg': 2.3, | |
| 'h2h_btts_rate': 0.5, | |
| 'h2h_matches': 0 | |
| }) | |
| h2h_df = pd.DataFrame(h2h_stats) | |
| for col in h2h_df.columns: | |
| self.df[col] = h2h_df[col].values | |
| self.features_created.append(col) | |
| def _create_league_context_features(self): | |
| """Create league position and context features.""" | |
| if 'league_position_home' not in self.df.columns: | |
| return | |
| self.df['position_diff'] = self.df['league_position_home'] - self.df['league_position_away'] | |
| self.df['top_6_match'] = ((self.df['league_position_home'] <= 6) & (self.df['league_position_away'] <= 6)).astype(int) | |
| self.df['relegation_match'] = ((self.df['league_position_home'] >= 15) | (self.df['league_position_away'] >= 15)).astype(int) | |
| self.features_created.extend(['position_diff', 'top_6_match', 'relegation_match']) | |
| def _create_situational_features(self): | |
| """Create situational context features.""" | |
| pass # Placeholder for derby/importance data | |
| def _create_interaction_features(self): | |
| """Create interaction features between home and away.""" | |
| for window in [5, 10]: | |
| if f'home_attack_strength_{window}' in self.df.columns and f'away_defense_weakness_{window}' in self.df.columns: | |
| self.df[f'attack_vs_defense_{window}'] = ( | |
| self.df[f'home_attack_strength_{window}'] * self.df[f'away_defense_weakness_{window}'] | |
| ) | |
| self.df[f'defense_vs_attack_{window}'] = ( | |
| self.df[f'away_attack_strength_{window}'] * self.df[f'home_defense_weakness_{window}'] | |
| ) | |
| self.features_created.extend([ | |
| f'attack_vs_defense_{window}', | |
| f'defense_vs_attack_{window}' | |
| ]) | |
| if f'home_ppg_{window}' in self.df.columns and f'away_ppg_{window}' in self.df.columns: | |
| self.df[f'form_difference_{window}'] = ( | |
| self.df[f'home_ppg_{window}'] - self.df[f'away_ppg_{window}'] | |
| ) | |
| self.features_created.append(f'form_difference_{window}') | |
| if f'home_overall_rating_{window}' in self.df.columns and f'away_overall_rating_{window}' in self.df.columns: | |
| self.df[f'rating_difference_{window}'] = ( | |
| self.df[f'home_overall_rating_{window}'] - self.df[f'away_overall_rating_{window}'] | |
| ) | |
| self.features_created.append(f'rating_difference_{window}') | |
| def _create_ratio_features(self): | |
| """Create ratio-based features.""" | |
| for window in [5, 10]: | |
| if f'home_attack_strength_{window}' in self.df.columns and f'away_attack_strength_{window}' in self.df.columns: | |
| self.df[f'attack_ratio_{window}'] = ( | |
| self.df[f'home_attack_strength_{window}'] / | |
| self.df[f'away_attack_strength_{window}'].clip(lower=0.1) | |
| ) | |
| self.features_created.append(f'attack_ratio_{window}') | |
| if f'home_defense_weakness_{window}' in self.df.columns and f'away_defense_weakness_{window}' in self.df.columns: | |
| self.df[f'defense_ratio_{window}'] = ( | |
| self.df[f'away_defense_weakness_{window}'] / | |
| self.df[f'home_defense_weakness_{window}'].clip(lower=0.1) | |
| ) | |
| self.features_created.append(f'defense_ratio_{window}') | |
| def _create_elo_features(self): | |
| """Create Elo rating features.""" | |
| # Placeholder - would need Elo rating data | |
| pass | |
| def _create_poisson_features(self): | |
| """Create Poisson-based expected goal features.""" | |
| for window in [5, 10]: | |
| if f'home_goals_scored_avg_{window}' in self.df.columns and f'away_goals_conceded_avg_{window}' in self.df.columns: | |
| # Expected home goals | |
| self.df[f'poisson_home_xg_{window}'] = ( | |
| self.df[f'home_goals_scored_avg_{window}'] * | |
| self.df[f'away_goals_conceded_avg_{window}'].clip(lower=0.5) / 1.5 | |
| ) | |
| # Expected away goals | |
| self.df[f'poisson_away_xg_{window}'] = ( | |
| self.df[f'away_goals_scored_avg_{window}'] * | |
| self.df[f'home_goals_conceded_avg_{window}'].clip(lower=0.5) / 1.5 | |
| ) | |
| self.features_created.extend([ | |
| f'poisson_home_xg_{window}', | |
| f'poisson_away_xg_{window}' | |
| ]) | |
| def _create_streak_features(self): | |
| """Create winning/losing streak features.""" | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns or 'result' not in self.df.columns: | |
| continue | |
| # Calculate streaks | |
| def calc_win_streak(results, team_type): | |
| streaks = [] | |
| streak = 0 | |
| win_result = 'H' if team_type == 'home' else 'A' | |
| for r in results: | |
| if r == win_result: | |
| streak += 1 | |
| else: | |
| streak = 0 | |
| streaks.append(streak) | |
| return streaks | |
| self.df[f'{team_type}_win_streak'] = self.df.groupby(team_col)['result'].transform( | |
| lambda x: calc_win_streak(x.tolist(), team_type) | |
| ) | |
| self.features_created.append(f'{team_type}_win_streak') | |
| def _create_consistency_features(self): | |
| """Create consistency/variance features.""" | |
| for window in [10, 20]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns or f'{team_type}_points' not in self.df.columns: | |
| continue | |
| # Points consistency (coefficient of variation) | |
| mean_pts = self.df.groupby(team_col)[f'{team_type}_points'].transform( | |
| lambda x: x.rolling(window, min_periods=3).mean() | |
| ) | |
| std_pts = self.df.groupby(team_col)[f'{team_type}_points'].transform( | |
| lambda x: x.rolling(window, min_periods=3).std() | |
| ) | |
| self.df[f'{team_type}_consistency_{window}'] = 1 - (std_pts / mean_pts.clip(lower=0.1)) | |
| self.features_created.append(f'{team_type}_consistency_{window}') | |
| def _create_scoring_pattern_features(self): | |
| """Create scoring pattern features.""" | |
| if 'home_goals' not in self.df.columns: | |
| return | |
| # High scoring indicator | |
| self.df['high_scoring'] = (self.df['home_goals'] + self.df['away_goals'] >= 3).astype(int) | |
| # Low scoring indicator | |
| self.df['low_scoring'] = (self.df['home_goals'] + self.df['away_goals'] <= 1).astype(int) | |
| for window in [5, 10]: | |
| for team_type in ['home', 'away']: | |
| team_col = f'{team_type}_team' | |
| if team_col not in self.df.columns: | |
| continue | |
| self.df[f'{team_type}_high_scoring_rate_{window}'] = self.df.groupby(team_col)['high_scoring'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.df[f'{team_type}_low_scoring_rate_{window}'] = self.df.groupby(team_col)['low_scoring'].transform( | |
| lambda x: x.rolling(window, min_periods=1).mean() | |
| ) | |
| self.features_created.extend([ | |
| f'{team_type}_high_scoring_rate_{window}', | |
| f'{team_type}_low_scoring_rate_{window}' | |
| ]) | |
| def get_feature_engineer(df: pd.DataFrame = None) -> AdvancedFeatureEngineer: | |
| """Get feature engineer instance.""" | |
| return AdvancedFeatureEngineer(df) | |
| def create_match_features(historical_df: pd.DataFrame) -> pd.DataFrame: | |
| """Create all features from historical data.""" | |
| engineer = AdvancedFeatureEngineer(historical_df) | |
| return engineer.create_all_features() | |
| # Alias for backward compatibility | |
| create_advanced_features = create_match_features | |