Spaces:
Runtime error
Runtime error
| """ | |
| Enhanced Feature Engineering Module - 600+ Features | |
| Generates comprehensive features across 20+ categories | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| from typing import Dict, List, Optional, Tuple, Any | |
| from dataclasses import dataclass | |
| import logging | |
| from pathlib import Path | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class FeatureConfig: | |
| """Configuration for feature generation.""" | |
| rolling_windows: List[int] = None | |
| ewm_spans: List[int] = None | |
| goal_thresholds: List[float] = None | |
| include_advanced: bool = True | |
| include_embeddings: bool = False | |
| include_clusters: bool = True | |
| def __post_init__(self): | |
| self.rolling_windows = self.rolling_windows or [3, 5, 10, 20] | |
| self.ewm_spans = self.ewm_spans or [3, 5, 10] | |
| self.goal_thresholds = self.goal_thresholds or [0.5, 1.5, 2.5, 3.5, 4.5] | |
| class EnhancedFeatureGenerator: | |
| """ | |
| Comprehensive feature generator producing 600+ features across 20 categories. | |
| """ | |
| def __init__(self, data: pd.DataFrame, config: FeatureConfig = None): | |
| self.data = data.copy() | |
| self.config = config or FeatureConfig() | |
| self._prepare_data() | |
| self.feature_count = 0 | |
| def _prepare_data(self): | |
| """Prepare and validate data.""" | |
| if 'match_date' in self.data.columns: | |
| self.data['match_date'] = pd.to_datetime(self.data['match_date']) | |
| self.data = self.data.sort_values('match_date').reset_index(drop=True) | |
| # Add result column | |
| if 'home_goals' in self.data.columns and 'away_goals' in self.data.columns: | |
| self.data['result'] = np.where( | |
| self.data['home_goals'] > self.data['away_goals'], 'H', | |
| np.where(self.data['home_goals'] < self.data['away_goals'], 'A', 'D') | |
| ) | |
| self.data['total_goals'] = self.data['home_goals'] + self.data['away_goals'] | |
| self.data['goal_diff'] = self.data['home_goals'] - self.data['away_goals'] | |
| def generate_all_features(self) -> pd.DataFrame: | |
| """Generate all 600+ features.""" | |
| logger.info("Generating comprehensive feature set...") | |
| features = pd.DataFrame(index=self.data.index) | |
| # 1. Goal Features (80+) | |
| goal_feats = self._generate_goal_features() | |
| features = pd.concat([features, goal_feats], axis=1) | |
| logger.info(f" Goal features: {len(goal_feats.columns)}") | |
| # 2. Form Features (60+) | |
| form_feats = self._generate_form_features() | |
| features = pd.concat([features, form_feats], axis=1) | |
| logger.info(f" Form features: {len(form_feats.columns)}") | |
| # 3. Strength Ratings (40+) | |
| strength_feats = self._generate_strength_features() | |
| features = pd.concat([features, strength_feats], axis=1) | |
| logger.info(f" Strength features: {len(strength_feats.columns)}") | |
| # 4. Momentum Features (40+) | |
| momentum_feats = self._generate_momentum_features() | |
| features = pd.concat([features, momentum_feats], axis=1) | |
| logger.info(f" Momentum features: {len(momentum_feats.columns)}") | |
| # 5. BTTS Features (30+) | |
| btts_feats = self._generate_btts_features() | |
| features = pd.concat([features, btts_feats], axis=1) | |
| logger.info(f" BTTS features: {len(btts_feats.columns)}") | |
| # 6. Over/Under Features (50+) | |
| ou_feats = self._generate_over_under_features() | |
| features = pd.concat([features, ou_feats], axis=1) | |
| logger.info(f" Over/Under features: {len(ou_feats.columns)}") | |
| # 7. HT/FT Features (40+) | |
| htft_feats = self._generate_htft_features() | |
| features = pd.concat([features, htft_feats], axis=1) | |
| logger.info(f" HT/FT features: {len(htft_feats.columns)}") | |
| # 8. H2H Features (30+) | |
| h2h_feats = self._generate_h2h_features() | |
| features = pd.concat([features, h2h_feats], axis=1) | |
| logger.info(f" H2H features: {len(h2h_feats.columns)}") | |
| # 9. Timing Features (25+) | |
| timing_feats = self._generate_timing_features() | |
| features = pd.concat([features, timing_feats], axis=1) | |
| logger.info(f" Timing features: {len(timing_feats.columns)}") | |
| # 10. Rest Features (15+) | |
| rest_feats = self._generate_rest_features() | |
| features = pd.concat([features, rest_feats], axis=1) | |
| logger.info(f" Rest features: {len(rest_feats.columns)}") | |
| # 11. Streak Features (25+) | |
| streak_feats = self._generate_streak_features() | |
| features = pd.concat([features, streak_feats], axis=1) | |
| logger.info(f" Streak features: {len(streak_feats.columns)}") | |
| # 12. Consistency Features (20+) | |
| consistency_feats = self._generate_consistency_features() | |
| features = pd.concat([features, consistency_feats], axis=1) | |
| logger.info(f" Consistency features: {len(consistency_feats.columns)}") | |
| # 13. Interaction Features (60+) | |
| interaction_feats = self._generate_interaction_features(features) | |
| features = pd.concat([features, interaction_feats], axis=1) | |
| logger.info(f" Interaction features: {len(interaction_feats.columns)}") | |
| # 14. Ratio Features (30+) | |
| ratio_feats = self._generate_ratio_features(features) | |
| features = pd.concat([features, ratio_feats], axis=1) | |
| logger.info(f" Ratio features: {len(ratio_feats.columns)}") | |
| # 15. EWM Features (40+) | |
| ewm_feats = self._generate_ewm_features() | |
| features = pd.concat([features, ewm_feats], axis=1) | |
| logger.info(f" EWM features: {len(ewm_feats.columns)}") | |
| # 16. Percentile Features (20+) | |
| percentile_feats = self._generate_percentile_features(features) | |
| features = pd.concat([features, percentile_feats], axis=1) | |
| logger.info(f" Percentile features: {len(percentile_feats.columns)}") | |
| if self.config.include_clusters: | |
| # 17. Cluster Features (15+) | |
| cluster_feats = self._generate_cluster_features(features) | |
| features = pd.concat([features, cluster_feats], axis=1) | |
| logger.info(f" Cluster features: {len(cluster_feats.columns)}") | |
| self.feature_count = len(features.columns) | |
| logger.info(f"Total features generated: {self.feature_count}") | |
| return features | |
| def _generate_goal_features(self) -> pd.DataFrame: | |
| """Generate goal-related features across rolling windows.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_scored_col = 'home_goals' if team_type == 'home' else 'away_goals' | |
| goals_conceded_col = 'away_goals' if team_type == 'home' else 'home_goals' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| # Group by team and calculate rolling stats | |
| for col_to_roll, metric_name in [ | |
| (goals_scored_col, 'goals_scored'), | |
| (goals_conceded_col, 'goals_conceded'), | |
| ('total_goals', 'total_goals'), | |
| ]: | |
| if col_to_roll not in self.data.columns: | |
| continue | |
| # Mean, std, sum, min, max | |
| features[f"{prefix}_{metric_name}_mean"] = ( | |
| self.data.groupby(team_col)[col_to_roll] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| features[f"{prefix}_{metric_name}_std"] = ( | |
| self.data.groupby(team_col)[col_to_roll] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).std()) | |
| ) | |
| features[f"{prefix}_{metric_name}_sum"] = ( | |
| self.data.groupby(team_col)[col_to_roll] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).sum()) | |
| ) | |
| features[f"{prefix}_{metric_name}_min"] = ( | |
| self.data.groupby(team_col)[col_to_roll] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).min()) | |
| ) | |
| features[f"{prefix}_{metric_name}_max"] = ( | |
| self.data.groupby(team_col)[col_to_roll] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).max()) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_form_features(self) -> pd.DataFrame: | |
| """Generate form-related features.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| # Points calculation | |
| def calc_points(group): | |
| points = [] | |
| for i, (idx, row) in enumerate(group.iterrows()): | |
| team = row[team_col] | |
| if row['home_team'] == team: | |
| pts = 3 if row['result'] == 'H' else (1 if row['result'] == 'D' else 0) | |
| else: | |
| pts = 3 if row['result'] == 'A' else (1 if row['result'] == 'D' else 0) | |
| points.append(pts) | |
| return pd.Series(points, index=group.index) | |
| # PPG (Points Per Game) | |
| features[f"{prefix}_ppg"] = ( | |
| self.data.groupby(team_col) | |
| .apply(lambda g: calc_points(g).shift(1).rolling(window, min_periods=1).mean()) | |
| .reset_index(level=0, drop=True).sort_index() | |
| ) | |
| # Win rate | |
| def calc_win_rate(group): | |
| wins = [] | |
| for i, (idx, row) in enumerate(group.iterrows()): | |
| team = row[team_col] | |
| if row['home_team'] == team: | |
| win = 1 if row['result'] == 'H' else 0 | |
| else: | |
| win = 1 if row['result'] == 'A' else 0 | |
| wins.append(win) | |
| return pd.Series(wins, index=group.index) | |
| features[f"{prefix}_win_rate"] = ( | |
| self.data.groupby(team_col) | |
| .apply(lambda g: calc_win_rate(g).shift(1).rolling(window, min_periods=1).mean()) | |
| .reset_index(level=0, drop=True).sort_index() | |
| ) | |
| # Draw rate | |
| features[f"{prefix}_draw_rate"] = ( | |
| self.data.groupby(team_col)['result'] | |
| .transform(lambda x: (x.shift(1) == 'D').rolling(window, min_periods=1).mean()) | |
| ) | |
| # Loss rate | |
| features[f"{prefix}_loss_rate"] = ( | |
| 1 - features.get(f"{prefix}_win_rate", 0) - features.get(f"{prefix}_draw_rate", 0) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_strength_features(self) -> pd.DataFrame: | |
| """Generate attack/defense strength ratings.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_for = 'home_goals' if team_type == 'home' else 'away_goals' | |
| goals_against = 'away_goals' if team_type == 'home' else 'home_goals' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| # Attack strength (goals scored / league average) | |
| league_avg_goals = self.data[goals_for].mean() | |
| team_goals_mean = ( | |
| self.data.groupby(team_col)[goals_for] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| features[f"{prefix}_attack_strength"] = team_goals_mean / max(league_avg_goals, 0.1) | |
| # Defense strength (goals conceded / league average) | |
| league_avg_conceded = self.data[goals_against].mean() | |
| team_conceded_mean = ( | |
| self.data.groupby(team_col)[goals_against] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| features[f"{prefix}_defense_strength"] = team_conceded_mean / max(league_avg_conceded, 0.1) | |
| # Overall strength | |
| features[f"{prefix}_overall_strength"] = ( | |
| features[f"{prefix}_attack_strength"] - features[f"{prefix}_defense_strength"] | |
| ) | |
| # Goal difference | |
| features[f"{prefix}_goal_diff"] = team_goals_mean - team_conceded_mean | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_momentum_features(self) -> pd.DataFrame: | |
| """Generate momentum/trend features.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_for = 'home_goals' if team_type == 'home' else 'away_goals' | |
| # Short vs long term momentum | |
| short_window = 3 | |
| long_window = 10 | |
| short_term = ( | |
| self.data.groupby(team_col)[goals_for] | |
| .transform(lambda x: x.shift(1).rolling(short_window, min_periods=1).mean()) | |
| ) | |
| long_term = ( | |
| self.data.groupby(team_col)[goals_for] | |
| .transform(lambda x: x.shift(1).rolling(long_window, min_periods=1).mean()) | |
| ) | |
| features[f"{team_type}_momentum_goals"] = short_term - long_term | |
| features[f"{team_type}_momentum_ratio"] = short_term / (long_term + 0.1) | |
| # Form trend | |
| features[f"{team_type}_form_trend"] = ( | |
| self.data.groupby(team_col)[goals_for] | |
| .transform(lambda x: x.shift(1).diff().rolling(3, min_periods=1).mean()) | |
| ) | |
| # Combined momentum | |
| features['momentum_diff'] = features['home_momentum_goals'] - features['away_momentum_goals'] | |
| features['momentum_product'] = features['home_momentum_goals'] * features['away_momentum_goals'] | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_btts_features(self) -> pd.DataFrame: | |
| """Generate Both Teams To Score features.""" | |
| features = {} | |
| # Add BTTS column | |
| if 'btts' not in self.data.columns: | |
| self.data['btts'] = ((self.data['home_goals'] > 0) & (self.data['away_goals'] > 0)).astype(int) | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_for = 'home_goals' if team_type == 'home' else 'away_goals' | |
| goals_against = 'away_goals' if team_type == 'home' else 'home_goals' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| # BTTS rate | |
| features[f"{prefix}_btts_rate"] = ( | |
| self.data.groupby(team_col)['btts'] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| # Scoring rate (scored in X% of games) | |
| scored = (self.data[goals_for] > 0).astype(int) | |
| features[f"{prefix}_scoring_rate"] = ( | |
| self.data.groupby(team_col)[goals_for] | |
| .transform(lambda x: (x.shift(1) > 0).rolling(window, min_periods=1).mean()) | |
| ) | |
| # Conceding rate | |
| features[f"{prefix}_conceding_rate"] = ( | |
| self.data.groupby(team_col)[goals_against] | |
| .transform(lambda x: (x.shift(1) > 0).rolling(window, min_periods=1).mean()) | |
| ) | |
| # Clean sheet rate | |
| features[f"{prefix}_clean_sheet_rate"] = 1 - features[f"{prefix}_conceding_rate"] | |
| # Failed to score rate | |
| features[f"{prefix}_failed_to_score_rate"] = 1 - features[f"{prefix}_scoring_rate"] | |
| # Combined BTTS probability | |
| features['btts_combined_prob'] = ( | |
| features.get('home_L5_scoring_rate', 0.5) * features.get('away_L5_scoring_rate', 0.5) * | |
| features.get('home_L5_conceding_rate', 0.5) * features.get('away_L5_conceding_rate', 0.5) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_over_under_features(self) -> pd.DataFrame: | |
| """Generate Over/Under features for multiple thresholds.""" | |
| features = {} | |
| for threshold in self.config.goal_thresholds: | |
| threshold_name = str(threshold).replace('.', '_') | |
| over_col = f"over_{threshold_name}" | |
| self.data[over_col] = (self.data['total_goals'] > threshold).astype(int) | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| features[f"{prefix}_over{threshold_name}_rate"] = ( | |
| self.data.groupby(team_col)[over_col] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_htft_features(self) -> pd.DataFrame: | |
| """Generate Half-Time/Full-Time features.""" | |
| features = {} | |
| if 'home_goals_ht' not in self.data.columns: | |
| return pd.DataFrame() | |
| # Add HT result | |
| self.data['ht_result'] = np.where( | |
| self.data['home_goals_ht'] > self.data['away_goals_ht'], 'H', | |
| np.where(self.data['home_goals_ht'] < self.data['away_goals_ht'], 'A', 'D') | |
| ) | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| ht_goals = 'home_goals_ht' if team_type == 'home' else 'away_goals_ht' | |
| for window in self.config.rolling_windows: | |
| prefix = f"{team_type}_L{window}" | |
| # HT goals | |
| features[f"{prefix}_ht_goals_mean"] = ( | |
| self.data.groupby(team_col)[ht_goals] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean()) | |
| ) | |
| # HT lead rate | |
| if team_type == 'home': | |
| features[f"{prefix}_ht_lead_rate"] = ( | |
| self.data.groupby(team_col)['ht_result'] | |
| .transform(lambda x: (x.shift(1) == 'H').rolling(window, min_periods=1).mean()) | |
| ) | |
| else: | |
| features[f"{prefix}_ht_lead_rate"] = ( | |
| self.data.groupby(team_col)['ht_result'] | |
| .transform(lambda x: (x.shift(1) == 'A').rolling(window, min_periods=1).mean()) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_h2h_features(self) -> pd.DataFrame: | |
| """Generate Head-to-Head features.""" | |
| features = {} | |
| for idx, row in self.data.iterrows(): | |
| home_team = row.get('home_team', '') | |
| away_team = row.get('away_team', '') | |
| match_date = row.get('match_date', datetime.now()) | |
| # Get historical H2H | |
| h2h = self.data[ | |
| ( | |
| ((self.data['home_team'] == home_team) & (self.data['away_team'] == away_team)) | | |
| ((self.data['home_team'] == away_team) & (self.data['away_team'] == home_team)) | |
| ) & | |
| (self.data['match_date'] < match_date) | |
| ].tail(10) | |
| n_h2h = len(h2h) | |
| features.setdefault('h2h_matches', []).append(n_h2h) | |
| if n_h2h > 0: | |
| # Home team wins in H2H | |
| home_wins = len(h2h[ | |
| ((h2h['home_team'] == home_team) & (h2h['result'] == 'H')) | | |
| ((h2h['away_team'] == home_team) & (h2h['result'] == 'A')) | |
| ]) | |
| away_wins = len(h2h[ | |
| ((h2h['home_team'] == away_team) & (h2h['result'] == 'H')) | | |
| ((h2h['away_team'] == away_team) & (h2h['result'] == 'A')) | |
| ]) | |
| draws = n_h2h - home_wins - away_wins | |
| features.setdefault('h2h_home_win_rate', []).append(home_wins / n_h2h) | |
| features.setdefault('h2h_away_win_rate', []).append(away_wins / n_h2h) | |
| features.setdefault('h2h_draw_rate', []).append(draws / n_h2h) | |
| features.setdefault('h2h_total_goals_avg', []).append(h2h['total_goals'].mean()) | |
| features.setdefault('h2h_btts_rate', []).append(h2h['btts'].mean() if 'btts' in h2h.columns else 0.5) | |
| else: | |
| features.setdefault('h2h_home_win_rate', []).append(0.33) | |
| features.setdefault('h2h_away_win_rate', []).append(0.33) | |
| features.setdefault('h2h_draw_rate', []).append(0.34) | |
| features.setdefault('h2h_total_goals_avg', []).append(2.5) | |
| features.setdefault('h2h_btts_rate', []).append(0.5) | |
| return pd.DataFrame(features) | |
| def _generate_timing_features(self) -> pd.DataFrame: | |
| """Generate time-based features.""" | |
| features = {} | |
| if 'match_date' not in self.data.columns: | |
| return pd.DataFrame() | |
| # Day of week (0 = Monday) | |
| features['day_of_week'] = self.data['match_date'].dt.dayofweek | |
| features['is_weekend'] = features['day_of_week'].isin([5, 6]).astype(int) | |
| # Month | |
| features['month'] = self.data['match_date'].dt.month | |
| features['is_early_season'] = features['month'].isin([8, 9, 10]).astype(int) | |
| features['is_late_season'] = features['month'].isin([4, 5]).astype(int) | |
| # Season progress (0-1) | |
| if 'season' in self.data.columns: | |
| features['season_progress'] = ( | |
| self.data.groupby('season')['match_date'] | |
| .transform(lambda x: (x - x.min()).dt.days / max((x.max() - x.min()).days, 1)) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_rest_features(self) -> pd.DataFrame: | |
| """Generate rest days features.""" | |
| features = {} | |
| if 'match_date' not in self.data.columns: | |
| return pd.DataFrame() | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| # Days since last match | |
| rest_days = [] | |
| for idx, row in self.data.iterrows(): | |
| team = row[team_col] | |
| match_date = row['match_date'] | |
| prev_matches = self.data[ | |
| ((self.data['home_team'] == team) | (self.data['away_team'] == team)) & | |
| (self.data['match_date'] < match_date) | |
| ] | |
| if len(prev_matches) > 0: | |
| last_match = prev_matches['match_date'].max() | |
| days = (match_date - last_match).days | |
| else: | |
| days = 7 # Default | |
| rest_days.append(days) | |
| features[f"{team_type}_rest_days"] = rest_days | |
| features[f"{team_type}_is_short_rest"] = [1 if d <= 3 else 0 for d in rest_days] | |
| features[f"{team_type}_is_long_rest"] = [1 if d >= 7 else 0 for d in rest_days] | |
| # Rest advantage | |
| features['rest_diff'] = [ | |
| h - a for h, a in zip(features['home_rest_days'], features['away_rest_days']) | |
| ] | |
| return pd.DataFrame(features) | |
| def _generate_streak_features(self) -> pd.DataFrame: | |
| """Generate streak features.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| win_streak = [] | |
| unbeaten_streak = [] | |
| scoring_streak = [] | |
| clean_sheet_streak = [] | |
| for idx, row in self.data.iterrows(): | |
| team = row[team_col] | |
| match_date = row.get('match_date', datetime.now()) | |
| # Get recent matches | |
| recent = self.data[ | |
| ((self.data['home_team'] == team) | (self.data['away_team'] == team)) & | |
| (self.data['match_date'] < match_date) | |
| ].tail(10) | |
| ws = 0 | |
| ubs = 0 | |
| ss = 0 | |
| css = 0 | |
| for _, m in recent.iloc[::-1].iterrows(): | |
| is_home = m['home_team'] == team | |
| result = m['result'] | |
| goals_for = m['home_goals'] if is_home else m['away_goals'] | |
| goals_against = m['away_goals'] if is_home else m['home_goals'] | |
| won = (is_home and result == 'H') or (not is_home and result == 'A') | |
| drawn = result == 'D' | |
| if won and ws == len(recent.iloc[::-1].head(len(recent) - recent.iloc[::-1].index.get_loc(_))): | |
| ws += 1 | |
| if (won or drawn): | |
| if ubs == len(recent.iloc[::-1].head(len(recent) - recent.iloc[::-1].index.get_loc(_))): | |
| ubs += 1 | |
| if goals_for > 0: | |
| if ss == len(recent.iloc[::-1].head(len(recent) - recent.iloc[::-1].index.get_loc(_))): | |
| ss += 1 | |
| if goals_against == 0: | |
| if css == len(recent.iloc[::-1].head(len(recent) - recent.iloc[::-1].index.get_loc(_))): | |
| css += 1 | |
| win_streak.append(min(ws, 10)) | |
| unbeaten_streak.append(min(ubs, 10)) | |
| scoring_streak.append(min(ss, 10)) | |
| clean_sheet_streak.append(min(css, 5)) | |
| features[f"{team_type}_win_streak"] = win_streak | |
| features[f"{team_type}_unbeaten_streak"] = unbeaten_streak | |
| features[f"{team_type}_scoring_streak"] = scoring_streak | |
| features[f"{team_type}_clean_sheet_streak"] = clean_sheet_streak | |
| return pd.DataFrame(features) | |
| def _generate_consistency_features(self, window: int = 10) -> pd.DataFrame: | |
| """Generate consistency/variance features.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_col = 'home_goals' if team_type == 'home' else 'away_goals' | |
| # Goals std | |
| features[f"{team_type}_goals_std"] = ( | |
| self.data.groupby(team_col)[goals_col] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=3).std()) | |
| ) | |
| # Coefficient of variation | |
| goals_mean = ( | |
| self.data.groupby(team_col)[goals_col] | |
| .transform(lambda x: x.shift(1).rolling(window, min_periods=3).mean()) | |
| ) | |
| features[f"{team_type}_goals_cv"] = features[f"{team_type}_goals_std"] / (goals_mean + 0.1) | |
| # Results consistency (low variance = consistent) | |
| features[f"{team_type}_consistency_score"] = 1 / (features[f"{team_type}_goals_cv"] + 0.1) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_interaction_features(self, base_features: pd.DataFrame) -> pd.DataFrame: | |
| """Generate interaction features between home and away metrics.""" | |
| features = {} | |
| # Attack vs Defense | |
| for window in [5, 10]: | |
| home_attack = base_features.get(f'home_L{window}_attack_strength', pd.Series(1)) | |
| away_defense = base_features.get(f'away_L{window}_defense_strength', pd.Series(1)) | |
| away_attack = base_features.get(f'away_L{window}_attack_strength', pd.Series(1)) | |
| home_defense = base_features.get(f'home_L{window}_defense_strength', pd.Series(1)) | |
| features[f'home_attack_vs_away_defense_L{window}'] = home_attack / (away_defense + 0.1) | |
| features[f'away_attack_vs_home_defense_L{window}'] = away_attack / (home_defense + 0.1) | |
| features[f'attack_diff_L{window}'] = home_attack - away_attack | |
| features[f'defense_diff_L{window}'] = home_defense - away_defense | |
| features[f'strength_product_L{window}'] = home_attack * away_attack | |
| # Form interactions | |
| for window in [5, 10]: | |
| home_form = base_features.get(f'home_L{window}_ppg', pd.Series(1.5)) | |
| away_form = base_features.get(f'away_L{window}_ppg', pd.Series(1.5)) | |
| features[f'form_ratio_L{window}'] = home_form / (away_form + 0.1) | |
| features[f'form_diff_L{window}'] = home_form - away_form | |
| features[f'form_product_L{window}'] = home_form * away_form | |
| # Goal expectancy | |
| home_goals_mean = base_features.get('home_L5_goals_scored_mean', pd.Series(1.5)) | |
| away_goals_mean = base_features.get('away_L5_goals_scored_mean', pd.Series(1.5)) | |
| features['expected_total_goals'] = home_goals_mean + away_goals_mean | |
| features['expected_goal_diff'] = home_goals_mean - away_goals_mean | |
| features['goal_expectancy_product'] = home_goals_mean * away_goals_mean | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_ratio_features(self, base_features: pd.DataFrame) -> pd.DataFrame: | |
| """Generate ratio-based features.""" | |
| features = {} | |
| # Win rate ratios | |
| home_win = base_features.get('home_L5_win_rate', pd.Series(0.33)) | |
| away_win = base_features.get('away_L5_win_rate', pd.Series(0.33)) | |
| features['win_rate_ratio'] = home_win / (away_win + 0.01) | |
| features['win_rate_diff'] = home_win - away_win | |
| # Goals ratios | |
| for window in [5, 10]: | |
| home_goals = base_features.get(f'home_L{window}_goals_scored_mean', pd.Series(1.5)) | |
| away_goals = base_features.get(f'away_L{window}_goals_scored_mean', pd.Series(1.5)) | |
| home_conceded = base_features.get(f'home_L{window}_goals_conceded_mean', pd.Series(1.0)) | |
| away_conceded = base_features.get(f'away_L{window}_goals_conceded_mean', pd.Series(1.0)) | |
| features[f'goals_for_ratio_L{window}'] = home_goals / (away_goals + 0.1) | |
| features[f'goals_against_ratio_L{window}'] = home_conceded / (away_conceded + 0.1) | |
| # Attack/Defense ratios for each team | |
| features[f'home_attack_defense_ratio_L{window}'] = home_goals / (home_conceded + 0.1) | |
| features[f'away_attack_defense_ratio_L{window}'] = away_goals / (away_conceded + 0.1) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_ewm_features(self) -> pd.DataFrame: | |
| """Generate exponentially weighted moving average features.""" | |
| features = {} | |
| for team_type in ['home', 'away']: | |
| team_col = 'home_team' if team_type == 'home' else 'away_team' | |
| goals_col = 'home_goals' if team_type == 'home' else 'away_goals' | |
| for span in self.config.ewm_spans: | |
| prefix = f"{team_type}_ewm{span}" | |
| features[f'{prefix}_goals'] = ( | |
| self.data.groupby(team_col)[goals_col] | |
| .transform(lambda x: x.shift(1).ewm(span=span, min_periods=1).mean()) | |
| ) | |
| features[f'{prefix}_total_goals'] = ( | |
| self.data.groupby(team_col)['total_goals'] | |
| .transform(lambda x: x.shift(1).ewm(span=span, min_periods=1).mean()) | |
| ) | |
| return pd.DataFrame(features).fillna(0) | |
| def _generate_percentile_features(self, base_features: pd.DataFrame) -> pd.DataFrame: | |
| """Generate league percentile features.""" | |
| features = {} | |
| key_metrics = [ | |
| 'home_L5_goals_scored_mean', 'away_L5_goals_scored_mean', | |
| 'home_L5_attack_strength', 'away_L5_attack_strength', | |
| 'home_L5_ppg', 'away_L5_ppg' | |
| ] | |
| for metric in key_metrics: | |
| if metric in base_features.columns: | |
| features[f'{metric}_percentile'] = ( | |
| base_features[metric].rank(pct=True) | |
| ) | |
| return pd.DataFrame(features).fillna(0.5) | |
| def _generate_cluster_features(self, base_features: pd.DataFrame) -> pd.DataFrame: | |
| """Generate team style clustering features.""" | |
| features = {} | |
| try: | |
| # Select features for clustering | |
| cluster_cols = [ | |
| col for col in base_features.columns | |
| if 'L5' in col and ('attack' in col or 'defense' in col or 'goals' in col) | |
| ][:10] # Limit to 10 features | |
| if len(cluster_cols) < 3: | |
| return pd.DataFrame() | |
| X = base_features[cluster_cols].fillna(0) | |
| # Standardize | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # K-Means clustering | |
| n_clusters = 5 | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(X_scaled) | |
| # One-hot encode clusters | |
| for i in range(n_clusters): | |
| features[f'style_cluster_{i}'] = (clusters == i).astype(int) | |
| features['cluster_id'] = clusters | |
| except Exception as e: | |
| logger.warning(f"Clustering failed: {e}") | |
| return pd.DataFrame(features) | |
| # Test | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("ENHANCED FEATURE ENGINEERING - TEST") | |
| print("=" * 60) | |
| # Create sample data | |
| np.random.seed(42) | |
| n_matches = 100 | |
| teams = ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'] | |
| sample_data = pd.DataFrame({ | |
| 'match_date': pd.date_range('2024-01-01', periods=n_matches, freq='3D'), | |
| 'home_team': np.random.choice(teams, n_matches), | |
| 'away_team': np.random.choice(teams, n_matches), | |
| 'home_goals': np.random.poisson(1.5, n_matches), | |
| 'away_goals': np.random.poisson(1.2, n_matches), | |
| 'home_goals_ht': np.random.poisson(0.7, n_matches), | |
| 'away_goals_ht': np.random.poisson(0.6, n_matches), | |
| 'season': '2024' | |
| }) | |
| # Generate features | |
| generator = EnhancedFeatureGenerator(sample_data) | |
| features = generator.generate_all_features() | |
| print(f"\n✅ Generated {len(features.columns)} features!") | |
| print(f"\nSample columns: {list(features.columns)[:20]}") | |