Spaces:
Runtime error
Runtime error
| """ | |
| Advanced Feature Engineering Module | |
| Generates 150+ features per match for improved prediction accuracy: | |
| - Core statistics (shots, corners, cards) | |
| - Form features with time decay | |
| - Head-to-head history | |
| - xG-based features | |
| - Market/odds features | |
| - Contextual features | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Tuple | |
| from datetime import datetime, timedelta | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| # Base paths | |
| DATA_DIR = Path(__file__).parent.parent.parent / "data" | |
| class AdvancedFeatureEngine: | |
| """Generates 150+ features per match for ML prediction""" | |
| def __init__(self, historical_data: Optional[pd.DataFrame] = None): | |
| self.historical_data = historical_data | |
| self.team_stats_cache = {} | |
| self.h2h_cache = {} | |
| if historical_data is not None: | |
| self._build_caches() | |
| def _build_caches(self) -> None: | |
| """Build team statistics and H2H caches from historical data""" | |
| if self.historical_data is None or self.historical_data.empty: | |
| return | |
| df = self.historical_data | |
| # Build team stats cache | |
| for team in set(df.get('home_team', [])) | set(df.get('HomeTeam', [])): | |
| if isinstance(team, str): | |
| self.team_stats_cache[team.lower()] = self._calculate_team_stats(team) | |
| logger.info(f"Built cache for {len(self.team_stats_cache)} teams") | |
| def _calculate_team_stats(self, team: str) -> Dict: | |
| """Calculate historical statistics for a team""" | |
| df = self.historical_data | |
| team_lower = team.lower() | |
| # Get home and away matches | |
| home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam' | |
| away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam' | |
| home_matches = df[df[home_col].str.lower() == team_lower] if home_col in df.columns else pd.DataFrame() | |
| away_matches = df[df[away_col].str.lower() == team_lower] if away_col in df.columns else pd.DataFrame() | |
| stats = { | |
| # Goals | |
| 'goals_scored_home': home_matches.get('home_goals', home_matches.get('FTHG', pd.Series())).mean() or 1.5, | |
| 'goals_conceded_home': home_matches.get('away_goals', home_matches.get('FTAG', pd.Series())).mean() or 1.2, | |
| 'goals_scored_away': away_matches.get('away_goals', away_matches.get('FTAG', pd.Series())).mean() or 1.1, | |
| 'goals_conceded_away': away_matches.get('home_goals', away_matches.get('FTHG', pd.Series())).mean() or 1.4, | |
| # Shots | |
| 'shots_home': home_matches.get('home_shots', home_matches.get('HS', pd.Series())).mean() or 12, | |
| 'shots_away': away_matches.get('away_shots', away_matches.get('AS', pd.Series())).mean() or 10, | |
| 'shots_target_home': home_matches.get('home_shots_target', home_matches.get('HST', pd.Series())).mean() or 4, | |
| 'shots_target_away': away_matches.get('away_shots_target', away_matches.get('AST', pd.Series())).mean() or 3, | |
| # Corners | |
| 'corners_home': home_matches.get('home_corners', home_matches.get('HC', pd.Series())).mean() or 5, | |
| 'corners_away': away_matches.get('away_corners', away_matches.get('AC', pd.Series())).mean() or 4, | |
| # Cards | |
| 'yellows_home': home_matches.get('home_yellows', home_matches.get('HY', pd.Series())).mean() or 1.5, | |
| 'yellows_away': away_matches.get('away_yellows', away_matches.get('AY', pd.Series())).mean() or 1.7, | |
| 'reds_home': home_matches.get('home_reds', home_matches.get('HR', pd.Series())).mean() or 0.05, | |
| 'reds_away': away_matches.get('away_reds', away_matches.get('AR', pd.Series())).mean() or 0.05, | |
| # Fouls | |
| 'fouls_home': home_matches.get('home_fouls', home_matches.get('HF', pd.Series())).mean() or 11, | |
| 'fouls_away': away_matches.get('away_fouls', away_matches.get('AF', pd.Series())).mean() or 12, | |
| # Match counts | |
| 'home_matches': len(home_matches), | |
| 'away_matches': len(away_matches), | |
| 'total_matches': len(home_matches) + len(away_matches), | |
| # Win rates | |
| 'home_win_rate': self._calculate_win_rate(home_matches, 'home'), | |
| 'away_win_rate': self._calculate_win_rate(away_matches, 'away'), | |
| # xG (if available) | |
| 'xg_home': home_matches.get('home_xg', pd.Series()).mean() or 0, | |
| 'xg_away': away_matches.get('away_xg', pd.Series()).mean() or 0, | |
| } | |
| return stats | |
| def _calculate_win_rate(self, matches: pd.DataFrame, team_type: str) -> float: | |
| """Calculate win rate from matches""" | |
| if matches.empty: | |
| return 0.33 | |
| result_col = 'result' if 'result' in matches.columns else 'FTR' | |
| if result_col not in matches.columns: | |
| return 0.33 | |
| if team_type == 'home': | |
| wins = (matches[result_col] == 'H').sum() | |
| else: | |
| wins = (matches[result_col] == 'A').sum() | |
| return wins / len(matches) if len(matches) > 0 else 0.33 | |
| def rolling_form(self, team: str, n_matches: int = 5, | |
| decay: float = 0.9) -> Dict[str, float]: | |
| """Calculate rolling form with exponential time decay""" | |
| if self.historical_data is None: | |
| return self._default_form() | |
| df = self.historical_data | |
| team_lower = team.lower() | |
| # Find team matches | |
| home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam' | |
| away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam' | |
| result_col = 'result' if 'result' in df.columns else 'FTR' | |
| # Get recent matches | |
| home_mask = df[home_col].str.lower() == team_lower if home_col in df.columns else pd.Series([False] * len(df)) | |
| away_mask = df[away_col].str.lower() == team_lower if away_col in df.columns else pd.Series([False] * len(df)) | |
| team_matches = df[home_mask | away_mask].head(n_matches) | |
| if team_matches.empty: | |
| return self._default_form() | |
| # Calculate weighted form | |
| points = [] | |
| goals_for = [] | |
| goals_against = [] | |
| for i, (_, match) in enumerate(team_matches.iterrows()): | |
| weight = decay ** i | |
| is_home = str(match.get(home_col, '')).lower() == team_lower | |
| result = match.get(result_col, 'D') | |
| # Points | |
| if (is_home and result == 'H') or (not is_home and result == 'A'): | |
| points.append(3 * weight) | |
| elif result == 'D': | |
| points.append(1 * weight) | |
| else: | |
| points.append(0) | |
| # Goals | |
| home_goals = match.get('home_goals', match.get('FTHG', 0)) or 0 | |
| away_goals = match.get('away_goals', match.get('FTAG', 0)) or 0 | |
| if is_home: | |
| goals_for.append(home_goals * weight) | |
| goals_against.append(away_goals * weight) | |
| else: | |
| goals_for.append(away_goals * weight) | |
| goals_against.append(home_goals * weight) | |
| total_weight = sum(decay ** i for i in range(len(team_matches))) | |
| return { | |
| 'form_points': sum(points) / total_weight if total_weight > 0 else 1.0, | |
| 'form_goals_scored': sum(goals_for) / total_weight if total_weight > 0 else 1.0, | |
| 'form_goals_conceded': sum(goals_against) / total_weight if total_weight > 0 else 1.0, | |
| 'form_goal_diff': (sum(goals_for) - sum(goals_against)) / total_weight if total_weight > 0 else 0, | |
| 'form_matches': len(team_matches), | |
| 'form_wins': sum(1 for p in points if p > 2), | |
| 'form_draws': sum(1 for p in points if 0 < p <= 1), | |
| 'form_losses': sum(1 for p in points if p == 0), | |
| } | |
| def _default_form(self) -> Dict[str, float]: | |
| return { | |
| 'form_points': 1.5, 'form_goals_scored': 1.2, 'form_goals_conceded': 1.2, | |
| 'form_goal_diff': 0, 'form_matches': 0, 'form_wins': 0, | |
| 'form_draws': 0, 'form_losses': 0 | |
| } | |
| def head_to_head(self, home_team: str, away_team: str, | |
| n_matches: int = 5) -> Dict[str, float]: | |
| """Get head-to-head statistics""" | |
| if self.historical_data is None: | |
| return self._default_h2h() | |
| cache_key = f"{home_team.lower()}_{away_team.lower()}" | |
| if cache_key in self.h2h_cache: | |
| return self.h2h_cache[cache_key] | |
| df = self.historical_data | |
| home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam' | |
| away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam' | |
| result_col = 'result' if 'result' in df.columns else 'FTR' | |
| # Find H2H matches (either home or away) | |
| mask1 = (df[home_col].str.lower() == home_team.lower()) & (df[away_col].str.lower() == away_team.lower()) | |
| mask2 = (df[home_col].str.lower() == away_team.lower()) & (df[away_col].str.lower() == home_team.lower()) | |
| h2h_matches = df[mask1 | mask2].head(n_matches) | |
| if h2h_matches.empty: | |
| return self._default_h2h() | |
| # Calculate H2H stats | |
| home_wins = 0 | |
| away_wins = 0 | |
| draws = 0 | |
| home_goals = 0 | |
| away_goals = 0 | |
| for _, match in h2h_matches.iterrows(): | |
| is_home_in_this_match = str(match.get(home_col, '')).lower() == home_team.lower() | |
| result = match.get(result_col, 'D') | |
| hg = match.get('home_goals', match.get('FTHG', 0)) or 0 | |
| ag = match.get('away_goals', match.get('FTAG', 0)) or 0 | |
| if is_home_in_this_match: | |
| home_goals += hg | |
| away_goals += ag | |
| if result == 'H': | |
| home_wins += 1 | |
| elif result == 'A': | |
| away_wins += 1 | |
| else: | |
| draws += 1 | |
| else: | |
| home_goals += ag | |
| away_goals += hg | |
| if result == 'A': | |
| home_wins += 1 | |
| elif result == 'H': | |
| away_wins += 1 | |
| else: | |
| draws += 1 | |
| n = len(h2h_matches) | |
| h2h_stats = { | |
| 'h2h_matches': n, | |
| 'h2h_home_wins': home_wins, | |
| 'h2h_away_wins': away_wins, | |
| 'h2h_draws': draws, | |
| 'h2h_home_win_rate': home_wins / n if n > 0 else 0.33, | |
| 'h2h_away_win_rate': away_wins / n if n > 0 else 0.33, | |
| 'h2h_draw_rate': draws / n if n > 0 else 0.33, | |
| 'h2h_home_goals_avg': home_goals / n if n > 0 else 1.2, | |
| 'h2h_away_goals_avg': away_goals / n if n > 0 else 1.0, | |
| 'h2h_total_goals_avg': (home_goals + away_goals) / n if n > 0 else 2.2, | |
| } | |
| self.h2h_cache[cache_key] = h2h_stats | |
| return h2h_stats | |
| def _default_h2h(self) -> Dict[str, float]: | |
| return { | |
| 'h2h_matches': 0, 'h2h_home_wins': 0, 'h2h_away_wins': 0, 'h2h_draws': 0, | |
| 'h2h_home_win_rate': 0.4, 'h2h_away_win_rate': 0.3, 'h2h_draw_rate': 0.3, | |
| 'h2h_home_goals_avg': 1.2, 'h2h_away_goals_avg': 1.0, 'h2h_total_goals_avg': 2.2 | |
| } | |
| def odds_features(self, home_odds: float = 2.0, draw_odds: float = 3.3, | |
| away_odds: float = 3.5) -> Dict[str, float]: | |
| """Extract features from betting odds""" | |
| # Convert odds to implied probabilities | |
| total_prob = (1/home_odds + 1/draw_odds + 1/away_odds) | |
| home_prob = (1/home_odds) / total_prob | |
| draw_prob = (1/draw_odds) / total_prob | |
| away_prob = (1/away_odds) / total_prob | |
| # Overround (bookmaker margin) | |
| overround = total_prob - 1 | |
| return { | |
| 'odds_home': home_odds, | |
| 'odds_draw': draw_odds, | |
| 'odds_away': away_odds, | |
| 'implied_home_prob': home_prob, | |
| 'implied_draw_prob': draw_prob, | |
| 'implied_away_prob': away_prob, | |
| 'odds_overround': overround, | |
| 'odds_favorite_margin': max(home_prob, away_prob) - min(home_prob, away_prob), | |
| 'odds_is_home_favorite': 1 if home_prob > away_prob else 0, | |
| 'odds_is_away_favorite': 1 if away_prob > home_prob else 0, | |
| 'odds_home_value': home_odds * home_prob, # EV indicator | |
| 'odds_away_value': away_odds * away_prob, | |
| } | |
| def contextual_features(self, home_team: str, away_team: str, | |
| match_date: Optional[datetime] = None, | |
| is_cup: bool = False, | |
| is_derby: bool = False) -> Dict[str, float]: | |
| """Extract contextual features about the match""" | |
| if match_date is None: | |
| match_date = datetime.now() | |
| # Time-based features | |
| day_of_week = match_date.weekday() | |
| month = match_date.month | |
| # Season position (rough estimate) | |
| if month >= 8: | |
| season_progress = (month - 8) / 10 # Aug to May | |
| else: | |
| season_progress = (month + 4) / 10 | |
| return { | |
| 'ctx_day_of_week': day_of_week, | |
| 'ctx_is_weekend': 1 if day_of_week >= 5 else 0, | |
| 'ctx_month': month, | |
| 'ctx_season_progress': min(1.0, max(0.0, season_progress)), | |
| 'ctx_is_cup': 1 if is_cup else 0, | |
| 'ctx_is_derby': 1 if is_derby else 0, | |
| 'ctx_end_of_season': 1 if month in [4, 5] else 0, | |
| 'ctx_start_of_season': 1 if month in [8, 9] else 0, | |
| } | |
| def extract_all_features(self, home_team: str, away_team: str, | |
| home_odds: float = 2.0, draw_odds: float = 3.3, | |
| away_odds: float = 3.5, | |
| match_date: Optional[datetime] = None) -> np.ndarray: | |
| """Extract all 150+ features for a match""" | |
| features = {} | |
| # 1. Get team statistics (40+ features) | |
| home_stats = self.team_stats_cache.get(home_team.lower(), self._calculate_team_stats(home_team)) | |
| away_stats = self.team_stats_cache.get(away_team.lower(), self._calculate_team_stats(away_team)) | |
| for key, value in home_stats.items(): | |
| features[f'home_{key}'] = value if isinstance(value, (int, float)) else 0 | |
| for key, value in away_stats.items(): | |
| features[f'away_{key}'] = value if isinstance(value, (int, float)) else 0 | |
| # 2. Rolling form (16 features: 8 per team) | |
| home_form = self.rolling_form(home_team) | |
| away_form = self.rolling_form(away_team) | |
| for key, value in home_form.items(): | |
| features[f'home_{key}'] = value | |
| for key, value in away_form.items(): | |
| features[f'away_{key}'] = value | |
| # 3. Head-to-head (10 features) | |
| h2h = self.head_to_head(home_team, away_team) | |
| features.update(h2h) | |
| # 4. Odds features (12 features) | |
| odds_feats = self.odds_features(home_odds, draw_odds, away_odds) | |
| features.update(odds_feats) | |
| # 5. Contextual features (8 features) | |
| ctx_feats = self.contextual_features(home_team, away_team, match_date) | |
| features.update(ctx_feats) | |
| # 6. Derived features (20+ features) | |
| features['diff_goals_scored'] = home_stats.get('goals_scored_home', 1.5) - away_stats.get('goals_scored_away', 1.1) | |
| features['diff_goals_conceded'] = away_stats.get('goals_conceded_away', 1.4) - home_stats.get('goals_conceded_home', 1.2) | |
| features['diff_shots'] = home_stats.get('shots_home', 12) - away_stats.get('shots_away', 10) | |
| features['diff_shots_target'] = home_stats.get('shots_target_home', 4) - away_stats.get('shots_target_away', 3) | |
| features['diff_corners'] = home_stats.get('corners_home', 5) - away_stats.get('corners_away', 4) | |
| features['diff_form_points'] = home_form.get('form_points', 1.5) - away_form.get('form_points', 1.5) | |
| features['diff_win_rate'] = home_stats.get('home_win_rate', 0.4) - away_stats.get('away_win_rate', 0.3) | |
| # Expected total goals | |
| features['expected_total_goals'] = ( | |
| home_stats.get('goals_scored_home', 1.5) + | |
| away_stats.get('goals_scored_away', 1.1) | |
| ) | |
| # BTTS indicator | |
| features['btts_indicator'] = min( | |
| 1 - (1 - home_stats.get('goals_scored_home', 1.5) / 3), | |
| 1 - (1 - away_stats.get('goals_scored_away', 1.1) / 3) | |
| ) | |
| # Convert to numpy array | |
| feature_values = [float(v) if isinstance(v, (int, float)) and not np.isnan(v) else 0.0 | |
| for v in features.values()] | |
| return np.array(feature_values) | |
| def get_feature_names(self) -> List[str]: | |
| """Get list of all feature names""" | |
| # Generate a dummy extraction to get feature names | |
| features = {} | |
| # Add all feature groups | |
| for prefix in ['home_', 'away_']: | |
| for stat in ['goals_scored_home', 'goals_conceded_home', 'goals_scored_away', | |
| 'goals_conceded_away', 'shots_home', 'shots_away', 'shots_target_home', | |
| 'shots_target_away', 'corners_home', 'corners_away', 'yellows_home', | |
| 'yellows_away', 'reds_home', 'reds_away', 'fouls_home', 'fouls_away', | |
| 'home_matches', 'away_matches', 'total_matches', 'home_win_rate', | |
| 'away_win_rate', 'xg_home', 'xg_away']: | |
| features[f'{prefix}{stat}'] = 0 | |
| for form in ['form_points', 'form_goals_scored', 'form_goals_conceded', | |
| 'form_goal_diff', 'form_matches', 'form_wins', 'form_draws', 'form_losses']: | |
| features[f'{prefix}{form}'] = 0 | |
| # H2H | |
| for h2h in ['h2h_matches', 'h2h_home_wins', 'h2h_away_wins', 'h2h_draws', | |
| 'h2h_home_win_rate', 'h2h_away_win_rate', 'h2h_draw_rate', | |
| 'h2h_home_goals_avg', 'h2h_away_goals_avg', 'h2h_total_goals_avg']: | |
| features[h2h] = 0 | |
| # Odds | |
| for odds in ['odds_home', 'odds_draw', 'odds_away', 'implied_home_prob', | |
| 'implied_draw_prob', 'implied_away_prob', 'odds_overround', | |
| 'odds_favorite_margin', 'odds_is_home_favorite', 'odds_is_away_favorite', | |
| 'odds_home_value', 'odds_away_value']: | |
| features[odds] = 0 | |
| # Context | |
| for ctx in ['ctx_day_of_week', 'ctx_is_weekend', 'ctx_month', 'ctx_season_progress', | |
| 'ctx_is_cup', 'ctx_is_derby', 'ctx_end_of_season', 'ctx_start_of_season']: | |
| features[ctx] = 0 | |
| # Derived | |
| for diff in ['diff_goals_scored', 'diff_goals_conceded', 'diff_shots', | |
| 'diff_shots_target', 'diff_corners', 'diff_form_points', 'diff_win_rate', | |
| 'expected_total_goals', 'btts_indicator']: | |
| features[diff] = 0 | |
| return list(features.keys()) | |
| def create_feature_engine(data_path: Optional[Path] = None) -> AdvancedFeatureEngine: | |
| """Create a feature engine with historical data""" | |
| if data_path is None: | |
| data_path = DATA_DIR / "processed" / "master_training_data.csv" | |
| if data_path.exists(): | |
| df = pd.read_csv(data_path) | |
| return AdvancedFeatureEngine(df) | |
| # Try existing training data | |
| existing = DATA_DIR / "comprehensive_training_data.csv" | |
| if existing.exists(): | |
| df = pd.read_csv(existing) | |
| return AdvancedFeatureEngine(df) | |
| return AdvancedFeatureEngine() | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| # Test feature extraction | |
| engine = create_feature_engine() | |
| features = engine.extract_all_features( | |
| home_team="Arsenal", | |
| away_team="Chelsea", | |
| home_odds=2.1, | |
| draw_odds=3.4, | |
| away_odds=3.2 | |
| ) | |
| print(f"Generated {len(features)} features") | |
| print(f"Feature names: {len(engine.get_feature_names())}") | |