footypredict-pro / src /features /advanced_feature_engine.py
nananie143's picture
Deploy advanced models with XGBoost/LightGBM
246a547 verified
"""
Advanced Feature Engineering Module
Generates 150+ features per match for improved prediction accuracy:
- Core statistics (shots, corners, cards)
- Form features with time decay
- Head-to-head history
- xG-based features
- Market/odds features
- Contextual features
"""
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime, timedelta
import logging
logger = logging.getLogger(__name__)
# Base paths
DATA_DIR = Path(__file__).parent.parent.parent / "data"
class AdvancedFeatureEngine:
"""Generates 150+ features per match for ML prediction"""
def __init__(self, historical_data: Optional[pd.DataFrame] = None):
self.historical_data = historical_data
self.team_stats_cache = {}
self.h2h_cache = {}
if historical_data is not None:
self._build_caches()
def _build_caches(self) -> None:
"""Build team statistics and H2H caches from historical data"""
if self.historical_data is None or self.historical_data.empty:
return
df = self.historical_data
# Build team stats cache
for team in set(df.get('home_team', [])) | set(df.get('HomeTeam', [])):
if isinstance(team, str):
self.team_stats_cache[team.lower()] = self._calculate_team_stats(team)
logger.info(f"Built cache for {len(self.team_stats_cache)} teams")
def _calculate_team_stats(self, team: str) -> Dict:
"""Calculate historical statistics for a team"""
df = self.historical_data
team_lower = team.lower()
# Get home and away matches
home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam'
away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam'
home_matches = df[df[home_col].str.lower() == team_lower] if home_col in df.columns else pd.DataFrame()
away_matches = df[df[away_col].str.lower() == team_lower] if away_col in df.columns else pd.DataFrame()
stats = {
# Goals
'goals_scored_home': home_matches.get('home_goals', home_matches.get('FTHG', pd.Series())).mean() or 1.5,
'goals_conceded_home': home_matches.get('away_goals', home_matches.get('FTAG', pd.Series())).mean() or 1.2,
'goals_scored_away': away_matches.get('away_goals', away_matches.get('FTAG', pd.Series())).mean() or 1.1,
'goals_conceded_away': away_matches.get('home_goals', away_matches.get('FTHG', pd.Series())).mean() or 1.4,
# Shots
'shots_home': home_matches.get('home_shots', home_matches.get('HS', pd.Series())).mean() or 12,
'shots_away': away_matches.get('away_shots', away_matches.get('AS', pd.Series())).mean() or 10,
'shots_target_home': home_matches.get('home_shots_target', home_matches.get('HST', pd.Series())).mean() or 4,
'shots_target_away': away_matches.get('away_shots_target', away_matches.get('AST', pd.Series())).mean() or 3,
# Corners
'corners_home': home_matches.get('home_corners', home_matches.get('HC', pd.Series())).mean() or 5,
'corners_away': away_matches.get('away_corners', away_matches.get('AC', pd.Series())).mean() or 4,
# Cards
'yellows_home': home_matches.get('home_yellows', home_matches.get('HY', pd.Series())).mean() or 1.5,
'yellows_away': away_matches.get('away_yellows', away_matches.get('AY', pd.Series())).mean() or 1.7,
'reds_home': home_matches.get('home_reds', home_matches.get('HR', pd.Series())).mean() or 0.05,
'reds_away': away_matches.get('away_reds', away_matches.get('AR', pd.Series())).mean() or 0.05,
# Fouls
'fouls_home': home_matches.get('home_fouls', home_matches.get('HF', pd.Series())).mean() or 11,
'fouls_away': away_matches.get('away_fouls', away_matches.get('AF', pd.Series())).mean() or 12,
# Match counts
'home_matches': len(home_matches),
'away_matches': len(away_matches),
'total_matches': len(home_matches) + len(away_matches),
# Win rates
'home_win_rate': self._calculate_win_rate(home_matches, 'home'),
'away_win_rate': self._calculate_win_rate(away_matches, 'away'),
# xG (if available)
'xg_home': home_matches.get('home_xg', pd.Series()).mean() or 0,
'xg_away': away_matches.get('away_xg', pd.Series()).mean() or 0,
}
return stats
def _calculate_win_rate(self, matches: pd.DataFrame, team_type: str) -> float:
"""Calculate win rate from matches"""
if matches.empty:
return 0.33
result_col = 'result' if 'result' in matches.columns else 'FTR'
if result_col not in matches.columns:
return 0.33
if team_type == 'home':
wins = (matches[result_col] == 'H').sum()
else:
wins = (matches[result_col] == 'A').sum()
return wins / len(matches) if len(matches) > 0 else 0.33
def rolling_form(self, team: str, n_matches: int = 5,
decay: float = 0.9) -> Dict[str, float]:
"""Calculate rolling form with exponential time decay"""
if self.historical_data is None:
return self._default_form()
df = self.historical_data
team_lower = team.lower()
# Find team matches
home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam'
away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam'
result_col = 'result' if 'result' in df.columns else 'FTR'
# Get recent matches
home_mask = df[home_col].str.lower() == team_lower if home_col in df.columns else pd.Series([False] * len(df))
away_mask = df[away_col].str.lower() == team_lower if away_col in df.columns else pd.Series([False] * len(df))
team_matches = df[home_mask | away_mask].head(n_matches)
if team_matches.empty:
return self._default_form()
# Calculate weighted form
points = []
goals_for = []
goals_against = []
for i, (_, match) in enumerate(team_matches.iterrows()):
weight = decay ** i
is_home = str(match.get(home_col, '')).lower() == team_lower
result = match.get(result_col, 'D')
# Points
if (is_home and result == 'H') or (not is_home and result == 'A'):
points.append(3 * weight)
elif result == 'D':
points.append(1 * weight)
else:
points.append(0)
# Goals
home_goals = match.get('home_goals', match.get('FTHG', 0)) or 0
away_goals = match.get('away_goals', match.get('FTAG', 0)) or 0
if is_home:
goals_for.append(home_goals * weight)
goals_against.append(away_goals * weight)
else:
goals_for.append(away_goals * weight)
goals_against.append(home_goals * weight)
total_weight = sum(decay ** i for i in range(len(team_matches)))
return {
'form_points': sum(points) / total_weight if total_weight > 0 else 1.0,
'form_goals_scored': sum(goals_for) / total_weight if total_weight > 0 else 1.0,
'form_goals_conceded': sum(goals_against) / total_weight if total_weight > 0 else 1.0,
'form_goal_diff': (sum(goals_for) - sum(goals_against)) / total_weight if total_weight > 0 else 0,
'form_matches': len(team_matches),
'form_wins': sum(1 for p in points if p > 2),
'form_draws': sum(1 for p in points if 0 < p <= 1),
'form_losses': sum(1 for p in points if p == 0),
}
def _default_form(self) -> Dict[str, float]:
return {
'form_points': 1.5, 'form_goals_scored': 1.2, 'form_goals_conceded': 1.2,
'form_goal_diff': 0, 'form_matches': 0, 'form_wins': 0,
'form_draws': 0, 'form_losses': 0
}
def head_to_head(self, home_team: str, away_team: str,
n_matches: int = 5) -> Dict[str, float]:
"""Get head-to-head statistics"""
if self.historical_data is None:
return self._default_h2h()
cache_key = f"{home_team.lower()}_{away_team.lower()}"
if cache_key in self.h2h_cache:
return self.h2h_cache[cache_key]
df = self.historical_data
home_col = 'home_team' if 'home_team' in df.columns else 'HomeTeam'
away_col = 'away_team' if 'away_team' in df.columns else 'AwayTeam'
result_col = 'result' if 'result' in df.columns else 'FTR'
# Find H2H matches (either home or away)
mask1 = (df[home_col].str.lower() == home_team.lower()) & (df[away_col].str.lower() == away_team.lower())
mask2 = (df[home_col].str.lower() == away_team.lower()) & (df[away_col].str.lower() == home_team.lower())
h2h_matches = df[mask1 | mask2].head(n_matches)
if h2h_matches.empty:
return self._default_h2h()
# Calculate H2H stats
home_wins = 0
away_wins = 0
draws = 0
home_goals = 0
away_goals = 0
for _, match in h2h_matches.iterrows():
is_home_in_this_match = str(match.get(home_col, '')).lower() == home_team.lower()
result = match.get(result_col, 'D')
hg = match.get('home_goals', match.get('FTHG', 0)) or 0
ag = match.get('away_goals', match.get('FTAG', 0)) or 0
if is_home_in_this_match:
home_goals += hg
away_goals += ag
if result == 'H':
home_wins += 1
elif result == 'A':
away_wins += 1
else:
draws += 1
else:
home_goals += ag
away_goals += hg
if result == 'A':
home_wins += 1
elif result == 'H':
away_wins += 1
else:
draws += 1
n = len(h2h_matches)
h2h_stats = {
'h2h_matches': n,
'h2h_home_wins': home_wins,
'h2h_away_wins': away_wins,
'h2h_draws': draws,
'h2h_home_win_rate': home_wins / n if n > 0 else 0.33,
'h2h_away_win_rate': away_wins / n if n > 0 else 0.33,
'h2h_draw_rate': draws / n if n > 0 else 0.33,
'h2h_home_goals_avg': home_goals / n if n > 0 else 1.2,
'h2h_away_goals_avg': away_goals / n if n > 0 else 1.0,
'h2h_total_goals_avg': (home_goals + away_goals) / n if n > 0 else 2.2,
}
self.h2h_cache[cache_key] = h2h_stats
return h2h_stats
def _default_h2h(self) -> Dict[str, float]:
return {
'h2h_matches': 0, 'h2h_home_wins': 0, 'h2h_away_wins': 0, 'h2h_draws': 0,
'h2h_home_win_rate': 0.4, 'h2h_away_win_rate': 0.3, 'h2h_draw_rate': 0.3,
'h2h_home_goals_avg': 1.2, 'h2h_away_goals_avg': 1.0, 'h2h_total_goals_avg': 2.2
}
def odds_features(self, home_odds: float = 2.0, draw_odds: float = 3.3,
away_odds: float = 3.5) -> Dict[str, float]:
"""Extract features from betting odds"""
# Convert odds to implied probabilities
total_prob = (1/home_odds + 1/draw_odds + 1/away_odds)
home_prob = (1/home_odds) / total_prob
draw_prob = (1/draw_odds) / total_prob
away_prob = (1/away_odds) / total_prob
# Overround (bookmaker margin)
overround = total_prob - 1
return {
'odds_home': home_odds,
'odds_draw': draw_odds,
'odds_away': away_odds,
'implied_home_prob': home_prob,
'implied_draw_prob': draw_prob,
'implied_away_prob': away_prob,
'odds_overround': overround,
'odds_favorite_margin': max(home_prob, away_prob) - min(home_prob, away_prob),
'odds_is_home_favorite': 1 if home_prob > away_prob else 0,
'odds_is_away_favorite': 1 if away_prob > home_prob else 0,
'odds_home_value': home_odds * home_prob, # EV indicator
'odds_away_value': away_odds * away_prob,
}
def contextual_features(self, home_team: str, away_team: str,
match_date: Optional[datetime] = None,
is_cup: bool = False,
is_derby: bool = False) -> Dict[str, float]:
"""Extract contextual features about the match"""
if match_date is None:
match_date = datetime.now()
# Time-based features
day_of_week = match_date.weekday()
month = match_date.month
# Season position (rough estimate)
if month >= 8:
season_progress = (month - 8) / 10 # Aug to May
else:
season_progress = (month + 4) / 10
return {
'ctx_day_of_week': day_of_week,
'ctx_is_weekend': 1 if day_of_week >= 5 else 0,
'ctx_month': month,
'ctx_season_progress': min(1.0, max(0.0, season_progress)),
'ctx_is_cup': 1 if is_cup else 0,
'ctx_is_derby': 1 if is_derby else 0,
'ctx_end_of_season': 1 if month in [4, 5] else 0,
'ctx_start_of_season': 1 if month in [8, 9] else 0,
}
def extract_all_features(self, home_team: str, away_team: str,
home_odds: float = 2.0, draw_odds: float = 3.3,
away_odds: float = 3.5,
match_date: Optional[datetime] = None) -> np.ndarray:
"""Extract all 150+ features for a match"""
features = {}
# 1. Get team statistics (40+ features)
home_stats = self.team_stats_cache.get(home_team.lower(), self._calculate_team_stats(home_team))
away_stats = self.team_stats_cache.get(away_team.lower(), self._calculate_team_stats(away_team))
for key, value in home_stats.items():
features[f'home_{key}'] = value if isinstance(value, (int, float)) else 0
for key, value in away_stats.items():
features[f'away_{key}'] = value if isinstance(value, (int, float)) else 0
# 2. Rolling form (16 features: 8 per team)
home_form = self.rolling_form(home_team)
away_form = self.rolling_form(away_team)
for key, value in home_form.items():
features[f'home_{key}'] = value
for key, value in away_form.items():
features[f'away_{key}'] = value
# 3. Head-to-head (10 features)
h2h = self.head_to_head(home_team, away_team)
features.update(h2h)
# 4. Odds features (12 features)
odds_feats = self.odds_features(home_odds, draw_odds, away_odds)
features.update(odds_feats)
# 5. Contextual features (8 features)
ctx_feats = self.contextual_features(home_team, away_team, match_date)
features.update(ctx_feats)
# 6. Derived features (20+ features)
features['diff_goals_scored'] = home_stats.get('goals_scored_home', 1.5) - away_stats.get('goals_scored_away', 1.1)
features['diff_goals_conceded'] = away_stats.get('goals_conceded_away', 1.4) - home_stats.get('goals_conceded_home', 1.2)
features['diff_shots'] = home_stats.get('shots_home', 12) - away_stats.get('shots_away', 10)
features['diff_shots_target'] = home_stats.get('shots_target_home', 4) - away_stats.get('shots_target_away', 3)
features['diff_corners'] = home_stats.get('corners_home', 5) - away_stats.get('corners_away', 4)
features['diff_form_points'] = home_form.get('form_points', 1.5) - away_form.get('form_points', 1.5)
features['diff_win_rate'] = home_stats.get('home_win_rate', 0.4) - away_stats.get('away_win_rate', 0.3)
# Expected total goals
features['expected_total_goals'] = (
home_stats.get('goals_scored_home', 1.5) +
away_stats.get('goals_scored_away', 1.1)
)
# BTTS indicator
features['btts_indicator'] = min(
1 - (1 - home_stats.get('goals_scored_home', 1.5) / 3),
1 - (1 - away_stats.get('goals_scored_away', 1.1) / 3)
)
# Convert to numpy array
feature_values = [float(v) if isinstance(v, (int, float)) and not np.isnan(v) else 0.0
for v in features.values()]
return np.array(feature_values)
def get_feature_names(self) -> List[str]:
"""Get list of all feature names"""
# Generate a dummy extraction to get feature names
features = {}
# Add all feature groups
for prefix in ['home_', 'away_']:
for stat in ['goals_scored_home', 'goals_conceded_home', 'goals_scored_away',
'goals_conceded_away', 'shots_home', 'shots_away', 'shots_target_home',
'shots_target_away', 'corners_home', 'corners_away', 'yellows_home',
'yellows_away', 'reds_home', 'reds_away', 'fouls_home', 'fouls_away',
'home_matches', 'away_matches', 'total_matches', 'home_win_rate',
'away_win_rate', 'xg_home', 'xg_away']:
features[f'{prefix}{stat}'] = 0
for form in ['form_points', 'form_goals_scored', 'form_goals_conceded',
'form_goal_diff', 'form_matches', 'form_wins', 'form_draws', 'form_losses']:
features[f'{prefix}{form}'] = 0
# H2H
for h2h in ['h2h_matches', 'h2h_home_wins', 'h2h_away_wins', 'h2h_draws',
'h2h_home_win_rate', 'h2h_away_win_rate', 'h2h_draw_rate',
'h2h_home_goals_avg', 'h2h_away_goals_avg', 'h2h_total_goals_avg']:
features[h2h] = 0
# Odds
for odds in ['odds_home', 'odds_draw', 'odds_away', 'implied_home_prob',
'implied_draw_prob', 'implied_away_prob', 'odds_overround',
'odds_favorite_margin', 'odds_is_home_favorite', 'odds_is_away_favorite',
'odds_home_value', 'odds_away_value']:
features[odds] = 0
# Context
for ctx in ['ctx_day_of_week', 'ctx_is_weekend', 'ctx_month', 'ctx_season_progress',
'ctx_is_cup', 'ctx_is_derby', 'ctx_end_of_season', 'ctx_start_of_season']:
features[ctx] = 0
# Derived
for diff in ['diff_goals_scored', 'diff_goals_conceded', 'diff_shots',
'diff_shots_target', 'diff_corners', 'diff_form_points', 'diff_win_rate',
'expected_total_goals', 'btts_indicator']:
features[diff] = 0
return list(features.keys())
def create_feature_engine(data_path: Optional[Path] = None) -> AdvancedFeatureEngine:
"""Create a feature engine with historical data"""
if data_path is None:
data_path = DATA_DIR / "processed" / "master_training_data.csv"
if data_path.exists():
df = pd.read_csv(data_path)
return AdvancedFeatureEngine(df)
# Try existing training data
existing = DATA_DIR / "comprehensive_training_data.csv"
if existing.exists():
df = pd.read_csv(existing)
return AdvancedFeatureEngine(df)
return AdvancedFeatureEngine()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
# Test feature extraction
engine = create_feature_engine()
features = engine.extract_all_features(
home_team="Arsenal",
away_team="Chelsea",
home_odds=2.1,
draw_odds=3.4,
away_odds=3.2
)
print(f"Generated {len(features)} features")
print(f"Feature names: {len(engine.get_feature_names())}")