Spaces:
Runtime error
Runtime error
| """ | |
| Data Merger | |
| Combines all collected datasets into a unified training dataset: | |
| - Standardizes team names across sources | |
| - Aligns column schemas | |
| - Merges historical data with xG and odds | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Tuple | |
| import logging | |
| from fuzzywuzzy import fuzz, process | |
| logger = logging.getLogger(__name__) | |
| # Base paths | |
| DATA_DIR = Path(__file__).parent.parent.parent / "data" | |
| RAW_DATA_DIR = DATA_DIR / "raw" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| class DataMerger: | |
| """Merges all collected datasets into unified training data""" | |
| # Standard column mapping | |
| COLUMN_MAPPING = { | |
| # Date columns | |
| 'Date': 'date', 'date': 'date', 'datetime': 'date', 'match_date': 'date', | |
| # Team columns | |
| 'HomeTeam': 'home_team', 'home_team': 'home_team', 'home': 'home_team', | |
| 'AwayTeam': 'away_team', 'away_team': 'away_team', 'away': 'away_team', | |
| 'h': 'home_team', 'a': 'away_team', | |
| # Goals | |
| 'FTHG': 'home_goals', 'FTAG': 'away_goals', | |
| 'home_goals': 'home_goals', 'away_goals': 'away_goals', | |
| 'HG': 'home_goals', 'AG': 'away_goals', | |
| # Half time | |
| 'HTHG': 'ht_home_goals', 'HTAG': 'ht_away_goals', | |
| # Result | |
| 'FTR': 'result', 'result': 'result', | |
| # xG | |
| 'home_xg': 'home_xg', 'away_xg': 'away_xg', | |
| 'xG_home': 'home_xg', 'xG_away': 'away_xg', | |
| # Shots | |
| 'HS': 'home_shots', 'AS': 'away_shots', | |
| 'HST': 'home_shots_target', 'AST': 'away_shots_target', | |
| # Other stats | |
| 'HF': 'home_fouls', 'AF': 'away_fouls', | |
| 'HC': 'home_corners', 'AC': 'away_corners', | |
| 'HY': 'home_yellows', 'AY': 'away_yellows', | |
| 'HR': 'home_reds', 'AR': 'away_reds', | |
| # Odds | |
| 'B365H': 'odds_home', 'B365D': 'odds_draw', 'B365A': 'odds_away', | |
| 'PSH': 'odds_home_ps', 'PSD': 'odds_draw_ps', 'PSA': 'odds_away_ps', | |
| # League | |
| 'Div': 'league_code', 'LeagueName': 'league', 'league': 'league', | |
| 'Season': 'season', 'season': 'season' | |
| } | |
| # Known team name variations | |
| TEAM_ALIASES = { | |
| 'man united': 'manchester united', | |
| 'man utd': 'manchester united', | |
| 'manchester utd': 'manchester united', | |
| 'man city': 'manchester city', | |
| 'manchester c': 'manchester city', | |
| 'spurs': 'tottenham', | |
| 'tottenham hotspur': 'tottenham', | |
| 'wolves': 'wolverhampton', | |
| 'wolverhampton wanderers': 'wolverhampton', | |
| 'west ham': 'west ham united', | |
| 'brighton': 'brighton and hove albion', | |
| 'brighton hove': 'brighton and hove albion', | |
| 'nottm forest': 'nottingham forest', | |
| "nottingham": "nottingham forest", | |
| 'newcastle utd': 'newcastle united', | |
| 'sheffield utd': 'sheffield united', | |
| 'leicester': 'leicester city', | |
| 'crystal palace': 'crystal palace', | |
| 'bournemouth': 'afc bournemouth', | |
| 'bayern': 'bayern munich', | |
| 'bayern münchen': 'bayern munich', | |
| 'dortmund': 'borussia dortmund', | |
| 'borussia m.gladbach': 'borussia monchengladbach', | |
| 'gladbach': 'borussia monchengladbach', | |
| 'atletico': 'atletico madrid', | |
| 'atlético madrid': 'atletico madrid', | |
| 'real': 'real madrid', | |
| 'barca': 'barcelona', | |
| 'milan': 'ac milan', | |
| 'inter': 'inter milan', | |
| 'internazionale': 'inter milan', | |
| 'napoli': 'ssc napoli', | |
| 'juventus': 'juventus fc', | |
| 'roma': 'as roma', | |
| 'psg': 'paris saint-germain', | |
| 'paris sg': 'paris saint-germain', | |
| 'lyon': 'olympique lyon', | |
| 'marseille': 'olympique marseille' | |
| } | |
| def __init__(self, output_dir: Optional[Path] = None): | |
| self.output_dir = output_dir or PROCESSED_DIR | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.team_index = {} # For fuzzy matching cache | |
| def standardize_team_name(self, name: str) -> str: | |
| """Standardize team name to canonical form""" | |
| if not isinstance(name, str): | |
| return str(name) | |
| name_lower = name.lower().strip() | |
| # Check aliases first | |
| if name_lower in self.TEAM_ALIASES: | |
| return self.TEAM_ALIASES[name_lower] | |
| # Check fuzzy match cache | |
| if name_lower in self.team_index: | |
| return self.team_index[name_lower] | |
| # Try fuzzy match against known aliases | |
| match, score = process.extractOne(name_lower, list(self.TEAM_ALIASES.keys())) | |
| if score > 85: | |
| canonical = self.TEAM_ALIASES[match] | |
| self.team_index[name_lower] = canonical | |
| return canonical | |
| # Return title case version | |
| return name.strip().title() | |
| def standardize_columns(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """Rename columns to standard names""" | |
| rename_map = {} | |
| for old_name in df.columns: | |
| if old_name in self.COLUMN_MAPPING: | |
| rename_map[old_name] = self.COLUMN_MAPPING[old_name] | |
| if rename_map: | |
| df = df.rename(columns=rename_map) | |
| return df | |
| def load_kaggle_data(self) -> pd.DataFrame: | |
| """Load data from Kaggle collector""" | |
| kaggle_dir = RAW_DATA_DIR / "kaggle" | |
| combined_file = kaggle_dir / "football_data_all_leagues.csv" | |
| if combined_file.exists(): | |
| df = pd.read_csv(combined_file) | |
| df['source'] = 'kaggle' | |
| logger.info(f"Loaded {len(df)} matches from Kaggle data") | |
| return df | |
| return pd.DataFrame() | |
| def load_huggingface_data(self) -> pd.DataFrame: | |
| """Load data from HuggingFace collector""" | |
| hf_dir = RAW_DATA_DIR / "huggingface" | |
| all_dfs = [] | |
| for csv_file in hf_dir.glob("*.csv"): | |
| try: | |
| df = pd.read_csv(csv_file) | |
| df['source'] = 'huggingface' | |
| all_dfs.append(df) | |
| except Exception as e: | |
| logger.warning(f"Failed to load {csv_file}: {e}") | |
| if all_dfs: | |
| combined = pd.concat(all_dfs, ignore_index=True) | |
| logger.info(f"Loaded {len(combined)} rows from HuggingFace data") | |
| return combined | |
| return pd.DataFrame() | |
| def load_github_data(self) -> pd.DataFrame: | |
| """Load data from GitHub collector""" | |
| github_dir = RAW_DATA_DIR / "github" | |
| all_dfs = [] | |
| for csv_file in github_dir.glob("*.csv"): | |
| try: | |
| df = pd.read_csv(csv_file) | |
| df['source'] = 'github' | |
| all_dfs.append(df) | |
| except Exception as e: | |
| logger.warning(f"Failed to load {csv_file}: {e}") | |
| if all_dfs: | |
| combined = pd.concat(all_dfs, ignore_index=True) | |
| logger.info(f"Loaded {len(combined)} rows from GitHub data") | |
| return combined | |
| return pd.DataFrame() | |
| def load_existing_data(self) -> pd.DataFrame: | |
| """Load existing training data""" | |
| existing_file = DATA_DIR / "comprehensive_training_data.csv" | |
| if existing_file.exists(): | |
| df = pd.read_csv(existing_file) | |
| df['source'] = 'existing' | |
| logger.info(f"Loaded {len(df)} matches from existing training data") | |
| return df | |
| return pd.DataFrame() | |
| def merge_all_sources(self) -> pd.DataFrame: | |
| """Merge all data sources into unified dataset""" | |
| sources = [] | |
| # Load from each source | |
| kaggle = self.load_kaggle_data() | |
| if not kaggle.empty: | |
| sources.append(('kaggle', kaggle)) | |
| hf = self.load_huggingface_data() | |
| if not hf.empty: | |
| sources.append(('huggingface', hf)) | |
| github = self.load_github_data() | |
| if not github.empty: | |
| sources.append(('github', github)) | |
| existing = self.load_existing_data() | |
| if not existing.empty: | |
| sources.append(('existing', existing)) | |
| if not sources: | |
| logger.warning("No data sources found") | |
| return pd.DataFrame() | |
| # Process each source | |
| processed = [] | |
| for name, df in sources: | |
| logger.info(f"Processing {name}: {len(df)} rows") | |
| # Standardize columns | |
| df = self.standardize_columns(df) | |
| # Standardize team names | |
| if 'home_team' in df.columns: | |
| df['home_team'] = df['home_team'].apply(self.standardize_team_name) | |
| if 'away_team' in df.columns: | |
| df['away_team'] = df['away_team'].apply(self.standardize_team_name) | |
| processed.append(df) | |
| # Combine all | |
| combined = pd.concat(processed, ignore_index=True) | |
| # Remove duplicates (same date + teams) | |
| if all(col in combined.columns for col in ['date', 'home_team', 'away_team']): | |
| before = len(combined) | |
| combined = combined.drop_duplicates(subset=['date', 'home_team', 'away_team'], keep='first') | |
| logger.info(f"Removed {before - len(combined)} duplicates") | |
| # Sort by date | |
| if 'date' in combined.columns: | |
| combined = combined.sort_values('date', ascending=False) | |
| return combined | |
| def create_master_dataset(self) -> Tuple[pd.DataFrame, Dict]: | |
| """Create the master training dataset""" | |
| logger.info("Merging all data sources...") | |
| combined = self.merge_all_sources() | |
| if combined.empty: | |
| return pd.DataFrame(), {"error": "No data to merge"} | |
| # Save master dataset | |
| output_file = self.output_dir / "master_training_data.csv" | |
| combined.to_csv(output_file, index=False) | |
| # Calculate stats | |
| stats = { | |
| "total_matches": len(combined), | |
| "sources": combined['source'].value_counts().to_dict() if 'source' in combined.columns else {}, | |
| "teams": len(set(combined.get('home_team', [])) | set(combined.get('away_team', []))), | |
| "columns": len(combined.columns), | |
| "output_file": str(output_file) | |
| } | |
| logger.info(f"✓ Created master dataset: {len(combined)} matches") | |
| logger.info(f" Saved to: {output_file}") | |
| return combined, stats | |
| def merge_all_data() -> pd.DataFrame: | |
| """Convenience function to merge all collected data""" | |
| merger = DataMerger() | |
| df, stats = merger.create_master_dataset() | |
| print(f"Merged {stats.get('total_matches', 0)} matches from {len(stats.get('sources', {}))} sources") | |
| return df | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| merger = DataMerger() | |
| df, stats = merger.create_master_dataset() | |
| print("\nMaster Dataset Stats:") | |
| for key, value in stats.items(): | |
| print(f" {key}: {value}") | |