""" Data Cleaner Module Standardizes and cleans football match data from various sources. Part of the complete blueprint implementation. """ import pandas as pd import numpy as np from typing import Dict, List, Optional, Tuple import logging import re from datetime import datetime logger = logging.getLogger(__name__) class DataCleaner: """ Standardizes data from multiple sources into a common format. Handles: - Team name normalization - Date parsing - Missing value imputation - Outlier detection - Column standardization """ # Team name aliases (common variations) TEAM_ALIASES = { # England 'man united': 'manchester united', 'man utd': 'manchester united', 'man city': 'manchester city', 'wolves': 'wolverhampton', 'spurs': 'tottenham', 'brighton': 'brighton and hove albion', 'west ham': 'west ham united', 'newcastle': 'newcastle united', 'nott\'m forest': 'nottingham forest', 'nottingham': 'nottingham forest', 'leicester': 'leicester city', # Germany 'bayern': 'bayern munich', 'bayern munchen': 'bayern munich', 'dortmund': 'borussia dortmund', 'bvb': 'borussia dortmund', 'm\'gladbach': 'borussia monchengladbach', 'gladbach': 'borussia monchengladbach', 'leverkusen': 'bayer leverkusen', 'rb leipzig': 'leipzig', 'wolfsburg': 'vfl wolfsburg', # Spain 'real': 'real madrid', 'barca': 'barcelona', 'atleti': 'atletico madrid', 'atletico': 'atletico madrid', # Italy 'inter': 'inter milan', 'internazionale': 'inter milan', 'juve': 'juventus', 'ac milan': 'milan', 'napoli': 'ssc napoli', # France 'psg': 'paris saint-germain', 'paris': 'paris saint-germain', 'monaco': 'as monaco', 'marseille': 'olympique marseille', 'lyon': 'olympique lyon', } STANDARD_COLUMNS = { 'match_date': 'datetime64[ns]', 'home_team': 'string', 'away_team': 'string', 'home_goals': 'int64', 'away_goals': 'int64', 'result': 'string', 'league': 'string', 'season': 'string', } def __init__(self): self.team_mapping = {} self._build_team_mapping() def _build_team_mapping(self): """Build team name mapping from aliases.""" for alias, canonical in self.TEAM_ALIASES.items(): self.team_mapping[alias.lower()] = canonical.lower() def normalize_team_name(self, team: str) -> str: """Normalize team name to standard form.""" if not team: return '' team_lower = team.lower().strip() # Check aliases if team_lower in self.team_mapping: return self.team_mapping[team_lower].title() # Basic normalization team_clean = re.sub(r'\s+', ' ', team_lower) team_clean = re.sub(r'fc$|^fc\s', '', team_clean).strip() return team_clean.title() def parse_date(self, date_str: str) -> Optional[datetime]: """Parse date from various formats.""" if pd.isna(date_str): return None formats = [ '%Y-%m-%d', '%d/%m/%Y', '%d-%m-%Y', '%Y/%m/%d', '%d %b %Y', '%b %d, %Y', ] for fmt in formats: try: return datetime.strptime(str(date_str), fmt) except ValueError: continue # Try pandas try: return pd.to_datetime(date_str) except Exception: return None def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: """ Clean and standardize a DataFrame. Steps: 1. Normalize column names 2. Parse dates 3. Normalize team names 4. Handle missing values 5. Calculate derived columns """ df = df.copy() # 1. Lowercase column names df.columns = df.columns.str.lower().str.strip() # 2. Parse dates date_cols = ['date', 'match_date', 'matchdate', 'date_time'] for col in date_cols: if col in df.columns: df['match_date'] = pd.to_datetime(df[col], errors='coerce') break # 3. Normalize team names team_cols = ['home_team', 'away_team', 'hometeam', 'awayteam', 'home', 'away'] if 'hometeam' in df.columns and 'home_team' not in df.columns: df['home_team'] = df['hometeam'] if 'awayteam' in df.columns and 'away_team' not in df.columns: df['away_team'] = df['awayteam'] if 'home_team' in df.columns: df['home_team'] = df['home_team'].apply(self.normalize_team_name) if 'away_team' in df.columns: df['away_team'] = df['away_team'].apply(self.normalize_team_name) # 4. Handle missing values numeric_cols = df.select_dtypes(include=[np.number]).columns df[numeric_cols] = df[numeric_cols].fillna(0) # 5. Calculate result if missing if 'result' not in df.columns and all(c in df.columns for c in ['home_goals', 'away_goals']): df['result'] = df.apply( lambda r: 'H' if r['home_goals'] > r['away_goals'] else ('A' if r['home_goals'] < r['away_goals'] else 'D'), axis=1 ) return df def remove_duplicates( self, df: pd.DataFrame, subset: List[str] = None ) -> pd.DataFrame: """Remove duplicate matches.""" subset = subset or ['match_date', 'home_team', 'away_team'] available_subset = [c for c in subset if c in df.columns] if not available_subset: return df original_len = len(df) df = df.drop_duplicates(subset=available_subset, keep='last') if len(df) < original_len: logger.info(f"Removed {original_len - len(df)} duplicate rows") return df def detect_outliers( self, df: pd.DataFrame, column: str, method: str = 'iqr', threshold: float = 1.5 ) -> pd.Series: """ Detect outliers in a column. Returns boolean Series indicating outliers. """ if column not in df.columns: return pd.Series([False] * len(df)) values = df[column].dropna() if method == 'iqr': q1 = values.quantile(0.25) q3 = values.quantile(0.75) iqr = q3 - q1 lower = q1 - threshold * iqr upper = q3 + threshold * iqr return (df[column] < lower) | (df[column] > upper) elif method == 'zscore': mean = values.mean() std = values.std() return abs((df[column] - mean) / std) > threshold return pd.Series([False] * len(df)) def impute_missing( self, df: pd.DataFrame, column: str, method: str = 'mean' ) -> pd.DataFrame: """Impute missing values.""" df = df.copy() if column not in df.columns: return df if method == 'mean': df[column] = df[column].fillna(df[column].mean()) elif method == 'median': df[column] = df[column].fillna(df[column].median()) elif method == 'mode': df[column] = df[column].fillna(df[column].mode().iloc[0] if not df[column].mode().empty else 0) elif method == 'zero': df[column] = df[column].fillna(0) return df # Global instance _cleaner: Optional[DataCleaner] = None def get_cleaner() -> DataCleaner: """Get or create data cleaner.""" global _cleaner if _cleaner is None: _cleaner = DataCleaner() return _cleaner def clean_match_data(df: pd.DataFrame) -> pd.DataFrame: """Quick function to clean match data.""" return get_cleaner().clean_dataframe(df)