""" GitHub Data Collector Fetches football datasets from GitHub repositories and web sources: - football.csv - Open public domain football data - jokecamp/FootballData - JSON/CSV odds data - understat.com - xG data - fbref.com - Advanced statistics """ import requests import pandas as pd from pathlib import Path from typing import Optional, Dict, List import logging import json import time logger = logging.getLogger(__name__) # Base paths DATA_DIR = Path(__file__).parent.parent.parent / "data" RAW_DATA_DIR = DATA_DIR / "raw" / "github" class GitHubCollector: """Clones and processes GitHub football datasets""" # GitHub raw file URLs for direct download GITHUB_SOURCES = { "football_csv": { "base_url": "https://raw.githubusercontent.com/openfootball/football.json/master", "files": ["2023-24/en.1.json", "2023-24/de.1.json", "2023-24/es.1.json"], "format": "json" }, "jokecamp_football": { "base_url": "https://raw.githubusercontent.com/jokecamp/FootballData/master", "files": ["openFootballData/stadiums.json", "openFootballData/countries.json"], "format": "json" } } def __init__(self, output_dir: Optional[Path] = None): self.output_dir = output_dir or RAW_DATA_DIR self.output_dir.mkdir(parents=True, exist_ok=True) def download_github_file(self, url: str, output_name: str) -> bool: """Download a single file from GitHub""" try: response = requests.get(url, timeout=30) response.raise_for_status() output_path = self.output_dir / output_name with open(output_path, 'wb') as f: f.write(response.content) logger.info(f"✓ Downloaded: {output_name}") return True except Exception as e: logger.error(f"Failed to download {url}: {e}") return False def fetch_football_csv(self) -> pd.DataFrame: """Fetch data from football.csv / openfootball project""" all_data = [] for source_name, config in self.GITHUB_SOURCES.items(): base_url = config["base_url"] for file_path in config["files"]: url = f"{base_url}/{file_path}" output_name = f"{source_name}_{file_path.replace('/', '_')}" if self.download_github_file(url, output_name): # Parse based on format file_path = self.output_dir / output_name if config["format"] == "json": try: with open(file_path) as f: data = json.load(f) # Convert to DataFrame if it's match data if isinstance(data, dict) and "matches" in data: df = pd.DataFrame(data["matches"]) all_data.append(df) except Exception as e: logger.warning(f"Failed to parse {file_path}: {e}") if all_data: return pd.concat(all_data, ignore_index=True) return pd.DataFrame() def fetch_understat_xg(self, leagues: List[str] = None, seasons: List[str] = None) -> pd.DataFrame: """Fetch xG data from understat.com API""" try: from understatapi import UnderstatClient if leagues is None: leagues = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"] if seasons is None: seasons = ["2024", "2023", "2022", "2021", "2020"] all_data = [] with UnderstatClient() as client: for league in leagues: for season in seasons: try: logger.info(f"Fetching xG: {league} {season}") # Get league fixtures fixtures = client.league(league).get_match_data(season) for match in fixtures: all_data.append({ 'league': league, 'season': season, 'home_team': match.get('h', {}).get('title', ''), 'away_team': match.get('a', {}).get('title', ''), 'home_goals': match.get('goals', {}).get('h', 0), 'away_goals': match.get('goals', {}).get('a', 0), 'home_xg': float(match.get('xG', {}).get('h', 0)), 'away_xg': float(match.get('xG', {}).get('a', 0)), 'date': match.get('datetime', '') }) time.sleep(0.5) # Rate limiting except Exception as e: logger.warning(f"Failed to get {league} {season}: {e}") if all_data: df = pd.DataFrame(all_data) output_file = self.output_dir / "understat_xg_data.csv" df.to_csv(output_file, index=False) logger.info(f"✓ Saved {len(df)} xG records to {output_file}") return df except ImportError: logger.warning("understatapi not installed, skipping xG data") except Exception as e: logger.error(f"Error fetching xG data: {e}") return pd.DataFrame() def fetch_fbref_stats(self, league_url: str = None) -> pd.DataFrame: """Fetch advanced stats from fbref.com""" try: # Use pandas read_html to scrape tables if league_url is None: league_url = "https://fbref.com/en/comps/9/Premier-League-Stats" logger.info(f"Fetching stats from {league_url}") tables = pd.read_html(league_url) # Usually the main stats table is one of the first if tables: df = tables[0] output_file = self.output_dir / "fbref_stats.csv" df.to_csv(output_file, index=False) logger.info(f"✓ Saved {len(df)} rows to {output_file}") return df except Exception as e: logger.error(f"Error fetching fbref stats: {e}") return pd.DataFrame() def download_all(self) -> Dict[str, pd.DataFrame]: """Download all GitHub and web data sources""" results = {} # GitHub sources logger.info("Fetching GitHub data...") results["github"] = self.fetch_football_csv() # Understat xG logger.info("Fetching Understat xG data...") results["understat_xg"] = self.fetch_understat_xg() # FBRef stats (optional, may require more handling) # results["fbref"] = self.fetch_fbref_stats() return results def get_combined_data(self) -> pd.DataFrame: """Get all GitHub data combined""" all_dfs = [] for csv_file in self.output_dir.glob("*.csv"): try: df = pd.read_csv(csv_file) all_dfs.append(df) logger.info(f"Loaded {len(df)} rows from {csv_file.name}") except Exception as e: logger.warning(f"Failed to load {csv_file}: {e}") if all_dfs: return pd.concat(all_dfs, ignore_index=True) return pd.DataFrame() # Convenience function def collect_github_data() -> pd.DataFrame: """Download and return all GitHub football data""" collector = GitHubCollector() collector.download_all() return collector.get_combined_data() if __name__ == "__main__": logging.basicConfig(level=logging.INFO) collector = GitHubCollector() print("Downloading data from GitHub and web sources...") results = collector.download_all() for name, df in results.items(): if not df.empty: print(f" {name}: {len(df)} rows") else: print(f" {name}: No data")