Spaces:
Runtime error
Runtime error
| """ | |
| GitHub Data Collector | |
| Fetches football datasets from GitHub repositories and web sources: | |
| - football.csv - Open public domain football data | |
| - jokecamp/FootballData - JSON/CSV odds data | |
| - understat.com - xG data | |
| - fbref.com - Advanced statistics | |
| """ | |
| import requests | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import Optional, Dict, List | |
| import logging | |
| import json | |
| import time | |
| logger = logging.getLogger(__name__) | |
| # Base paths | |
| DATA_DIR = Path(__file__).parent.parent.parent / "data" | |
| RAW_DATA_DIR = DATA_DIR / "raw" / "github" | |
| class GitHubCollector: | |
| """Clones and processes GitHub football datasets""" | |
| # GitHub raw file URLs for direct download | |
| GITHUB_SOURCES = { | |
| "football_csv": { | |
| "base_url": "https://raw.githubusercontent.com/openfootball/football.json/master", | |
| "files": ["2023-24/en.1.json", "2023-24/de.1.json", "2023-24/es.1.json"], | |
| "format": "json" | |
| }, | |
| "jokecamp_football": { | |
| "base_url": "https://raw.githubusercontent.com/jokecamp/FootballData/master", | |
| "files": ["openFootballData/stadiums.json", "openFootballData/countries.json"], | |
| "format": "json" | |
| } | |
| } | |
| def __init__(self, output_dir: Optional[Path] = None): | |
| self.output_dir = output_dir or RAW_DATA_DIR | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def download_github_file(self, url: str, output_name: str) -> bool: | |
| """Download a single file from GitHub""" | |
| try: | |
| response = requests.get(url, timeout=30) | |
| response.raise_for_status() | |
| output_path = self.output_dir / output_name | |
| with open(output_path, 'wb') as f: | |
| f.write(response.content) | |
| logger.info(f"✓ Downloaded: {output_name}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to download {url}: {e}") | |
| return False | |
| def fetch_football_csv(self) -> pd.DataFrame: | |
| """Fetch data from football.csv / openfootball project""" | |
| all_data = [] | |
| for source_name, config in self.GITHUB_SOURCES.items(): | |
| base_url = config["base_url"] | |
| for file_path in config["files"]: | |
| url = f"{base_url}/{file_path}" | |
| output_name = f"{source_name}_{file_path.replace('/', '_')}" | |
| if self.download_github_file(url, output_name): | |
| # Parse based on format | |
| file_path = self.output_dir / output_name | |
| if config["format"] == "json": | |
| try: | |
| with open(file_path) as f: | |
| data = json.load(f) | |
| # Convert to DataFrame if it's match data | |
| if isinstance(data, dict) and "matches" in data: | |
| df = pd.DataFrame(data["matches"]) | |
| all_data.append(df) | |
| except Exception as e: | |
| logger.warning(f"Failed to parse {file_path}: {e}") | |
| if all_data: | |
| return pd.concat(all_data, ignore_index=True) | |
| return pd.DataFrame() | |
| def fetch_understat_xg(self, leagues: List[str] = None, seasons: List[str] = None) -> pd.DataFrame: | |
| """Fetch xG data from understat.com API""" | |
| try: | |
| from understatapi import UnderstatClient | |
| if leagues is None: | |
| leagues = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"] | |
| if seasons is None: | |
| seasons = ["2024", "2023", "2022", "2021", "2020"] | |
| all_data = [] | |
| with UnderstatClient() as client: | |
| for league in leagues: | |
| for season in seasons: | |
| try: | |
| logger.info(f"Fetching xG: {league} {season}") | |
| # Get league fixtures | |
| fixtures = client.league(league).get_match_data(season) | |
| for match in fixtures: | |
| all_data.append({ | |
| 'league': league, | |
| 'season': season, | |
| 'home_team': match.get('h', {}).get('title', ''), | |
| 'away_team': match.get('a', {}).get('title', ''), | |
| 'home_goals': match.get('goals', {}).get('h', 0), | |
| 'away_goals': match.get('goals', {}).get('a', 0), | |
| 'home_xg': float(match.get('xG', {}).get('h', 0)), | |
| 'away_xg': float(match.get('xG', {}).get('a', 0)), | |
| 'date': match.get('datetime', '') | |
| }) | |
| time.sleep(0.5) # Rate limiting | |
| except Exception as e: | |
| logger.warning(f"Failed to get {league} {season}: {e}") | |
| if all_data: | |
| df = pd.DataFrame(all_data) | |
| output_file = self.output_dir / "understat_xg_data.csv" | |
| df.to_csv(output_file, index=False) | |
| logger.info(f"✓ Saved {len(df)} xG records to {output_file}") | |
| return df | |
| except ImportError: | |
| logger.warning("understatapi not installed, skipping xG data") | |
| except Exception as e: | |
| logger.error(f"Error fetching xG data: {e}") | |
| return pd.DataFrame() | |
| def fetch_fbref_stats(self, league_url: str = None) -> pd.DataFrame: | |
| """Fetch advanced stats from fbref.com""" | |
| try: | |
| # Use pandas read_html to scrape tables | |
| if league_url is None: | |
| league_url = "https://fbref.com/en/comps/9/Premier-League-Stats" | |
| logger.info(f"Fetching stats from {league_url}") | |
| tables = pd.read_html(league_url) | |
| # Usually the main stats table is one of the first | |
| if tables: | |
| df = tables[0] | |
| output_file = self.output_dir / "fbref_stats.csv" | |
| df.to_csv(output_file, index=False) | |
| logger.info(f"✓ Saved {len(df)} rows to {output_file}") | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error fetching fbref stats: {e}") | |
| return pd.DataFrame() | |
| def download_all(self) -> Dict[str, pd.DataFrame]: | |
| """Download all GitHub and web data sources""" | |
| results = {} | |
| # GitHub sources | |
| logger.info("Fetching GitHub data...") | |
| results["github"] = self.fetch_football_csv() | |
| # Understat xG | |
| logger.info("Fetching Understat xG data...") | |
| results["understat_xg"] = self.fetch_understat_xg() | |
| # FBRef stats (optional, may require more handling) | |
| # results["fbref"] = self.fetch_fbref_stats() | |
| return results | |
| def get_combined_data(self) -> pd.DataFrame: | |
| """Get all GitHub data combined""" | |
| all_dfs = [] | |
| for csv_file in self.output_dir.glob("*.csv"): | |
| try: | |
| df = pd.read_csv(csv_file) | |
| all_dfs.append(df) | |
| logger.info(f"Loaded {len(df)} rows from {csv_file.name}") | |
| except Exception as e: | |
| logger.warning(f"Failed to load {csv_file}: {e}") | |
| if all_dfs: | |
| return pd.concat(all_dfs, ignore_index=True) | |
| return pd.DataFrame() | |
| # Convenience function | |
| def collect_github_data() -> pd.DataFrame: | |
| """Download and return all GitHub football data""" | |
| collector = GitHubCollector() | |
| collector.download_all() | |
| return collector.get_combined_data() | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| collector = GitHubCollector() | |
| print("Downloading data from GitHub and web sources...") | |
| results = collector.download_all() | |
| for name, df in results.items(): | |
| if not df.empty: | |
| print(f" {name}: {len(df)} rows") | |
| else: | |
| print(f" {name}: No data") | |