footypredict-pro / src /data /github_collector.py
nananie143's picture
Deploy advanced models with XGBoost/LightGBM
246a547 verified
"""
GitHub Data Collector
Fetches football datasets from GitHub repositories and web sources:
- football.csv - Open public domain football data
- jokecamp/FootballData - JSON/CSV odds data
- understat.com - xG data
- fbref.com - Advanced statistics
"""
import requests
import pandas as pd
from pathlib import Path
from typing import Optional, Dict, List
import logging
import json
import time
logger = logging.getLogger(__name__)
# Base paths
DATA_DIR = Path(__file__).parent.parent.parent / "data"
RAW_DATA_DIR = DATA_DIR / "raw" / "github"
class GitHubCollector:
"""Clones and processes GitHub football datasets"""
# GitHub raw file URLs for direct download
GITHUB_SOURCES = {
"football_csv": {
"base_url": "https://raw.githubusercontent.com/openfootball/football.json/master",
"files": ["2023-24/en.1.json", "2023-24/de.1.json", "2023-24/es.1.json"],
"format": "json"
},
"jokecamp_football": {
"base_url": "https://raw.githubusercontent.com/jokecamp/FootballData/master",
"files": ["openFootballData/stadiums.json", "openFootballData/countries.json"],
"format": "json"
}
}
def __init__(self, output_dir: Optional[Path] = None):
self.output_dir = output_dir or RAW_DATA_DIR
self.output_dir.mkdir(parents=True, exist_ok=True)
def download_github_file(self, url: str, output_name: str) -> bool:
"""Download a single file from GitHub"""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
output_path = self.output_dir / output_name
with open(output_path, 'wb') as f:
f.write(response.content)
logger.info(f"✓ Downloaded: {output_name}")
return True
except Exception as e:
logger.error(f"Failed to download {url}: {e}")
return False
def fetch_football_csv(self) -> pd.DataFrame:
"""Fetch data from football.csv / openfootball project"""
all_data = []
for source_name, config in self.GITHUB_SOURCES.items():
base_url = config["base_url"]
for file_path in config["files"]:
url = f"{base_url}/{file_path}"
output_name = f"{source_name}_{file_path.replace('/', '_')}"
if self.download_github_file(url, output_name):
# Parse based on format
file_path = self.output_dir / output_name
if config["format"] == "json":
try:
with open(file_path) as f:
data = json.load(f)
# Convert to DataFrame if it's match data
if isinstance(data, dict) and "matches" in data:
df = pd.DataFrame(data["matches"])
all_data.append(df)
except Exception as e:
logger.warning(f"Failed to parse {file_path}: {e}")
if all_data:
return pd.concat(all_data, ignore_index=True)
return pd.DataFrame()
def fetch_understat_xg(self, leagues: List[str] = None, seasons: List[str] = None) -> pd.DataFrame:
"""Fetch xG data from understat.com API"""
try:
from understatapi import UnderstatClient
if leagues is None:
leagues = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"]
if seasons is None:
seasons = ["2024", "2023", "2022", "2021", "2020"]
all_data = []
with UnderstatClient() as client:
for league in leagues:
for season in seasons:
try:
logger.info(f"Fetching xG: {league} {season}")
# Get league fixtures
fixtures = client.league(league).get_match_data(season)
for match in fixtures:
all_data.append({
'league': league,
'season': season,
'home_team': match.get('h', {}).get('title', ''),
'away_team': match.get('a', {}).get('title', ''),
'home_goals': match.get('goals', {}).get('h', 0),
'away_goals': match.get('goals', {}).get('a', 0),
'home_xg': float(match.get('xG', {}).get('h', 0)),
'away_xg': float(match.get('xG', {}).get('a', 0)),
'date': match.get('datetime', '')
})
time.sleep(0.5) # Rate limiting
except Exception as e:
logger.warning(f"Failed to get {league} {season}: {e}")
if all_data:
df = pd.DataFrame(all_data)
output_file = self.output_dir / "understat_xg_data.csv"
df.to_csv(output_file, index=False)
logger.info(f"✓ Saved {len(df)} xG records to {output_file}")
return df
except ImportError:
logger.warning("understatapi not installed, skipping xG data")
except Exception as e:
logger.error(f"Error fetching xG data: {e}")
return pd.DataFrame()
def fetch_fbref_stats(self, league_url: str = None) -> pd.DataFrame:
"""Fetch advanced stats from fbref.com"""
try:
# Use pandas read_html to scrape tables
if league_url is None:
league_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
logger.info(f"Fetching stats from {league_url}")
tables = pd.read_html(league_url)
# Usually the main stats table is one of the first
if tables:
df = tables[0]
output_file = self.output_dir / "fbref_stats.csv"
df.to_csv(output_file, index=False)
logger.info(f"✓ Saved {len(df)} rows to {output_file}")
return df
except Exception as e:
logger.error(f"Error fetching fbref stats: {e}")
return pd.DataFrame()
def download_all(self) -> Dict[str, pd.DataFrame]:
"""Download all GitHub and web data sources"""
results = {}
# GitHub sources
logger.info("Fetching GitHub data...")
results["github"] = self.fetch_football_csv()
# Understat xG
logger.info("Fetching Understat xG data...")
results["understat_xg"] = self.fetch_understat_xg()
# FBRef stats (optional, may require more handling)
# results["fbref"] = self.fetch_fbref_stats()
return results
def get_combined_data(self) -> pd.DataFrame:
"""Get all GitHub data combined"""
all_dfs = []
for csv_file in self.output_dir.glob("*.csv"):
try:
df = pd.read_csv(csv_file)
all_dfs.append(df)
logger.info(f"Loaded {len(df)} rows from {csv_file.name}")
except Exception as e:
logger.warning(f"Failed to load {csv_file}: {e}")
if all_dfs:
return pd.concat(all_dfs, ignore_index=True)
return pd.DataFrame()
# Convenience function
def collect_github_data() -> pd.DataFrame:
"""Download and return all GitHub football data"""
collector = GitHubCollector()
collector.download_all()
return collector.get_combined_data()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
collector = GitHubCollector()
print("Downloading data from GitHub and web sources...")
results = collector.download_all()
for name, df in results.items():
if not df.empty:
print(f" {name}: {len(df)} rows")
else:
print(f" {name}: No data")