Spaces:

nananie143
/

footypredict-pro

Runtime error

App Files Files Community

footypredict-pro / src /data /github_collector.py

nananie143

Deploy advanced models with XGBoost/LightGBM

246a547 verified about 1 month ago

raw

history blame contribute delete

8.59 kB

	"""
	GitHub Data Collector

	Fetches football datasets from GitHub repositories and web sources:
	- football.csv - Open public domain football data
	- jokecamp/FootballData - JSON/CSV odds data
	- understat.com - xG data
	- fbref.com - Advanced statistics
	"""

	import requests
	import pandas as pd
	from pathlib import Path
	from typing import Optional, Dict, List
	import logging
	import json
	import time

	logger = logging.getLogger(__name__)

	# Base paths
	DATA_DIR = Path(__file__).parent.parent.parent / "data"
	RAW_DATA_DIR = DATA_DIR / "raw" / "github"


	class GitHubCollector:
	"""Clones and processes GitHub football datasets"""

	# GitHub raw file URLs for direct download
	GITHUB_SOURCES = {
	"football_csv": {
	"base_url": "https://raw.githubusercontent.com/openfootball/football.json/master",
	"files": ["2023-24/en.1.json", "2023-24/de.1.json", "2023-24/es.1.json"],
	"format": "json"
	},
	"jokecamp_football": {
	"base_url": "https://raw.githubusercontent.com/jokecamp/FootballData/master",
	"files": ["openFootballData/stadiums.json", "openFootballData/countries.json"],
	"format": "json"
	}
	}

	def __init__(self, output_dir: Optional[Path] = None):
	self.output_dir = output_dir or RAW_DATA_DIR
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def download_github_file(self, url: str, output_name: str) -> bool:
	"""Download a single file from GitHub"""
	try:
	response = requests.get(url, timeout=30)
	response.raise_for_status()

	output_path = self.output_dir / output_name
	with open(output_path, 'wb') as f:
	f.write(response.content)

	logger.info(f"✓ Downloaded: {output_name}")
	return True

	except Exception as e:
	logger.error(f"Failed to download {url}: {e}")
	return False

	def fetch_football_csv(self) -> pd.DataFrame:
	"""Fetch data from football.csv / openfootball project"""
	all_data = []

	for source_name, config in self.GITHUB_SOURCES.items():
	base_url = config["base_url"]

	for file_path in config["files"]:
	url = f"{base_url}/{file_path}"
	output_name = f"{source_name}_{file_path.replace('/', '_')}"

	if self.download_github_file(url, output_name):
	# Parse based on format
	file_path = self.output_dir / output_name
	if config["format"] == "json":
	try:
	with open(file_path) as f:
	data = json.load(f)
	# Convert to DataFrame if it's match data
	if isinstance(data, dict) and "matches" in data:
	df = pd.DataFrame(data["matches"])
	all_data.append(df)
	except Exception as e:
	logger.warning(f"Failed to parse {file_path}: {e}")

	if all_data:
	return pd.concat(all_data, ignore_index=True)
	return pd.DataFrame()

	def fetch_understat_xg(self, leagues: List[str] = None, seasons: List[str] = None) -> pd.DataFrame:
	"""Fetch xG data from understat.com API"""
	try:
	from understatapi import UnderstatClient

	if leagues is None:
	leagues = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"]

	if seasons is None:
	seasons = ["2024", "2023", "2022", "2021", "2020"]

	all_data = []

	with UnderstatClient() as client:
	for league in leagues:
	for season in seasons:
	try:
	logger.info(f"Fetching xG: {league} {season}")

	# Get league fixtures
	fixtures = client.league(league).get_match_data(season)

	for match in fixtures:
	all_data.append({
	'league': league,
	'season': season,
	'home_team': match.get('h', {}).get('title', ''),
	'away_team': match.get('a', {}).get('title', ''),
	'home_goals': match.get('goals', {}).get('h', 0),
	'away_goals': match.get('goals', {}).get('a', 0),
	'home_xg': float(match.get('xG', {}).get('h', 0)),
	'away_xg': float(match.get('xG', {}).get('a', 0)),
	'date': match.get('datetime', '')
	})

	time.sleep(0.5) # Rate limiting

	except Exception as e:
	logger.warning(f"Failed to get {league} {season}: {e}")

	if all_data:
	df = pd.DataFrame(all_data)
	output_file = self.output_dir / "understat_xg_data.csv"
	df.to_csv(output_file, index=False)
	logger.info(f"✓ Saved {len(df)} xG records to {output_file}")
	return df

	except ImportError:
	logger.warning("understatapi not installed, skipping xG data")
	except Exception as e:
	logger.error(f"Error fetching xG data: {e}")

	return pd.DataFrame()

	def fetch_fbref_stats(self, league_url: str = None) -> pd.DataFrame:
	"""Fetch advanced stats from fbref.com"""
	try:
	# Use pandas read_html to scrape tables
	if league_url is None:
	league_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

	logger.info(f"Fetching stats from {league_url}")

	tables = pd.read_html(league_url)

	# Usually the main stats table is one of the first
	if tables:
	df = tables[0]
	output_file = self.output_dir / "fbref_stats.csv"
	df.to_csv(output_file, index=False)
	logger.info(f"✓ Saved {len(df)} rows to {output_file}")
	return df

	except Exception as e:
	logger.error(f"Error fetching fbref stats: {e}")

	return pd.DataFrame()

	def download_all(self) -> Dict[str, pd.DataFrame]:
	"""Download all GitHub and web data sources"""
	results = {}

	# GitHub sources
	logger.info("Fetching GitHub data...")
	results["github"] = self.fetch_football_csv()

	# Understat xG
	logger.info("Fetching Understat xG data...")
	results["understat_xg"] = self.fetch_understat_xg()

	# FBRef stats (optional, may require more handling)
	# results["fbref"] = self.fetch_fbref_stats()

	return results

	def get_combined_data(self) -> pd.DataFrame:
	"""Get all GitHub data combined"""
	all_dfs = []

	for csv_file in self.output_dir.glob("*.csv"):
	try:
	df = pd.read_csv(csv_file)
	all_dfs.append(df)
	logger.info(f"Loaded {len(df)} rows from {csv_file.name}")
	except Exception as e:
	logger.warning(f"Failed to load {csv_file}: {e}")

	if all_dfs:
	return pd.concat(all_dfs, ignore_index=True)
	return pd.DataFrame()


	# Convenience function
	def collect_github_data() -> pd.DataFrame:
	"""Download and return all GitHub football data"""
	collector = GitHubCollector()
	collector.download_all()
	return collector.get_combined_data()


	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO)

	collector = GitHubCollector()

	print("Downloading data from GitHub and web sources...")
	results = collector.download_all()

	for name, df in results.items():
	if not df.empty:
	print(f" {name}: {len(df)} rows")
	else:
	print(f" {name}: No data")