Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Fetch women's football player stats from Sofascore API. | |
| Scrapes season-level player statistics for multiple women's leagues: | |
| - FAWSL (2021-2026) | |
| - Liga F (Spain) | |
| - Division 1 Féminine (France) | |
| - Frauen-Bundesliga (Germany) | |
| - UWCL (Women's Champions League) | |
| Writes: data/sofascore/<league>_<season>.csv | |
| Usage: | |
| python scripts/fetch_sofascore.py | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import time | |
| from pathlib import Path | |
| import requests | |
| ROOT = Path(__file__).resolve().parent.parent | |
| OUT = ROOT / "data" / "sofascore" | |
| OUT.mkdir(parents=True, exist_ok=True) | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", | |
| "Accept": "application/json", | |
| } | |
| # Sofascore tournament IDs and seasons to scrape | |
| TOURNAMENTS = { | |
| "fawsl": { | |
| "tournament_id": 1044, | |
| "seasons": { | |
| "2025-2026": 79227, | |
| "2024-2025": 64370, | |
| "2023-2024": 53244, | |
| "2022-2023": 42953, | |
| "2021-2022": 37669, | |
| }, | |
| }, | |
| "liga_f": { | |
| "tournament_id": 1127, | |
| "seasons": { | |
| "2025-2026": 77723, | |
| "2024-2025": 65687, | |
| "2023-2024": 53464, | |
| "2022-2023": 45147, | |
| "2021-2022": 37805, | |
| }, | |
| }, | |
| "d1_feminine": { | |
| "tournament_id": 1139, | |
| "seasons": { | |
| "2025-2026": 78460, | |
| "2024-2025": 64322, | |
| "2023-2024": 53225, | |
| "2022-2023": 44862, | |
| "2021-2022": 37668, | |
| }, | |
| }, | |
| "frauen_bundesliga": { | |
| "tournament_id": 232, | |
| "seasons": { | |
| "2025-2026": 78297, | |
| "2024-2025": 64294, | |
| "2023-2024": 53133, | |
| "2022-2023": 42673, | |
| "2021-2022": 37476, | |
| }, | |
| }, | |
| "uwcl": { | |
| "tournament_id": 696, | |
| "seasons": { | |
| "2025-2026": 77328, | |
| "2024-2025": 63572, | |
| "2023-2024": 52831, | |
| "2022-2023": 42445, | |
| "2021-2022": 37590, | |
| }, | |
| }, | |
| "serie_a_fem": { | |
| "tournament_id": 556, | |
| "seasons": { | |
| "2025-2026": 79548, | |
| "2024-2025": 65034, | |
| "2023-2024": 53676, | |
| "2022-2023": 45041, | |
| "2021-2022": 38047, | |
| }, | |
| }, | |
| } | |
| STAT_FIELDS = ( | |
| "goals,expectedGoals,assists,bigChancesCreated,bigChancesMissed," | |
| "shotsOnTarget,shotsOffTarget,keyPasses,successfulDribbles," | |
| "accuratePasses,totalPasses,accurateLongBalls,totalLongBalls," | |
| "accurateCrosses,totalCrosses,tacklesWon,interceptions," | |
| "blockedShots,clearancesTotal,minutesPlayed,appearances," | |
| "foulsDrawn,foulsCommitted,dispossessed,dribbledPast," | |
| "totalDuels,duelsWon,aerialDuelsWon,rating" | |
| ) | |
| CSV_COLS = [ | |
| "player_id", "player_name", "team_id", "team_name", | |
| "league", "season", | |
| "appearances", "minutes_played", "rating", | |
| "goals", "expected_goals", "assists", | |
| "big_chances_created", "big_chances_missed", | |
| "shots_on_target", "shots_off_target", | |
| "key_passes", "successful_dribbles", | |
| "accurate_passes", "total_passes", | |
| "accurate_long_balls", "total_long_balls", | |
| "accurate_crosses", "total_crosses", | |
| "tackles_won", "interceptions", "blocked_shots", "clearances", | |
| "fouls_drawn", "fouls_committed", | |
| "dispossessed", "dribbled_past", | |
| "total_duels", "duels_won", "aerial_duels_won", | |
| ] | |
| def _get(url: str, retries: int = 3) -> requests.Response | None: | |
| """GET with retry and SSL error handling.""" | |
| for attempt in range(retries): | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=15) | |
| return r | |
| except (requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: | |
| wait = 5 * (attempt + 1) | |
| print(f" Connection error (attempt {attempt+1}/{retries}), waiting {wait}s...") | |
| time.sleep(wait) | |
| return None | |
| def discover_seasons(tournament_id: int) -> dict[str, int]: | |
| """Fetch available seasons for a tournament.""" | |
| url = f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}/seasons" | |
| r = _get(url) | |
| if r is None or r.status_code != 200: | |
| code = r.status_code if r else "no response" | |
| print(f" Failed to get seasons for tournament {tournament_id}: {code}") | |
| return {} | |
| seasons = {} | |
| for s in r.json().get("seasons", []): | |
| name = s["name"] | |
| sid = s["id"] | |
| # Only recent seasons (roughly 2021+) | |
| year_part = name.split("/")[0].split()[-1] if "/" in name else name.split()[-1] | |
| try: | |
| year = int(year_part) | |
| if year >= 2021: | |
| seasons[name] = sid | |
| except ValueError: | |
| pass | |
| return seasons | |
| def fetch_player_stats( | |
| tournament_id: int, season_id: int, league: str, season: str | |
| ) -> list[dict]: | |
| """Fetch all player stats for a season, paginating through results.""" | |
| all_rows = [] | |
| offset = 0 | |
| page_size = 100 | |
| while True: | |
| url = ( | |
| f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}" | |
| f"/season/{season_id}/statistics" | |
| f"?limit={page_size}&offset={offset}" | |
| f"&accumulation=total&fields={STAT_FIELDS}" | |
| ) | |
| r = _get(url) | |
| if r is None: | |
| print(f" Failed to connect at offset {offset}") | |
| break | |
| if r.status_code == 403: | |
| print(f" Rate limited at offset {offset}, waiting 30s...") | |
| time.sleep(30) | |
| continue | |
| if r.status_code != 200: | |
| print(f" Error at offset {offset}: {r.status_code}") | |
| break | |
| data = r.json() | |
| results = data.get("results", []) | |
| if not results: | |
| break | |
| for entry in results: | |
| player = entry.get("player", {}) | |
| team = entry.get("team", {}) | |
| row = { | |
| "player_id": player.get("id"), | |
| "player_name": player.get("name"), | |
| "team_id": team.get("id"), | |
| "team_name": team.get("name"), | |
| "league": league, | |
| "season": season, | |
| "appearances": entry.get("appearances", 0), | |
| "minutes_played": entry.get("minutesPlayed", 0), | |
| "rating": entry.get("rating"), | |
| "goals": entry.get("goals", 0), | |
| "expected_goals": entry.get("expectedGoals"), | |
| "assists": entry.get("assists", 0), | |
| "big_chances_created": entry.get("bigChancesCreated", 0), | |
| "big_chances_missed": entry.get("bigChancesMissed", 0), | |
| "shots_on_target": entry.get("shotsOnTarget", 0), | |
| "shots_off_target": entry.get("shotsOffTarget", 0), | |
| "key_passes": entry.get("keyPasses", 0), | |
| "successful_dribbles": entry.get("successfulDribbles", 0), | |
| "accurate_passes": entry.get("accuratePasses", 0), | |
| "total_passes": entry.get("totalPasses", 0), | |
| "accurate_long_balls": entry.get("accurateLongBalls", 0), | |
| "total_long_balls": entry.get("totalLongBalls", 0), | |
| "accurate_crosses": entry.get("accurateCrosses"), | |
| "total_crosses": entry.get("totalCrosses"), | |
| "tackles_won": entry.get("tacklesWon", 0), | |
| "interceptions": entry.get("interceptions", 0), | |
| "blocked_shots": entry.get("blockedShots", 0), | |
| "clearances": entry.get("clearancesTotal", 0), | |
| "fouls_drawn": entry.get("foulsDrawn", 0), | |
| "fouls_committed": entry.get("foulsCommitted", 0), | |
| "dispossessed": entry.get("dispossessed", 0), | |
| "dribbled_past": entry.get("dribbledPast", 0), | |
| "total_duels": entry.get("totalDuels", 0), | |
| "duels_won": entry.get("duelsWon", 0), | |
| "aerial_duels_won": entry.get("aerialDuelsWon", 0), | |
| } | |
| all_rows.append(row) | |
| offset += page_size | |
| time.sleep(1.5) # Rate limit | |
| return all_rows | |
| def main(): | |
| # Auto-discover seasons for leagues without hardcoded IDs | |
| for league_key, info in TOURNAMENTS.items(): | |
| if not info["seasons"]: | |
| print(f"Discovering seasons for {league_key} (tournament {info['tournament_id']})...") | |
| info["seasons"] = discover_seasons(info["tournament_id"]) | |
| if info["seasons"]: | |
| for name, sid in info["seasons"].items(): | |
| print(f" {name} (id={sid})") | |
| else: | |
| print(" No seasons found") | |
| time.sleep(1) | |
| total_players = 0 | |
| for league_key, info in TOURNAMENTS.items(): | |
| tid = info["tournament_id"] | |
| for season_name, season_id in info["seasons"].items(): | |
| out_file = OUT / f"{league_key}_{season_name.replace('/', '-')}.csv" | |
| if out_file.exists(): | |
| existing = sum(1 for _ in open(out_file)) - 1 | |
| print(f"SKIP {league_key} {season_name} — already have {existing} players") | |
| total_players += existing | |
| continue | |
| print(f"Fetching {league_key} {season_name} (tid={tid}, sid={season_id})...") | |
| rows = fetch_player_stats(tid, season_id, league_key, season_name) | |
| if rows: | |
| with open(out_file, "w", newline="") as f: | |
| writer = csv.DictWriter(f, fieldnames=CSV_COLS) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| print(f" Saved {len(rows)} players to {out_file.name}") | |
| total_players += len(rows) | |
| else: | |
| print(f" No data found") | |
| time.sleep(2) # Between seasons | |
| print(f"\nTotal: {total_players} player-season records") | |
| if __name__ == "__main__": | |
| main() | |