#!/usr/bin/env python3 """Fetch women's football player stats from Sofascore API. Scrapes season-level player statistics for multiple women's leagues: - FAWSL (2021-2026) - Liga F (Spain) - Division 1 Féminine (France) - Frauen-Bundesliga (Germany) - UWCL (Women's Champions League) Writes: data/sofascore/_.csv Usage: python scripts/fetch_sofascore.py """ from __future__ import annotations import csv import time from pathlib import Path import requests ROOT = Path(__file__).resolve().parent.parent OUT = ROOT / "data" / "sofascore" OUT.mkdir(parents=True, exist_ok=True) HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", "Accept": "application/json", } # Sofascore tournament IDs and seasons to scrape TOURNAMENTS = { "fawsl": { "tournament_id": 1044, "seasons": { "2025-2026": 79227, "2024-2025": 64370, "2023-2024": 53244, "2022-2023": 42953, "2021-2022": 37669, }, }, "liga_f": { "tournament_id": 1127, "seasons": { "2025-2026": 77723, "2024-2025": 65687, "2023-2024": 53464, "2022-2023": 45147, "2021-2022": 37805, }, }, "d1_feminine": { "tournament_id": 1139, "seasons": { "2025-2026": 78460, "2024-2025": 64322, "2023-2024": 53225, "2022-2023": 44862, "2021-2022": 37668, }, }, "frauen_bundesliga": { "tournament_id": 232, "seasons": { "2025-2026": 78297, "2024-2025": 64294, "2023-2024": 53133, "2022-2023": 42673, "2021-2022": 37476, }, }, "uwcl": { "tournament_id": 696, "seasons": { "2025-2026": 77328, "2024-2025": 63572, "2023-2024": 52831, "2022-2023": 42445, "2021-2022": 37590, }, }, "serie_a_fem": { "tournament_id": 556, "seasons": { "2025-2026": 79548, "2024-2025": 65034, "2023-2024": 53676, "2022-2023": 45041, "2021-2022": 38047, }, }, } STAT_FIELDS = ( "goals,expectedGoals,assists,bigChancesCreated,bigChancesMissed," "shotsOnTarget,shotsOffTarget,keyPasses,successfulDribbles," "accuratePasses,totalPasses,accurateLongBalls,totalLongBalls," "accurateCrosses,totalCrosses,tacklesWon,interceptions," "blockedShots,clearancesTotal,minutesPlayed,appearances," "foulsDrawn,foulsCommitted,dispossessed,dribbledPast," "totalDuels,duelsWon,aerialDuelsWon,rating" ) CSV_COLS = [ "player_id", "player_name", "team_id", "team_name", "league", "season", "appearances", "minutes_played", "rating", "goals", "expected_goals", "assists", "big_chances_created", "big_chances_missed", "shots_on_target", "shots_off_target", "key_passes", "successful_dribbles", "accurate_passes", "total_passes", "accurate_long_balls", "total_long_balls", "accurate_crosses", "total_crosses", "tackles_won", "interceptions", "blocked_shots", "clearances", "fouls_drawn", "fouls_committed", "dispossessed", "dribbled_past", "total_duels", "duels_won", "aerial_duels_won", ] def _get(url: str, retries: int = 3) -> requests.Response | None: """GET with retry and SSL error handling.""" for attempt in range(retries): try: r = requests.get(url, headers=HEADERS, timeout=15) return r except (requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: wait = 5 * (attempt + 1) print(f" Connection error (attempt {attempt+1}/{retries}), waiting {wait}s...") time.sleep(wait) return None def discover_seasons(tournament_id: int) -> dict[str, int]: """Fetch available seasons for a tournament.""" url = f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}/seasons" r = _get(url) if r is None or r.status_code != 200: code = r.status_code if r else "no response" print(f" Failed to get seasons for tournament {tournament_id}: {code}") return {} seasons = {} for s in r.json().get("seasons", []): name = s["name"] sid = s["id"] # Only recent seasons (roughly 2021+) year_part = name.split("/")[0].split()[-1] if "/" in name else name.split()[-1] try: year = int(year_part) if year >= 2021: seasons[name] = sid except ValueError: pass return seasons def fetch_player_stats( tournament_id: int, season_id: int, league: str, season: str ) -> list[dict]: """Fetch all player stats for a season, paginating through results.""" all_rows = [] offset = 0 page_size = 100 while True: url = ( f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}" f"/season/{season_id}/statistics" f"?limit={page_size}&offset={offset}" f"&accumulation=total&fields={STAT_FIELDS}" ) r = _get(url) if r is None: print(f" Failed to connect at offset {offset}") break if r.status_code == 403: print(f" Rate limited at offset {offset}, waiting 30s...") time.sleep(30) continue if r.status_code != 200: print(f" Error at offset {offset}: {r.status_code}") break data = r.json() results = data.get("results", []) if not results: break for entry in results: player = entry.get("player", {}) team = entry.get("team", {}) row = { "player_id": player.get("id"), "player_name": player.get("name"), "team_id": team.get("id"), "team_name": team.get("name"), "league": league, "season": season, "appearances": entry.get("appearances", 0), "minutes_played": entry.get("minutesPlayed", 0), "rating": entry.get("rating"), "goals": entry.get("goals", 0), "expected_goals": entry.get("expectedGoals"), "assists": entry.get("assists", 0), "big_chances_created": entry.get("bigChancesCreated", 0), "big_chances_missed": entry.get("bigChancesMissed", 0), "shots_on_target": entry.get("shotsOnTarget", 0), "shots_off_target": entry.get("shotsOffTarget", 0), "key_passes": entry.get("keyPasses", 0), "successful_dribbles": entry.get("successfulDribbles", 0), "accurate_passes": entry.get("accuratePasses", 0), "total_passes": entry.get("totalPasses", 0), "accurate_long_balls": entry.get("accurateLongBalls", 0), "total_long_balls": entry.get("totalLongBalls", 0), "accurate_crosses": entry.get("accurateCrosses"), "total_crosses": entry.get("totalCrosses"), "tackles_won": entry.get("tacklesWon", 0), "interceptions": entry.get("interceptions", 0), "blocked_shots": entry.get("blockedShots", 0), "clearances": entry.get("clearancesTotal", 0), "fouls_drawn": entry.get("foulsDrawn", 0), "fouls_committed": entry.get("foulsCommitted", 0), "dispossessed": entry.get("dispossessed", 0), "dribbled_past": entry.get("dribbledPast", 0), "total_duels": entry.get("totalDuels", 0), "duels_won": entry.get("duelsWon", 0), "aerial_duels_won": entry.get("aerialDuelsWon", 0), } all_rows.append(row) offset += page_size time.sleep(1.5) # Rate limit return all_rows def main(): # Auto-discover seasons for leagues without hardcoded IDs for league_key, info in TOURNAMENTS.items(): if not info["seasons"]: print(f"Discovering seasons for {league_key} (tournament {info['tournament_id']})...") info["seasons"] = discover_seasons(info["tournament_id"]) if info["seasons"]: for name, sid in info["seasons"].items(): print(f" {name} (id={sid})") else: print(" No seasons found") time.sleep(1) total_players = 0 for league_key, info in TOURNAMENTS.items(): tid = info["tournament_id"] for season_name, season_id in info["seasons"].items(): out_file = OUT / f"{league_key}_{season_name.replace('/', '-')}.csv" if out_file.exists(): existing = sum(1 for _ in open(out_file)) - 1 print(f"SKIP {league_key} {season_name} — already have {existing} players") total_players += existing continue print(f"Fetching {league_key} {season_name} (tid={tid}, sid={season_id})...") rows = fetch_player_stats(tid, season_id, league_key, season_name) if rows: with open(out_file, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=CSV_COLS) writer.writeheader() writer.writerows(rows) print(f" Saved {len(rows)} players to {out_file.name}") total_players += len(rows) else: print(f" No data found") time.sleep(2) # Between seasons print(f"\nTotal: {total_players} player-season records") if __name__ == "__main__": main()