WIFX / scripts /fetch_sofascore.py
amadabhu
updates to the UI
d7a255c
#!/usr/bin/env python3
"""Fetch women's football player stats from Sofascore API.
Scrapes season-level player statistics for multiple women's leagues:
- FAWSL (2021-2026)
- Liga F (Spain)
- Division 1 Féminine (France)
- Frauen-Bundesliga (Germany)
- UWCL (Women's Champions League)
Writes: data/sofascore/<league>_<season>.csv
Usage:
python scripts/fetch_sofascore.py
"""
from __future__ import annotations
import csv
import time
from pathlib import Path
import requests
ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "data" / "sofascore"
OUT.mkdir(parents=True, exist_ok=True)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "application/json",
}
# Sofascore tournament IDs and seasons to scrape
TOURNAMENTS = {
"fawsl": {
"tournament_id": 1044,
"seasons": {
"2025-2026": 79227,
"2024-2025": 64370,
"2023-2024": 53244,
"2022-2023": 42953,
"2021-2022": 37669,
},
},
"liga_f": {
"tournament_id": 1127,
"seasons": {
"2025-2026": 77723,
"2024-2025": 65687,
"2023-2024": 53464,
"2022-2023": 45147,
"2021-2022": 37805,
},
},
"d1_feminine": {
"tournament_id": 1139,
"seasons": {
"2025-2026": 78460,
"2024-2025": 64322,
"2023-2024": 53225,
"2022-2023": 44862,
"2021-2022": 37668,
},
},
"frauen_bundesliga": {
"tournament_id": 232,
"seasons": {
"2025-2026": 78297,
"2024-2025": 64294,
"2023-2024": 53133,
"2022-2023": 42673,
"2021-2022": 37476,
},
},
"uwcl": {
"tournament_id": 696,
"seasons": {
"2025-2026": 77328,
"2024-2025": 63572,
"2023-2024": 52831,
"2022-2023": 42445,
"2021-2022": 37590,
},
},
"serie_a_fem": {
"tournament_id": 556,
"seasons": {
"2025-2026": 79548,
"2024-2025": 65034,
"2023-2024": 53676,
"2022-2023": 45041,
"2021-2022": 38047,
},
},
}
STAT_FIELDS = (
"goals,expectedGoals,assists,bigChancesCreated,bigChancesMissed,"
"shotsOnTarget,shotsOffTarget,keyPasses,successfulDribbles,"
"accuratePasses,totalPasses,accurateLongBalls,totalLongBalls,"
"accurateCrosses,totalCrosses,tacklesWon,interceptions,"
"blockedShots,clearancesTotal,minutesPlayed,appearances,"
"foulsDrawn,foulsCommitted,dispossessed,dribbledPast,"
"totalDuels,duelsWon,aerialDuelsWon,rating"
)
CSV_COLS = [
"player_id", "player_name", "team_id", "team_name",
"league", "season",
"appearances", "minutes_played", "rating",
"goals", "expected_goals", "assists",
"big_chances_created", "big_chances_missed",
"shots_on_target", "shots_off_target",
"key_passes", "successful_dribbles",
"accurate_passes", "total_passes",
"accurate_long_balls", "total_long_balls",
"accurate_crosses", "total_crosses",
"tackles_won", "interceptions", "blocked_shots", "clearances",
"fouls_drawn", "fouls_committed",
"dispossessed", "dribbled_past",
"total_duels", "duels_won", "aerial_duels_won",
]
def _get(url: str, retries: int = 3) -> requests.Response | None:
"""GET with retry and SSL error handling."""
for attempt in range(retries):
try:
r = requests.get(url, headers=HEADERS, timeout=15)
return r
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e:
wait = 5 * (attempt + 1)
print(f" Connection error (attempt {attempt+1}/{retries}), waiting {wait}s...")
time.sleep(wait)
return None
def discover_seasons(tournament_id: int) -> dict[str, int]:
"""Fetch available seasons for a tournament."""
url = f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}/seasons"
r = _get(url)
if r is None or r.status_code != 200:
code = r.status_code if r else "no response"
print(f" Failed to get seasons for tournament {tournament_id}: {code}")
return {}
seasons = {}
for s in r.json().get("seasons", []):
name = s["name"]
sid = s["id"]
# Only recent seasons (roughly 2021+)
year_part = name.split("/")[0].split()[-1] if "/" in name else name.split()[-1]
try:
year = int(year_part)
if year >= 2021:
seasons[name] = sid
except ValueError:
pass
return seasons
def fetch_player_stats(
tournament_id: int, season_id: int, league: str, season: str
) -> list[dict]:
"""Fetch all player stats for a season, paginating through results."""
all_rows = []
offset = 0
page_size = 100
while True:
url = (
f"https://www.sofascore.com/api/v1/unique-tournament/{tournament_id}"
f"/season/{season_id}/statistics"
f"?limit={page_size}&offset={offset}"
f"&accumulation=total&fields={STAT_FIELDS}"
)
r = _get(url)
if r is None:
print(f" Failed to connect at offset {offset}")
break
if r.status_code == 403:
print(f" Rate limited at offset {offset}, waiting 30s...")
time.sleep(30)
continue
if r.status_code != 200:
print(f" Error at offset {offset}: {r.status_code}")
break
data = r.json()
results = data.get("results", [])
if not results:
break
for entry in results:
player = entry.get("player", {})
team = entry.get("team", {})
row = {
"player_id": player.get("id"),
"player_name": player.get("name"),
"team_id": team.get("id"),
"team_name": team.get("name"),
"league": league,
"season": season,
"appearances": entry.get("appearances", 0),
"minutes_played": entry.get("minutesPlayed", 0),
"rating": entry.get("rating"),
"goals": entry.get("goals", 0),
"expected_goals": entry.get("expectedGoals"),
"assists": entry.get("assists", 0),
"big_chances_created": entry.get("bigChancesCreated", 0),
"big_chances_missed": entry.get("bigChancesMissed", 0),
"shots_on_target": entry.get("shotsOnTarget", 0),
"shots_off_target": entry.get("shotsOffTarget", 0),
"key_passes": entry.get("keyPasses", 0),
"successful_dribbles": entry.get("successfulDribbles", 0),
"accurate_passes": entry.get("accuratePasses", 0),
"total_passes": entry.get("totalPasses", 0),
"accurate_long_balls": entry.get("accurateLongBalls", 0),
"total_long_balls": entry.get("totalLongBalls", 0),
"accurate_crosses": entry.get("accurateCrosses"),
"total_crosses": entry.get("totalCrosses"),
"tackles_won": entry.get("tacklesWon", 0),
"interceptions": entry.get("interceptions", 0),
"blocked_shots": entry.get("blockedShots", 0),
"clearances": entry.get("clearancesTotal", 0),
"fouls_drawn": entry.get("foulsDrawn", 0),
"fouls_committed": entry.get("foulsCommitted", 0),
"dispossessed": entry.get("dispossessed", 0),
"dribbled_past": entry.get("dribbledPast", 0),
"total_duels": entry.get("totalDuels", 0),
"duels_won": entry.get("duelsWon", 0),
"aerial_duels_won": entry.get("aerialDuelsWon", 0),
}
all_rows.append(row)
offset += page_size
time.sleep(1.5) # Rate limit
return all_rows
def main():
# Auto-discover seasons for leagues without hardcoded IDs
for league_key, info in TOURNAMENTS.items():
if not info["seasons"]:
print(f"Discovering seasons for {league_key} (tournament {info['tournament_id']})...")
info["seasons"] = discover_seasons(info["tournament_id"])
if info["seasons"]:
for name, sid in info["seasons"].items():
print(f" {name} (id={sid})")
else:
print(" No seasons found")
time.sleep(1)
total_players = 0
for league_key, info in TOURNAMENTS.items():
tid = info["tournament_id"]
for season_name, season_id in info["seasons"].items():
out_file = OUT / f"{league_key}_{season_name.replace('/', '-')}.csv"
if out_file.exists():
existing = sum(1 for _ in open(out_file)) - 1
print(f"SKIP {league_key} {season_name} — already have {existing} players")
total_players += existing
continue
print(f"Fetching {league_key} {season_name} (tid={tid}, sid={season_id})...")
rows = fetch_player_stats(tid, season_id, league_key, season_name)
if rows:
with open(out_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=CSV_COLS)
writer.writeheader()
writer.writerows(rows)
print(f" Saved {len(rows)} players to {out_file.name}")
total_players += len(rows)
else:
print(f" No data found")
time.sleep(2) # Between seasons
print(f"\nTotal: {total_players} player-season records")
if __name__ == "__main__":
main()