#!/usr/bin/env python3 """Pre-aggregate raw data into small JSON files for the interactive dashboard. Usage: python scripts/build_dashboard_data.py Reads from data/ and writes JSON files to data/dashboard/. """ from __future__ import annotations import json import warnings from pathlib import Path import numpy as np import pandas as pd warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning) ROOT = Path(__file__).resolve().parent.parent DATA = ROOT / "data" OUT = ROOT / "output" OUT.mkdir(parents=True, exist_ok=True) COMPETITIONS = { "FA_Womens_Super_League_2018-2019": {"type": "league", "label": "FAWSL 2018-19"}, "FA_Womens_Super_League_2019-2020": {"type": "league", "label": "FAWSL 2019-20"}, "FA_Womens_Super_League_2020-2021": {"type": "league", "label": "FAWSL 2020-21"}, "NWSL_2018": {"type": "league", "label": "NWSL 2018"}, "UEFA_Womens_Euro_2022": {"type": "tournament", "label": "Euros 2022"}, "UEFA_Womens_Euro_2025": {"type": "tournament", "label": "Euros 2025"}, "Womens_World_Cup_2019": {"type": "tournament", "label": "WWC 2019"}, "Womens_World_Cup_2023": {"type": "tournament", "label": "WWC 2023"}, } EVENT_COLS = [ "type", "player", "player_id", "team", "match_id", "minute", "shot_outcome", "shot_statsbomb_xg", "pass_goal_assist", "pass_shot_assist", "pass_through_ball", "pass_cross", "pass_switch", "pass_outcome", "dribble_outcome", "interception_outcome", "duel_type", "duel_outcome", "position", ] def load_events(comp_dir: str) -> pd.DataFrame: path = DATA / "statsbomb" / comp_dir / "events.csv" df = pd.read_csv(path, usecols=lambda c: c in EVENT_COLS, low_memory=False) df["competition"] = comp_dir return df def load_matches(comp_dir: str) -> pd.DataFrame: path = DATA / "statsbomb" / comp_dir / "matches.csv" df = pd.read_csv(path) df["competition"] = comp_dir return df def load_lineups(comp_dir: str) -> pd.DataFrame: path = DATA / "statsbomb" / comp_dir / "lineups.csv" df = pd.read_csv(path) df["competition"] = comp_dir return df def percentile_rank(series: pd.Series) -> pd.Series: return series.rank(pct=True) * 100 # --------------------------------------------------------------------------- # StatsBomb Player Aggregates # --------------------------------------------------------------------------- def build_sb_players(all_events: pd.DataFrame, all_lineups: pd.DataFrame) -> dict: ev = all_events[all_events["player"].notna()].copy() # Goal threat components goals = ev[ev["shot_outcome"] == "Goal"].groupby("player").size().rename("goals") shots_on = ev[ev["shot_outcome"].isin(["Goal", "Saved", "Saved Off Target", "Saved to Post"])].groupby("player").size().rename("shots_on_target") xg = ev[ev["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg") assists = ev[ev["pass_goal_assist"].notna()].groupby("player").size().rename("assists") key_passes = ev[ev["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes") # Playmaker components through_balls = ev[ev["pass_through_ball"].notna()].groupby("player").size().rename("through_balls") crosses = ev[ev["pass_cross"].notna()].groupby("player").size().rename("crosses") switches = ev[ev["pass_switch"].notna()].groupby("player").size().rename("switches") dribbles_ok = ev[(ev["type"] == "Dribble") & (ev["dribble_outcome"] == "Complete")].groupby("player").size().rename("dribbles") # Defensive components interceptions = ev[ev["type"] == "Interception"].groupby("player").size().rename("interceptions") tackles_won = ev[(ev["duel_type"] == "Tackle") & (ev["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won") blocks = ev[ev["type"] == "Block"].groupby("player").size().rename("blocks") clearances = ev[ev["type"] == "Clearance"].groupby("player").size().rename("clearances") pressures = ev[ev["type"] == "Pressure"].groupby("player").size().rename("pressures") recoveries = ev[ev["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries") fouls_won = ev[ev["type"] == "Foul Won"].groupby("player").size().rename("fouls_won") fouls_committed = ev[ev["type"] == "Foul Committed"].groupby("player").size().rename("fouls_committed") # Get primary team and position per player player_team = ev.groupby("player")["team"].agg(lambda x: x.value_counts().index[0]).rename("team") player_comp = ev.groupby("player")["competition"].agg(lambda x: x.value_counts().index[0]) player_comp_label = player_comp.map(lambda c: COMPETITIONS.get(c, {}).get("label", c)).rename("competition") # Position from lineups pos_df = all_lineups[["player_name", "positions"]].copy() pos_df = pos_df[pos_df["positions"].notna()] def extract_primary_pos(pos_str): try: import ast positions = ast.literal_eval(pos_str) if positions and isinstance(positions, list): return positions[0].get("position", "Unknown") if isinstance(positions[0], dict) else str(positions[0]) except Exception: pass return "Unknown" pos_df["primary_position"] = pos_df["positions"].apply(extract_primary_pos) player_positions = pos_df.groupby("player_name")["primary_position"].agg( lambda x: x.value_counts().index[0] ).rename("position") def simplify_position(pos): pos = str(pos).lower() if "goalkeeper" in pos or pos == "gk": return "GK" elif "back" in pos or "defender" in pos or pos in ("cb", "lb", "rb", "lwb", "rwb"): return "DF" elif "midfield" in pos or pos in ("cm", "cdm", "cam", "lm", "rm", "dm", "am"): return "MF" elif "forward" in pos or "wing" in pos or "striker" in pos or pos in ("st", "cf", "lw", "rw", "ss"): return "FW" return "MF" player_pos_simple = player_positions.map(simplify_position).rename("position_group") # Combine all stats stats = pd.DataFrame({ "team": player_team, "competition": player_comp_label, }) for s in [goals, shots_on, xg, assists, key_passes, through_balls, crosses, switches, dribbles_ok, interceptions, tackles_won, blocks, clearances, pressures, recoveries, fouls_won, fouls_committed]: stats = stats.join(s, how="left") stats = stats.join(player_pos_simple, how="left") stats = stats.fillna(0) stats["position_group"] = stats["position_group"].replace(0, "MF") # Compute scores as percentile ranks stats["goal_threat"] = percentile_rank( stats[["goals", "shots_on_target", "xg", "assists", "key_passes"]].sum(axis=1) ) stats["playmaker"] = percentile_rank( stats[["assists", "key_passes", "through_balls", "crosses", "switches", "dribbles"]].sum(axis=1) ) stats["defensive"] = percentile_rank( stats[["interceptions", "tackles_won", "blocks", "clearances", "pressures", "recoveries"]].sum(axis=1) ) stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3 stats = stats.reset_index().rename(columns={"index": "player"}) # Top 30 per metric result = {} for metric in ["goal_threat", "playmaker", "defensive", "composite"]: top = stats.nlargest(30, metric) result[metric] = top[["player", "team", "competition", "position_group", "goals", "assists", "xg", "key_passes", "interceptions", "tackles_won", "blocks", metric]].to_dict(orient="records") # League vs tournament split ev_with_type = ev.copy() ev_with_type["comp_type"] = ev_with_type["competition"].map( lambda c: COMPETITIONS.get(c, {}).get("type", "unknown") ) league_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_goals") tourn_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_goals") league_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_assists") tourn_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_assists") lvt = pd.DataFrame({"league_goals": league_goals, "tournament_goals": tourn_goals, "league_assists": league_assists, "tournament_assists": tourn_assists}).fillna(0) lvt["total"] = lvt.sum(axis=1) lvt = lvt.nlargest(25, "total").reset_index().rename(columns={"index": "player"}) result["league_vs_tournament"] = lvt.to_dict(orient="records") # Top 10 by position by_pos = {} for pos in ["FW", "MF", "DF", "GK"]: subset = stats[stats["position_group"] == pos].nlargest(10, "composite") by_pos[pos] = subset[["player", "team", "composite", "goal_threat", "playmaker", "defensive"]].to_dict(orient="records") result["by_position"] = by_pos return result # --------------------------------------------------------------------------- # StatsBomb Club/Country Aggregates # --------------------------------------------------------------------------- def compute_team_rankings(all_matches: pd.DataFrame, all_events: pd.DataFrame, comp_type: str) -> dict: comps = [c for c, info in COMPETITIONS.items() if info["type"] == comp_type] matches = all_matches[all_matches["competition"].isin(comps)].copy() events = all_events[all_events["competition"].isin(comps)] if matches.empty: return {"teams": []} matches = matches.sort_values("match_date") # xG per team per match xg_by_match = events[events["shot_statsbomb_xg"].notna()].groupby( ["match_id", "team"] )["shot_statsbomb_xg"].sum().reset_index() # Build team stats records = [] for _, m in matches.iterrows(): home, away = m["home_team"], m["away_team"] hs, as_ = m["home_score"], m["away_score"] mid = m["match_id"] comp_label = COMPETITIONS.get(m["competition"], {}).get("label", m["competition"]) for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]: xg_team = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == team)] xg_opp = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == opp)] records.append({ "team": team, "match_id": mid, "match_date": m["match_date"], "competition": comp_label, "goals_scored": gs, "goals_conceded": gc, "points": 3 if gs > gc else (1 if gs == gc else 0), "xg_for": float(xg_team["shot_statsbomb_xg"].values[0]) if len(xg_team) else 0.0, "xg_against": float(xg_opp["shot_statsbomb_xg"].values[0]) if len(xg_opp) else 0.0, }) df = pd.DataFrame(records) # Aggregate across all competitions team_stats = df.groupby("team").agg( matches=("match_id", "count"), total_points=("points", "sum"), goals_scored=("goals_scored", "sum"), goals_conceded=("goals_conceded", "sum"), xg_for=("xg_for", "sum"), xg_against=("xg_against", "sum"), competition=("competition", lambda x: ", ".join(x.unique()[:3])), # List multiple comps ).reset_index() team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2) team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2) team_stats["xg_dominance"] = ((team_stats["xg_for"] - team_stats["xg_against"]) / team_stats["matches"]).round(3) # Elo (across all matches) elo = {} for _, m in matches.iterrows(): home, away = m["home_team"], m["away_team"] hs, as_ = m["home_score"], m["away_score"] eh = elo.get(home, 1500) ea = elo.get(away, 1500) exp_h = 1 / (1 + 10 ** ((ea - eh) / 400)) actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0) K = 40 elo[home] = eh + K * (actual_h - exp_h) elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h)) team_stats["elo"] = team_stats["team"].map(elo).round(0) # Composite for col in ["ppg", "elo", "xg_dominance"]: team_stats[f"{col}_pct"] = percentile_rank(team_stats[col]) team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["xg_dominance_pct"]) / 3).round(1) team_stats = team_stats.sort_values("composite", ascending=False) cols = ["team", "competition", "matches", "ppg", "elo", "xg_dominance", "gd_per_game", "composite"] return {"teams": team_stats[cols].to_dict(orient="records")} # --------------------------------------------------------------------------- # StatsBomb Player Comparisons # --------------------------------------------------------------------------- def build_sb_player_comparisons(all_events: pd.DataFrame) -> dict: ev = all_events[all_events["player"].notna()].copy() ev["comp_type"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("type", "unknown")) ev["comp_label"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("label", c)) def player_scores(subset): goals = subset[subset["shot_outcome"] == "Goal"].groupby("player").size().rename("goals") assists = subset[subset["pass_goal_assist"].notna()].groupby("player").size().rename("assists") key_passes = subset[subset["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes") xg = subset[subset["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg") through_balls = subset[subset["pass_through_ball"].notna()].groupby("player").size().rename("through_balls") crosses = subset[subset["pass_cross"].notna()].groupby("player").size().rename("crosses") interceptions = subset[subset["type"] == "Interception"].groupby("player").size().rename("interceptions") tackles = subset[(subset["duel_type"] == "Tackle") & (subset["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won") blocks = subset[subset["type"] == "Block"].groupby("player").size().rename("blocks") recoveries = subset[subset["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries") stats = pd.DataFrame({"goals": goals, "assists": assists, "key_passes": key_passes, "xg": xg, "through_balls": through_balls, "crosses": crosses, "interceptions": interceptions, "tackles_won": tackles, "blocks": blocks, "recoveries": recoveries}).fillna(0) if len(stats) == 0: return stats stats["goal_threat"] = percentile_rank(stats[["goals", "xg", "assists", "key_passes"]].sum(axis=1)) stats["playmaker"] = percentile_rank(stats[["assists", "key_passes", "through_balls", "crosses"]].sum(axis=1)) stats["defensive"] = percentile_rank(stats[["interceptions", "tackles_won", "blocks", "recoveries"]].sum(axis=1)) stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3 return stats result = {} # 1. Historical tournaments vs Euros 2025 hist_tourn = ev[(ev["comp_type"] == "tournament") & (ev["competition"] != "UEFA_Womens_Euro_2025")] euros25 = ev[ev["competition"] == "UEFA_Womens_Euro_2025"] hist_scores = player_scores(hist_tourn) e25_scores = player_scores(euros25) comparison1 = [] common = hist_scores.index.intersection(e25_scores.index) for metric in ["goal_threat", "playmaker", "defensive", "composite"]: merged = pd.DataFrame({ "historical": hist_scores.loc[common, metric] if metric in hist_scores.columns else 0, "euros_2025": e25_scores.loc[common, metric] if metric in e25_scores.columns else 0, }).dropna() top = merged.nlargest(15, "euros_2025").reset_index().rename(columns={"index": "player"}) comparison1.append({"metric": metric, "players": top.to_dict(orient="records")}) result["historical_vs_euros2025"] = comparison1 # 2. League vs Tournament league_ev = ev[ev["comp_type"] == "league"] tourn_ev = ev[ev["comp_type"] == "tournament"] league_scores = player_scores(league_ev) tourn_scores = player_scores(tourn_ev) comparison2 = [] common2 = league_scores.index.intersection(tourn_scores.index) for metric in ["goal_threat", "playmaker", "defensive", "composite"]: merged = pd.DataFrame({ "league": league_scores.loc[common2, metric] if metric in league_scores.columns else 0, "tournament": tourn_scores.loc[common2, metric] if metric in tourn_scores.columns else 0, }).dropna() top = merged.nlargest(15, "tournament").reset_index().rename(columns={"index": "player"}) comparison2.append({"metric": metric, "players": top.to_dict(orient="records")}) result["league_vs_tournament"] = comparison2 # 3. Euros 2025 Group vs Knockout # Need match stage info from matches e25_matches_path = DATA / "statsbomb" / "UEFA_Womens_Euro_2025" / "matches.csv" if e25_matches_path.exists(): e25m = pd.read_csv(e25_matches_path) group_match_ids = e25m[e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist() ko_match_ids = e25m[~e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist() group_ev = euros25[euros25["match_id"].isin(group_match_ids)] ko_ev = euros25[euros25["match_id"].isin(ko_match_ids)] group_scores = player_scores(group_ev) ko_scores = player_scores(ko_ev) comparison3 = [] common3 = group_scores.index.intersection(ko_scores.index) for metric in ["goal_threat", "playmaker", "composite"]: merged = pd.DataFrame({ "group_stage": group_scores.loc[common3, metric] if metric in group_scores.columns else 0, "knockout": ko_scores.loc[common3, metric] if metric in ko_scores.columns else 0, }).dropna() top = merged.nlargest(15, "knockout").reset_index().rename(columns={"index": "player"}) comparison3.append({"metric": metric, "players": top.to_dict(orient="records")}) result["euros2025_group_vs_knockout"] = comparison3 return result # --------------------------------------------------------------------------- # FIFA Rankings # --------------------------------------------------------------------------- def build_fifa_rankings() -> dict: quarters = [ ("2025_03_06", "Mar 2025"), ("2025_06_12", "Jun 2025"), ("2025_08_07", "Aug 2025"), ("2025_12_11", "Dec 2025"), ] frames = {} for suffix, label in quarters: path = DATA / f"fifa_womens_world_ranking_{suffix}.csv" if path.exists(): df = pd.read_csv(path) frames[label] = df if not frames: return {} # Build per-country trajectory countries = {} for label, df in frames.items(): for _, row in df.iterrows(): c = row["Country"] if c not in countries: countries[c] = { "country": c, "code": row.get("Country_Code", ""), "confederation": row.get("Confederation", ""), "points": {}, "ranks": {}, } if pd.notna(row["Total_Points"]): countries[c]["points"][label] = float(row["Total_Points"]) if pd.notna(row["Rank"]): countries[c]["ranks"][label] = int(row["Rank"]) all_countries = list(countries.values()) # Average points and rank across all quarters for top 25 all_df = pd.concat(frames.values(), ignore_index=True) avg_points = all_df.groupby(["Country", "Country_Code", "Confederation"])["Total_Points"].mean().reset_index() avg_points = avg_points.sort_values("Total_Points", ascending=False) top25 = avg_points.head(25).rename(columns={"Total_Points": "Avg_Points"}).to_dict(orient="records") # Confederation breakdown (by year/quarter) conf_avg = {} for label, df in frames.items(): conf_avg[label] = df.groupby("Confederation")["Total_Points"].mean().round(1).to_dict() # Movers: calculate change from earliest to latest available year all_quarters_sorted = sorted(frames.keys()) first_label = all_quarters_sorted[0] latest_label = all_quarters_sorted[-1] movers = [] for c in all_countries: if first_label in c["ranks"] and latest_label in c["ranks"]: rank_change = c["ranks"][first_label] - c["ranks"][latest_label] point_change = c["points"].get(latest_label, 0) - c["points"].get(first_label, 0) movers.append({ "country": c["country"], "code": c["code"], "confederation": c["confederation"], "rank_change": rank_change, "point_change": round(point_change, 1), }) movers_df = pd.DataFrame(movers) top_climbers = movers_df.nlargest(15, "rank_change").to_dict(orient="records") top_fallers = movers_df.nsmallest(15, "rank_change").to_dict(orient="records") top_point_gainers = movers_df.nlargest(15, "point_change").to_dict(orient="records") # H1 vs H2 (first half vs second half of the year) mid = len(all_quarters_sorted) // 2 h1_labels = all_quarters_sorted[:mid] h2_labels = all_quarters_sorted[mid:] h1h2 = [] for c in all_countries: pts = c["points"] rnk = c["ranks"] if all(l in pts for l in h1_labels + h2_labels) and all(l in rnk for l in h1_labels + h2_labels): h1_point_delta = sum(pts[l] for l in h1_labels[1:]) - sum(pts[l] for l in h1_labels[:-1]) h2_point_delta = sum(pts[l] for l in h2_labels[1:]) - sum(pts[l] for l in h2_labels[:-1]) h1_rank_delta = rnk[h1_labels[0]] - rnk[h1_labels[-1]] h2_rank_delta = rnk[h2_labels[0]] - rnk[h2_labels[-1]] h1h2.append({ "country": c["country"], "code": c["code"], "confederation": c["confederation"], "h1_point_delta": round(h1_point_delta, 1), "h2_point_delta": round(h2_point_delta, 1), "h1_rank_delta": h1_rank_delta, "h2_rank_delta": h2_rank_delta, }) h1h2_df = pd.DataFrame(h1h2) # Top 10 trajectories (by average points across all years) top10_countries = avg_points.head(10)["Country"].tolist() trajectories = [c for c in all_countries if c["country"] in top10_countries] return { "top25": top25, "confederation_avg": conf_avg, "top_climbers": top_climbers, "top_fallers": top_fallers, "top_point_gainers": top_point_gainers, "h1_vs_h2": h1h2_df.nlargest(20, "h2_point_delta").to_dict(orient="records") if len(h1h2_df) else [], "h1_vs_h2_risers": h1h2_df.assign( h2_improvement=h1h2_df["h2_point_delta"] - h1h2_df["h1_point_delta"] ).nlargest(15, "h2_improvement").to_dict(orient="records") if len(h1h2_df) else [], "trajectories": trajectories, "quarters": [q[1] for q in quarters], } # --------------------------------------------------------------------------- # WIFXScore (aggregated across years) # --------------------------------------------------------------------------- def build_wifx_scores() -> dict: path = DATA / "wifx_scores.csv" df = pd.read_csv(path) # Aggregate by player across all years/competitions # Use max score so merging entries never penalises players # (different sources have different feature richness) player_agg = df.groupby("player").apply( lambda g: pd.Series({ "WIFXScore": g["WIFXScore"].max(), "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"], "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"], "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"], "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"], "total_events": g["total_events"].sum(), "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown", "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown", }) ).reset_index() # Filter for minimum events threshold player_agg = player_agg[player_agg["total_events"] >= 50] # At least 50 events total player_agg = player_agg[player_agg["WIFXScore"].notna()] # Remove NaN scores # Top 25 by average WIFXScore - include all metrics top25 = player_agg.nlargest(25, "WIFXScore")[ ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] ].to_dict(orient="records") # Bottom 25 by WIFXScore bottom25 = player_agg.nsmallest(25, "WIFXScore")[ ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] ].to_dict(orient="records") # All players for component breakdown (top 15) all_players = player_agg.sort_values("WIFXScore", ascending=False)[ ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] ].to_dict(orient="records") # Distribution histogram hist_counts, hist_edges = np.histogram(player_agg["WIFXScore"], bins=30) distribution = { "counts": hist_counts.tolist(), "edges": [round(float(e), 2) for e in hist_edges.tolist()], "mean": round(float(player_agg["WIFXScore"].mean()), 2), "std": round(float(player_agg["WIFXScore"].std()), 2), } # By competition (still useful to show) by_comp = df.groupby("primary_comp")["WIFXScore"].agg(["mean", "median", "std", "count", "min", "max"]).round(2) by_comp_list = [] for comp, row in by_comp.iterrows(): scores = df[df["primary_comp"] == comp]["WIFXScore"].tolist() by_comp_list.append({ "competition": comp, "mean": row["mean"], "median": row["median"], "std": row["std"], "count": int(row["count"]), "scores": [round(s, 2) for s in scores], }) return { "top25": top25, "bottom25": bottom25, "all_players": all_players, "distribution": distribution, "by_competition": by_comp_list, } # --------------------------------------------------------------------------- # WIFXScore Historical (retired/legend players) # --------------------------------------------------------------------------- def build_wifx_historical_scores() -> dict: path = DATA / "wifx_historical_scores.csv" retired_path = DATA / "retired_players.csv" df = pd.read_csv(path) retired_df = pd.read_csv(retired_path) category_map = dict(zip(retired_df["player"], retired_df["category"])) player_agg = df.groupby("player").apply( lambda g: pd.Series({ "WIFXScore": g["WIFXScore"].max(), "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"], "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"], "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"], "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"], "total_events": g["total_events"].sum(), "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown", "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown", }) ).reset_index() player_agg = player_agg[player_agg["total_events"] >= 50] player_agg = player_agg[player_agg["WIFXScore"].notna()] player_agg["category"] = player_agg["player"].map(category_map).fillna("retired") cols = ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events", "category"] all_players = player_agg.sort_values("WIFXScore", ascending=False)[cols].to_dict(orient="records") top25 = player_agg.nlargest(25, "WIFXScore")[cols].to_dict(orient="records") return { "top25": top25, "all_players": all_players, } # --------------------------------------------------------------------------- # Historical Match Results # --------------------------------------------------------------------------- def build_match_results() -> dict: results_path = DATA / "versions" / "36" / "results.csv" goals_path = DATA / "versions" / "36" / "goalscorers.csv" results = pd.read_csv(results_path) goalscorers = pd.read_csv(goals_path) # Team aggregates (min 10 matches) records = [] for _, m in results.iterrows(): home, away = m["home_team"], m["away_team"] hs, as_ = m["home_score"], m["away_score"] for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]: records.append({ "team": team, "date": m["date"], "goals_scored": gs, "goals_conceded": gc, "points": 3 if gs > gc else (1 if gs == gc else 0), }) df = pd.DataFrame(records) team_stats = df.groupby("team").agg( matches=("points", "count"), total_points=("points", "sum"), goals_scored=("goals_scored", "sum"), goals_conceded=("goals_conceded", "sum"), ).reset_index() team_stats = team_stats[team_stats["matches"] >= 10] team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2) team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2) # Elo results_sorted = results.sort_values("date") elo = {} for _, m in results_sorted.iterrows(): home, away = m["home_team"], m["away_team"] hs, as_ = m["home_score"], m["away_score"] eh = elo.get(home, 1500) ea = elo.get(away, 1500) exp_h = 1 / (1 + 10 ** ((ea - eh) / 400)) actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0) K = 40 elo[home] = eh + K * (actual_h - exp_h) elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h)) team_stats["elo"] = team_stats["team"].map(elo).round(0) # Composite for col in ["ppg", "elo", "gd_per_game"]: team_stats[f"{col}_pct"] = percentile_rank(team_stats[col]) team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["gd_per_game_pct"]) / 3).round(1) team_stats = team_stats.sort_values("composite", ascending=False) top_teams = team_stats.head(30)[["team", "matches", "ppg", "elo", "gd_per_game", "composite"]].to_dict(orient="records") # Top scorers scorer_counts = goalscorers.groupby("scorer").agg( goals=("scorer", "count"), teams=("team", lambda x: ", ".join(x.unique())), penalties=("penalty", "sum"), ).reset_index().sort_values("goals", ascending=False) top_scorers = scorer_counts.head(30).to_dict(orient="records") return { "top_teams": top_teams, "top_scorers": top_scorers, } # --------------------------------------------------------------------------- # WIFX National Team Scores (aggregated across all years) # --------------------------------------------------------------------------- def build_wifx_national_team_scores(): path = DATA / "wifx_national_team_scores.csv" df = pd.read_csv(path) # Championship wins weighting (major tournaments) CHAMPIONSHIP_WINS = { "United States Women's": 4, # WWC: 1991, 1999, 2015, 2019 "United States": 4, "Germany Women's": 2, # Euro: 1995, 2001, 2009, 2013 "Germany": 2, "Norway Women's": 1, # Euro: 1995, WWC: 2023 "Norway": 1, "Japan Women's": 1, # WWC: 2011 "Japan": 1, "Spain Women's": 2, # Euro: 2022, WWC: 2023 "Spain": 2, "England Women's": 1, # Euro: 2022 "England": 1, "Netherlands Women's": 1, # Euro: 2017 "Netherlands": 1, "France Women's": 0, "France": 0, "Sweden Women's": 0, "Sweden": 0, "Canada Women's": 1, # Olympics: 2020, 2024 "Canada": 1, "Brazil Women's": 0, "Brazil": 0, "Australia Women's": 0, "Australia": 0, } # Add championship wins df["championship_wins"] = df["team"].map(CHAMPIONSHIP_WINS).fillna(0) # Aggregate by team agg_cols = { "offensive_rating": "mean", "defensive_rating": "mean", "net_rating": "mean", "composite_rating": "mean", "matches": "sum", "goals_scored": "sum", "championship_wins": "max", # Keep max wins } if "goals_conceded" in df.columns: agg_cols["goals_conceded"] = "sum" agg = df.groupby("team").agg(agg_cols).reset_index() # Weight net rating by championship wins (add number of championships) agg["wifx_global_ranking"] = agg["net_rating"] + agg["championship_wins"] # Sort by WIFX Global Ranking agg = agg.sort_values("wifx_global_ranking", ascending=False) # Rename net_rating to wifx_global_ranking for output result = { "all_teams": agg.to_dict(orient="records"), } write_json("wifx_national_team_scores.json", result) # --------------------------------------------------------------------------- # WIFX Club Team Scores (aggregated across all years) # --------------------------------------------------------------------------- def build_wifx_club_team_scores(): # First, load existing StatsBomb data path = DATA / "wifx_club_team_scores.csv" df = pd.read_csv(path) # Proper weighted average aggregation for StatsBomb agg = {} for _, row in df.iterrows(): team = row['team'] matches = row['matches'] if team not in agg: agg[team] = { 'team': team, 'matches': 0, 'goals_scored': 0, 'offensive_rating_sum': 0, 'defensive_rating_sum': 0, 'net_rating_sum': 0, 'composite_rating_sum': 0, 'comps': set() } agg[team]['matches'] += matches agg[team]['goals_scored'] += int(row.get('goals_scored', 0) or 0) if 'goals_conceded' in row and pd.notna(row.get('goals_conceded')): if 'goals_conceded' not in agg[team]: agg[team]['goals_conceded'] = 0 agg[team]['goals_conceded'] += int(row['goals_conceded']) agg[team]['offensive_rating_sum'] += (row['offensive_rating'] or 0) * matches agg[team]['defensive_rating_sum'] += (row['defensive_rating'] or 0) * matches agg[team]['net_rating_sum'] += (row['net_rating'] or 0) * matches agg[team]['composite_rating_sum'] += (row['composite_rating'] or 0) * matches if pd.notna(row.get('comp_label')): agg[team]['comps'].add(row['comp_label']) # Compute StatsBomb averages sb_result = [] for team, data in agg.items(): comps_str = ", ".join(sorted(data['comps'])) if data['comps'] else "FAWSL" result = { 'team': team, 'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1), 'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1), 'net_rating': round(data['net_rating_sum'] / data['matches'], 1), 'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1), 'matches': data['matches'], 'goals_scored': data['goals_scored'], 'comp_label': comps_str, 'source': 'statsbomb' } if 'goals_conceded' in data: result['goals_conceded'] = data['goals_conceded'] sb_result.append(result) # Normalize StatsBomb to 0-100 scale (was 0-30) sb_off_min = min(t['offensive_rating'] for t in sb_result) sb_off_max = max(t['offensive_rating'] for t in sb_result) sb_def_min = min(t['defensive_rating'] for t in sb_result) sb_def_max = max(t['defensive_rating'] for t in sb_result) if sb_off_max > sb_off_min: for t in sb_result: t['offensive_rating'] = round((t['offensive_rating'] - sb_off_min) / (sb_off_max - sb_off_min) * 100, 1) t['defensive_rating'] = round((t['defensive_rating'] - sb_def_min) / (sb_def_max - sb_def_min) * 100, 1) t['net_rating'] = round(t['offensive_rating'] - t['defensive_rating'], 1) t['composite_rating'] = round((t['offensive_rating'] + t['defensive_rating']) / 2, 1) TEAM_MAP = { 'KPqjw8PQ6v': 'Portland Thorns', 'aDQ0lzvQEv': 'OL Reign', '4JMAk47qKg': 'Chicago Red Stars', 'XVqKeVKM01': 'Washington Spirit', 'raMyrr25d2': 'Houston Dash', 'zeQZeazqKw': 'Orlando Pride', '7vQ7BBzqD1': 'FC Kansas City', '4wM4rZdqjB': 'North Carolina Courage', 'Pk5LeeNqOW': 'Kansas City Current', '4wM4Ezg5jB': 'Sky Blue FC', '7VqG1lYMvW': 'NJ/NY Gotham', 'eV5DR6YQKn': 'Angel City', 'kRQa8JOqKZ': 'San Diego Wave', 'eV5D2w9QKn': 'Bay FC', '315VnJ759x': 'Racing Louisville', 'xW5pwDBMg1': 'Boston Breakers', 'kRQaWa15KZ': 'Western New York Flash', } ga_path = DATA / "asa_nwsl" / "goals_added.csv" if ga_path.exists(): ga = pd.read_csv(ga_path) team_year = ga.groupby(['team_id_ga', 'season']).agg({ 'minutes_played_ga': 'sum', 'ga_shooting_raw': 'sum', 'ga_passing_raw': 'sum', 'ga_dribbling_raw': 'sum', 'ga_interrupting_raw': 'sum', 'ga_receiving_raw': 'sum', 'player_id': 'count', }).reset_index() team_year.columns = ['team_id', 'season', 'minutes', 'shooting', 'passing', 'dribbling', 'interrupting', 'receiving', 'players'] team_year['team'] = team_year['team_id'].map(TEAM_MAP).fillna('Unknown') team_year = team_year[(team_year['team'] != 'Unknown') & (team_year['minutes'] > 5000)] # Percentile ranking within each season team_year['offensive_rating'] = team_year.groupby('season')['shooting'].transform(lambda x: (x.rank(pct=True) * 100).round(1)) team_year['defensive_rating'] = team_year.groupby('season')['interrupting'].transform(lambda x: (x.rank(pct=True) * 100).round(1)) team_year['net_rating'] = (team_year['offensive_rating'] - team_year['defensive_rating']).round(1) team_year['composite_rating'] = ((team_year['offensive_rating'] + team_year['defensive_rating']) / 2).round(1) # Convert minutes to matches (approx 90 min = 1 match) team_year['matches'] = (team_year['minutes'] / 90).astype(int) team_year['comp_label'] = 'NWSL ' + team_year['season'].astype(str) # Aggregate across all years asa_agg = {} for _, row in team_year.iterrows(): team = row['team'] matches = row['matches'] if team not in asa_agg: asa_agg[team] = { 'team': team, 'matches': 0, 'offensive_rating_sum': 0, 'defensive_rating_sum': 0, 'net_rating_sum': 0, 'composite_rating_sum': 0, } asa_agg[team]['matches'] += matches asa_agg[team]['offensive_rating_sum'] += row['offensive_rating'] * matches asa_agg[team]['defensive_rating_sum'] += row['defensive_rating'] * matches asa_agg[team]['net_rating_sum'] += row['net_rating'] * matches asa_agg[team]['composite_rating_sum'] += row['composite_rating'] * matches asa_result = [] for team, data in asa_agg.items(): asa_result.append({ 'team': team, 'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1), 'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1), 'net_rating': round(data['net_rating_sum'] / data['matches'], 1), 'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1), 'matches': data['matches'], 'goals_scored': 0, # Not available in ASA 'goals_conceded': 0, 'comp_label': 'NWSL 2016-2025', 'source': 'asa' }) else: asa_result = [] # Combine both (deduplicate by team name - prefer ASA if available as it has more data) combined = {} # Championship wins mapping for clubs (NWSL weighted slightly higher) CLUB_CHAMPIONSHIPS = { # NWSL (weighted 1.5x) "Portland Thorns": 3, # 2017, 2022, 2024 "North Carolina Courage": 3, # 2018, 2019, 2023 "Kansas City Current": 1, # 2024 (as Current) "FC Kansas City": 2, # 2014, 2015 "Western New York Flash": 1, # 2016 "OL Reign": 1, # 2020 "Seattle Reign": 1, # 2020 "Chicago Red Stars": 0, "Washington Spirit": 1, # 2021 "Houston Dash": 0, "Angel City": 0, "NJ/NY Gotham": 0, "Boston Breakers": 0, "Sky Blue FC": 0, # FAWSL "Chelsea": 4, # 2015-16, 2017-18, 2019-20, 2020-21 "Manchester City Women": 2, # 2016-17, 2020-21 "Arsenal Women": 1, # 2022-23 "Liverpool FFC": 1, # 2013-14 "Everton Ladies": 0, "Bristol City WFC": 0, "Brighton & Hove Albion Women": 0, "Reading FC Women": 0, "Tottenham Hotspur Women": 0, "West Ham United LFC": 0, "Aston Villa": 0, "Yeovil Town LFC": 0, # UWCL "Lyon": 8, "OL Lyonnes": 8, # 2016-2020 (5), 2021-22, 2022-23, 2023-24 "Barcelona": 3, "FĂștbol Club Barcelona": 3, # 2020-21, 2021-22, 2022-23 "Wolfsburg": 2, "VfL Wolfsburg": 2, # 2013-14, 2015-16 "Paris Saint-Germain": 0, "Olympique Lyonnais": 8, # Other leagues "Bay FC": 0, "Racing Louisville": 0, "San Diego Wave": 0, "FC Barcelona": 3, } # First add StatsBomb teams for t in sb_result: combined[t['team']] = t # Then add ASA teams (will overwrite StatsBomb if exists) for t in asa_result: if t['team'] in combined: # Merge - keep statsbomb goals data, use ASA ratings weighted by matches existing = combined[t['team']] total_matches = existing['matches'] + t['matches'] combined[t['team']] = { 'team': t['team'], 'offensive_rating': round((existing['offensive_rating'] * existing['matches'] + t['offensive_rating'] * t['matches']) / total_matches, 1), 'defensive_rating': round((existing['defensive_rating'] * existing['matches'] + t['defensive_rating'] * t['matches']) / total_matches, 1), 'net_rating': round((existing['net_rating'] * existing['matches'] + t['net_rating'] * t['matches']) / total_matches, 1), 'composite_rating': round((existing['composite_rating'] * existing['matches'] + t['composite_rating'] * t['matches']) / total_matches, 1), 'matches': total_matches, 'goals_scored': existing.get('goals_scored', 0), 'goals_conceded': existing.get('goals_conceded', 0), 'comp_label': 'NWSL + FAWSL', } else: combined[t['team']] = t # Add championship wins and WIFX Global Club Ranking for team, data in combined.items(): wins = CLUB_CHAMPIONSHIPS.get(team, 0) # NWSL championships weighted 1.5x nwsl_teams = ["Portland Thorns", "North Carolina Courage", "Kansas City Current", "FC Kansas City", "Western New York Flash", "OL Reign", "Seattle Reign", "Chicago Red Stars", "Washington Spirit", "Houston Dash", "Angel City", "NJ/NY Gotham", "Boston Breakers", "Sky Blue FC", "Bay FC", "Racing Louisville", "San Diego Wave"] if team in nwsl_teams: data['championship_wins'] = wins data['wifx_global_club_ranking'] = data['net_rating'] + (wins * 1.5) else: data['championship_wins'] = wins data['wifx_global_club_ranking'] = data['net_rating'] + wins all_teams = list(combined.values()) all_teams.sort(key=lambda x: x.get('wifx_global_club_ranking', x.get('composite_rating', 0)), reverse=True) write_json("wifx_club_team_scores.json", {"all_teams": all_teams}) # --------------------------------------------------------------------------- # WIFX Confederation Scores (aggregated across years) # --------------------------------------------------------------------------- def build_wifx_confederation_scores(): path = DATA / "wifx_club_confederation_scores.csv" df = pd.read_csv(path) # Aggregate by team agg = df.groupby("team").agg({ "wifx_club_score": "mean", "country": "first", "confederation": "first", "championships_won": "sum", "finals_reached": "sum", }).reset_index() agg = agg.sort_values("wifx_club_score", ascending=False) agg = agg.assign(rank=range(1, len(agg) + 1)) result = { "club_confederation_scores": agg.to_dict(orient="records"), } write_json("wifx_club_confederation_scores.json", result) # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): print("Loading StatsBomb events (this may take a minute)...") all_events = pd.concat([load_events(c) for c in COMPETITIONS], ignore_index=True) print(f" Loaded {len(all_events):,} events") all_matches = pd.concat([load_matches(c) for c in COMPETITIONS], ignore_index=True) print(f" Loaded {len(all_matches):,} matches") all_lineups = pd.concat([load_lineups(c) for c in COMPETITIONS], ignore_index=True) print(f" Loaded {len(all_lineups):,} lineup entries") # Build WIFX dashboards only print("Building WIFX scores...") wifx = build_wifx_scores() write_json("wifx_scores.json", wifx) print("Building WIFX historical scores...") wifx_hist = build_wifx_historical_scores() write_json("wifx_historical_scores.json", wifx_hist) print("Building aggregated WIFX national team scores...") build_wifx_national_team_scores() print("Building aggregated WIFX club team scores...") build_wifx_club_team_scores() print("Building aggregated WIFX confederation scores...") build_wifx_confederation_scores() print("Done! All JSON files written to data/dashboard/") def write_json(filename: str, data: dict): import math path = OUT / filename def clean_nan(obj): if isinstance(obj, dict): return {k: clean_nan(v) for k, v in obj.items()} elif isinstance(obj, list): return [clean_nan(v) for v in obj] elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): return None elif obj == "NaN": return None return obj data = clean_nan(data) with path.open("w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, default=str) size = path.stat().st_size print(f" Wrote {path} ({size / 1024:.1f} KB)") if __name__ == "__main__": main()