Spaces:

AMadabhushi
/

WIFX

Running

File size: 48,758 Bytes

#!/usr/bin/env python3
"""Pre-aggregate raw data into small JSON files for the interactive dashboard.

Usage:
    python scripts/build_dashboard_data.py

Reads from data/ and writes JSON files to data/dashboard/.
"""
from __future__ import annotations

import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

ROOT = Path(__file__).resolve().parent.parent
DATA = ROOT / "data"
OUT = ROOT / "output"
OUT.mkdir(parents=True, exist_ok=True)

COMPETITIONS = {
    "FA_Womens_Super_League_2018-2019": {"type": "league", "label": "FAWSL 2018-19"},
    "FA_Womens_Super_League_2019-2020": {"type": "league", "label": "FAWSL 2019-20"},
    "FA_Womens_Super_League_2020-2021": {"type": "league", "label": "FAWSL 2020-21"},
    "NWSL_2018": {"type": "league", "label": "NWSL 2018"},
    "UEFA_Womens_Euro_2022": {"type": "tournament", "label": "Euros 2022"},
    "UEFA_Womens_Euro_2025": {"type": "tournament", "label": "Euros 2025"},
    "Womens_World_Cup_2019": {"type": "tournament", "label": "WWC 2019"},
    "Womens_World_Cup_2023": {"type": "tournament", "label": "WWC 2023"},
}

EVENT_COLS = [
    "type", "player", "player_id", "team", "match_id", "minute",
    "shot_outcome", "shot_statsbomb_xg",
    "pass_goal_assist", "pass_shot_assist", "pass_through_ball",
    "pass_cross", "pass_switch", "pass_outcome",
    "dribble_outcome",
    "interception_outcome",
    "duel_type", "duel_outcome",
    "position",
]


def load_events(comp_dir: str) -> pd.DataFrame:
    path = DATA / "statsbomb" / comp_dir / "events.csv"
    df = pd.read_csv(path, usecols=lambda c: c in EVENT_COLS, low_memory=False)
    df["competition"] = comp_dir
    return df


def load_matches(comp_dir: str) -> pd.DataFrame:
    path = DATA / "statsbomb" / comp_dir / "matches.csv"
    df = pd.read_csv(path)
    df["competition"] = comp_dir
    return df


def load_lineups(comp_dir: str) -> pd.DataFrame:
    path = DATA / "statsbomb" / comp_dir / "lineups.csv"
    df = pd.read_csv(path)
    df["competition"] = comp_dir
    return df


def percentile_rank(series: pd.Series) -> pd.Series:
    return series.rank(pct=True) * 100


# ---------------------------------------------------------------------------
# StatsBomb Player Aggregates
# ---------------------------------------------------------------------------
def build_sb_players(all_events: pd.DataFrame, all_lineups: pd.DataFrame) -> dict:
    ev = all_events[all_events["player"].notna()].copy()

    # Goal threat components
    goals = ev[ev["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
    shots_on = ev[ev["shot_outcome"].isin(["Goal", "Saved", "Saved Off Target", "Saved to Post"])].groupby("player").size().rename("shots_on_target")
    xg = ev[ev["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
    assists = ev[ev["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
    key_passes = ev[ev["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")

    # Playmaker components
    through_balls = ev[ev["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
    crosses = ev[ev["pass_cross"].notna()].groupby("player").size().rename("crosses")
    switches = ev[ev["pass_switch"].notna()].groupby("player").size().rename("switches")
    dribbles_ok = ev[(ev["type"] == "Dribble") & (ev["dribble_outcome"] == "Complete")].groupby("player").size().rename("dribbles")

    # Defensive components
    interceptions = ev[ev["type"] == "Interception"].groupby("player").size().rename("interceptions")
    tackles_won = ev[(ev["duel_type"] == "Tackle") & (ev["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
    blocks = ev[ev["type"] == "Block"].groupby("player").size().rename("blocks")
    clearances = ev[ev["type"] == "Clearance"].groupby("player").size().rename("clearances")
    pressures = ev[ev["type"] == "Pressure"].groupby("player").size().rename("pressures")
    recoveries = ev[ev["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")
    fouls_won = ev[ev["type"] == "Foul Won"].groupby("player").size().rename("fouls_won")
    fouls_committed = ev[ev["type"] == "Foul Committed"].groupby("player").size().rename("fouls_committed")

    # Get primary team and position per player
    player_team = ev.groupby("player")["team"].agg(lambda x: x.value_counts().index[0]).rename("team")
    player_comp = ev.groupby("player")["competition"].agg(lambda x: x.value_counts().index[0])
    player_comp_label = player_comp.map(lambda c: COMPETITIONS.get(c, {}).get("label", c)).rename("competition")

    # Position from lineups
    pos_df = all_lineups[["player_name", "positions"]].copy()
    pos_df = pos_df[pos_df["positions"].notna()]

    def extract_primary_pos(pos_str):
        try:
            import ast
            positions = ast.literal_eval(pos_str)
            if positions and isinstance(positions, list):
                return positions[0].get("position", "Unknown") if isinstance(positions[0], dict) else str(positions[0])
        except Exception:
            pass
        return "Unknown"

    pos_df["primary_position"] = pos_df["positions"].apply(extract_primary_pos)
    player_positions = pos_df.groupby("player_name")["primary_position"].agg(
        lambda x: x.value_counts().index[0]
    ).rename("position")

    def simplify_position(pos):
        pos = str(pos).lower()
        if "goalkeeper" in pos or pos == "gk":
            return "GK"
        elif "back" in pos or "defender" in pos or pos in ("cb", "lb", "rb", "lwb", "rwb"):
            return "DF"
        elif "midfield" in pos or pos in ("cm", "cdm", "cam", "lm", "rm", "dm", "am"):
            return "MF"
        elif "forward" in pos or "wing" in pos or "striker" in pos or pos in ("st", "cf", "lw", "rw", "ss"):
            return "FW"
        return "MF"

    player_pos_simple = player_positions.map(simplify_position).rename("position_group")

    # Combine all stats
    stats = pd.DataFrame({
        "team": player_team,
        "competition": player_comp_label,
    })
    for s in [goals, shots_on, xg, assists, key_passes, through_balls, crosses,
              switches, dribbles_ok, interceptions, tackles_won, blocks, clearances,
              pressures, recoveries, fouls_won, fouls_committed]:
        stats = stats.join(s, how="left")
    stats = stats.join(player_pos_simple, how="left")
    stats = stats.fillna(0)
    stats["position_group"] = stats["position_group"].replace(0, "MF")

    # Compute scores as percentile ranks
    stats["goal_threat"] = percentile_rank(
        stats[["goals", "shots_on_target", "xg", "assists", "key_passes"]].sum(axis=1)
    )
    stats["playmaker"] = percentile_rank(
        stats[["assists", "key_passes", "through_balls", "crosses", "switches", "dribbles"]].sum(axis=1)
    )
    stats["defensive"] = percentile_rank(
        stats[["interceptions", "tackles_won", "blocks", "clearances", "pressures", "recoveries"]].sum(axis=1)
    )
    stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3

    stats = stats.reset_index().rename(columns={"index": "player"})

    # Top 30 per metric
    result = {}
    for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
        top = stats.nlargest(30, metric)
        result[metric] = top[["player", "team", "competition", "position_group",
                              "goals", "assists", "xg", "key_passes",
                              "interceptions", "tackles_won", "blocks",
                              metric]].to_dict(orient="records")

    # League vs tournament split
    ev_with_type = ev.copy()
    ev_with_type["comp_type"] = ev_with_type["competition"].map(
        lambda c: COMPETITIONS.get(c, {}).get("type", "unknown")
    )
    league_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_goals")
    tourn_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_goals")
    league_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_assists")
    tourn_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_assists")

    lvt = pd.DataFrame({"league_goals": league_goals, "tournament_goals": tourn_goals,
                         "league_assists": league_assists, "tournament_assists": tourn_assists}).fillna(0)
    lvt["total"] = lvt.sum(axis=1)
    lvt = lvt.nlargest(25, "total").reset_index().rename(columns={"index": "player"})
    result["league_vs_tournament"] = lvt.to_dict(orient="records")

    # Top 10 by position
    by_pos = {}
    for pos in ["FW", "MF", "DF", "GK"]:
        subset = stats[stats["position_group"] == pos].nlargest(10, "composite")
        by_pos[pos] = subset[["player", "team", "composite", "goal_threat", "playmaker", "defensive"]].to_dict(orient="records")
    result["by_position"] = by_pos

    return result


# ---------------------------------------------------------------------------
# StatsBomb Club/Country Aggregates
# ---------------------------------------------------------------------------
def compute_team_rankings(all_matches: pd.DataFrame, all_events: pd.DataFrame, comp_type: str) -> dict:
    comps = [c for c, info in COMPETITIONS.items() if info["type"] == comp_type]
    matches = all_matches[all_matches["competition"].isin(comps)].copy()
    events = all_events[all_events["competition"].isin(comps)]

    if matches.empty:
        return {"teams": []}

    matches = matches.sort_values("match_date")

    # xG per team per match
    xg_by_match = events[events["shot_statsbomb_xg"].notna()].groupby(
        ["match_id", "team"]
    )["shot_statsbomb_xg"].sum().reset_index()

    # Build team stats
    records = []
    for _, m in matches.iterrows():
        home, away = m["home_team"], m["away_team"]
        hs, as_ = m["home_score"], m["away_score"]
        mid = m["match_id"]
        comp_label = COMPETITIONS.get(m["competition"], {}).get("label", m["competition"])

        for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
            xg_team = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == team)]
            xg_opp = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == opp)]
            records.append({
                "team": team,
                "match_id": mid,
                "match_date": m["match_date"],
                "competition": comp_label,
                "goals_scored": gs,
                "goals_conceded": gc,
                "points": 3 if gs > gc else (1 if gs == gc else 0),
                "xg_for": float(xg_team["shot_statsbomb_xg"].values[0]) if len(xg_team) else 0.0,
                "xg_against": float(xg_opp["shot_statsbomb_xg"].values[0]) if len(xg_opp) else 0.0,
            })

    df = pd.DataFrame(records)
    
    # Aggregate across all competitions
    team_stats = df.groupby("team").agg(
        matches=("match_id", "count"),
        total_points=("points", "sum"),
        goals_scored=("goals_scored", "sum"),
        goals_conceded=("goals_conceded", "sum"),
        xg_for=("xg_for", "sum"),
        xg_against=("xg_against", "sum"),
        competition=("competition", lambda x: ", ".join(x.unique()[:3])),  # List multiple comps
    ).reset_index()

    team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
    team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)
    team_stats["xg_dominance"] = ((team_stats["xg_for"] - team_stats["xg_against"]) / team_stats["matches"]).round(3)

    # Elo (across all matches)
    elo = {}
    for _, m in matches.iterrows():
        home, away = m["home_team"], m["away_team"]
        hs, as_ = m["home_score"], m["away_score"]
        eh = elo.get(home, 1500)
        ea = elo.get(away, 1500)
        exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
        actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
        K = 40
        elo[home] = eh + K * (actual_h - exp_h)
        elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))

    team_stats["elo"] = team_stats["team"].map(elo).round(0)

    # Composite
    for col in ["ppg", "elo", "xg_dominance"]:
        team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
    team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["xg_dominance_pct"]) / 3).round(1)

    team_stats = team_stats.sort_values("composite", ascending=False)
    cols = ["team", "competition", "matches", "ppg", "elo", "xg_dominance", "gd_per_game", "composite"]
    return {"teams": team_stats[cols].to_dict(orient="records")}


# ---------------------------------------------------------------------------
# StatsBomb Player Comparisons
# ---------------------------------------------------------------------------
def build_sb_player_comparisons(all_events: pd.DataFrame) -> dict:
    ev = all_events[all_events["player"].notna()].copy()
    ev["comp_type"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("type", "unknown"))
    ev["comp_label"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("label", c))

    def player_scores(subset):
        goals = subset[subset["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
        assists = subset[subset["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
        key_passes = subset[subset["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")
        xg = subset[subset["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
        through_balls = subset[subset["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
        crosses = subset[subset["pass_cross"].notna()].groupby("player").size().rename("crosses")
        interceptions = subset[subset["type"] == "Interception"].groupby("player").size().rename("interceptions")
        tackles = subset[(subset["duel_type"] == "Tackle") & (subset["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
        blocks = subset[subset["type"] == "Block"].groupby("player").size().rename("blocks")
        recoveries = subset[subset["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")

        stats = pd.DataFrame({"goals": goals, "assists": assists, "key_passes": key_passes,
                               "xg": xg, "through_balls": through_balls, "crosses": crosses,
                               "interceptions": interceptions, "tackles_won": tackles,
                               "blocks": blocks, "recoveries": recoveries}).fillna(0)

        if len(stats) == 0:
            return stats

        stats["goal_threat"] = percentile_rank(stats[["goals", "xg", "assists", "key_passes"]].sum(axis=1))
        stats["playmaker"] = percentile_rank(stats[["assists", "key_passes", "through_balls", "crosses"]].sum(axis=1))
        stats["defensive"] = percentile_rank(stats[["interceptions", "tackles_won", "blocks", "recoveries"]].sum(axis=1))
        stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3
        return stats

    result = {}

    # 1. Historical tournaments vs Euros 2025
    hist_tourn = ev[(ev["comp_type"] == "tournament") & (ev["competition"] != "UEFA_Womens_Euro_2025")]
    euros25 = ev[ev["competition"] == "UEFA_Womens_Euro_2025"]
    hist_scores = player_scores(hist_tourn)
    e25_scores = player_scores(euros25)

    comparison1 = []
    common = hist_scores.index.intersection(e25_scores.index)
    for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
        merged = pd.DataFrame({
            "historical": hist_scores.loc[common, metric] if metric in hist_scores.columns else 0,
            "euros_2025": e25_scores.loc[common, metric] if metric in e25_scores.columns else 0,
        }).dropna()
        top = merged.nlargest(15, "euros_2025").reset_index().rename(columns={"index": "player"})
        comparison1.append({"metric": metric, "players": top.to_dict(orient="records")})
    result["historical_vs_euros2025"] = comparison1

    # 2. League vs Tournament
    league_ev = ev[ev["comp_type"] == "league"]
    tourn_ev = ev[ev["comp_type"] == "tournament"]
    league_scores = player_scores(league_ev)
    tourn_scores = player_scores(tourn_ev)

    comparison2 = []
    common2 = league_scores.index.intersection(tourn_scores.index)
    for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
        merged = pd.DataFrame({
            "league": league_scores.loc[common2, metric] if metric in league_scores.columns else 0,
            "tournament": tourn_scores.loc[common2, metric] if metric in tourn_scores.columns else 0,
        }).dropna()
        top = merged.nlargest(15, "tournament").reset_index().rename(columns={"index": "player"})
        comparison2.append({"metric": metric, "players": top.to_dict(orient="records")})
    result["league_vs_tournament"] = comparison2

    # 3. Euros 2025 Group vs Knockout
    # Need match stage info from matches
    e25_matches_path = DATA / "statsbomb" / "UEFA_Womens_Euro_2025" / "matches.csv"
    if e25_matches_path.exists():
        e25m = pd.read_csv(e25_matches_path)
        group_match_ids = e25m[e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()
        ko_match_ids = e25m[~e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()

        group_ev = euros25[euros25["match_id"].isin(group_match_ids)]
        ko_ev = euros25[euros25["match_id"].isin(ko_match_ids)]
        group_scores = player_scores(group_ev)
        ko_scores = player_scores(ko_ev)

        comparison3 = []
        common3 = group_scores.index.intersection(ko_scores.index)
        for metric in ["goal_threat", "playmaker", "composite"]:
            merged = pd.DataFrame({
                "group_stage": group_scores.loc[common3, metric] if metric in group_scores.columns else 0,
                "knockout": ko_scores.loc[common3, metric] if metric in ko_scores.columns else 0,
            }).dropna()
            top = merged.nlargest(15, "knockout").reset_index().rename(columns={"index": "player"})
            comparison3.append({"metric": metric, "players": top.to_dict(orient="records")})
        result["euros2025_group_vs_knockout"] = comparison3

    return result


# ---------------------------------------------------------------------------
# FIFA Rankings
# ---------------------------------------------------------------------------
def build_fifa_rankings() -> dict:
    quarters = [
        ("2025_03_06", "Mar 2025"),
        ("2025_06_12", "Jun 2025"),
        ("2025_08_07", "Aug 2025"),
        ("2025_12_11", "Dec 2025"),
    ]

    frames = {}
    for suffix, label in quarters:
        path = DATA / f"fifa_womens_world_ranking_{suffix}.csv"
        if path.exists():
            df = pd.read_csv(path)
            frames[label] = df

    if not frames:
        return {}

    # Build per-country trajectory
    countries = {}
    for label, df in frames.items():
        for _, row in df.iterrows():
            c = row["Country"]
            if c not in countries:
                countries[c] = {
                    "country": c,
                    "code": row.get("Country_Code", ""),
                    "confederation": row.get("Confederation", ""),
                    "points": {},
                    "ranks": {},
                }
            if pd.notna(row["Total_Points"]):
                countries[c]["points"][label] = float(row["Total_Points"])
            if pd.notna(row["Rank"]):
                countries[c]["ranks"][label] = int(row["Rank"])

    all_countries = list(countries.values())

    # Average points and rank across all quarters for top 25
    all_df = pd.concat(frames.values(), ignore_index=True)
    avg_points = all_df.groupby(["Country", "Country_Code", "Confederation"])["Total_Points"].mean().reset_index()
    avg_points = avg_points.sort_values("Total_Points", ascending=False)
    top25 = avg_points.head(25).rename(columns={"Total_Points": "Avg_Points"}).to_dict(orient="records")

    # Confederation breakdown (by year/quarter)
    conf_avg = {}
    for label, df in frames.items():
        conf_avg[label] = df.groupby("Confederation")["Total_Points"].mean().round(1).to_dict()

    # Movers: calculate change from earliest to latest available year
    all_quarters_sorted = sorted(frames.keys())
    first_label = all_quarters_sorted[0]
    latest_label = all_quarters_sorted[-1]
    movers = []
    for c in all_countries:
        if first_label in c["ranks"] and latest_label in c["ranks"]:
            rank_change = c["ranks"][first_label] - c["ranks"][latest_label]
            point_change = c["points"].get(latest_label, 0) - c["points"].get(first_label, 0)
            movers.append({
                "country": c["country"],
                "code": c["code"],
                "confederation": c["confederation"],
                "rank_change": rank_change,
                "point_change": round(point_change, 1),
            })

    movers_df = pd.DataFrame(movers)
    top_climbers = movers_df.nlargest(15, "rank_change").to_dict(orient="records")
    top_fallers = movers_df.nsmallest(15, "rank_change").to_dict(orient="records")
    top_point_gainers = movers_df.nlargest(15, "point_change").to_dict(orient="records")

    # H1 vs H2 (first half vs second half of the year)
    mid = len(all_quarters_sorted) // 2
    h1_labels = all_quarters_sorted[:mid]
    h2_labels = all_quarters_sorted[mid:]
    h1h2 = []
    for c in all_countries:
        pts = c["points"]
        rnk = c["ranks"]
        if all(l in pts for l in h1_labels + h2_labels) and all(l in rnk for l in h1_labels + h2_labels):
            h1_point_delta = sum(pts[l] for l in h1_labels[1:]) - sum(pts[l] for l in h1_labels[:-1])
            h2_point_delta = sum(pts[l] for l in h2_labels[1:]) - sum(pts[l] for l in h2_labels[:-1])
            h1_rank_delta = rnk[h1_labels[0]] - rnk[h1_labels[-1]]
            h2_rank_delta = rnk[h2_labels[0]] - rnk[h2_labels[-1]]
            h1h2.append({
                "country": c["country"],
                "code": c["code"],
                "confederation": c["confederation"],
                "h1_point_delta": round(h1_point_delta, 1),
                "h2_point_delta": round(h2_point_delta, 1),
                "h1_rank_delta": h1_rank_delta,
                "h2_rank_delta": h2_rank_delta,
            })
    h1h2_df = pd.DataFrame(h1h2)

    # Top 10 trajectories (by average points across all years)
    top10_countries = avg_points.head(10)["Country"].tolist()
    trajectories = [c for c in all_countries if c["country"] in top10_countries]

    return {
        "top25": top25,
        "confederation_avg": conf_avg,
        "top_climbers": top_climbers,
        "top_fallers": top_fallers,
        "top_point_gainers": top_point_gainers,
        "h1_vs_h2": h1h2_df.nlargest(20, "h2_point_delta").to_dict(orient="records") if len(h1h2_df) else [],
        "h1_vs_h2_risers": h1h2_df.assign(
            h2_improvement=h1h2_df["h2_point_delta"] - h1h2_df["h1_point_delta"]
        ).nlargest(15, "h2_improvement").to_dict(orient="records") if len(h1h2_df) else [],
        "trajectories": trajectories,
        "quarters": [q[1] for q in quarters],
    }


# ---------------------------------------------------------------------------
# WIFXScore (aggregated across years)
# ---------------------------------------------------------------------------
def build_wifx_scores() -> dict:
    path = DATA / "wifx_scores.csv"
    df = pd.read_csv(path)

    # Aggregate by player across all years/competitions
    # Use max score so merging entries never penalises players
    # (different sources have different feature richness)
    player_agg = df.groupby("player").apply(
        lambda g: pd.Series({
            "WIFXScore": g["WIFXScore"].max(),
            "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
            "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
            "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
            "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
            "total_events": g["total_events"].sum(),
            "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
            "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
        })
    ).reset_index()

    # Filter for minimum events threshold
    player_agg = player_agg[player_agg["total_events"] >= 50]  # At least 50 events total
    player_agg = player_agg[player_agg["WIFXScore"].notna()]  # Remove NaN scores

    # Top 25 by average WIFXScore - include all metrics
    top25 = player_agg.nlargest(25, "WIFXScore")[
        ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
    ].to_dict(orient="records")

    # Bottom 25 by WIFXScore
    bottom25 = player_agg.nsmallest(25, "WIFXScore")[
        ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
    ].to_dict(orient="records")

    # All players for component breakdown (top 15)
    all_players = player_agg.sort_values("WIFXScore", ascending=False)[
        ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
    ].to_dict(orient="records")

    # Distribution histogram
    hist_counts, hist_edges = np.histogram(player_agg["WIFXScore"], bins=30)
    distribution = {
        "counts": hist_counts.tolist(),
        "edges": [round(float(e), 2) for e in hist_edges.tolist()],
        "mean": round(float(player_agg["WIFXScore"].mean()), 2),
        "std": round(float(player_agg["WIFXScore"].std()), 2),
    }

    # By competition (still useful to show)
    by_comp = df.groupby("primary_comp")["WIFXScore"].agg(["mean", "median", "std", "count", "min", "max"]).round(2)
    by_comp_list = []
    for comp, row in by_comp.iterrows():
        scores = df[df["primary_comp"] == comp]["WIFXScore"].tolist()
        by_comp_list.append({
            "competition": comp,
            "mean": row["mean"],
            "median": row["median"],
            "std": row["std"],
            "count": int(row["count"]),
            "scores": [round(s, 2) for s in scores],
        })

    return {
        "top25": top25,
        "bottom25": bottom25,
        "all_players": all_players,
        "distribution": distribution,
        "by_competition": by_comp_list,
    }


# ---------------------------------------------------------------------------
# WIFXScore Historical (retired/legend players)
# ---------------------------------------------------------------------------
def build_wifx_historical_scores() -> dict:
    path = DATA / "wifx_historical_scores.csv"
    retired_path = DATA / "retired_players.csv"
    df = pd.read_csv(path)
    retired_df = pd.read_csv(retired_path)
    category_map = dict(zip(retired_df["player"], retired_df["category"]))

    player_agg = df.groupby("player").apply(
        lambda g: pd.Series({
            "WIFXScore": g["WIFXScore"].max(),
            "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
            "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
            "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
            "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
            "total_events": g["total_events"].sum(),
            "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
            "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
        })
    ).reset_index()

    player_agg = player_agg[player_agg["total_events"] >= 50]
    player_agg = player_agg[player_agg["WIFXScore"].notna()]
    player_agg["category"] = player_agg["player"].map(category_map).fillna("retired")

    cols = ["player", "team", "primary_comp", "WIFXScore", "epm_raw",
            "offensive_score", "creative_score", "defensive_score",
            "total_events", "category"]

    all_players = player_agg.sort_values("WIFXScore", ascending=False)[cols].to_dict(orient="records")
    top25 = player_agg.nlargest(25, "WIFXScore")[cols].to_dict(orient="records")

    return {
        "top25": top25,
        "all_players": all_players,
    }


# ---------------------------------------------------------------------------
# Historical Match Results
# ---------------------------------------------------------------------------
def build_match_results() -> dict:
    results_path = DATA / "versions" / "36" / "results.csv"
    goals_path = DATA / "versions" / "36" / "goalscorers.csv"

    results = pd.read_csv(results_path)
    goalscorers = pd.read_csv(goals_path)

    # Team aggregates (min 10 matches)
    records = []
    for _, m in results.iterrows():
        home, away = m["home_team"], m["away_team"]
        hs, as_ = m["home_score"], m["away_score"]
        for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
            records.append({
                "team": team,
                "date": m["date"],
                "goals_scored": gs,
                "goals_conceded": gc,
                "points": 3 if gs > gc else (1 if gs == gc else 0),
            })

    df = pd.DataFrame(records)
    team_stats = df.groupby("team").agg(
        matches=("points", "count"),
        total_points=("points", "sum"),
        goals_scored=("goals_scored", "sum"),
        goals_conceded=("goals_conceded", "sum"),
    ).reset_index()
    team_stats = team_stats[team_stats["matches"] >= 10]
    team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
    team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)

    # Elo
    results_sorted = results.sort_values("date")
    elo = {}
    for _, m in results_sorted.iterrows():
        home, away = m["home_team"], m["away_team"]
        hs, as_ = m["home_score"], m["away_score"]
        eh = elo.get(home, 1500)
        ea = elo.get(away, 1500)
        exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
        actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
        K = 40
        elo[home] = eh + K * (actual_h - exp_h)
        elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))

    team_stats["elo"] = team_stats["team"].map(elo).round(0)

    # Composite
    for col in ["ppg", "elo", "gd_per_game"]:
        team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
    team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["gd_per_game_pct"]) / 3).round(1)
    team_stats = team_stats.sort_values("composite", ascending=False)

    top_teams = team_stats.head(30)[["team", "matches", "ppg", "elo", "gd_per_game", "composite"]].to_dict(orient="records")

    # Top scorers
    scorer_counts = goalscorers.groupby("scorer").agg(
        goals=("scorer", "count"),
        teams=("team", lambda x: ", ".join(x.unique())),
        penalties=("penalty", "sum"),
    ).reset_index().sort_values("goals", ascending=False)
    top_scorers = scorer_counts.head(30).to_dict(orient="records")

    return {
        "top_teams": top_teams,
        "top_scorers": top_scorers,
    }


# ---------------------------------------------------------------------------
# WIFX National Team Scores (aggregated across all years)
# ---------------------------------------------------------------------------
def build_wifx_national_team_scores():
    path = DATA / "wifx_national_team_scores.csv"
    df = pd.read_csv(path)

    # Championship wins weighting (major tournaments)
    CHAMPIONSHIP_WINS = {
        "United States Women's": 4,  # WWC: 1991, 1999, 2015, 2019
        "United States": 4,
        "Germany Women's": 2,  # Euro: 1995, 2001, 2009, 2013
        "Germany": 2,
        "Norway Women's": 1,  # Euro: 1995, WWC: 2023
        "Norway": 1,
        "Japan Women's": 1,  # WWC: 2011
        "Japan": 1,
        "Spain Women's": 2,  # Euro: 2022, WWC: 2023
        "Spain": 2,
        "England Women's": 1,  # Euro: 2022
        "England": 1,
        "Netherlands Women's": 1,  # Euro: 2017
        "Netherlands": 1,
        "France Women's": 0,
        "France": 0,
        "Sweden Women's": 0,
        "Sweden": 0,
        "Canada Women's": 1,  # Olympics: 2020, 2024
        "Canada": 1,
        "Brazil Women's": 0,
        "Brazil": 0,
        "Australia Women's": 0,
        "Australia": 0,
    }
    
    # Add championship wins
    df["championship_wins"] = df["team"].map(CHAMPIONSHIP_WINS).fillna(0)
    
    # Aggregate by team
    agg_cols = {
        "offensive_rating": "mean",
        "defensive_rating": "mean",
        "net_rating": "mean",
        "composite_rating": "mean",
        "matches": "sum",
        "goals_scored": "sum",
        "championship_wins": "max",  # Keep max wins
    }
    if "goals_conceded" in df.columns:
        agg_cols["goals_conceded"] = "sum"
    
    agg = df.groupby("team").agg(agg_cols).reset_index()

    # Weight net rating by championship wins (add number of championships)
    agg["wifx_global_ranking"] = agg["net_rating"] + agg["championship_wins"]
    
    # Sort by WIFX Global Ranking
    agg = agg.sort_values("wifx_global_ranking", ascending=False)

    # Rename net_rating to wifx_global_ranking for output
    result = {
        "all_teams": agg.to_dict(orient="records"),
    }

    write_json("wifx_national_team_scores.json", result)


# ---------------------------------------------------------------------------
# WIFX Club Team Scores (aggregated across all years)
# ---------------------------------------------------------------------------
def build_wifx_club_team_scores():
    # First, load existing StatsBomb data
    path = DATA / "wifx_club_team_scores.csv"
    df = pd.read_csv(path)

    # Proper weighted average aggregation for StatsBomb
    agg = {}
    for _, row in df.iterrows():
        team = row['team']
        matches = row['matches']
        if team not in agg:
            agg[team] = {
                'team': team,
                'matches': 0,
                'goals_scored': 0,
                'offensive_rating_sum': 0,
                'defensive_rating_sum': 0,
                'net_rating_sum': 0,
                'composite_rating_sum': 0,
                'comps': set()
            }
        agg[team]['matches'] += matches
        agg[team]['goals_scored'] += int(row.get('goals_scored', 0) or 0)
        if 'goals_conceded' in row and pd.notna(row.get('goals_conceded')):
            if 'goals_conceded' not in agg[team]:
                agg[team]['goals_conceded'] = 0
            agg[team]['goals_conceded'] += int(row['goals_conceded'])
        agg[team]['offensive_rating_sum'] += (row['offensive_rating'] or 0) * matches
        agg[team]['defensive_rating_sum'] += (row['defensive_rating'] or 0) * matches
        agg[team]['net_rating_sum'] += (row['net_rating'] or 0) * matches
        agg[team]['composite_rating_sum'] += (row['composite_rating'] or 0) * matches
        if pd.notna(row.get('comp_label')):
            agg[team]['comps'].add(row['comp_label'])

    # Compute StatsBomb averages
    sb_result = []
    for team, data in agg.items():
        comps_str = ", ".join(sorted(data['comps'])) if data['comps'] else "FAWSL"
        result = {
            'team': team,
            'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
            'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
            'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
            'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
            'matches': data['matches'],
            'goals_scored': data['goals_scored'],
            'comp_label': comps_str,
            'source': 'statsbomb'
        }
        if 'goals_conceded' in data:
            result['goals_conceded'] = data['goals_conceded']
        sb_result.append(result)

    # Normalize StatsBomb to 0-100 scale (was 0-30)
    sb_off_min = min(t['offensive_rating'] for t in sb_result)
    sb_off_max = max(t['offensive_rating'] for t in sb_result)
    sb_def_min = min(t['defensive_rating'] for t in sb_result)
    sb_def_max = max(t['defensive_rating'] for t in sb_result)
    if sb_off_max > sb_off_min:
        for t in sb_result:
            t['offensive_rating'] = round((t['offensive_rating'] - sb_off_min) / (sb_off_max - sb_off_min) * 100, 1)
            t['defensive_rating'] = round((t['defensive_rating'] - sb_def_min) / (sb_def_max - sb_def_min) * 100, 1)
            t['net_rating'] = round(t['offensive_rating'] - t['defensive_rating'], 1)
            t['composite_rating'] = round((t['offensive_rating'] + t['defensive_rating']) / 2, 1)
    
    TEAM_MAP = {
        'KPqjw8PQ6v': 'Portland Thorns',
        'aDQ0lzvQEv': 'OL Reign',
        '4JMAk47qKg': 'Chicago Red Stars',
        'XVqKeVKM01': 'Washington Spirit',
        'raMyrr25d2': 'Houston Dash',
        'zeQZeazqKw': 'Orlando Pride',
        '7vQ7BBzqD1': 'FC Kansas City',
        '4wM4rZdqjB': 'North Carolina Courage',
        'Pk5LeeNqOW': 'Kansas City Current',
        '4wM4Ezg5jB': 'Sky Blue FC',
        '7VqG1lYMvW': 'NJ/NY Gotham',
        'eV5DR6YQKn': 'Angel City',
        'kRQa8JOqKZ': 'San Diego Wave',
        'eV5D2w9QKn': 'Bay FC',
        '315VnJ759x': 'Racing Louisville',
        'xW5pwDBMg1': 'Boston Breakers',
        'kRQaWa15KZ': 'Western New York Flash',
    }

    ga_path = DATA / "asa_nwsl" / "goals_added.csv"
    if ga_path.exists():
        ga = pd.read_csv(ga_path)

        team_year = ga.groupby(['team_id_ga', 'season']).agg({
            'minutes_played_ga': 'sum',
            'ga_shooting_raw': 'sum',
            'ga_passing_raw': 'sum',
            'ga_dribbling_raw': 'sum',
            'ga_interrupting_raw': 'sum',
            'ga_receiving_raw': 'sum',
            'player_id': 'count',
        }).reset_index()
        team_year.columns = ['team_id', 'season', 'minutes', 'shooting', 'passing', 'dribbling', 'interrupting', 'receiving', 'players']
        team_year['team'] = team_year['team_id'].map(TEAM_MAP).fillna('Unknown')
        team_year = team_year[(team_year['team'] != 'Unknown') & (team_year['minutes'] > 5000)]
        
        # Percentile ranking within each season
        team_year['offensive_rating'] = team_year.groupby('season')['shooting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
        team_year['defensive_rating'] = team_year.groupby('season')['interrupting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
        team_year['net_rating'] = (team_year['offensive_rating'] - team_year['defensive_rating']).round(1)
        team_year['composite_rating'] = ((team_year['offensive_rating'] + team_year['defensive_rating']) / 2).round(1)
        
        # Convert minutes to matches (approx 90 min = 1 match)
        team_year['matches'] = (team_year['minutes'] / 90).astype(int)
        team_year['comp_label'] = 'NWSL ' + team_year['season'].astype(str)
        
        # Aggregate across all years
        asa_agg = {}
        for _, row in team_year.iterrows():
            team = row['team']
            matches = row['matches']
            if team not in asa_agg:
                asa_agg[team] = {
                    'team': team,
                    'matches': 0,
                    'offensive_rating_sum': 0,
                    'defensive_rating_sum': 0,
                    'net_rating_sum': 0,
                    'composite_rating_sum': 0,
                }
            asa_agg[team]['matches'] += matches
            asa_agg[team]['offensive_rating_sum'] += row['offensive_rating'] * matches
            asa_agg[team]['defensive_rating_sum'] += row['defensive_rating'] * matches
            asa_agg[team]['net_rating_sum'] += row['net_rating'] * matches
            asa_agg[team]['composite_rating_sum'] += row['composite_rating'] * matches
        
        asa_result = []
        for team, data in asa_agg.items():
            asa_result.append({
                'team': team,
                'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
                'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
                'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
                'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
                'matches': data['matches'],
                'goals_scored': 0,  # Not available in ASA
                'goals_conceded': 0,
                'comp_label': 'NWSL 2016-2025',
                'source': 'asa'
            })
    else:
        asa_result = []

    # Combine both (deduplicate by team name - prefer ASA if available as it has more data)
    combined = {}
    
    # Championship wins mapping for clubs (NWSL weighted slightly higher)
    CLUB_CHAMPIONSHIPS = {
        # NWSL (weighted 1.5x)
        "Portland Thorns": 3,  # 2017, 2022, 2024
        "North Carolina Courage": 3,  # 2018, 2019, 2023
        "Kansas City Current": 1,  # 2024 (as Current)
        "FC Kansas City": 2,  # 2014, 2015
        "Western New York Flash": 1,  # 2016
        "OL Reign": 1,  # 2020
        "Seattle Reign": 1,  # 2020
        "Chicago Red Stars": 0,
        "Washington Spirit": 1,  # 2021
        "Houston Dash": 0,
        "Angel City": 0,
        "NJ/NY Gotham": 0,
        "Boston Breakers": 0,
        "Sky Blue FC": 0,
        
        # FAWSL
        "Chelsea": 4,  # 2015-16, 2017-18, 2019-20, 2020-21
        "Manchester City Women": 2,  # 2016-17, 2020-21
        "Arsenal Women": 1,  # 2022-23
        "Liverpool FFC": 1,  # 2013-14
        "Everton Ladies": 0,
        "Bristol City WFC": 0,
        "Brighton & Hove Albion Women": 0,
        "Reading FC Women": 0,
        "Tottenham Hotspur Women": 0,
        "West Ham United LFC": 0,
        "Aston Villa": 0,
        "Yeovil Town LFC": 0,
        
        # UWCL
        "Lyon": 8,
        "OL Lyonnes": 8,  # 2016-2020 (5), 2021-22, 2022-23, 2023-24
        "Barcelona": 3,
        "Fútbol Club Barcelona": 3,  # 2020-21, 2021-22, 2022-23
        "Wolfsburg": 2,
        "VfL Wolfsburg": 2,  # 2013-14, 2015-16
        "Paris Saint-Germain": 0,
        "Olympique Lyonnais": 8,
        
        # Other leagues
        "Bay FC": 0,
        "Racing Louisville": 0,
        "San Diego Wave": 0,
        "FC Barcelona": 3,
    }
    
    # First add StatsBomb teams
    for t in sb_result:
        combined[t['team']] = t
    
    # Then add ASA teams (will overwrite StatsBomb if exists)
    for t in asa_result:
        if t['team'] in combined:
            # Merge - keep statsbomb goals data, use ASA ratings weighted by matches
            existing = combined[t['team']]
            total_matches = existing['matches'] + t['matches']
            combined[t['team']] = {
                'team': t['team'],
                'offensive_rating': round((existing['offensive_rating'] * existing['matches'] + t['offensive_rating'] * t['matches']) / total_matches, 1),
                'defensive_rating': round((existing['defensive_rating'] * existing['matches'] + t['defensive_rating'] * t['matches']) / total_matches, 1),
                'net_rating': round((existing['net_rating'] * existing['matches'] + t['net_rating'] * t['matches']) / total_matches, 1),
                'composite_rating': round((existing['composite_rating'] * existing['matches'] + t['composite_rating'] * t['matches']) / total_matches, 1),
                'matches': total_matches,
                'goals_scored': existing.get('goals_scored', 0),
                'goals_conceded': existing.get('goals_conceded', 0),
                'comp_label': 'NWSL + FAWSL',
            }
        else:
            combined[t['team']] = t
    
    # Add championship wins and WIFX Global Club Ranking
    for team, data in combined.items():
        wins = CLUB_CHAMPIONSHIPS.get(team, 0)
        # NWSL championships weighted 1.5x
        nwsl_teams = ["Portland Thorns", "North Carolina Courage", "Kansas City Current", "FC Kansas City", 
                     "Western New York Flash", "OL Reign", "Seattle Reign", "Chicago Red Stars", 
                     "Washington Spirit", "Houston Dash", "Angel City", "NJ/NY Gotham", "Boston Breakers",
                     "Sky Blue FC", "Bay FC", "Racing Louisville", "San Diego Wave"]
        if team in nwsl_teams:
            data['championship_wins'] = wins
            data['wifx_global_club_ranking'] = data['net_rating'] + (wins * 1.5)
        else:
            data['championship_wins'] = wins
            data['wifx_global_club_ranking'] = data['net_rating'] + wins

    all_teams = list(combined.values())
    all_teams.sort(key=lambda x: x.get('wifx_global_club_ranking', x.get('composite_rating', 0)), reverse=True)
    
    write_json("wifx_club_team_scores.json", {"all_teams": all_teams})


# ---------------------------------------------------------------------------
# WIFX Confederation Scores (aggregated across years)
# ---------------------------------------------------------------------------
def build_wifx_confederation_scores():
    path = DATA / "wifx_club_confederation_scores.csv"
    df = pd.read_csv(path)

    # Aggregate by team
    agg = df.groupby("team").agg({
        "wifx_club_score": "mean",
        "country": "first",
        "confederation": "first",
        "championships_won": "sum",
        "finals_reached": "sum",
    }).reset_index()

    agg = agg.sort_values("wifx_club_score", ascending=False)
    agg = agg.assign(rank=range(1, len(agg) + 1))

    result = {
        "club_confederation_scores": agg.to_dict(orient="records"),
    }

    write_json("wifx_club_confederation_scores.json", result)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
    print("Loading StatsBomb events (this may take a minute)...")
    all_events = pd.concat([load_events(c) for c in COMPETITIONS], ignore_index=True)
    print(f"  Loaded {len(all_events):,} events")

    all_matches = pd.concat([load_matches(c) for c in COMPETITIONS], ignore_index=True)
    print(f"  Loaded {len(all_matches):,} matches")

    all_lineups = pd.concat([load_lineups(c) for c in COMPETITIONS], ignore_index=True)
    print(f"  Loaded {len(all_lineups):,} lineup entries")

    # Build WIFX dashboards only
    print("Building WIFX scores...")
    wifx = build_wifx_scores()
    write_json("wifx_scores.json", wifx)

    print("Building WIFX historical scores...")
    wifx_hist = build_wifx_historical_scores()
    write_json("wifx_historical_scores.json", wifx_hist)

    print("Building aggregated WIFX national team scores...")
    build_wifx_national_team_scores()
    print("Building aggregated WIFX club team scores...")
    build_wifx_club_team_scores()
    print("Building aggregated WIFX confederation scores...")
    build_wifx_confederation_scores()

    print("Done! All JSON files written to data/dashboard/")


def write_json(filename: str, data: dict):
    import math
    path = OUT / filename
    
    def clean_nan(obj):
        if isinstance(obj, dict):
            return {k: clean_nan(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [clean_nan(v) for v in obj]
        elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
            return None
        elif obj == "NaN":
            return None
        return obj
    
    data = clean_nan(data)
    with path.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, default=str)
    size = path.stat().st_size
    print(f"  Wrote {path} ({size / 1024:.1f} KB)")


if __name__ == "__main__":
    main()