File size: 3,458 Bytes
d64ffef
 
43752dc
 
d64ffef
 
 
 
 
 
 
 
 
 
 
 
7901bf8
 
 
d64ffef
 
 
 
 
 
 
 
 
 
7901bf8
d64ffef
7901bf8
d64ffef
 
 
 
 
 
 
7901bf8
d64ffef
 
 
 
 
 
 
 
 
 
7901bf8
d64ffef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7901bf8
 
d64ffef
43752dc
 
 
7901bf8
43752dc
 
 
 
 
 
 
d64ffef
 
 
 
46dbc41
 
 
d64ffef
7901bf8
d64ffef
7901bf8
d64ffef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd

MEDALS = {0: "🥇", 1: "🥈", 2: "🥉"}


def compute_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
    """Compute average rank per model for each metric.

    Ranking procedure:
    1. Rank models within each (metric, subdataset, frequency, cutoff) group.
    2. Average ranks across cutoff dates for each (metric, subdataset, frequency, model).
    3. Average across all (subdataset, frequency) combos for each (metric, model).

    Returns a dataframe with columns: model, rank CRPS, rank MASE
    """
    ranked = df.copy()
    ranked["rank"] = ranked.groupby(["metric", "subdataset", "frequency", "cutoff"])[
        "value"
    ].rank(method="min")

    # Step 2: average ranks across cutoffs per (metric, subdataset, frequency, model)
    per_subdataset = (
        ranked.groupby(["metric", "subdataset", "frequency", "model"])["rank"]
        .mean()
        .reset_index()
    )

    # Print per-subdataset ranks for manual inspection
    for metric in sorted(per_subdataset["metric"].unique()):
        print(f"\n{'=' * 60}")
        print(f"Metric: {metric}")
        print(f"{'=' * 60}")
        sub = per_subdataset[per_subdataset["metric"] == metric]
        pivot = sub.pivot_table(
            index=["subdataset", "frequency"], columns="model", values="rank"
        )
        print(pivot.to_string())

    # Step 3: average across all (subdataset, frequency) combos
    overall = per_subdataset.groupby(["metric", "model"])["rank"].mean().reset_index()

    # Pivot so each metric becomes a column
    leaderboard = overall.pivot(index="model", columns="metric", values="rank")
    leaderboard = leaderboard.rename(
        columns={m: f"rank {m.upper()}" for m in leaderboard.columns}
    )

    # Average metric values: mean across all (subdataset, frequency, cutoff) per (metric, model)
    avg_values = (
        df.groupby(["metric", "model"])["value"]
        .median()  # using median instead of mean to avoid outliers
        .reset_index()
        .pivot(index="model", columns="metric", values="value")
    )
    avg_values = avg_values.rename(
        columns={m: f"avg {m.upper()}" for m in avg_values.columns}
    )
    leaderboard = leaderboard.join(avg_values)

    # Re-rank by average of the two rank columns for ordering
    rank_cols = [c for c in leaderboard.columns if c.startswith("rank ")]
    leaderboard["avg_rank"] = leaderboard[rank_cols].mean(axis=1)
    leaderboard = leaderboard.sort_values("avg_rank")
    leaderboard = leaderboard.drop(columns="avg_rank").reset_index()

    # Round for display
    for col in leaderboard.columns:
        if col.startswith("rank ") or col.startswith("avg "):
            leaderboard[col] = leaderboard[col].round(3)

    # Add medals to model names
    leaderboard = leaderboard.reset_index(drop=True)
    leaderboard["model"] = [
        f"{MEDALS.get(i, '')} {m}".strip() for i, m in enumerate(leaderboard["model"])
    ]

    # Reorder: model, avg columns, rank columns
    avg_cols = sorted(c for c in leaderboard.columns if c.startswith("avg "))
    rank_cols = sorted(c for c in leaderboard.columns if c.startswith("rank "))
    leaderboard = leaderboard[["model"] + avg_cols + rank_cols]

    return leaderboard


if __name__ == "__main__":
    from data import load_data

    df = load_data()
    lb = compute_leaderboard(df)
    print(f"\n{'=' * 60}")
    print("LEADERBOARD")
    print(f"{'=' * 60}")
    print(lb.to_string(index=False))