import pandas as pd MEDALS = {0: "🥇", 1: "🥈", 2: "🥉"} def compute_leaderboard(df: pd.DataFrame) -> pd.DataFrame: """Compute average rank per model for each metric. Ranking procedure: 1. Rank models within each (metric, subdataset, frequency, cutoff) group. 2. Average ranks across cutoff dates for each (metric, subdataset, frequency, model). 3. Average across all (subdataset, frequency) combos for each (metric, model). Returns a dataframe with columns: model, rank CRPS, rank MASE """ ranked = df.copy() ranked["rank"] = ranked.groupby( ["metric", "subdataset", "frequency", "cutoff"] )["value"].rank(method="min") # Step 2: average ranks across cutoffs per (metric, subdataset, frequency, model) per_subdataset = ( ranked.groupby(["metric", "subdataset", "frequency", "model"])["rank"] .mean() .reset_index() ) # Print per-subdataset ranks for manual inspection for metric in sorted(per_subdataset["metric"].unique()): print(f"\n{'='*60}") print(f"Metric: {metric}") print(f"{'='*60}") sub = per_subdataset[per_subdataset["metric"] == metric] pivot = sub.pivot_table( index=["subdataset", "frequency"], columns="model", values="rank" ) print(pivot.to_string()) # Step 3: average across all (subdataset, frequency) combos overall = ( per_subdataset.groupby(["metric", "model"])["rank"] .mean() .reset_index() ) # Pivot so each metric becomes a column leaderboard = overall.pivot(index="model", columns="metric", values="rank") leaderboard = leaderboard.rename( columns={m: f"rank {m.upper()}" for m in leaderboard.columns} ) # Average metric values: mean across all (subdataset, frequency, cutoff) per (metric, model) avg_values = ( df.groupby(["metric", "model"])["value"] .mean() .reset_index() .pivot(index="model", columns="metric", values="value") ) avg_values = avg_values.rename( columns={m: f"avg {m.upper()}" for m in avg_values.columns} ) leaderboard = leaderboard.join(avg_values) # Re-rank by average of the two rank columns for ordering rank_cols = [c for c in leaderboard.columns if c.startswith("rank ")] leaderboard["avg_rank"] = leaderboard[rank_cols].mean(axis=1) leaderboard = leaderboard.sort_values("avg_rank") leaderboard = leaderboard.drop(columns="avg_rank").reset_index() # Round for display for col in leaderboard.columns: if col.startswith("rank "): leaderboard[col] = leaderboard[col].round(2) elif col.startswith("avg "): leaderboard[col] = leaderboard[col].round(4) # Add medals to model names leaderboard = leaderboard.reset_index(drop=True) leaderboard["model"] = [ f"{MEDALS.get(i, '')} {m}".strip() for i, m in enumerate(leaderboard["model"]) ] # Reorder: model, avg columns, rank columns avg_cols = sorted(c for c in leaderboard.columns if c.startswith("avg ")) rank_cols = sorted(c for c in leaderboard.columns if c.startswith("rank ")) leaderboard = leaderboard[["model"] + avg_cols + rank_cols] return leaderboard if __name__ == "__main__": df = pd.read_csv("mock_evaluation_results.csv") lb = compute_leaderboard(df) print(f"\n{'='*60}") print("LEADERBOARD") print(f"{'='*60}") print(lb.to_string(index=False))