mlip-arena / benchmarks /ev /aggregate.py
github-actions[ci]
Clean sync from main branch - 2026-03-27 03:03:42
1508879
from pathlib import Path
import numpy as np
import pandas as pd
DATA_DIR = Path(__file__).parent.absolute()
def summarize():
"""
Summarizes all benchmark results (*_results.parquet) in the directory.
Aggregates metrics per model, computes ranks, and exports leaderboard files.
"""
# 1. Find and Load all result parquet files
result_files = sorted([f for f in DATA_DIR.glob("*_results.parquet") if f.name != "all_results.parquet"])
if not result_files:
print("No result parquet files found (*_results.parquet).")
return None
print(f"Found {len(result_files)} result files. Loading metrics...")
dfs = []
for f in result_files:
try:
df = pd.read_parquet(f)
dfs.append(df)
except Exception as e:
print(f"Error loading {f.name}: {e}")
if not dfs:
print("No data could be loaded.")
return None
# Combine all individual results
df_all = pd.concat(dfs, ignore_index=True)
# 2. Ensure consistent dtypes before saving and summarizing
# This prevents PyArrow errors (mixed float32/float64 or object types)
for col in df_all.columns:
if col in ["model", "structure", "formula", "id"]:
df_all[col] = df_all[col].astype(str)
elif col == "missing":
df_all[col] = df_all[col].astype(bool)
elif col in [
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
]:
# Convert single-value metrics to float64
df_all[col] = pd.to_numeric(df_all[col], errors="coerce")
elif col in ["volume-ratio", "energy-delta-per-atom"]:
# Ensure array contents are consistent float64
df_all[col] = df_all[col].apply(
lambda x: np.array(x, dtype=np.float64) if isinstance(x, list | np.ndarray) else x
)
# Expose combined results
results_fpath = DATA_DIR / "all_results.parquet"
df_all.to_parquet(results_fpath)
print(f"Combined {len(df_all)} results into {results_fpath.name}")
# 3. Calculate summary metrics per model
# Metrics to aggregate (means and stds)
metrics = [
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
]
# Only include valid (not missing) results for means and stds
df_valid = df_all[~df_all["missing"]]
# Group by model
summary_means = df_valid.groupby("model")[metrics].mean()
summary_stds = df_valid.groupby("model")[metrics].std().rename(columns={m: f"{m}-std" for m in metrics})
# Calculate missing count per model (includes all attempts)
summary_missing = df_all.groupby("model")["missing"].sum().astype(int).to_frame("missing")
# Combine all pieces into the leaderboard table
leaderboard = pd.concat([summary_means, summary_stds, summary_missing], axis=1)
leaderboard = leaderboard.reset_index()
# 4. Ranking Logic
# flip_rank: smaller absolute difference from 1 is better
leaderboard["flip_rank"] = (leaderboard["energy-diff-flip-times"] - 1).abs().rank(ascending=True, method="min")
# tortuosity_rank: smaller is better (minimum is 1)
leaderboard["tortuosity_rank"] = leaderboard["tortuosity"].rank(ascending=True, method="min")
# spearman_compression_energy_rank: smaller (more negative) is better
leaderboard["spearman_compression_energy_rank"] = leaderboard["spearman-compression-energy"].rank(method="min")
# spearman_compression_derivative_rank: larger is better
leaderboard["spearman_compression_derivative_rank"] = leaderboard["spearman-compression-derivative"].rank(
ascending=False, method="min"
)
# spearman_tension_energy_rank: larger is better
leaderboard["spearman_tension_energy_rank"] = leaderboard["spearman-tension-energy"].rank(
ascending=False, method="min"
)
# missing_rank: fewer failures/missing data is better
leaderboard["missing_rank"] = leaderboard["missing"].rank(ascending=True, method="min")
# Aggregate Rank
leaderboard["rank-aggregation"] = (
leaderboard["flip_rank"]
+ leaderboard["tortuosity_rank"]
+ leaderboard["spearman_compression_energy_rank"]
+ leaderboard["spearman_compression_derivative_rank"]
+ leaderboard["spearman_tension_energy_rank"]
+ leaderboard["missing_rank"]
).astype(int)
leaderboard["rank"] = leaderboard["rank-aggregation"].rank(method="min").astype(int)
# 5. Clean up and Export
# Reorder columns to match original format
cols_ordered = [
"model",
"rank",
"rank-aggregation",
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
"missing",
"energy-diff-flip-times-std",
"tortuosity-std",
"spearman-compression-energy-std",
"spearman-compression-derivative-std",
"spearman-tension-energy-std",
]
# Ensure all columns exist
for col in cols_ordered:
if col not in leaderboard.columns:
leaderboard[col] = np.nan
leaderboard = leaderboard[cols_ordered].sort_values("rank")
# Save to CSV and LaTeX
leaderboard.to_csv(DATA_DIR / "leaderboard.csv", index=False)
leaderboard.to_latex(DATA_DIR / "leaderboard.tex", index=False, float_format="%.3f")
print("\nBenchmark Leaderboard:")
print(leaderboard[["model", "rank", "missing"]].to_string(index=False))
print(f"\nFinal results exported to {DATA_DIR / 'leaderboard.csv'} and {DATA_DIR / 'leaderboard.tex'}")
return leaderboard
if __name__ == "__main__":
summarize()