Spaces:
Running
Running
File size: 5,917 Bytes
1508879 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | from pathlib import Path
import numpy as np
import pandas as pd
DATA_DIR = Path(__file__).parent.absolute()
def summarize():
"""
Summarizes all benchmark results (*_results.parquet) in the directory.
Aggregates metrics per model, computes ranks, and exports leaderboard files.
"""
# 1. Find and Load all result parquet files
result_files = sorted([f for f in DATA_DIR.glob("*_results.parquet") if f.name != "all_results.parquet"])
if not result_files:
print("No result parquet files found (*_results.parquet).")
return None
print(f"Found {len(result_files)} result files. Loading metrics...")
dfs = []
for f in result_files:
try:
df = pd.read_parquet(f)
dfs.append(df)
except Exception as e:
print(f"Error loading {f.name}: {e}")
if not dfs:
print("No data could be loaded.")
return None
# Combine all individual results
df_all = pd.concat(dfs, ignore_index=True)
# 2. Ensure consistent dtypes before saving and summarizing
# This prevents PyArrow errors (mixed float32/float64 or object types)
for col in df_all.columns:
if col in ["model", "structure", "formula", "id"]:
df_all[col] = df_all[col].astype(str)
elif col == "missing":
df_all[col] = df_all[col].astype(bool)
elif col in [
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
]:
# Convert single-value metrics to float64
df_all[col] = pd.to_numeric(df_all[col], errors="coerce")
elif col in ["volume-ratio", "energy-delta-per-atom"]:
# Ensure array contents are consistent float64
df_all[col] = df_all[col].apply(
lambda x: np.array(x, dtype=np.float64) if isinstance(x, list | np.ndarray) else x
)
# Expose combined results
results_fpath = DATA_DIR / "all_results.parquet"
df_all.to_parquet(results_fpath)
print(f"Combined {len(df_all)} results into {results_fpath.name}")
# 3. Calculate summary metrics per model
# Metrics to aggregate (means and stds)
metrics = [
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
]
# Only include valid (not missing) results for means and stds
df_valid = df_all[~df_all["missing"]]
# Group by model
summary_means = df_valid.groupby("model")[metrics].mean()
summary_stds = df_valid.groupby("model")[metrics].std().rename(columns={m: f"{m}-std" for m in metrics})
# Calculate missing count per model (includes all attempts)
summary_missing = df_all.groupby("model")["missing"].sum().astype(int).to_frame("missing")
# Combine all pieces into the leaderboard table
leaderboard = pd.concat([summary_means, summary_stds, summary_missing], axis=1)
leaderboard = leaderboard.reset_index()
# 4. Ranking Logic
# flip_rank: smaller absolute difference from 1 is better
leaderboard["flip_rank"] = (leaderboard["energy-diff-flip-times"] - 1).abs().rank(ascending=True, method="min")
# tortuosity_rank: smaller is better (minimum is 1)
leaderboard["tortuosity_rank"] = leaderboard["tortuosity"].rank(ascending=True, method="min")
# spearman_compression_energy_rank: smaller (more negative) is better
leaderboard["spearman_compression_energy_rank"] = leaderboard["spearman-compression-energy"].rank(method="min")
# spearman_compression_derivative_rank: larger is better
leaderboard["spearman_compression_derivative_rank"] = leaderboard["spearman-compression-derivative"].rank(
ascending=False, method="min"
)
# spearman_tension_energy_rank: larger is better
leaderboard["spearman_tension_energy_rank"] = leaderboard["spearman-tension-energy"].rank(
ascending=False, method="min"
)
# missing_rank: fewer failures/missing data is better
leaderboard["missing_rank"] = leaderboard["missing"].rank(ascending=True, method="min")
# Aggregate Rank
leaderboard["rank-aggregation"] = (
leaderboard["flip_rank"]
+ leaderboard["tortuosity_rank"]
+ leaderboard["spearman_compression_energy_rank"]
+ leaderboard["spearman_compression_derivative_rank"]
+ leaderboard["spearman_tension_energy_rank"]
+ leaderboard["missing_rank"]
).astype(int)
leaderboard["rank"] = leaderboard["rank-aggregation"].rank(method="min").astype(int)
# 5. Clean up and Export
# Reorder columns to match original format
cols_ordered = [
"model",
"rank",
"rank-aggregation",
"energy-diff-flip-times",
"tortuosity",
"spearman-compression-energy",
"spearman-compression-derivative",
"spearman-tension-energy",
"missing",
"energy-diff-flip-times-std",
"tortuosity-std",
"spearman-compression-energy-std",
"spearman-compression-derivative-std",
"spearman-tension-energy-std",
]
# Ensure all columns exist
for col in cols_ordered:
if col not in leaderboard.columns:
leaderboard[col] = np.nan
leaderboard = leaderboard[cols_ordered].sort_values("rank")
# Save to CSV and LaTeX
leaderboard.to_csv(DATA_DIR / "leaderboard.csv", index=False)
leaderboard.to_latex(DATA_DIR / "leaderboard.tex", index=False, float_format="%.3f")
print("\nBenchmark Leaderboard:")
print(leaderboard[["model", "rank", "missing"]].to_string(index=False))
print(f"\nFinal results exported to {DATA_DIR / 'leaderboard.csv'} and {DATA_DIR / 'leaderboard.tex'}")
return leaderboard
if __name__ == "__main__":
summarize()
|