Spaces:
Running
Running
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| DATA_DIR = Path(__file__).parent.absolute() | |
| def summarize(): | |
| """ | |
| Summarizes all benchmark results (*_results.parquet) in the directory. | |
| Aggregates metrics per model, computes ranks, and exports leaderboard files. | |
| """ | |
| # 1. Find and Load all result parquet files | |
| result_files = sorted([f for f in DATA_DIR.glob("*_results.parquet") if f.name != "all_results.parquet"]) | |
| if not result_files: | |
| print("No result parquet files found (*_results.parquet).") | |
| return None | |
| print(f"Found {len(result_files)} result files. Loading metrics...") | |
| dfs = [] | |
| for f in result_files: | |
| try: | |
| df = pd.read_parquet(f) | |
| dfs.append(df) | |
| except Exception as e: | |
| print(f"Error loading {f.name}: {e}") | |
| if not dfs: | |
| print("No data could be loaded.") | |
| return None | |
| # Combine all individual results | |
| df_all = pd.concat(dfs, ignore_index=True) | |
| # 2. Ensure consistent dtypes before saving and summarizing | |
| # This prevents PyArrow errors (mixed float32/float64 or object types) | |
| for col in df_all.columns: | |
| if col in ["model", "structure", "formula", "id"]: | |
| df_all[col] = df_all[col].astype(str) | |
| elif col == "missing": | |
| df_all[col] = df_all[col].astype(bool) | |
| elif col in [ | |
| "energy-diff-flip-times", | |
| "tortuosity", | |
| "spearman-compression-energy", | |
| "spearman-compression-derivative", | |
| "spearman-tension-energy", | |
| ]: | |
| # Convert single-value metrics to float64 | |
| df_all[col] = pd.to_numeric(df_all[col], errors="coerce") | |
| elif col in ["volume-ratio", "energy-delta-per-atom"]: | |
| # Ensure array contents are consistent float64 | |
| df_all[col] = df_all[col].apply( | |
| lambda x: np.array(x, dtype=np.float64) if isinstance(x, list | np.ndarray) else x | |
| ) | |
| # Expose combined results | |
| results_fpath = DATA_DIR / "all_results.parquet" | |
| df_all.to_parquet(results_fpath) | |
| print(f"Combined {len(df_all)} results into {results_fpath.name}") | |
| # 3. Calculate summary metrics per model | |
| # Metrics to aggregate (means and stds) | |
| metrics = [ | |
| "energy-diff-flip-times", | |
| "tortuosity", | |
| "spearman-compression-energy", | |
| "spearman-compression-derivative", | |
| "spearman-tension-energy", | |
| ] | |
| # Only include valid (not missing) results for means and stds | |
| df_valid = df_all[~df_all["missing"]] | |
| # Group by model | |
| summary_means = df_valid.groupby("model")[metrics].mean() | |
| summary_stds = df_valid.groupby("model")[metrics].std().rename(columns={m: f"{m}-std" for m in metrics}) | |
| # Calculate missing count per model (includes all attempts) | |
| summary_missing = df_all.groupby("model")["missing"].sum().astype(int).to_frame("missing") | |
| # Combine all pieces into the leaderboard table | |
| leaderboard = pd.concat([summary_means, summary_stds, summary_missing], axis=1) | |
| leaderboard = leaderboard.reset_index() | |
| # 4. Ranking Logic | |
| # flip_rank: smaller absolute difference from 1 is better | |
| leaderboard["flip_rank"] = (leaderboard["energy-diff-flip-times"] - 1).abs().rank(ascending=True, method="min") | |
| # tortuosity_rank: smaller is better (minimum is 1) | |
| leaderboard["tortuosity_rank"] = leaderboard["tortuosity"].rank(ascending=True, method="min") | |
| # spearman_compression_energy_rank: smaller (more negative) is better | |
| leaderboard["spearman_compression_energy_rank"] = leaderboard["spearman-compression-energy"].rank(method="min") | |
| # spearman_compression_derivative_rank: larger is better | |
| leaderboard["spearman_compression_derivative_rank"] = leaderboard["spearman-compression-derivative"].rank( | |
| ascending=False, method="min" | |
| ) | |
| # spearman_tension_energy_rank: larger is better | |
| leaderboard["spearman_tension_energy_rank"] = leaderboard["spearman-tension-energy"].rank( | |
| ascending=False, method="min" | |
| ) | |
| # missing_rank: fewer failures/missing data is better | |
| leaderboard["missing_rank"] = leaderboard["missing"].rank(ascending=True, method="min") | |
| # Aggregate Rank | |
| leaderboard["rank-aggregation"] = ( | |
| leaderboard["flip_rank"] | |
| + leaderboard["tortuosity_rank"] | |
| + leaderboard["spearman_compression_energy_rank"] | |
| + leaderboard["spearman_compression_derivative_rank"] | |
| + leaderboard["spearman_tension_energy_rank"] | |
| + leaderboard["missing_rank"] | |
| ).astype(int) | |
| leaderboard["rank"] = leaderboard["rank-aggregation"].rank(method="min").astype(int) | |
| # 5. Clean up and Export | |
| # Reorder columns to match original format | |
| cols_ordered = [ | |
| "model", | |
| "rank", | |
| "rank-aggregation", | |
| "energy-diff-flip-times", | |
| "tortuosity", | |
| "spearman-compression-energy", | |
| "spearman-compression-derivative", | |
| "spearman-tension-energy", | |
| "missing", | |
| "energy-diff-flip-times-std", | |
| "tortuosity-std", | |
| "spearman-compression-energy-std", | |
| "spearman-compression-derivative-std", | |
| "spearman-tension-energy-std", | |
| ] | |
| # Ensure all columns exist | |
| for col in cols_ordered: | |
| if col not in leaderboard.columns: | |
| leaderboard[col] = np.nan | |
| leaderboard = leaderboard[cols_ordered].sort_values("rank") | |
| # Save to CSV and LaTeX | |
| leaderboard.to_csv(DATA_DIR / "leaderboard.csv", index=False) | |
| leaderboard.to_latex(DATA_DIR / "leaderboard.tex", index=False, float_format="%.3f") | |
| print("\nBenchmark Leaderboard:") | |
| print(leaderboard[["model", "rank", "missing"]].to_string(index=False)) | |
| print(f"\nFinal results exported to {DATA_DIR / 'leaderboard.csv'} and {DATA_DIR / 'leaderboard.tex'}") | |
| return leaderboard | |
| if __name__ == "__main__": | |
| summarize() | |