File size: 5,917 Bytes
1508879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from pathlib import Path

import numpy as np
import pandas as pd

DATA_DIR = Path(__file__).parent.absolute()


def summarize():
    """
    Summarizes all benchmark results (*_results.parquet) in the directory.
    Aggregates metrics per model, computes ranks, and exports leaderboard files.
    """
    # 1. Find and Load all result parquet files
    result_files = sorted([f for f in DATA_DIR.glob("*_results.parquet") if f.name != "all_results.parquet"])

    if not result_files:
        print("No result parquet files found (*_results.parquet).")
        return None

    print(f"Found {len(result_files)} result files. Loading metrics...")

    dfs = []
    for f in result_files:
        try:
            df = pd.read_parquet(f)
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {f.name}: {e}")

    if not dfs:
        print("No data could be loaded.")
        return None

    # Combine all individual results
    df_all = pd.concat(dfs, ignore_index=True)

    # 2. Ensure consistent dtypes before saving and summarizing
    # This prevents PyArrow errors (mixed float32/float64 or object types)
    for col in df_all.columns:
        if col in ["model", "structure", "formula", "id"]:
            df_all[col] = df_all[col].astype(str)
        elif col == "missing":
            df_all[col] = df_all[col].astype(bool)
        elif col in [
            "energy-diff-flip-times",
            "tortuosity",
            "spearman-compression-energy",
            "spearman-compression-derivative",
            "spearman-tension-energy",
        ]:
            # Convert single-value metrics to float64
            df_all[col] = pd.to_numeric(df_all[col], errors="coerce")
        elif col in ["volume-ratio", "energy-delta-per-atom"]:
            # Ensure array contents are consistent float64
            df_all[col] = df_all[col].apply(
                lambda x: np.array(x, dtype=np.float64) if isinstance(x, list | np.ndarray) else x
            )

    # Expose combined results
    results_fpath = DATA_DIR / "all_results.parquet"
    df_all.to_parquet(results_fpath)
    print(f"Combined {len(df_all)} results into {results_fpath.name}")

    # 3. Calculate summary metrics per model
    # Metrics to aggregate (means and stds)
    metrics = [
        "energy-diff-flip-times",
        "tortuosity",
        "spearman-compression-energy",
        "spearman-compression-derivative",
        "spearman-tension-energy",
    ]

    # Only include valid (not missing) results for means and stds
    df_valid = df_all[~df_all["missing"]]

    # Group by model
    summary_means = df_valid.groupby("model")[metrics].mean()
    summary_stds = df_valid.groupby("model")[metrics].std().rename(columns={m: f"{m}-std" for m in metrics})

    # Calculate missing count per model (includes all attempts)
    summary_missing = df_all.groupby("model")["missing"].sum().astype(int).to_frame("missing")

    # Combine all pieces into the leaderboard table
    leaderboard = pd.concat([summary_means, summary_stds, summary_missing], axis=1)
    leaderboard = leaderboard.reset_index()

    # 4. Ranking Logic
    # flip_rank: smaller absolute difference from 1 is better
    leaderboard["flip_rank"] = (leaderboard["energy-diff-flip-times"] - 1).abs().rank(ascending=True, method="min")

    # tortuosity_rank: smaller is better (minimum is 1)
    leaderboard["tortuosity_rank"] = leaderboard["tortuosity"].rank(ascending=True, method="min")

    # spearman_compression_energy_rank: smaller (more negative) is better
    leaderboard["spearman_compression_energy_rank"] = leaderboard["spearman-compression-energy"].rank(method="min")

    # spearman_compression_derivative_rank: larger is better
    leaderboard["spearman_compression_derivative_rank"] = leaderboard["spearman-compression-derivative"].rank(
        ascending=False, method="min"
    )

    # spearman_tension_energy_rank: larger is better
    leaderboard["spearman_tension_energy_rank"] = leaderboard["spearman-tension-energy"].rank(
        ascending=False, method="min"
    )

    # missing_rank: fewer failures/missing data is better
    leaderboard["missing_rank"] = leaderboard["missing"].rank(ascending=True, method="min")

    # Aggregate Rank
    leaderboard["rank-aggregation"] = (
        leaderboard["flip_rank"]
        + leaderboard["tortuosity_rank"]
        + leaderboard["spearman_compression_energy_rank"]
        + leaderboard["spearman_compression_derivative_rank"]
        + leaderboard["spearman_tension_energy_rank"]
        + leaderboard["missing_rank"]
    ).astype(int)
    leaderboard["rank"] = leaderboard["rank-aggregation"].rank(method="min").astype(int)

    # 5. Clean up and Export
    # Reorder columns to match original format
    cols_ordered = [
        "model",
        "rank",
        "rank-aggregation",
        "energy-diff-flip-times",
        "tortuosity",
        "spearman-compression-energy",
        "spearman-compression-derivative",
        "spearman-tension-energy",
        "missing",
        "energy-diff-flip-times-std",
        "tortuosity-std",
        "spearman-compression-energy-std",
        "spearman-compression-derivative-std",
        "spearman-tension-energy-std",
    ]
    # Ensure all columns exist
    for col in cols_ordered:
        if col not in leaderboard.columns:
            leaderboard[col] = np.nan

    leaderboard = leaderboard[cols_ordered].sort_values("rank")

    # Save to CSV and LaTeX
    leaderboard.to_csv(DATA_DIR / "leaderboard.csv", index=False)
    leaderboard.to_latex(DATA_DIR / "leaderboard.tex", index=False, float_format="%.3f")

    print("\nBenchmark Leaderboard:")
    print(leaderboard[["model", "rank", "missing"]].to_string(index=False))
    print(f"\nFinal results exported to {DATA_DIR / 'leaderboard.csv'} and {DATA_DIR / 'leaderboard.tex'}")

    return leaderboard


if __name__ == "__main__":
    summarize()