File size: 9,084 Bytes
24c2665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import os
import csv
import json
from collections import OrderedDict, defaultdict

def insert_sorted_acc_fields(result_dict):
    # Extract and remove all *_acc keys except "model"
    acc_fields = {
        k: result_dict.pop(k) for k in list(result_dict.keys())
        if k != "model" and k.endswith("_acc")
    }

    # Sort acc keys
    sorted_acc_fields = dict(sorted(acc_fields.items()))

    # Rebuild the OrderedDict with model first, then sorted accs, then the rest
    reordered = OrderedDict()
    reordered["model"] = result_dict["model"]
    reordered.update(sorted_acc_fields)
    reordered.update(result_dict)  # remaining keys are details

    return reordered

def convert_latex_table(data, selected_data=None):
    """
    Convert a list of dicts into a LaTeX table, sorted by descending average accuracy.

    Args:
        data (List[Dict]): your JSON‐like list.
        selected_data (List[str], optional):
            List of metric names _without_ the '_acc' suffix to include.
            E.g. ['aime24', 'amc23', 'hmmt_2024'].
            Defaults to all metrics found in data except 'avg_acc'.
    Returns:
        str: the LaTeX code for a table.
    """
    # 1. Infer all available metrics (minus the pre‐computed avg_acc) if none specified
    if selected_data is None:
        selected_data = sorted(
            k[:-4] for k in data[0].keys()
            if k.endswith('_acc') and k != "avg_acc"
        )

    # 2. Build rows: clean model name, grab each metric, compute new average
    rows = []
    for item in data:
        model_name = item["model"].replace("_temp0_n1_seed2", "")
        vals = []
        for metric in selected_data:
            key = f"{metric}_acc"
            vals.append(float(item.get(key, 0.0)))
        avg_selected = sum(vals) / len(vals) if vals else 0.0
        if model_name != "Qwen2.5-7B":
            model_name = model_name.replace("Qwen2.5-7B", "7B").replace("_stage1", "").replace("qwen2.5-7b", "7B")
            model_name = model_name.replace("_", "\\_")
        rows.append((model_name, vals, avg_selected))

    # 3. Sort rows by avg_selected descending
    rows.sort(key=lambda x: x[2], reverse=True)

    # 4. Start LaTeX
    col_spec = "l" + "r" * (len(selected_data) + 1)
    header = ["Model"] + [m.replace("_", r"\_") for m in selected_data] + ["Avg"]
    header = " & ".join(header) + r" \\"
    header = header.replace("livemathbench", "livemath").replace("olympiadbench", "olympiad").replace("minerva\\_math", "minerva").replace("hmmt\\_2024", "hmmt24")

    lines = []
    lines.append(r"\begin{table}[ht]")
    lines.append(r"\centering")
    lines.append(rf"\begin{{tabular}}{{{col_spec}}}")
    lines.append(r"\toprule")
    lines.append(header)
    lines.append(r"\midrule")
    for model, vals, avg in rows:
        formatted = [f"{v:.1f}" for v in vals] + [f"{avg:.1f}"]
        lines.append(" & ".join([model] + formatted) + r" \\")
    lines.append(r"\bottomrule")
    lines.append(r"\end{tabular}")
    lines.append(r"\caption{Model accuracies on selected benchmarks, sorted by average}")
    lines.append(r"\label{tab:acc_sorted}")
    lines.append(r"\end{table}")

    return "\n".join(lines)
def compute_method_ranks(data, selected_models=None, selected_data=None):
    """
    Compute, for each metric, the rank of each model (1 = best accuracy).

    Args:
        data (List[Dict]): your JSON‐like list of dicts.
        selected_models (List[str], optional):
            List of clean model names (with "_temp0_n1_seed2" already stripped)
            whose ranks you care about.  If None, returns ranks for _all_ models.
        selected_data (List[str], optional):
            List of metric names _without_ the "_acc" suffix.  If None,
            defaults to all keys ending in "_acc" except "avg_acc".

    Returns:
        Dict[str, Dict[str,int]]:  
            Outer: metric →  
            Inner: model_name → rank (1 = highest accuracy)
    """
    # 1. Determine which metrics to rank
    if selected_data is None:
        selected_data = sorted(
            k[:-4] for k in data[0].keys()
            if k.endswith("_acc") and k != "avg_acc"
        )

    # 2. Prepare clean model names + parsed accuracies
    models = []
    for item in data:
        clean_name = item["model"].replace("_temp0_n1_seed2", "")
        models.append((clean_name, item))

    # 3. For each metric, sort and assign ranks
    all_ranks = {}
    for metric in selected_data:
        key = f"{metric}_acc"
        # build list of (model, float(acc))
        vals = [
            (name, float(item.get(key, 0.0)))
            for name, item in models
        ]
        # sort desc by accuracy
        vals.sort(key=lambda x: x[1], reverse=True)
        # assign ranks (1-based). Ties get the same rank.
        ranks = {}
        prev_score = None
        prev_rank = 0
        for idx, (name, score) in enumerate(vals, start=1):
            if score == prev_score:
                rank = prev_rank
            else:
                rank = idx
            ranks[name] = rank
            prev_score, prev_rank = score, rank

        # if user only wants a subset, filter
        if selected_models is not None:
            ranks = {m: ranks[m] for m in selected_models if m in ranks}

        all_ranks[metric] = ranks

    return all_ranks

def collect_eval_results_by_prefix(root):
    all_results = []

    for model_dir in os.listdir(root):
        model_path = os.path.join(root, model_dir)
        if not os.path.isdir(model_path):
            continue

        # Look for the eval_results directory and its subdirectories
        eval_results_dir = os.path.join(model_path, "eval_results")
        if not os.path.isdir(eval_results_dir):
            print(f"⚠️ Missing eval_results directory for: {model_dir}")
            continue
            
        # Find the global_step directory (assuming there might be only one)
        global_step_dirs = [d for d in os.listdir(eval_results_dir) if os.path.isdir(os.path.join(eval_results_dir, d))]
        if not global_step_dirs:
            print(f"⚠️ No global step directories found in: {eval_results_dir}")
            continue
            
        # Use the first global step directory (usually global_step_0)
        global_step_dir = os.path.join(eval_results_dir, global_step_dirs[0])
        
        # Create a new result entry for this model
        result = OrderedDict()
        result["model"] = model_dir
        
        # Collect accuracies from each benchmark directory
        benchmark_dirs = [d for d in os.listdir(global_step_dir) if os.path.isdir(os.path.join(global_step_dir, d))]
        
        for benchmark in benchmark_dirs:
            if "livemath" in benchmark :
                # skip livemathbench or "aime25" in benchmark
                continue
            benchmark_path = os.path.join(global_step_dir, benchmark)
            
            # Look for the metrics json file
            metrics_files = [f for f in os.listdir(benchmark_path) if f.endswith('_metrics.json')]
            if not metrics_files:
                print(f"⚠️ No metrics file found for {model_dir}/{benchmark}")
                continue
                
            # Use the first metrics file found
            metrics_file = os.path.join(benchmark_path, metrics_files[0])
            
            try:
                with open(metrics_file, 'r') as f:
                    metrics_data = json.load(f)
                    
                # Extract the accuracy value
                if 'acc' in metrics_data:
                    result[f"{benchmark}_acc"] = metrics_data['acc']
                else:
                    print(f"⚠️ No accuracy found in {metrics_file}")
            except Exception as e:
                print(f"⚠️ Error reading {metrics_file}: {e}")
        
        # Only add results if we have some accuracies
        if len(result) > 1:  # More than just the "model" key
            # Calculate average accuracy
            acc_values = [v for k, v in result.items() if k.endswith('_acc')]
            if acc_values:
                avg_acc = sum(acc_values) / len(acc_values)
                result["avg_acc"] = round(avg_acc, 1)
                
                # Add metadata about how many benchmarks were averaged
                result["avg_metadata"] = {
                    "num_benchmarks": len(acc_values),
                    "benchmarks": [k[:-4] for k in result.keys() if k.endswith('_acc') and k != "avg_acc"]
                }
            
            result = insert_sorted_acc_fields(result)
            all_results.append(result)
        else:
            print(f"⚠️ No accuracies found for {model_dir}")

    # sort by model name
    all_results.sort(key=lambda x: x["model"])
    output_path = os.path.join(root, "combined_eval_results.json")
    with open(output_path, "w") as f:
        json.dump(all_results, f, indent=2)

    print(f"✅ Saved structured JSON to {output_path}")
# Example usage
collect_eval_results_by_prefix("./EVAL/checkpoints")