File size: 9,084 Bytes
24c2665 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
import os
import csv
import json
from collections import OrderedDict, defaultdict
def insert_sorted_acc_fields(result_dict):
# Extract and remove all *_acc keys except "model"
acc_fields = {
k: result_dict.pop(k) for k in list(result_dict.keys())
if k != "model" and k.endswith("_acc")
}
# Sort acc keys
sorted_acc_fields = dict(sorted(acc_fields.items()))
# Rebuild the OrderedDict with model first, then sorted accs, then the rest
reordered = OrderedDict()
reordered["model"] = result_dict["model"]
reordered.update(sorted_acc_fields)
reordered.update(result_dict) # remaining keys are details
return reordered
def convert_latex_table(data, selected_data=None):
"""
Convert a list of dicts into a LaTeX table, sorted by descending average accuracy.
Args:
data (List[Dict]): your JSON‐like list.
selected_data (List[str], optional):
List of metric names _without_ the '_acc' suffix to include.
E.g. ['aime24', 'amc23', 'hmmt_2024'].
Defaults to all metrics found in data except 'avg_acc'.
Returns:
str: the LaTeX code for a table.
"""
# 1. Infer all available metrics (minus the pre‐computed avg_acc) if none specified
if selected_data is None:
selected_data = sorted(
k[:-4] for k in data[0].keys()
if k.endswith('_acc') and k != "avg_acc"
)
# 2. Build rows: clean model name, grab each metric, compute new average
rows = []
for item in data:
model_name = item["model"].replace("_temp0_n1_seed2", "")
vals = []
for metric in selected_data:
key = f"{metric}_acc"
vals.append(float(item.get(key, 0.0)))
avg_selected = sum(vals) / len(vals) if vals else 0.0
if model_name != "Qwen2.5-7B":
model_name = model_name.replace("Qwen2.5-7B", "7B").replace("_stage1", "").replace("qwen2.5-7b", "7B")
model_name = model_name.replace("_", "\\_")
rows.append((model_name, vals, avg_selected))
# 3. Sort rows by avg_selected descending
rows.sort(key=lambda x: x[2], reverse=True)
# 4. Start LaTeX
col_spec = "l" + "r" * (len(selected_data) + 1)
header = ["Model"] + [m.replace("_", r"\_") for m in selected_data] + ["Avg"]
header = " & ".join(header) + r" \\"
header = header.replace("livemathbench", "livemath").replace("olympiadbench", "olympiad").replace("minerva\\_math", "minerva").replace("hmmt\\_2024", "hmmt24")
lines = []
lines.append(r"\begin{table}[ht]")
lines.append(r"\centering")
lines.append(rf"\begin{{tabular}}{{{col_spec}}}")
lines.append(r"\toprule")
lines.append(header)
lines.append(r"\midrule")
for model, vals, avg in rows:
formatted = [f"{v:.1f}" for v in vals] + [f"{avg:.1f}"]
lines.append(" & ".join([model] + formatted) + r" \\")
lines.append(r"\bottomrule")
lines.append(r"\end{tabular}")
lines.append(r"\caption{Model accuracies on selected benchmarks, sorted by average}")
lines.append(r"\label{tab:acc_sorted}")
lines.append(r"\end{table}")
return "\n".join(lines)
def compute_method_ranks(data, selected_models=None, selected_data=None):
"""
Compute, for each metric, the rank of each model (1 = best accuracy).
Args:
data (List[Dict]): your JSON‐like list of dicts.
selected_models (List[str], optional):
List of clean model names (with "_temp0_n1_seed2" already stripped)
whose ranks you care about. If None, returns ranks for _all_ models.
selected_data (List[str], optional):
List of metric names _without_ the "_acc" suffix. If None,
defaults to all keys ending in "_acc" except "avg_acc".
Returns:
Dict[str, Dict[str,int]]:
Outer: metric →
Inner: model_name → rank (1 = highest accuracy)
"""
# 1. Determine which metrics to rank
if selected_data is None:
selected_data = sorted(
k[:-4] for k in data[0].keys()
if k.endswith("_acc") and k != "avg_acc"
)
# 2. Prepare clean model names + parsed accuracies
models = []
for item in data:
clean_name = item["model"].replace("_temp0_n1_seed2", "")
models.append((clean_name, item))
# 3. For each metric, sort and assign ranks
all_ranks = {}
for metric in selected_data:
key = f"{metric}_acc"
# build list of (model, float(acc))
vals = [
(name, float(item.get(key, 0.0)))
for name, item in models
]
# sort desc by accuracy
vals.sort(key=lambda x: x[1], reverse=True)
# assign ranks (1-based). Ties get the same rank.
ranks = {}
prev_score = None
prev_rank = 0
for idx, (name, score) in enumerate(vals, start=1):
if score == prev_score:
rank = prev_rank
else:
rank = idx
ranks[name] = rank
prev_score, prev_rank = score, rank
# if user only wants a subset, filter
if selected_models is not None:
ranks = {m: ranks[m] for m in selected_models if m in ranks}
all_ranks[metric] = ranks
return all_ranks
def collect_eval_results_by_prefix(root):
all_results = []
for model_dir in os.listdir(root):
model_path = os.path.join(root, model_dir)
if not os.path.isdir(model_path):
continue
# Look for the eval_results directory and its subdirectories
eval_results_dir = os.path.join(model_path, "eval_results")
if not os.path.isdir(eval_results_dir):
print(f"⚠️ Missing eval_results directory for: {model_dir}")
continue
# Find the global_step directory (assuming there might be only one)
global_step_dirs = [d for d in os.listdir(eval_results_dir) if os.path.isdir(os.path.join(eval_results_dir, d))]
if not global_step_dirs:
print(f"⚠️ No global step directories found in: {eval_results_dir}")
continue
# Use the first global step directory (usually global_step_0)
global_step_dir = os.path.join(eval_results_dir, global_step_dirs[0])
# Create a new result entry for this model
result = OrderedDict()
result["model"] = model_dir
# Collect accuracies from each benchmark directory
benchmark_dirs = [d for d in os.listdir(global_step_dir) if os.path.isdir(os.path.join(global_step_dir, d))]
for benchmark in benchmark_dirs:
if "livemath" in benchmark :
# skip livemathbench or "aime25" in benchmark
continue
benchmark_path = os.path.join(global_step_dir, benchmark)
# Look for the metrics json file
metrics_files = [f for f in os.listdir(benchmark_path) if f.endswith('_metrics.json')]
if not metrics_files:
print(f"⚠️ No metrics file found for {model_dir}/{benchmark}")
continue
# Use the first metrics file found
metrics_file = os.path.join(benchmark_path, metrics_files[0])
try:
with open(metrics_file, 'r') as f:
metrics_data = json.load(f)
# Extract the accuracy value
if 'acc' in metrics_data:
result[f"{benchmark}_acc"] = metrics_data['acc']
else:
print(f"⚠️ No accuracy found in {metrics_file}")
except Exception as e:
print(f"⚠️ Error reading {metrics_file}: {e}")
# Only add results if we have some accuracies
if len(result) > 1: # More than just the "model" key
# Calculate average accuracy
acc_values = [v for k, v in result.items() if k.endswith('_acc')]
if acc_values:
avg_acc = sum(acc_values) / len(acc_values)
result["avg_acc"] = round(avg_acc, 1)
# Add metadata about how many benchmarks were averaged
result["avg_metadata"] = {
"num_benchmarks": len(acc_values),
"benchmarks": [k[:-4] for k in result.keys() if k.endswith('_acc') and k != "avg_acc"]
}
result = insert_sorted_acc_fields(result)
all_results.append(result)
else:
print(f"⚠️ No accuracies found for {model_dir}")
# sort by model name
all_results.sort(key=lambda x: x["model"])
output_path = os.path.join(root, "combined_eval_results.json")
with open(output_path, "w") as f:
json.dump(all_results, f, indent=2)
print(f"✅ Saved structured JSON to {output_path}")
# Example usage
collect_eval_results_by_prefix("./EVAL/checkpoints") |