| """ |
| 比较 bf16 / cts / dts_exp 三种模型在各 benchmark 上的精度, |
| 打印精度对比表及相对于 bf16 基线的精度损失,并判断是否满足比赛要求(< 1.0%)。 |
| |
| 用法: |
| python compare_results.py |
| """ |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| RESULTS_DIR = Path(__file__).parent / "results" |
|
|
| |
| MODELS = ["bf16_lr1e5", "max_quant_lr1e5", "bf16", "cts", "max_quant", "dts_exp"] |
| BASELINE = "bf16_lr1e5" |
|
|
| |
| TASK_METRICS = { |
| "mmlu": "acc,none", |
| "gsm8k": "exact_match,flexible-extract", |
| "minerva_math": "math_verify,none", |
| "hellaswag": "acc_norm,none", |
| "arc_easy": "acc_norm,none", |
| "arc_challenge": "acc_norm,none", |
| } |
|
|
| TASK_DISPLAY = { |
| "mmlu": "MMLU (5-shot)", |
| "gsm8k": "GSM8K (5-shot)", |
| "minerva_math": "MATH500 (4-shot)", |
| "hellaswag": "HellaSwag (10-shot)", |
| "arc_easy": "ARC-Easy (25-shot)", |
| "arc_challenge": "ARC-Challenge(25-shot)", |
| } |
|
|
| ACCURACY_LOSS_THRESHOLD = 1.0 |
|
|
|
|
| def load_results(model_name: str) -> dict: |
| """从 results/<model>/ 下所有最新的 lm-eval 结果文件中合并结果。""" |
| result_dir = RESULTS_DIR / model_name |
| candidates = sorted(result_dir.glob("**/results*.json"), key=os.path.getmtime) |
| if not candidates: |
| return {} |
| |
| |
| latest_mtime = os.path.getmtime(candidates[-1]) |
| cutoff = latest_mtime - 24 * 3600 |
| recent = [f for f in candidates if os.path.getmtime(f) >= cutoff] |
| merged = {"results": {}, "configs": {}} |
| for f in recent: |
| with open(f) as fp: |
| data = json.load(fp) |
| merged["results"].update(data.get("results", {})) |
| merged["configs"].update(data.get("configs", {})) |
| return merged |
|
|
|
|
| def extract_score(results: dict, task: str, metric: str) -> float | None: |
| """从 lm-eval 结果 JSON 中提取指定任务和 metric 的分数。""" |
| task_results = results.get("results", {}).get(task, {}) |
| if not task_results: |
| return None |
| return task_results.get(metric) |
|
|
|
|
| def main(): |
| import datetime |
| |
| all_results = {} |
| for model in MODELS: |
| data = load_results(model) |
| if not data: |
| print(f"[WARN] No results found for model: {model}") |
| all_results[model] = data |
|
|
| lines = [] |
|
|
| |
| col_w = 26 |
| score_w = 12 |
| drop_w = 14 |
|
|
| header = f"{'Task':<{col_w}}" |
| for m in MODELS: |
| header += f"{m:>{score_w}}" |
| for m in MODELS: |
| if m == BASELINE: |
| continue |
| header += f"{'vs '+BASELINE+' drop':>{drop_w}}" |
| sep = "=" * (col_w + score_w * len(MODELS) + drop_w * (len(MODELS) - 1)) |
| lines.append("") |
| lines.append(sep) |
| lines.append(header) |
| lines.append("-" * (col_w + score_w * len(MODELS) + drop_w * (len(MODELS) - 1))) |
|
|
| |
| fail_cases = [] |
| na_cases = [] |
|
|
| for task, metric in TASK_METRICS.items(): |
| scores = {} |
| for model in MODELS: |
| scores[model] = extract_score(all_results[model], task, metric) |
|
|
| row = f"{TASK_DISPLAY[task]:<{col_w}}" |
| for m in MODELS: |
| s = scores[m] |
| row += f"{s*100:>{score_w}.2f}%" if s is not None else f"{'N/A':>{score_w}}" |
|
|
| for m in MODELS: |
| if m == BASELINE: |
| continue |
| base = scores[BASELINE] |
| s = scores[m] |
| if base is None or s is None: |
| row += f"{'N/A':>{drop_w}}" |
| na_cases.append((m, task)) |
| else: |
| drop = (base - s) / base * 100.0 if base != 0 else 0.0 |
| flag = " x" if drop > ACCURACY_LOSS_THRESHOLD else " " |
| row += f"{drop:>{drop_w-2}.4f}%{flag}" |
| if drop > ACCURACY_LOSS_THRESHOLD: |
| fail_cases.append((m, task, drop)) |
|
|
| lines.append(row) |
|
|
| lines.append(sep) |
|
|
| |
| lines.append("") |
| lines.append("-- Average accuracy drop vs bf16 baseline --") |
| for m in MODELS: |
| if m == BASELINE: |
| continue |
| drops = [] |
| for task, metric in TASK_METRICS.items(): |
| base = extract_score(all_results[BASELINE], task, metric) |
| s = extract_score(all_results[m], task, metric) |
| if base is not None and s is not None: |
| drops.append((base - s) / base * 100.0 if base != 0 else 0.0) |
| if drops: |
| avg = sum(drops) / len(drops) |
| status = "PASS" if avg < ACCURACY_LOSS_THRESHOLD else "FAIL" |
| lines.append(f" {m:<12}: avg drop = {avg:.4f}% [{status} < {ACCURACY_LOSS_THRESHOLD}%]") |
| else: |
| lines.append(f" {m:<12}: N/A") |
|
|
| |
| if fail_cases: |
| lines.append(f"\n[!] Tasks exceeding {ACCURACY_LOSS_THRESHOLD}% accuracy loss threshold:") |
| for m, task, drop in fail_cases: |
| lines.append(f" {m} / {task}: {drop:.4f}%") |
|
|
| if na_cases: |
| lines.append("\n[!] Missing results (run run_eval.sh first):") |
| for m, task in na_cases: |
| lines.append(f" {m} / {task}") |
|
|
| |
| output = "\n".join(lines) |
| print(output) |
|
|
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
| out_path = RESULTS_DIR / f"summary_{timestamp}.txt" |
| with open(out_path, "w") as f: |
| f.write(f"Evaluation Summary ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')})\n") |
| f.write(output + "\n") |
| print(f"\nResults saved to: {out_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |