| | """ |
| | Markdown report generator for FRANKENSTALLM 3B evaluation pipeline. |
| | |
| | Generates comprehensive evaluation reports with sections for: |
| | - Perplexity metrics across datasets |
| | - Calibration statistics |
| | - Token NLL distribution |
| | - Generation quality samples |
| | - Repetition parameter search results |
| | - Standard benchmark results (lm-eval) โ Korean + English |
| | - 0-shot vs 5-shot comparison |
| | - Comparison with reference models |
| | """ |
| |
|
| | from datetime import datetime |
| | from pathlib import Path |
| | from typing import Dict, List, Optional, Any, Tuple |
| | import json |
| | import logging |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def _fmt_seconds(seconds: float) -> str: |
| | """Format seconds into a human-readable duration string.""" |
| | m, s = divmod(int(seconds), 60) |
| | h, m = divmod(m, 60) |
| | if h: |
| | return f"{h}h {m}m {s}s" |
| | if m: |
| | return f"{m}m {s}s" |
| | return f"{s}s" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def _normalize_phase1_results(raw: dict) -> dict: |
| | """Convert GPU-labelled phase1_results into logical sections. |
| | |
| | Returns dict with keys: perplexity, calibration, token_nll, generation, repetition. |
| | """ |
| | normalized: Dict[str, Any] = { |
| | "perplexity": {}, |
| | "calibration": {}, |
| | "token_nll": {}, |
| | "generation": {}, |
| | "repetition": {}, |
| | } |
| |
|
| | for label, data in raw.items(): |
| | if not isinstance(data, (dict, list)): |
| | continue |
| |
|
| | if "PPL" in label: |
| | |
| | if isinstance(data, dict) and "ppl" in data: |
| | name = data.get("name", label) |
| | normalized["perplexity"][name] = data |
| | elif isinstance(data, list): |
| | for item in data: |
| | if isinstance(item, dict) and "ppl" in item: |
| | name = item.get("name", f"unknown_{len(normalized['perplexity'])}") |
| | normalized["perplexity"][name] = item |
| | elif isinstance(data, dict) and "error" in data: |
| | |
| | pass |
| | elif "Calibration" in label: |
| | if isinstance(data, dict): |
| | if "calibration" in data: |
| | normalized["calibration"] = data["calibration"] |
| | if "token_nll" in data: |
| | normalized["token_nll"] = data["token_nll"] |
| | elif "Generation" in label: |
| | if isinstance(data, dict): |
| | normalized["generation"] = data |
| | elif "Repetition" in label: |
| | if isinstance(data, dict): |
| | normalized["repetition"] = data |
| |
|
| | return normalized |
| |
|
| |
|
| | def _normalize_phase2_results(raw: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]: |
| | """Convert GPU-labelled phase2_results into flat task dicts for 0-shot and 5-shot. |
| | |
| | Returns (zero_shot_metrics, five_shot_metrics) where each is: |
| | {"kobest_boolq": {"acc,none": 0.50, ...}, "haerae": {...}, ...} |
| | """ |
| | zero_shot: Dict[str, Any] = {} |
| | five_shot: Dict[str, Any] = {} |
| |
|
| | for label, data in raw.items(): |
| | if label == "5shot": |
| | |
| | if isinstance(data, dict): |
| | for sub_label, sub_data in data.items(): |
| | if isinstance(sub_data, dict) and "per_task_metrics" in sub_data: |
| | for task_name, metrics in sub_data["per_task_metrics"].items(): |
| | five_shot[task_name] = metrics |
| | continue |
| |
|
| | if isinstance(data, dict) and "per_task_metrics" in data: |
| | for task_name, metrics in data["per_task_metrics"].items(): |
| | zero_shot[task_name] = metrics |
| |
|
| | return zero_shot, five_shot |
| |
|
| |
|
| | def _get_acc(metrics: dict, prefer_norm: bool = False) -> Optional[float]: |
| | """Extract accuracy from lm-eval metrics dict.""" |
| | if prefer_norm and "acc_norm,none" in metrics: |
| | val = metrics["acc_norm,none"] |
| | if isinstance(val, (int, float)): |
| | return float(val) |
| | if "acc,none" in metrics: |
| | val = metrics["acc,none"] |
| | if isinstance(val, (int, float)): |
| | return float(val) |
| | return None |
| |
|
| |
|
| | def _fmt_pct(val: Optional[float]) -> str: |
| | """Format as percentage string or N/A.""" |
| | if val is None: |
| | return "N/A" |
| | return f"{val * 100:.2f}%" |
| |
|
| |
|
| | def _fmt_f(val, decimals: int = 4) -> str: |
| | """Format float or return N/A.""" |
| | if isinstance(val, (int, float)): |
| | return f"{val:.{decimals}f}" |
| | return str(val) if val is not None else "N/A" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generate_report( |
| | phase1_results: dict, |
| | phase2_results: dict, |
| | generation_samples: list, |
| | output_dir: Path, |
| | checkpoint_name: str = "checkpoint-0057000", |
| | total_elapsed_sec: float = 0.0, |
| | ) -> str: |
| | """Generate a comprehensive markdown evaluation report. |
| | |
| | Handles the GPU-labelled key structure from full_eval_pipeline.py |
| | and produces multiple report files. |
| | """ |
| | output_dir = Path(output_dir) |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | reports_dir = output_dir / "reports" |
| | reports_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | p1 = _normalize_phase1_results(phase1_results) |
| | zero_shot, five_shot = _normalize_phase2_results(phase2_results) |
| |
|
| | eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| |
|
| | |
| | ppl_report = _generate_perplexity_report(p1["perplexity"]) |
| | cal_report = _generate_calibration_report(p1["calibration"], p1["token_nll"]) |
| | gen_report = _generate_generation_report(p1["generation"], generation_samples) |
| | bench_report = _generate_benchmark_report(zero_shot, five_shot, p1["repetition"]) |
| | exec_summary = _generate_executive_summary( |
| | p1, zero_shot, five_shot, checkpoint_name, eval_datetime, total_elapsed_sec, |
| | ) |
| |
|
| | |
| | (reports_dir / "00_executive_summary.md").write_text(exec_summary, encoding="utf-8") |
| | (reports_dir / "01_perplexity_report.md").write_text(ppl_report, encoding="utf-8") |
| | (reports_dir / "02_calibration_report.md").write_text(cal_report, encoding="utf-8") |
| | (reports_dir / "03_generation_quality.md").write_text(gen_report, encoding="utf-8") |
| | (reports_dir / "04_benchmark_report.md").write_text(bench_report, encoding="utf-8") |
| |
|
| | |
| | full_report = "\n\n---\n\n".join([ |
| | exec_summary, ppl_report, cal_report, gen_report, bench_report, |
| | ]) |
| |
|
| | report_path = output_dir / "full_eval_report.md" |
| | report_path.write_text(full_report, encoding="utf-8") |
| |
|
| | return full_report |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def _generate_executive_summary( |
| | p1: dict, |
| | zero_shot: dict, |
| | five_shot: dict, |
| | checkpoint_name: str, |
| | eval_datetime: str, |
| | total_elapsed_sec: float, |
| | ) -> str: |
| | lines = [ |
| | "# FRANKENSTALLM 3B ์ข
ํฉ ํ๊ฐ ๋ฆฌํฌํธ\n", |
| | f"- **๋ชจ๋ธ**: FRANKENSTALLM 3B", |
| | f"- **์ฒดํฌํฌ์ธํธ**: {checkpoint_name}", |
| | f"- **ํ๊ฐ ์ผ์**: {eval_datetime}", |
| | f"- **์ด ์์ ์๊ฐ**: {total_elapsed_sec:.1f}์ด\n", |
| | "## Executive Summary\n", |
| | ] |
| |
|
| | |
| | main_ppl = "N/A" |
| | ppl_data = p1.get("perplexity", {}) |
| | for name in ["3b", "3b_val"]: |
| | if name in ppl_data and isinstance(ppl_data[name], dict): |
| | main_ppl = _fmt_f(ppl_data[name].get("ppl")) |
| | break |
| |
|
| | |
| | kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| | "kobest_sentineg", "kobest_wic"] |
| | kobest_accs = [] |
| | for t in kobest_tasks: |
| | if t in zero_shot: |
| | a = _get_acc(zero_shot[t]) |
| | if a is not None: |
| | kobest_accs.append(a) |
| | kobest_avg = _fmt_pct(sum(kobest_accs) / len(kobest_accs)) if kobest_accs else "N/A" |
| |
|
| | |
| | mmlu_ko_avg = "N/A" |
| | mmlu_ko_count = 0 |
| | if "global_mmlu_ko" in zero_shot: |
| | a = _get_acc(zero_shot["global_mmlu_ko"]) |
| | if a is not None: |
| | mmlu_ko_avg = _fmt_pct(a) |
| | |
| | mmlu_ko_count = sum( |
| | 1 for t in zero_shot |
| | if t.startswith("global_mmlu_ko_") and _get_acc(zero_shot[t]) is not None |
| | ) |
| | if mmlu_ko_count == 0: |
| | mmlu_ko_count = 1 |
| | else: |
| | |
| | mmlu_ko_accs = [] |
| | for t, m in zero_shot.items(): |
| | if t.startswith("global_mmlu_ko_"): |
| | a = _get_acc(m) |
| | if a is not None: |
| | mmlu_ko_accs.append(a) |
| | if mmlu_ko_accs: |
| | mmlu_ko_avg = _fmt_pct(sum(mmlu_ko_accs) / len(mmlu_ko_accs)) |
| | mmlu_ko_count = len(mmlu_ko_accs) |
| |
|
| | |
| | _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
| | mmlu_en_accs = [] |
| | for t, m in zero_shot.items(): |
| | if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| | a = _get_acc(m) |
| | if a is not None: |
| | mmlu_en_accs.append(a) |
| | if not mmlu_en_accs: |
| | |
| | for t in _MMLU_EN_GROUPS: |
| | if t in zero_shot: |
| | a = _get_acc(zero_shot[t]) |
| | if a is not None: |
| | mmlu_en_accs.append(a) |
| | mmlu_en_avg = _fmt_pct(sum(mmlu_en_accs) / len(mmlu_en_accs)) if mmlu_en_accs else "N/A" |
| |
|
| | |
| | haerae_acc = "N/A" |
| | if "haerae" in zero_shot: |
| | a = _get_acc(zero_shot["haerae"]) |
| | if a is not None: |
| | haerae_acc = _fmt_pct(a) |
| |
|
| | |
| | en_benchmarks = {} |
| | for t in ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]: |
| | if t in zero_shot: |
| | a = _get_acc(zero_shot[t], prefer_norm=(t in ["hellaswag", "arc_challenge"])) |
| | if a is not None: |
| | en_benchmarks[t] = a |
| |
|
| | |
| | top1 = _fmt_f(p1.get("calibration", {}).get("top1_accuracy")) |
| |
|
| | lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| | lines.append("|--------|-----|") |
| | lines.append(f"| ์ฃผ์ PPL (3b_val) | {main_ppl} |") |
| | lines.append(f"| MMLU-KO ํ๊ท ({mmlu_ko_count}๊ณผ๋ชฉ) | {mmlu_ko_avg} |") |
| | lines.append(f"| MMLU-EN ํ๊ท | {mmlu_en_avg} |") |
| | lines.append(f"| KoBEST ํ๊ท ({len(kobest_accs)}ํ์คํฌ) | {kobest_avg} |") |
| | lines.append(f"| HAE-RAE | {haerae_acc} |") |
| | for t, a in en_benchmarks.items(): |
| | lines.append(f"| {t} (0-shot) | {_fmt_pct(a)} |") |
| | lines.append(f"| Top-1 ์ ํ๋ (Calibration) | {top1} |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## ์ฐธ๊ณ ๋ชจ๋ธ ๋น๊ต\n") |
| | lines.append("| ๋ชจ๋ธ | ํ๋ผ๋ฏธํฐ | MMLU-KO | MMLU-EN | KoBEST ํ๊ท | PPL |") |
| | lines.append("|------|---------|---------|---------|------------|-----|") |
| | lines.append(f"| **FRANKENSTALLM 3B** | 3B | {mmlu_ko_avg} | {mmlu_en_avg} | {kobest_avg} | {main_ppl} |") |
| | lines.append("| Llama-3.2-3B | 3B | ~42% | ~58% | ~55% | โ |") |
| | lines.append("| Qwen2.5-3B | 3B | ~48% | ~65% | ~60% | โ |") |
| | lines.append("| EXAONE-3.5-2.4B | 2.4B | ~35% | ~50% | ~50% | โ |") |
| | lines.append("") |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | def _generate_perplexity_report(ppl_data: dict) -> str: |
| | lines = ["# Perplexity ํ๊ฐ\n"] |
| |
|
| | if not ppl_data: |
| | lines.append("๋ฐ์ดํฐ ์์\n") |
| | return "\n".join(lines) |
| |
|
| | rows = [] |
| | for name, metrics in ppl_data.items(): |
| | if isinstance(metrics, dict) and "ppl" in metrics: |
| | rows.append({ |
| | "name": name, |
| | "ppl": metrics.get("ppl"), |
| | "bits": metrics.get("bits_per_token"), |
| | "n_tokens": metrics.get("n_tokens"), |
| | "n_eval": metrics.get("n_eval_tokens"), |
| | "elapsed": metrics.get("elapsed_sec"), |
| | }) |
| |
|
| | rows.sort(key=lambda x: x["ppl"] if isinstance(x["ppl"], (int, float)) else float("inf"), |
| | reverse=True) |
| |
|
| | lines.append("| ๋ฐ์ดํฐ์
| PPL | Bits/Token | ์ ์ฒด ํ ํฐ | ํ๊ฐ ํ ํฐ | ์์ ์๊ฐ |") |
| | lines.append("|---------|-----|-----------|---------|---------|---------|") |
| | for r in rows: |
| | lines.append( |
| | f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | " |
| | f"{r['n_tokens']:,} | {r['n_eval']:,} | {_fmt_f(r['elapsed'], 1)}s |" |
| | if isinstance(r['n_tokens'], (int, float)) and isinstance(r['n_eval'], (int, float)) |
| | else f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | " |
| | f"{r['n_tokens']} | {r['n_eval']} | {_fmt_f(r['elapsed'], 1)}s |" |
| | ) |
| | lines.append("") |
| | return "\n".join(lines) |
| |
|
| |
|
| | def _generate_calibration_report(cal_data: dict, nll_data: dict) -> str: |
| | lines = ["# Calibration ๋ฐ Token NLL ๋ถ์\n"] |
| |
|
| | |
| | lines.append("## Calibration ๊ฒฐ๊ณผ\n") |
| | if cal_data: |
| | lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| | lines.append("|--------|-----|") |
| | metrics_map = { |
| | "top1_accuracy": "Top-1 Accuracy", |
| | "top5_accuracy": "Top-5 Accuracy", |
| | "top10_accuracy": "Top-10 Accuracy", |
| | "mean_correct_prob": "Mean Correct Prob", |
| | "mean_entropy": "Mean Entropy", |
| | } |
| | for key, label in metrics_map.items(): |
| | lines.append(f"| {label} | {_fmt_f(cal_data.get(key))} |") |
| | lines.append("") |
| | else: |
| | lines.append("๋ฐ์ดํฐ ์์\n") |
| |
|
| | |
| | lines.append("## Token NLL ๋ถํฌ\n") |
| | if nll_data: |
| | |
| | stats_map = [ |
| | (["nll_mean", "mean"], "ํ๊ท "), |
| | (["nll_std", "std"], "ํ์คํธ์ฐจ"), |
| | (["nll_median", "median"], "์ค์๊ฐ"), |
| | (["nll_min", "min"], "์ต์๊ฐ"), |
| | (["nll_max", "max"], "์ต๋๊ฐ"), |
| | ] |
| | lines.append("| ํต๊ณ | ๊ฐ |") |
| | lines.append("|------|-----|") |
| | for candidates, label in stats_map: |
| | val = None |
| | for c in candidates: |
| | if c in nll_data: |
| | val = nll_data[c] |
| | break |
| | lines.append(f"| {label} | {_fmt_f(val)} |") |
| | lines.append("") |
| |
|
| | |
| | pct_data = nll_data.get("nll_percentiles", nll_data.get("percentiles")) |
| | if pct_data and isinstance(pct_data, dict): |
| | lines.append("### Percentiles\n") |
| | lines.append("| Percentile | ๊ฐ |") |
| | lines.append("|------------|-----|") |
| | for pct, value in pct_data.items(): |
| | lines.append(f"| {pct}th | {_fmt_f(value)} |") |
| | lines.append("") |
| |
|
| | |
| | hlf = nll_data.get("high_loss_fractions") |
| | if hlf and isinstance(hlf, dict): |
| | lines.append("### ๊ณ ์์ค ํ ํฐ ๋น์จ\n") |
| | lines.append("| ์๊ณ๊ฐ | ๋น์จ |") |
| | lines.append("|--------|-----|") |
| | for threshold, fraction in hlf.items(): |
| | lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |") |
| | lines.append("") |
| | else: |
| | |
| | hlf_flat = {k.replace("high_loss_fraction_", ""): v |
| | for k, v in nll_data.items() |
| | if k.startswith("high_loss_fraction_")} |
| | if hlf_flat: |
| | lines.append("### ๊ณ ์์ค ํ ํฐ ๋น์จ\n") |
| | lines.append("| ์๊ณ๊ฐ | ๋น์จ |") |
| | lines.append("|--------|-----|") |
| | for threshold, fraction in sorted(hlf_flat.items()): |
| | lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |") |
| | lines.append("") |
| | else: |
| | lines.append("๋ฐ์ดํฐ ์์\n") |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | def _generate_generation_report(gen_data: dict, samples: list) -> str: |
| | lines = ["# ์์ฑ ํ์ง ๋ถ์\n"] |
| |
|
| | if gen_data and "summary" in gen_data: |
| | lines.append("## ์์ฝ ํต๊ณ\n") |
| | lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| | lines.append("|--------|-----|") |
| | for key, value in gen_data["summary"].items(): |
| | display = key.replace("_", " ").title() |
| | lines.append(f"| {display} | {_fmt_f(value)} |") |
| | lines.append("") |
| |
|
| | if samples: |
| | lines.append("## ์์ฑ ์ํ (Greedy)\n") |
| | for i, sample in enumerate(samples[:5], 1): |
| | if isinstance(sample, dict): |
| | prompt = sample.get("prompt", "") |
| | generated = sample.get("generated_text", "") |
| | if len(generated) > 300: |
| | generated = generated[:300] + "..." |
| | lines.append(f"### ์ํ {i}\n") |
| | lines.append(f"**Prompt**: {prompt}\n") |
| | lines.append(f"**Generated**: {generated}\n") |
| | lines.append("") |
| | elif not gen_data: |
| | lines.append("๋ฐ์ดํฐ ์์\n") |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | def _generate_benchmark_report( |
| | zero_shot: dict, |
| | five_shot: dict, |
| | repetition: dict, |
| | ) -> str: |
| | lines = ["# ํ์ค ๋ฒค์น๋งํฌ ๊ฒฐ๊ณผ\n"] |
| |
|
| | if not zero_shot and not five_shot: |
| | lines.append("๋ฐ์ดํฐ ์์\n") |
| | return "\n".join(lines) |
| |
|
| | |
| | lines.append("## ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
| |
|
| | |
| | kobest_names = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| | "kobest_sentineg", "kobest_wic"] |
| | kobest_0 = {t: zero_shot[t] for t in kobest_names if t in zero_shot} |
| | if kobest_0: |
| | lines.append("### KoBEST (0-shot)\n") |
| | lines.append("| ํ์คํฌ | Accuracy | F1 |") |
| | lines.append("|--------|----------|-----|") |
| | for t in kobest_names: |
| | if t in kobest_0: |
| | m = kobest_0[t] |
| | acc = _fmt_pct(_get_acc(m)) |
| | f1 = _fmt_f(m.get("f1,none")) |
| | lines.append(f"| {t} | {acc} | {f1} |") |
| | kobest_accs = [_get_acc(kobest_0[t]) for t in kobest_names |
| | if t in kobest_0 and _get_acc(kobest_0[t]) is not None] |
| | if kobest_accs: |
| | lines.append(f"| **ํ๊ท ** | **{_fmt_pct(sum(kobest_accs)/len(kobest_accs))}** | |") |
| | lines.append("") |
| |
|
| | |
| | if "haerae" in zero_shot: |
| | lines.append("### HAE-RAE (0-shot)\n") |
| | m = zero_shot["haerae"] |
| | lines.append(f"- Accuracy: {_fmt_pct(_get_acc(m))}") |
| | |
| | haerae_subs = {t: zero_shot[t] for t in zero_shot if t.startswith("haerae_") and t != "haerae"} |
| | if haerae_subs: |
| | lines.append("\n| ์๋ธํ์คํฌ | Accuracy |") |
| | lines.append("|-----------|----------|") |
| | for t, sm in sorted(haerae_subs.items()): |
| | lines.append(f"| {t} | {_fmt_pct(_get_acc(sm))} |") |
| | lines.append("") |
| |
|
| | |
| | mmlu_ko_tasks = {t: zero_shot[t] for t in zero_shot |
| | if t.startswith("global_mmlu_ko") and t != "global_mmlu_ko"} |
| | if mmlu_ko_tasks or "global_mmlu_ko" in zero_shot: |
| | lines.append("### MMLU-KO (0-shot)\n") |
| | if mmlu_ko_tasks: |
| | lines.append(f"ํ๊ฐ๋ ๊ณผ๋ชฉ ์: **{len(mmlu_ko_tasks)}**\n") |
| | accs = [(t, _get_acc(m)) for t, m in sorted(mmlu_ko_tasks.items()) |
| | if _get_acc(m) is not None] |
| | if accs: |
| | |
| | group_acc = _get_acc(zero_shot["global_mmlu_ko"]) if "global_mmlu_ko" in zero_shot else None |
| | avg_acc = group_acc if group_acc is not None else sum(a for _, a in accs) / len(accs) |
| | lines.append(f"์ ์ฒด ํ๊ท : **{_fmt_pct(avg_acc)}**\n") |
| |
|
| | |
| | accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True) |
| | lines.append("**์์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| | lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| | lines.append("|------|----------|") |
| | for t, a in accs_sorted[:10]: |
| | subject = t.replace("global_mmlu_ko_", "") |
| | lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| | lines.append("") |
| |
|
| | lines.append("**ํ์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| | lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| | lines.append("|------|----------|") |
| | for t, a in accs_sorted[-10:]: |
| | subject = t.replace("global_mmlu_ko_", "") |
| | lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| | lines.append("") |
| | elif "global_mmlu_ko" in zero_shot: |
| | a = _get_acc(zero_shot["global_mmlu_ko"]) |
| | lines.append(f"์ ์ฒด ์ ํ๋: {_fmt_pct(a)}\n") |
| |
|
| | |
| | lines.append("## ์์ด ๋ฒค์น๋งํฌ\n") |
| |
|
| | en_tasks = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"] |
| | en_found = {t: zero_shot[t] for t in en_tasks if t in zero_shot} |
| | if en_found: |
| | lines.append("### ์ฃผ์ ๋ฒค์น๋งํฌ (0-shot)\n") |
| | lines.append("| ํ์คํฌ | Accuracy | Acc (norm) |") |
| | lines.append("|--------|----------|-----------|") |
| | for t in en_tasks: |
| | if t in en_found: |
| | m = en_found[t] |
| | acc = _fmt_pct(_get_acc(m)) |
| | acc_norm = _fmt_pct(_get_acc(m, prefer_norm=True) if "acc_norm,none" in m else None) |
| | lines.append(f"| {t} | {acc} | {acc_norm} |") |
| | lines.append("") |
| |
|
| | |
| | mmlu_en_tasks = {t: zero_shot[t] for t in zero_shot |
| | if (t.startswith("mmlu_") or t == "mmlu") and not t.startswith("mmlu_ko")} |
| | if mmlu_en_tasks: |
| | lines.append("### MMLU-EN (0-shot)\n") |
| | |
| | subtasks = {t: m for t, m in mmlu_en_tasks.items() if t != "mmlu"} |
| | if subtasks: |
| | lines.append(f"ํ๊ฐ๋ ๊ณผ๋ชฉ ์: **{len(subtasks)}**\n") |
| | accs = [(t, _get_acc(m)) for t, m in sorted(subtasks.items()) |
| | if _get_acc(m) is not None] |
| | if accs: |
| | avg_acc = sum(a for _, a in accs) / len(accs) |
| | lines.append(f"์ ์ฒด ํ๊ท : **{_fmt_pct(avg_acc)}**\n") |
| |
|
| | accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True) |
| | lines.append("**์์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| | lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| | lines.append("|------|----------|") |
| | for t, a in accs_sorted[:10]: |
| | subject = t.replace("mmlu_", "") |
| | lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| | lines.append("") |
| |
|
| | lines.append("**ํ์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| | lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| | lines.append("|------|----------|") |
| | for t, a in accs_sorted[-10:]: |
| | subject = t.replace("mmlu_", "") |
| | lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| | lines.append("") |
| | elif "mmlu" in mmlu_en_tasks: |
| | a = _get_acc(mmlu_en_tasks["mmlu"]) |
| | lines.append(f"์ ์ฒด ์ ํ๋: {_fmt_pct(a)}\n") |
| |
|
| | |
| | if five_shot: |
| | lines.append("## 0-shot vs 5-shot ๋น๊ต\n") |
| |
|
| | |
| | common_tasks = sorted(set(zero_shot.keys()) & set(five_shot.keys())) |
| | if common_tasks: |
| | lines.append("| ํ์คํฌ | 0-shot Acc | 5-shot Acc | ๋ณํ |") |
| | lines.append("|--------|-----------|-----------|------|") |
| | for t in common_tasks: |
| | a0 = _get_acc(zero_shot[t]) |
| | a5 = _get_acc(five_shot[t]) |
| | if a0 is not None and a5 is not None: |
| | diff = a5 - a0 |
| | sign = "+" if diff >= 0 else "" |
| | lines.append( |
| | f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | {sign}{diff*100:.2f}pp |" |
| | ) |
| | else: |
| | lines.append(f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | โ |") |
| | lines.append("") |
| |
|
| | |
| | diffs = [] |
| | for t in common_tasks: |
| | a0 = _get_acc(zero_shot[t]) |
| | a5 = _get_acc(five_shot[t]) |
| | if a0 is not None and a5 is not None: |
| | diffs.append(a5 - a0) |
| | if diffs: |
| | avg_diff = sum(diffs) / len(diffs) |
| | improved = sum(1 for d in diffs if d > 0) |
| | degraded = sum(1 for d in diffs if d < 0) |
| | lines.append( |
| | f"ํ๊ท ๋ณํ: {'+' if avg_diff >= 0 else ''}{avg_diff*100:.2f}pp | " |
| | f"๊ฐ์ : {improved} | ํ๋ฝ: {degraded} | ๋์ผ: {len(diffs) - improved - degraded}\n" |
| | ) |
| |
|
| | |
| | if repetition and repetition.get("grid_results"): |
| | lines.append("## Repetition ํ๋ผ๋ฏธํฐ ๊ฒ์\n") |
| | rep_data = repetition["grid_results"] |
| | rep_rows = [] |
| | |
| | items = rep_data.items() if isinstance(rep_data, dict) else enumerate(rep_data) |
| | for key, metrics in items: |
| | if isinstance(metrics, dict): |
| | rep_rows.append({ |
| | "config": metrics.get("params", str(key)), |
| | "temp": metrics.get("temperature"), |
| | "rep_pen": metrics.get("repetition_penalty"), |
| | "3gram": metrics.get("avg_3gram_rep", metrics.get("3gram_repetition", float("inf"))), |
| | "4gram": metrics.get("avg_4gram_rep", metrics.get("4gram_repetition")), |
| | "eos_rate": metrics.get("eos_rate"), |
| | "avg_tokens": metrics.get("avg_tokens"), |
| | }) |
| | rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf")) |
| |
|
| | lines.append("| ์ค์ | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |") |
| | lines.append("|------|------|---------|--------|--------|----------|-----------|") |
| | for i, r in enumerate(rep_rows): |
| | marker = " **โ best**" if i == 0 else "" |
| | lines.append( |
| | f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | " |
| | f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | " |
| | f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}" |
| | ) |
| | lines.append("") |
| |
|
| | lines.append("---\n") |
| | lines.append("*์ด ๋ฆฌํฌํธ๋ ์๋์ผ๋ก ์์ฑ๋์์ต๋๋ค.*") |
| | return "\n".join(lines) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| | _BASE_PPL_REFERENCE = { |
| | "3b_val": 5.2263, |
| | "3b": 5.2263, |
| | "korean_c4_val": 5.7173, |
| | "korean_c4": 5.7173, |
| | "hplt_ko_val": 2.4028, |
| | "hplt_ko": 2.4028, |
| | "cc100_ko_val": 21.782, |
| | "cc100_ko": 21.782, |
| | "korean_val": 9.6505, |
| | "korean": 9.6505, |
| | } |
| |
|
| | _BASE_BENCH_REFERENCE = { |
| | "kobest_boolq": 0.5028, |
| | "kobest_copa": 0.4930, |
| | "kobest_hellaswag": 0.2160, |
| | "kobest_sentineg": 0.4861, |
| | "kobest_wic": 0.4865, |
| | "haerae": 0.1971, |
| | "global_mmlu_ko": 0.2275, |
| | "hellaswag": 0.2600, |
| | "arc_easy": 0.2563, |
| | "arc_challenge": 0.2167, |
| | "winogrande": 0.5059, |
| | "piqa": 0.5250, |
| | } |
| |
|
| | _BASE_GEN_REFERENCE = { |
| | "greedy_3gram_rep": 0.6099, |
| | "greedy_4gram_rep": 0.5702, |
| | "greedy_eos_rate": 0.0, |
| | } |
| |
|
| | _BASE_CALIB_REFERENCE = { |
| | "top1_accuracy": 0.6875, |
| | "top5_accuracy": 0.8164, |
| | "top10_accuracy": 0.8593, |
| | "mean_entropy": 1.5682, |
| | } |
| |
|
| | _BASE_NLL_REFERENCE = { |
| | "nll_mean": 1.5561, |
| | "high_loss_fraction_5": 0.1086, |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | _SFT_TARGETS = { |
| | |
| | "greedy_3gram_rep_max": 0.05, |
| | "eos_rate_min": 0.90, |
| | "sampled_eos_min": 0.50, |
| | "distinct_2_min": 0.70, |
| | |
| | "ppl_forgetting_max_pct": 15.0, |
| | |
| | "kobest_avg_min": 0.55, |
| | "haerae_min": 0.25, |
| | "mmlu_ko_min": 0.30, |
| | |
| | "top1_accuracy_min": 0.65, |
| | |
| | "hellaswag_min": 0.25, |
| | "arc_easy_min": 0.25, |
| | "arc_challenge_min": 0.21, |
| | "winogrande_min": 0.49, |
| | "piqa_min": 0.51, |
| | "mmlu_en_avg_min": 0.25, |
| | } |
| |
|
| | _REFERENCE_MODELS = { |
| | "Llama 3.2 1B": {"kobest_avg": 0.52, "mmlu_ko": 0.28, "mmlu_en": 0.32}, |
| | "Llama 3.2 3B": {"kobest_avg": 0.56, "mmlu_ko": 0.35, "mmlu_en": 0.55}, |
| | "Qwen 2.5 3B": {"kobest_avg": 0.58, "mmlu_ko": 0.42, "mmlu_en": 0.58}, |
| | } |
| |
|
| |
|
| | def _compute_orpo_score(sft_p1, sft_zero, base_p1, base_zero): |
| | """ORPO ํ์์ฑ ์ ๋ ํ์ (0-100์ ). |
| | |
| | Returns: |
| | dict with keys: total_score, dimension_scores, decision, confidence, orpo_gain_estimate |
| | """ |
| | dimensions = {} |
| | missing = 0 |
| | total_dims = 7 |
| |
|
| | |
| | max_forgetting = _get_max_forgetting(sft_p1, base_p1) |
| | if max_forgetting is not None: |
| | threshold = _SFT_TARGETS["ppl_forgetting_max_pct"] |
| | score = 25 * max(0, 1 - max_forgetting / threshold) |
| | dimensions["ppl_forgetting"] = { |
| | "score": round(score, 1), "weight": 25, |
| | "current": round(max_forgetting, 1), "threshold": f"<{threshold}%", |
| | "status": "PASS" if max_forgetting < threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["ppl_forgetting"] = {"score": 0, "weight": 25, "current": "N/A", "threshold": "<15%", "status": "N/A"} |
| |
|
| | |
| | rep_rate = _get_greedy_3gram_rep(sft_p1) |
| | if rep_rate is not None: |
| | threshold = _SFT_TARGETS["greedy_3gram_rep_max"] |
| | score = 20 * max(0, 1 - rep_rate / threshold) |
| | dimensions["greedy_rep"] = { |
| | "score": round(score, 1), "weight": 20, |
| | "current": f"{rep_rate:.1%}", "threshold": f"<{threshold:.0%}", |
| | "status": "PASS" if rep_rate < threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["greedy_rep"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": "<5%", "status": "N/A"} |
| |
|
| | |
| | eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| | if eos_rate is not None: |
| | threshold = _SFT_TARGETS["eos_rate_min"] |
| | score = 10 * min(eos_rate / threshold, 1) |
| | dimensions["eos_rate"] = { |
| | "score": round(score, 1), "weight": 10, |
| | "current": f"{eos_rate:.0%}", "threshold": f">{threshold:.0%}", |
| | "status": "PASS" if eos_rate >= threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["eos_rate"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">90%", "status": "N/A"} |
| |
|
| | |
| | kobest_avg = _get_kobest_avg(sft_zero) |
| | if kobest_avg is not None: |
| | threshold = _SFT_TARGETS["kobest_avg_min"] |
| | score = 20 * min(kobest_avg / threshold, 1) |
| | dimensions["kobest_avg"] = { |
| | "score": round(score, 1), "weight": 20, |
| | "current": f"{kobest_avg:.1%}", "threshold": f">{threshold:.0%}", |
| | "status": "PASS" if kobest_avg >= threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["kobest_avg"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": ">55%", "status": "N/A"} |
| |
|
| | |
| | top1 = sft_p1.get("calibration", {}).get("top1_accuracy") |
| | if top1 is not None: |
| | threshold = _SFT_TARGETS["top1_accuracy_min"] |
| | score = 10 * min(top1 / threshold, 1) |
| | dimensions["calibration"] = { |
| | "score": round(score, 1), "weight": 10, |
| | "current": f"{top1:.1%}", "threshold": f">={threshold:.0%}", |
| | "status": "PASS" if top1 >= threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["calibration"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">=65%", "status": "N/A"} |
| |
|
| | |
| | distinct_2 = sft_p1.get("generation", {}).get("summary", {}).get("greedy_avg_distinct_2") |
| | if distinct_2 is not None: |
| | threshold = _SFT_TARGETS["distinct_2_min"] |
| | score = 10 * min(distinct_2 / threshold, 1) |
| | dimensions["diversity"] = { |
| | "score": round(score, 1), "weight": 10, |
| | "current": f"{distinct_2:.0%}", "threshold": f">{threshold:.0%}", |
| | "status": "PASS" if distinct_2 >= threshold else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["diversity"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">70%", "status": "N/A"} |
| |
|
| | |
| | en_tasks = { |
| | "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| | "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| | "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| | "winogrande": _SFT_TARGETS["winogrande_min"], |
| | "piqa": _SFT_TARGETS["piqa_min"], |
| | } |
| | en_all_pass = True |
| | en_count = 0 |
| | for t, threshold in en_tasks.items(): |
| | a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| | if a is not None: |
| | en_count += 1 |
| | if a < threshold: |
| | en_all_pass = False |
| | if en_count > 0: |
| | score = 5.0 if en_all_pass else 0.0 |
| | dimensions["english"] = { |
| | "score": score, "weight": 5, |
| | "current": "์ ๋ถ ํต๊ณผ" if en_all_pass else "์ผ๋ถ ๋ฏธ๋ฌ", |
| | "threshold": "โ", "status": "PASS" if en_all_pass else "FAIL", |
| | } |
| | else: |
| | missing += 1 |
| | dimensions["english"] = {"score": 0, "weight": 5, "current": "N/A", "threshold": "โ", "status": "N/A"} |
| |
|
| | total_score = sum(d["score"] for d in dimensions.values()) |
| | confidence = round(1.0 - (missing / total_dims), 2) |
| |
|
| | if missing >= 2: |
| | logger.warning("ORPO score has %d/%d missing dimensions โ confidence %.0f%%", missing, total_dims, confidence * 100) |
| |
|
| | |
| | orpo_improvable = 0.0 |
| | if rep_rate is not None and rep_rate >= _SFT_TARGETS["greedy_3gram_rep_max"]: |
| | orpo_improvable += 20.0 |
| | if eos_rate is not None and eos_rate < _SFT_TARGETS["eos_rate_min"]: |
| | orpo_improvable += 10.0 |
| | if distinct_2 is not None and distinct_2 < _SFT_TARGETS["distinct_2_min"]: |
| | orpo_improvable += 5.0 |
| |
|
| | |
| | forgetting_ok = max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] |
| | if total_score >= 80: |
| | decision = "DEPLOY" |
| | elif total_score >= 40 and forgetting_ok: |
| | decision = "ORPO" |
| | else: |
| | decision = "SFT_RETRY" |
| |
|
| | return { |
| | "total_score": round(total_score, 1), |
| | "dimensions": dimensions, |
| | "decision": decision, |
| | "confidence": confidence, |
| | "orpo_gain_estimate": round(orpo_improvable, 1), |
| | } |
| |
|
| |
|
| | def generate_comparison_report( |
| | base_results_dir: Path, |
| | sft_phase1_results: dict, |
| | sft_phase2_results: dict, |
| | output_path: Path, |
| | sft_output_dir: Optional[Path] = None, |
| | total_elapsed_sec: float = 0.0, |
| | ) -> Path: |
| | """Generate a comprehensive Base vs SFT comparison report. |
| | |
| | Args: |
| | base_results_dir: Directory containing Base model's phase1/phase2_results.json |
| | sft_phase1_results: SFT Phase 1 results dict |
| | sft_phase2_results: SFT Phase 2 results dict |
| | output_path: Where to write the markdown report |
| | sft_output_dir: SFT eval outputs directory (for linking) |
| | total_elapsed_sec: Total pipeline elapsed time |
| | |
| | Returns: |
| | Path to the generated report |
| | """ |
| | base_results_dir = Path(base_results_dir) |
| | output_path = Path(output_path) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | base_p1 = {} |
| | base_p2 = {} |
| | p1_file = base_results_dir / "phase1_results.json" |
| | p2_file = base_results_dir / "phase2_results.json" |
| | if p1_file.exists(): |
| | with open(p1_file, encoding="utf-8") as f: |
| | base_p1 = json.load(f) |
| | if p2_file.exists(): |
| | with open(p2_file, encoding="utf-8") as f: |
| | base_p2 = json.load(f) |
| |
|
| | |
| | sft_p1 = _normalize_phase1_results(sft_phase1_results) |
| | base_p1_norm = _normalize_phase1_results(base_p1) |
| | sft_zero, sft_five = _normalize_phase2_results(sft_phase2_results) |
| | base_zero, base_five = _normalize_phase2_results(base_p2) |
| |
|
| | eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| |
|
| | lines = [] |
| |
|
| | |
| | lines.append("# FRANKENSTALLM 3B SFT ๋ชจ๋ธ ๋ค๋ฉด์ ์ข
ํฉ ํ๊ฐ ๋ณด๊ณ ์\n") |
| | lines.append(f"- **ํ๊ฐ ์ผ์**: {eval_datetime}") |
| | lines.append(f"- **SFT ์ฒดํฌํฌ์ธํธ**: checkpoint-best (val_loss=1.8851, step 25500)") |
| | lines.append(f"- **Base ์ฐธ์กฐ ๊ฒฐ๊ณผ**: 3b_reeval_20260305_1451") |
| | lines.append(f"- **์ด ์์ ์๊ฐ**: {_fmt_seconds(total_elapsed_sec)}") |
| | if sft_output_dir: |
| | lines.append(f"- **๊ฒฐ๊ณผ ๋๋ ํ ๋ฆฌ**: {sft_output_dir}") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 1. Executive Summary\n") |
| | verdicts = _compute_verdicts(sft_p1, sft_zero, base_p1_norm, base_zero) |
| | lines.append("| ํ๊ฐ ์ฐจ์ | ๊ฒฐ๊ณผ | ์์ธ |") |
| | lines.append("|----------|------|------|") |
| | for dim_name, verdict, detail in verdicts: |
| | icon = "PASS" if verdict else "FAIL" |
| | lines.append(f"| {dim_name} | **{icon}** | {detail} |") |
| | lines.append("") |
| |
|
| | pass_count = sum(1 for _, v, _ in verdicts if v) |
| | total_dims = len(verdicts) |
| | lines.append(f"**์ข
ํฉ**: {pass_count}/{total_dims} ์ฐจ์ ํต๊ณผ\n") |
| |
|
| | |
| | rep_rate = _get_greedy_3gram_rep(sft_p1) |
| | kobest_avg = _get_kobest_avg(sft_zero) |
| | max_forgetting = _get_max_forgetting(sft_p1, base_p1_norm) |
| |
|
| | lines.append("### ORPO ํ์ (์ ๋ ์ค์ฝ์ด)\n") |
| | orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero) |
| |
|
| | lines.append(f"**๊ฒฐ์ **: {orpo_result['decision']} (ํ์ ๋: {orpo_result['confidence']:.0%})\n") |
| | lines.append(f"**์ ๋ ์ค์ฝ์ด**: {orpo_result['total_score']}/100\n") |
| |
|
| | lines.append("| ์ฐจ์ | ์ ์ | /๊ฐ์ค์น | ํ์ฌ๊ฐ | ๊ธฐ์ค | ์ํ |") |
| | lines.append("|------|------|--------|--------|------|------|") |
| | dim_names = { |
| | "ppl_forgetting": "PPL Forgetting", |
| | "greedy_rep": "Greedy ๋ฐ๋ณต๋ฅ ", |
| | "eos_rate": "EOS ์ข
๋ฃ์จ", |
| | "kobest_avg": "KoBEST ํ๊ท ", |
| | "calibration": "Calibration", |
| | "diversity": "๋ค์์ฑ", |
| | "english": "์์ด ์ ์ง", |
| | } |
| | for key, label in dim_names.items(): |
| | d = orpo_result["dimensions"].get(key, {}) |
| | lines.append( |
| | f"| {label} | {d.get('score', 0)} | /{d.get('weight', 0)} | " |
| | f"{d.get('current', 'N/A')} | {d.get('threshold', 'โ')} | {d.get('status', 'N/A')} |" |
| | ) |
| | lines.append("") |
| |
|
| | if orpo_result["orpo_gain_estimate"] > 0: |
| | lines.append(f"**ORPO ๊ธฐ๋ ์ด๋**: +{orpo_result['orpo_gain_estimate']}์ " |
| | f"(๋ฐ๋ณต๋ฅ /EOS/๋ค์์ฑ ๊ฐ์ ๊ธฐ๋, PPL/๋ฒค์น ๋ณํ ์์)\n") |
| |
|
| | |
| | lines.append("**์ฐธ์กฐ ๋ชจ๋ธ ๋น๊ต**:\n") |
| | for model_name, ref in _REFERENCE_MODELS.items(): |
| | lines.append(f"- {model_name}: KoBEST={ref['kobest_avg']:.0%}, MMLU-KO={ref['mmlu_ko']:.0%}") |
| | lines.append("") |
| |
|
| | |
| | if orpo_result["decision"] == "DEPLOY": |
| | lines.append("**โ Phase 4: GGUF + Ollama ๋ฐฐํฌ** (์ค์ฝ์ด โฅ80, ๋ชจ๋ ํต์ฌ ์กฐ๊ฑด ์ถฉ์กฑ)\n") |
| | elif orpo_result["decision"] == "ORPO": |
| | lines.append("**โ Phase 3: ORPO** (์ค์ฝ์ด 40-79, ์ง์ ๋ณด์กด ์ํธ, ์์ฑ ๊ฐ์ ํ์)\n") |
| | else: |
| | lines.append("**โ SFT ์ฌ์๋** (์ค์ฝ์ด <40 ๋๋ ์ฌ๊ฐํ forgetting)\n") |
| |
|
| | |
| | lines.append("## 2. Perplexity ๋น๊ต (์ง์ ๋ณด์กด)\n") |
| | lines.append("| ๋ฐ์ดํฐ์
| Base PPL | SFT PPL | ๋ณํ | Forgetting % | ํ์ |") |
| | lines.append("|---------|---------|---------|------|-------------|------|") |
| |
|
| | sft_ppl = sft_p1.get("perplexity", {}) |
| | base_ppl = base_p1_norm.get("perplexity", {}) |
| |
|
| | |
| | all_ppl_names = sorted(set(list(sft_ppl.keys()) + list(base_ppl.keys()))) |
| | forgetting_values = [] |
| | for name in all_ppl_names: |
| | sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None |
| | base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| | |
| | if base_val is None: |
| | base_val = _BASE_PPL_REFERENCE.get(name) |
| |
|
| | if sft_val is not None and base_val is not None: |
| | forgetting = (sft_val - base_val) / base_val * 100 |
| | forgetting_values.append(forgetting) |
| | verdict = "PASS" if forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] else "FAIL" |
| | lines.append( |
| | f"| {name} | {base_val:.4f} | {sft_val:.4f} | " |
| | f"{'+' if sft_val >= base_val else ''}{sft_val - base_val:.4f} | " |
| | f"{forgetting:+.1f}% | {verdict} |" |
| | ) |
| | elif sft_val is not None: |
| | lines.append(f"| {name} | โ | {sft_val:.4f} | โ | โ | โ |") |
| | elif base_val is not None: |
| | lines.append(f"| {name} | {base_val:.4f} | โ | โ | โ | โ |") |
| |
|
| | if forgetting_values: |
| | avg_forgetting = sum(forgetting_values) / len(forgetting_values) |
| | max_f = max(forgetting_values) |
| | lines.append("") |
| | lines.append(f"**ํ๊ท Forgetting**: {avg_forgetting:+.1f}% | **์ต๋**: {max_f:+.1f}% | " |
| | f"**ํ์ **: {'PASS' if max_f < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'FAIL'} (์๊ณ๊ฐ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 3. ์์ฑ ํ์ง ๋น๊ต\n") |
| | sft_gen = sft_p1.get("generation", {}) |
| | if not sft_gen: |
| | logger.warning("Generation results missing from SFT Phase 1") |
| | sft_summary = sft_gen.get("summary", {}) |
| |
|
| | lines.append("| ์งํ | Base | SFT | ๋ชฉํ | ํ์ |") |
| | lines.append("|------|------|-----|------|------|") |
| |
|
| | greedy_3gram = sft_summary.get("greedy_avg_3gram_rep") |
| | greedy_4gram = sft_summary.get("greedy_avg_4gram_rep") |
| | eos_rate = sft_summary.get("greedy_eos_rate") |
| |
|
| | rep_threshold = _SFT_TARGETS["greedy_3gram_rep_max"] |
| | eos_threshold = _SFT_TARGETS["eos_rate_min"] |
| | greedy_3gram_verdict = "PASS" if greedy_3gram is not None and greedy_3gram < rep_threshold else "FAIL" |
| | greedy_4gram_verdict = "PASS" if greedy_4gram is not None and greedy_4gram < 0.05 else "FAIL" |
| | eos_verdict = "PASS" if eos_rate is not None and eos_rate >= eos_threshold else "FAIL" |
| | lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_BASE_GEN_REFERENCE['greedy_3gram_rep']:.2%} | " |
| | f"{_fmt_pct(greedy_3gram)} | < {rep_threshold:.0%} | {greedy_3gram_verdict} |") |
| | lines.append(f"| Greedy 4-gram ๋ฐ๋ณต๋ฅ | {_BASE_GEN_REFERENCE['greedy_4gram_rep']:.2%} | " |
| | f"{_fmt_pct(greedy_4gram)} | < 5% | {greedy_4gram_verdict} |") |
| | lines.append(f"| EOS ์ข
๋ฃ์จ | {_BASE_GEN_REFERENCE['greedy_eos_rate']:.0%} | " |
| | f"{_fmt_pct(eos_rate)} | > {eos_threshold:.0%} | {eos_verdict} |") |
| |
|
| | sampled_3gram = sft_summary.get("sampled_avg_3gram_rep") |
| | sampled_eos = sft_summary.get("sampled_eos_rate") |
| | if sampled_3gram is not None: |
| | lines.append(f"| Sampled 3-gram ๋ฐ๋ณต๋ฅ | โ | {sampled_3gram:.2%} | โ | โ |") |
| | if sampled_eos is not None: |
| | lines.append(f"| Sampled EOS ์ข
๋ฃ์จ | โ | {sampled_eos:.2%} | โ | โ |") |
| | lines.append("") |
| |
|
| | |
| | chat_status = "ํ์ฑํ" if sft_summary else "๋นํ์ฑํ" |
| | lines.append(f"**Chat Template**: {chat_status}\n") |
| |
|
| | |
| | if sft_gen.get("samples"): |
| | lines.append("### ์์ฑ ์ํ (Greedy, Chat Template)\n") |
| | greedy_samples = [s for s in sft_gen["samples"] if s.get("temperature") == 0.0] |
| | for i, s in enumerate(greedy_samples[:5], 1): |
| | prompt = s.get("prompt", "") |
| | text = s.get("text", "")[:400] |
| | hit_eos = s.get("hit_eos", False) |
| | rep3 = s.get("3gram_rep", 0) |
| | lines.append(f"**[{i}]** `{prompt}`") |
| | lines.append(f"> {text}") |
| | lines.append(f"> *EOS={hit_eos}, 3gram_rep={rep3:.2%}, tokens={s.get('generated_tokens', 0)}*\n") |
| |
|
| | |
| | sft_rep = sft_p1.get("repetition", {}) |
| | if sft_rep.get("grid_results"): |
| | lines.append("### Repetition ํ๋ผ๋ฏธํฐ ๊ฒ์ ๊ฒฐ๊ณผ\n") |
| | lines.append("| ์ค์ | 3-gram | EOS Rate | Avg Tokens |") |
| | lines.append("|------|--------|----------|-----------|") |
| | grid = sft_rep["grid_results"] |
| | items = grid if isinstance(grid, list) else list(grid.values()) |
| | for r in items[:6]: |
| | if isinstance(r, dict): |
| | lines.append( |
| | f"| {r.get('params', '?')} | " |
| | f"{_fmt_f(r.get('avg_3gram_rep'))} | " |
| | f"{_fmt_f(r.get('eos_rate'))} | " |
| | f"{_fmt_f(r.get('avg_tokens'), 1)} |" |
| | ) |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 4. ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
| | lines.append("### KoBEST (0-shot)\n") |
| | lines.append("| ํ์คํฌ | Base | SFT | ๋ณํ | ๋ชฉํ | ํ์ |") |
| | lines.append("|--------|------|-----|------|------|------|") |
| |
|
| | kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| | "kobest_sentineg", "kobest_wic"] |
| | kobest_targets = {"kobest_boolq": 0.60, "kobest_copa": 0.65, |
| | "kobest_hellaswag": 0.30, "kobest_sentineg": 0.60, |
| | "kobest_wic": 0.55} |
| | sft_kobest_accs = [] |
| | base_kobest_accs = [] |
| |
|
| | for t in kobest_tasks: |
| | base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| | sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| | target = kobest_targets.get(t, 0.50) |
| |
|
| | if sft_a is not None: |
| | sft_kobest_accs.append(sft_a) |
| | if base_a is not None: |
| | base_kobest_accs.append(base_a) |
| |
|
| | diff = "" |
| | verdict = "โ" |
| | if sft_a is not None and base_a is not None: |
| | d = (sft_a - base_a) * 100 |
| | diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | verdict = "PASS" if sft_a >= target else "FAIL" |
| |
|
| | lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | " |
| | f"โฅ{target*100:.0f}% | {verdict} |") |
| |
|
| | if sft_kobest_accs: |
| | sft_avg = sum(sft_kobest_accs) / len(sft_kobest_accs) |
| | base_avg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else _BASE_BENCH_REFERENCE.get("kobest_avg", 0.4369) |
| | diff_avg = (sft_avg - base_avg) * 100 |
| | lines.append(f"| **ํ๊ท ** | **{base_avg*100:.2f}%** | **{sft_avg*100:.2f}%** | " |
| | f"**{'+' if diff_avg >= 0 else ''}{diff_avg:.1f}pp** | " |
| | f"**โฅ{_SFT_TARGETS['kobest_avg_min']*100:.0f}%** | **{'PASS' if sft_avg >= _SFT_TARGETS['kobest_avg_min'] else 'FAIL'}** |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("### HAE-RAE (0-shot)\n") |
| | base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae") |
| | sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None |
| | if sft_haerae is not None: |
| | diff_h = (sft_haerae - (base_haerae or 0)) * 100 if base_haerae else 0 |
| | lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: {_fmt_pct(sft_haerae)} " |
| | f"({'+' if diff_h >= 0 else ''}{diff_h:.1f}pp) | " |
| | f"๋ชฉํ โฅ{_SFT_TARGETS['haerae_min']*100:.0f}% | {'PASS' if sft_haerae >= _SFT_TARGETS['haerae_min'] else 'FAIL'}") |
| | else: |
| | lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: N/A") |
| | lines.append("") |
| |
|
| | |
| | lines.append("### MMLU-KO (0-shot)\n") |
| | base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko") |
| | sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None |
| | if sft_mmlu_ko is not None: |
| | diff_mk = (sft_mmlu_ko - (base_mmlu_ko or 0)) * 100 if base_mmlu_ko else 0 |
| | lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: {_fmt_pct(sft_mmlu_ko)} " |
| | f"({'+' if diff_mk >= 0 else ''}{diff_mk:.1f}pp) | " |
| | f"๋ชฉํ โฅ{_SFT_TARGETS['mmlu_ko_min']*100:.0f}% | {'PASS' if sft_mmlu_ko >= _SFT_TARGETS['mmlu_ko_min'] else 'FAIL'}") |
| | else: |
| | lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: N/A") |
| | lines.append("") |
| |
|
| | |
| | if sft_five: |
| | lines.append("### 5-shot ๋น๊ต (ํ๊ตญ์ด)\n") |
| | lines.append("| ํ์คํฌ | 0-shot | 5-shot | ๋ณํ |") |
| | lines.append("|--------|--------|--------|------|") |
| | for t in kobest_tasks + ["haerae", "global_mmlu_ko"]: |
| | a0 = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| | a5 = _get_acc(sft_five.get(t, {})) if t in sft_five else None |
| | if a0 is not None and a5 is not None: |
| | d = (a5 - a0) * 100 |
| | lines.append(f"| {t} | {a0*100:.2f}% | {a5*100:.2f}% | {'+' if d >= 0 else ''}{d:.1f}pp |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 5. ์์ด ๋ฒค์น๋งํฌ (์ ์ง ํ์ธ)\n") |
| | lines.append("| ํ์คํฌ | Base | SFT | ๋ณํ | ํํ | ํ์ |") |
| | lines.append("|--------|------|-----|------|------|------|") |
| |
|
| | en_tasks = { |
| | "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| | "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| | "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| | "winogrande": _SFT_TARGETS["winogrande_min"], |
| | "piqa": _SFT_TARGETS["piqa_min"], |
| | } |
| | for t, threshold in en_tasks.items(): |
| | base_a = _get_acc(base_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \ |
| | if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| | sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \ |
| | if t in sft_zero else None |
| | diff = "" |
| | verdict = "โ" |
| | if sft_a is not None and base_a is not None: |
| | d = (sft_a - base_a) * 100 |
| | diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | verdict = "PASS" if sft_a >= threshold else "FAIL" |
| | lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | " |
| | f"โฅ{threshold*100:.0f}% | {verdict} |") |
| |
|
| | |
| | _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
| | sft_mmlu_en = [] |
| | base_mmlu_en = [] |
| | for t, m in sft_zero.items(): |
| | if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| | a = _get_acc(m) |
| | if a is not None: |
| | sft_mmlu_en.append(a) |
| | if not sft_mmlu_en: |
| | for t in _MMLU_EN_GROUPS: |
| | if t in sft_zero: |
| | a = _get_acc(sft_zero[t]) |
| | if a is not None: |
| | sft_mmlu_en.append(a) |
| | for t, m in base_zero.items(): |
| | if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| | a = _get_acc(m) |
| | if a is not None: |
| | base_mmlu_en.append(a) |
| | if not base_mmlu_en: |
| | for t in _MMLU_EN_GROUPS: |
| | if t in base_zero: |
| | a = _get_acc(base_zero[t]) |
| | if a is not None: |
| | base_mmlu_en.append(a) |
| |
|
| | sft_mmlu_en_avg = sum(sft_mmlu_en) / len(sft_mmlu_en) if sft_mmlu_en else None |
| | base_mmlu_en_avg = sum(base_mmlu_en) / len(base_mmlu_en) if base_mmlu_en else 0.2581 |
| | if sft_mmlu_en_avg is not None: |
| | d = (sft_mmlu_en_avg - base_mmlu_en_avg) * 100 |
| | lines.append(f"| MMLU-EN ํ๊ท | {base_mmlu_en_avg*100:.2f}% | {sft_mmlu_en_avg*100:.2f}% | " |
| | f"{'+' if d >= 0 else ''}{d:.1f}pp | โฅ25% | " |
| | f"{'PASS' if sft_mmlu_en_avg >= _SFT_TARGETS['mmlu_en_avg_min'] else 'FAIL'} |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 6. Calibration ๋น๊ต\n") |
| | sft_cal = sft_p1.get("calibration", {}) |
| | lines.append("| ์งํ | Base | SFT | ๋ชฉํ | ํ์ |") |
| | lines.append("|------|------|-----|------|------|") |
| |
|
| | cal_checks = [ |
| | ("top1_accuracy", "Top-1 Accuracy", _SFT_TARGETS["top1_accuracy_min"], True), |
| | ("top5_accuracy", "Top-5 Accuracy", 0.78, True), |
| | ("top10_accuracy", "Top-10 Accuracy", 0.82, True), |
| | ("mean_entropy", "Mean Entropy", 2.0, False), |
| | ] |
| | for key, label, threshold, is_higher_better in cal_checks: |
| | base_v = _BASE_CALIB_REFERENCE.get(key) |
| | sft_v = sft_cal.get(key) |
| | verdict = "โ" |
| | if sft_v is not None: |
| | if is_higher_better: |
| | verdict = "PASS" if sft_v >= threshold else "FAIL" |
| | else: |
| | verdict = "PASS" if sft_v <= threshold else "FAIL" |
| | lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | " |
| | f"{'โฅ' if is_higher_better else '<'}{threshold} | {verdict} |") |
| |
|
| | |
| | sft_nll = sft_p1.get("token_nll", {}) |
| | nll_mean = sft_nll.get("nll_mean", sft_nll.get("mean")) |
| | base_nll_mean = _BASE_NLL_REFERENCE.get("nll_mean") |
| | if nll_mean is not None: |
| | lines.append(f"| Token NLL mean | {_fmt_f(base_nll_mean)} | {_fmt_f(nll_mean)} | " |
| | f"< 2.0 | {'PASS' if nll_mean < 2.0 else 'FAIL'} |") |
| | hlf5 = sft_nll.get("high_loss_fractions", {}).get("5", sft_nll.get("high_loss_fraction_5")) |
| | base_hlf5 = _BASE_NLL_REFERENCE.get("high_loss_fraction_5") |
| | if hlf5 is not None: |
| | lines.append(f"| NLL > 5 ๋น์จ | {_fmt_f(base_hlf5)} | {_fmt_f(hlf5)} | " |
| | f"< 0.15 | {'PASS' if hlf5 < 0.15 else 'FAIL'} |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("## 7. ์ข
ํฉ ํ์ ๋ฐ ๋ค์ ๋จ๊ณ\n") |
| |
|
| | lines.append("### ํต์ฌ ํ์ ๊ธฐ์ค\n") |
| | lines.append("| ์กฐ๊ฑด | ํ์ฌ ๊ฐ | ๊ธฐ์ค | ์ถฉ์กฑ |") |
| | lines.append("|------|---------|------|------|") |
| |
|
| | rep_val = rep_rate |
| | lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(rep_val)} | < {_SFT_TARGETS['greedy_3gram_rep_max']:.0%} | " |
| | f"{'YES' if rep_val is not None and rep_val < _SFT_TARGETS['greedy_3gram_rep_max'] else 'NO'} |") |
| | lines.append(f"| KoBEST ํ๊ท | {_fmt_pct(kobest_avg)} | > {_SFT_TARGETS['kobest_avg_min']*100:.0f}% | " |
| | f"{'YES' if kobest_avg is not None and kobest_avg > _SFT_TARGETS['kobest_avg_min'] else 'NO'} |") |
| | lines.append(f"| ์ต๋ Forgetting | {f'{max_forgetting:.1f}%' if max_forgetting is not None else 'N/A'} | " |
| | f"< {_SFT_TARGETS['ppl_forgetting_max_pct']}% | {'YES' if max_forgetting is not None and max_forgetting < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'NO'} |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("### ๊ถ๊ณ \n") |
| | orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero) |
| | orpo_score = orpo_result["total_score"] |
| | orpo_decision = orpo_result["decision"] |
| |
|
| | all_core_pass = ( |
| | rep_rate is not None and rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] |
| | and kobest_avg is not None and kobest_avg > _SFT_TARGETS["kobest_avg_min"] |
| | and max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] |
| | ) |
| |
|
| | if all_core_pass: |
| | lines.append("**๋ชจ๋ ํต์ฌ ์กฐ๊ฑด ์ถฉ์กฑ โ Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ**\n") |
| | elif orpo_decision == "ORPO": |
| | lines.append(f"**ORPO ํ์ ์ค์ฝ์ด {orpo_score:.1f}/100 โ Phase 3: ORPO ํ์ต ์งํ** (795K preference pairs ํ์ฉ)\n") |
| | lines.append("ORPO ํ์ต ์ ์ฃผ์์ :") |
| | lines.append("- Greedy ๋ฐ๋ณต๋ฅ ๊ฐ์ (ํ์ฌ 72.97% โ ๋ชฉํ <5%)") |
| | lines.append("- EOS ์ข
๋ฃ์จ ๊ฐ์ (ํ์ฌ 60% โ ๋ชฉํ >90%)") |
| | lines.append("- ๋ฒค์น๋งํฌ ์ ์ ์ ์ง/ํฅ์") |
| | lines.append("- ์ง์ ๋ณด์กด ์ ์ง (ํ์ฌ forgetting 0.9%)") |
| | elif orpo_decision == "SKIP_ORPO": |
| | lines.append("**ORPO ๋ถํ์ โ Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ**\n") |
| | else: |
| | lines.append("**ํต์ฌ ์กฐ๊ฑด ๋ฏธ๋ฌ โ SFT ์ฌ์๋**\n") |
| | lines.append("์ฌ์๋ ์ ๊ฒํ ์ฌํญ:") |
| | lines.append("- ํ์ต๋ฅ ์กฐ์ ") |
| | lines.append("- ๋ฐ์ดํฐ ๊ตฌ์ฑ ์ฌ๊ฒํ ") |
| | lines.append("- ์ํญ ์ ์กฐ์ ") |
| | lines.append("") |
| |
|
| | lines.append("---\n") |
| | lines.append("*์ด ๋ณด๊ณ ์๋ `eval/sft_eval_pipeline.py`์ ์ํด ์๋ ์์ฑ๋์์ต๋๋ค.*") |
| |
|
| | report_text = "\n".join(lines) |
| | output_path.write_text(report_text, encoding="utf-8") |
| |
|
| | |
| | if sft_output_dir: |
| | (Path(sft_output_dir) / "sft_comparison_report.md").write_text(report_text, encoding="utf-8") |
| |
|
| | return output_path |
| |
|
| |
|
| | def _compute_verdicts(sft_p1, sft_zero, base_p1, base_zero): |
| | """Compute pass/fail verdicts for each of the 6 evaluation dimensions.""" |
| | verdicts = [] |
| |
|
| | |
| | max_forgetting = _get_max_forgetting(sft_p1, base_p1) |
| | if max_forgetting is not None: |
| | verdicts.append(( |
| | "์ฐจ์ 1: Perplexity (์ง์ ๋ณด์กด)", |
| | max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"], |
| | f"์ต๋ forgetting {max_forgetting:.1f}% (์๊ณ๊ฐ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)", |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 1: Perplexity (์ง์ ๋ณด์กด)", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | rep_rate = _get_greedy_3gram_rep(sft_p1) |
| | eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| | if rep_rate is not None and eos_rate is not None: |
| | gen_pass = rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] and eos_rate > _SFT_TARGETS["eos_rate_min"] |
| | verdicts.append(( |
| | "์ฐจ์ 2: ์์ฑ ํ์ง", |
| | gen_pass, |
| | f"๋ฐ๋ณต๋ฅ {rep_rate:.2%} (๋ชฉํ <{_SFT_TARGETS['greedy_3gram_rep_max']:.0%}), EOS {eos_rate:.0%} (๋ชฉํ >{_SFT_TARGETS['eos_rate_min']:.0%})", |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 2: ์์ฑ ํ์ง", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | kobest_avg = _get_kobest_avg(sft_zero) |
| | if kobest_avg is not None: |
| | verdicts.append(( |
| | "์ฐจ์ 3: ํ๊ตญ์ด ๋ฒค์น๋งํฌ", |
| | kobest_avg > _SFT_TARGETS["kobest_avg_min"], |
| | f"KoBEST ํ๊ท {kobest_avg*100:.2f}% (๋ชฉํ >{_SFT_TARGETS['kobest_avg_min']*100:.0f}%)", |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 3: ํ๊ตญ์ด ๋ฒค์น๋งํฌ", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | en_tasks = { |
| | "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| | "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| | "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| | "winogrande": _SFT_TARGETS["winogrande_min"], |
| | "piqa": _SFT_TARGETS["piqa_min"], |
| | } |
| | en_pass = True |
| | en_detail_parts = [] |
| | for t, threshold in en_tasks.items(): |
| | a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| | if a is not None: |
| | if a < threshold: |
| | en_pass = False |
| | en_detail_parts.append(f"{t}={a*100:.1f}%") |
| | if en_detail_parts: |
| | verdicts.append(( |
| | "์ฐจ์ 4: ์์ด ๋ฒค์น๋งํฌ", |
| | en_pass, |
| | ", ".join(en_detail_parts[:3]) + ("..." if len(en_detail_parts) > 3 else ""), |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 4: ์์ด ๋ฒค์น๋งํฌ", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | cal = sft_p1.get("calibration", {}) |
| | top1 = cal.get("top1_accuracy") |
| | if top1 is not None: |
| | cal_pass = top1 >= _SFT_TARGETS["top1_accuracy_min"] |
| | verdicts.append(( |
| | "์ฐจ์ 5: Calibration", |
| | cal_pass, |
| | f"Top-1 {top1*100:.2f}% (๋ชฉํ โฅ{_SFT_TARGETS['top1_accuracy_min']*100:.0f}%)", |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 5: Calibration", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | if eos_rate is not None: |
| | chat_pass = eos_rate > 0.50 |
| | verdicts.append(( |
| | "์ฐจ์ 6: SFT Chat ๋ฅ๋ ฅ", |
| | chat_pass, |
| | f"EOS ์ข
๋ฃ์จ {eos_rate:.0%}, ์์ฑ ์ํ ์๋ ๊ฒํ ํ์", |
| | )) |
| | else: |
| | verdicts.append(("์ฐจ์ 6: SFT Chat ๋ฅ๋ ฅ", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | return verdicts |
| |
|
| |
|
| | def _get_greedy_3gram_rep(p1: dict) -> Optional[float]: |
| | gen = p1.get("generation", {}) |
| | return gen.get("summary", {}).get("greedy_avg_3gram_rep") |
| |
|
| |
|
| | def _get_kobest_avg(zero_shot: dict) -> Optional[float]: |
| | kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| | "kobest_sentineg", "kobest_wic"] |
| | accs = [] |
| | for t in kobest_tasks: |
| | if t in zero_shot: |
| | a = _get_acc(zero_shot[t]) |
| | if a is not None: |
| | accs.append(a) |
| | return sum(accs) / len(accs) if accs else None |
| |
|
| |
|
| | def _get_max_forgetting(sft_p1: dict, base_p1: dict) -> Optional[float]: |
| | sft_ppl = sft_p1.get("perplexity", {}) |
| | base_ppl = base_p1.get("perplexity", {}) |
| | forgetting_values = [] |
| | for name in sft_ppl: |
| | sft_val = sft_ppl[name].get("ppl") if isinstance(sft_ppl[name], dict) else None |
| | base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| | if base_val is None: |
| | base_val = _BASE_PPL_REFERENCE.get(name) |
| | if sft_val is not None and base_val is not None and base_val > 0: |
| | forgetting_values.append((sft_val - base_val) / base_val * 100) |
| | return max(forgetting_values) if forgetting_values else None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def _compute_orpo_verdicts( |
| | orpo_p1: dict, |
| | orpo_zero: dict, |
| | sft_p1: dict, |
| | sft_zero: dict, |
| | training_curve: Optional[dict] = None, |
| | ) -> List[Tuple[str, bool, str]]: |
| | """Compute the 4 ORPO-specific evaluation dimensions. |
| | |
| | Returns list of (dimension_name, pass_bool, detail_string). |
| | """ |
| | verdicts: List[Tuple[str, bool, str]] = [] |
| |
|
| | |
| | pref_acc = None |
| | if training_curve and training_curve.get("eval_steps"): |
| | last_step = training_curve["eval_steps"][-1] |
| | pref_acc = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| | if pref_acc is not None: |
| | verdicts.append(( |
| | "ORPO-1: Preference Accuracy", |
| | pref_acc > 0.65, |
| | f"์ต์ข
{pref_acc:.2%} (๋ชฉํ > 65%)", |
| | )) |
| | else: |
| | verdicts.append(("ORPO-1: Preference Accuracy", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | reward_margin = None |
| | if training_curve and training_curve.get("eval_steps"): |
| | last_step = training_curve["eval_steps"][-1] |
| | reward_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| | if reward_margin is not None: |
| | verdicts.append(( |
| | "ORPO-2: Reward Margins", |
| | reward_margin > 0.1, |
| | f"์ต์ข
{reward_margin:.4f} (๋ชฉํ > 0.1)", |
| | )) |
| | else: |
| | verdicts.append(("ORPO-2: Reward Margins", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | |
| | rep_grid = orpo_p1.get("repetition", {}).get("grid_results") |
| | param_sens_pass = False |
| | param_sens_detail = "๋ฐ์ดํฐ ์์" |
| | if rep_grid: |
| | items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| | for r in items: |
| | if isinstance(r, dict): |
| | rp = r.get("repetition_penalty", r.get("rep_penalty")) |
| | if rp is not None and abs(float(rp) - 1.0) < 1e-6: |
| | rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition")) |
| | if rep_val is not None: |
| | param_sens_pass = rep_val < 0.05 |
| | param_sens_detail = f"rep_penalty=1.0 ์ 3-gram rep={rep_val:.2%} (๋ชฉํ < 5%)" |
| | break |
| | verdicts.append(( |
| | "ORPO-3: Parameter Sensitivity", |
| | param_sens_pass, |
| | param_sens_detail, |
| | )) |
| |
|
| | |
| | sft_rep = _get_greedy_3gram_rep(sft_p1) |
| | orpo_rep = _get_greedy_3gram_rep(orpo_p1) |
| | sft_eos = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| | orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| |
|
| | if all(v is not None for v in [sft_rep, orpo_rep, sft_eos, orpo_eos]): |
| | rep_improved = orpo_rep < sft_rep |
| | eos_improved = orpo_eos > sft_eos |
| | verdicts.append(( |
| | "ORPO-4: SFTโORPO ๊ฐ์ ", |
| | rep_improved and eos_improved, |
| | f"๋ฐ๋ณต๋ฅ {sft_rep:.2%}โ{orpo_rep:.2%} ({'โ' if rep_improved else 'โ'}), " |
| | f"EOS {sft_eos:.0%}โ{orpo_eos:.0%} ({'โ' if eos_improved else 'โ'})", |
| | )) |
| | else: |
| | verdicts.append(("ORPO-4: SFTโORPO ๊ฐ์ ", False, "๋ฐ์ดํฐ ์์")) |
| |
|
| | return verdicts |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generate_three_way_report( |
| | base_results_dir: Path, |
| | sft_results_dir: Path, |
| | orpo_phase1_results: dict, |
| | orpo_phase2_results: dict, |
| | output_path: Path, |
| | orpo_output_dir: Optional[Path] = None, |
| | training_curve: Optional[dict] = None, |
| | total_elapsed_sec: float = 0.0, |
| | ) -> Path: |
| | """Generate a comprehensive Base vs SFT vs ORPO 3-way comparison report. |
| | |
| | Args: |
| | base_results_dir: Directory containing Base model's phase1/phase2_results.json |
| | sft_results_dir: Directory containing SFT model's phase1/phase2_results.json |
| | orpo_phase1_results: ORPO Phase 1 results dict |
| | orpo_phase2_results: ORPO Phase 2 results dict |
| | output_path: Where to write the markdown report |
| | orpo_output_dir: ORPO eval outputs directory (for linking) |
| | training_curve: Dict with "eval_steps" list of per-step metrics |
| | total_elapsed_sec: Total pipeline elapsed time |
| | |
| | Returns: |
| | Path to the generated report |
| | """ |
| | base_results_dir = Path(base_results_dir) |
| | sft_results_dir = Path(sft_results_dir) |
| | output_path = Path(output_path) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | base_p1_raw, base_p2_raw = {}, {} |
| | p1_file = base_results_dir / "phase1_results.json" |
| | p2_file = base_results_dir / "phase2_results.json" |
| | if p1_file.exists(): |
| | with open(p1_file, encoding="utf-8") as f: |
| | base_p1_raw = json.load(f) |
| | if p2_file.exists(): |
| | with open(p2_file, encoding="utf-8") as f: |
| | base_p2_raw = json.load(f) |
| |
|
| | |
| | sft_p1_raw, sft_p2_raw = {}, {} |
| | sft_p1_file = sft_results_dir / "phase1_results.json" |
| | sft_p2_file = sft_results_dir / "phase2_results.json" |
| | if sft_p1_file.exists(): |
| | with open(sft_p1_file, encoding="utf-8") as f: |
| | sft_p1_raw = json.load(f) |
| | if sft_p2_file.exists(): |
| | with open(sft_p2_file, encoding="utf-8") as f: |
| | sft_p2_raw = json.load(f) |
| |
|
| | |
| | base_p1 = _normalize_phase1_results(base_p1_raw) |
| | base_zero, base_five = _normalize_phase2_results(base_p2_raw) |
| | sft_p1 = _normalize_phase1_results(sft_p1_raw) |
| | sft_zero, sft_five = _normalize_phase2_results(sft_p2_raw) |
| | orpo_p1 = _normalize_phase1_results(orpo_phase1_results) |
| | orpo_zero, orpo_five = _normalize_phase2_results(orpo_phase2_results) |
| |
|
| | eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| |
|
| | lines: List[str] = [] |
| |
|
| | |
| | |
| | |
| | lines.append("# FRANKENSTALLM 3B ORPO ๋ชจ๋ธ ์ข
ํฉ ํ๊ฐ ๋ณด๊ณ ์\n") |
| | lines.append(f"- **ํ๊ฐ ์ผ์**: {eval_datetime}") |
| | lines.append(f"- **๋น๊ต ๋์**: Base โ SFT โ ORPO") |
| | lines.append(f"- **์ด ์์ ์๊ฐ**: {_fmt_seconds(total_elapsed_sec)}") |
| | if orpo_output_dir: |
| | lines.append(f"- **๊ฒฐ๊ณผ ๋๋ ํ ๋ฆฌ**: {orpo_output_dir}") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 1. Executive Summary\n") |
| |
|
| | |
| | std_verdicts = _compute_verdicts(orpo_p1, orpo_zero, base_p1, base_zero) |
| | |
| | orpo_verdicts = _compute_orpo_verdicts(orpo_p1, orpo_zero, sft_p1, sft_zero, training_curve) |
| |
|
| | all_verdicts = std_verdicts + orpo_verdicts |
| |
|
| | lines.append("| # | ํ๊ฐ ์ฐจ์ | ๊ฒฐ๊ณผ | ์์ธ |") |
| | lines.append("|---|----------|------|------|") |
| | for i, (dim_name, verdict, detail) in enumerate(all_verdicts, 1): |
| | icon = "PASS" if verdict else "FAIL" |
| | lines.append(f"| {i} | {dim_name} | **{icon}** | {detail} |") |
| | lines.append("") |
| |
|
| | pass_count = sum(1 for _, v, _ in all_verdicts if v) |
| | total_dims = len(all_verdicts) |
| | lines.append(f"**์ข
ํฉ**: {pass_count}/{total_dims} ์ฐจ์ ํต๊ณผ\n") |
| |
|
| | |
| | orpo_score_result = _compute_orpo_score(orpo_p1, orpo_zero, base_p1, base_zero) |
| | lines.append(f"**์ ๋ ์ค์ฝ์ด**: {orpo_score_result['total_score']}/100\n") |
| |
|
| | |
| | orpo_rep = _get_greedy_3gram_rep(orpo_p1) |
| | orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| | orpo_forgetting = _get_max_forgetting(orpo_p1, base_p1) |
| | orpo_kobest = _get_kobest_avg(orpo_zero) |
| |
|
| | deploy_criteria_met = ( |
| | orpo_rep is not None and orpo_rep < 0.05 |
| | and orpo_eos is not None and orpo_eos > 0.90 |
| | and orpo_forgetting is not None and orpo_forgetting < 5.0 |
| | and orpo_kobest is not None and orpo_kobest >= 0.43 |
| | ) |
| | final_decision = "DEPLOY" if deploy_criteria_met else "RETRY" |
| | lines.append(f"**์ต์ข
ํ์ **: **{final_decision}**\n") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 2. ํ์ต ๊ณก์ ๋ถ์\n") |
| | if training_curve and training_curve.get("eval_steps"): |
| | eval_steps = training_curve["eval_steps"] |
| |
|
| | lines.append("### Training / Eval Loss\n") |
| | lines.append("| Step | Train Loss | Eval Loss | Pref Accuracy | Reward Margin |") |
| | lines.append("|------|-----------|-----------|---------------|---------------|") |
| | for step_data in eval_steps: |
| | step = step_data.get("step", "?") |
| | train_loss = _fmt_f(step_data.get("train_loss", step_data.get("loss")), 4) |
| | eval_loss = _fmt_f(step_data.get("eval_loss"), 4) |
| | pref_acc = _fmt_f(step_data.get("rewards_accuracies", step_data.get("preference_accuracy")), 4) |
| | reward_m = _fmt_f(step_data.get("rewards_margins", step_data.get("reward_margins")), 4) |
| | lines.append(f"| {step} | {train_loss} | {eval_loss} | {pref_acc} | {reward_m} |") |
| | lines.append("") |
| |
|
| | |
| | first_step = eval_steps[0] |
| | last_step = eval_steps[-1] |
| | lines.append("### ํ์ต ๊ณก์ ์์ฝ\n") |
| | first_loss = first_step.get("train_loss", first_step.get("loss")) |
| | last_loss = last_step.get("train_loss", last_step.get("loss")) |
| | if first_loss is not None and last_loss is not None: |
| | lines.append(f"- **Train Loss**: {first_loss:.4f} โ {last_loss:.4f}") |
| | first_eval = first_step.get("eval_loss") |
| | last_eval = last_step.get("eval_loss") |
| | if first_eval is not None and last_eval is not None: |
| | lines.append(f"- **Eval Loss**: {first_eval:.4f} โ {last_eval:.4f}") |
| | last_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| | if last_pref is not None: |
| | lines.append(f"- **์ต์ข
Preference Accuracy**: {last_pref:.2%}") |
| | last_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| | if last_margin is not None: |
| | lines.append(f"- **์ต์ข
Reward Margin**: {last_margin:.4f}") |
| | lines.append("") |
| | else: |
| | lines.append("ํ์ต ๊ณก์ ๋ฐ์ดํฐ ์์\n") |
| |
|
| | |
| | |
| | |
| | lines.append("## 3. Perplexity ๋น๊ต (์ง์ ๋ณด์กด)\n") |
| | lines.append("| ๋ฐ์ดํฐ์
| Base PPL | SFT PPL | ORPO PPL | SFT Forgetting | ORPO Forgetting |") |
| | lines.append("|---------|---------|---------|---------|----------------|-----------------|") |
| |
|
| | base_ppl = base_p1.get("perplexity", {}) |
| | sft_ppl = sft_p1.get("perplexity", {}) |
| | orpo_ppl = orpo_p1.get("perplexity", {}) |
| |
|
| | all_ppl_names = sorted(set( |
| | list(base_ppl.keys()) + list(sft_ppl.keys()) + list(orpo_ppl.keys()) |
| | )) |
| | for name in all_ppl_names: |
| | base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| | if base_val is None: |
| | base_val = _BASE_PPL_REFERENCE.get(name) |
| | sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None |
| | orpo_val = orpo_ppl.get(name, {}).get("ppl") if isinstance(orpo_ppl.get(name), dict) else None |
| |
|
| | sft_forg = f"{(sft_val - base_val) / base_val * 100:+.1f}%" if (sft_val is not None and base_val is not None and base_val > 0) else "โ" |
| | orpo_forg = f"{(orpo_val - base_val) / base_val * 100:+.1f}%" if (orpo_val is not None and base_val is not None and base_val > 0) else "โ" |
| |
|
| | lines.append( |
| | f"| {name} | {_fmt_f(base_val)} | {_fmt_f(sft_val)} | {_fmt_f(orpo_val)} | " |
| | f"{sft_forg} | {orpo_forg} |" |
| | ) |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 4. ์์ฑ ํ์ง ๋น๊ต\n") |
| |
|
| | base_gen_summary = base_p1.get("generation", {}).get("summary", {}) |
| | sft_gen_summary = sft_p1.get("generation", {}).get("summary", {}) |
| | orpo_gen_summary = orpo_p1.get("generation", {}).get("summary", {}) |
| |
|
| | base_3gram = base_gen_summary.get("greedy_avg_3gram_rep", _BASE_GEN_REFERENCE.get("greedy_3gram_rep")) |
| | sft_3gram = sft_gen_summary.get("greedy_avg_3gram_rep") |
| | orpo_3gram = orpo_gen_summary.get("greedy_avg_3gram_rep") |
| |
|
| | base_4gram = base_gen_summary.get("greedy_avg_4gram_rep", _BASE_GEN_REFERENCE.get("greedy_4gram_rep")) |
| | sft_4gram = sft_gen_summary.get("greedy_avg_4gram_rep") |
| | orpo_4gram = orpo_gen_summary.get("greedy_avg_4gram_rep") |
| |
|
| | base_eos = base_gen_summary.get("greedy_eos_rate", _BASE_GEN_REFERENCE.get("greedy_eos_rate")) |
| | sft_eos_val = sft_gen_summary.get("greedy_eos_rate") |
| | orpo_eos_val = orpo_gen_summary.get("greedy_eos_rate") |
| |
|
| | lines.append("| ์งํ | Base | SFT | ORPO | SFTโORPO ๋ณํ |") |
| | lines.append("|------|------|-----|------|---------------|") |
| |
|
| | |
| | sft_orpo_3gram_diff = "" |
| | if sft_3gram is not None and orpo_3gram is not None: |
| | d = (orpo_3gram - sft_3gram) * 100 |
| | sft_orpo_3gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(base_3gram)} | {_fmt_pct(sft_3gram)} | " |
| | f"{_fmt_pct(orpo_3gram)} | {sft_orpo_3gram_diff} |") |
| |
|
| | |
| | sft_orpo_4gram_diff = "" |
| | if sft_4gram is not None and orpo_4gram is not None: |
| | d = (orpo_4gram - sft_4gram) * 100 |
| | sft_orpo_4gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| Greedy 4-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(base_4gram)} | {_fmt_pct(sft_4gram)} | " |
| | f"{_fmt_pct(orpo_4gram)} | {sft_orpo_4gram_diff} |") |
| |
|
| | |
| | sft_orpo_eos_diff = "" |
| | if sft_eos_val is not None and orpo_eos_val is not None: |
| | d = (orpo_eos_val - sft_eos_val) * 100 |
| | sft_orpo_eos_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| EOS ์ข
๋ฃ์จ | {_fmt_pct(base_eos)} | {_fmt_pct(sft_eos_val)} | " |
| | f"{_fmt_pct(orpo_eos_val)} | {sft_orpo_eos_diff} |") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 5. ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
| |
|
| | |
| | lines.append("### KoBEST (0-shot)\n") |
| | lines.append("| ํ์คํฌ | Base | SFT | ORPO | BaseโORPO |") |
| | lines.append("|--------|------|-----|------|-----------|") |
| |
|
| | kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| | "kobest_sentineg", "kobest_wic"] |
| | base_kobest_accs, sft_kobest_accs, orpo_kobest_accs = [], [], [] |
| |
|
| | for t in kobest_tasks: |
| | base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| | sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| | orpo_a = _get_acc(orpo_zero.get(t, {})) if t in orpo_zero else None |
| |
|
| | if base_a is not None: |
| | base_kobest_accs.append(base_a) |
| | if sft_a is not None: |
| | sft_kobest_accs.append(sft_a) |
| | if orpo_a is not None: |
| | orpo_kobest_accs.append(orpo_a) |
| |
|
| | diff = "" |
| | if orpo_a is not None and base_a is not None: |
| | d = (orpo_a - base_a) * 100 |
| | diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| |
|
| | lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |") |
| |
|
| | |
| | base_kavg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else None |
| | sft_kavg = sum(sft_kobest_accs) / len(sft_kobest_accs) if sft_kobest_accs else None |
| | orpo_kavg = sum(orpo_kobest_accs) / len(orpo_kobest_accs) if orpo_kobest_accs else None |
| | avg_diff = "" |
| | if orpo_kavg is not None and base_kavg is not None: |
| | d = (orpo_kavg - base_kavg) * 100 |
| | avg_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| **ํ๊ท ** | **{_fmt_pct(base_kavg)}** | **{_fmt_pct(sft_kavg)}** | " |
| | f"**{_fmt_pct(orpo_kavg)}** | **{avg_diff}** |") |
| | lines.append("") |
| |
|
| | |
| | lines.append("### HAE-RAE (0-shot)\n") |
| | base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae") |
| | sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None |
| | orpo_haerae = _get_acc(orpo_zero.get("haerae", {})) if "haerae" in orpo_zero else None |
| | lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: {_fmt_pct(sft_haerae)} โ ORPO: {_fmt_pct(orpo_haerae)}") |
| | lines.append("") |
| |
|
| | |
| | lines.append("### MMLU-KO (0-shot)\n") |
| | base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko") |
| | sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None |
| | orpo_mmlu_ko = _get_acc(orpo_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in orpo_zero else None |
| | lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: {_fmt_pct(sft_mmlu_ko)} โ ORPO: {_fmt_pct(orpo_mmlu_ko)}") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 6. ์์ด ๋ฒค์น๋งํฌ\n") |
| | lines.append("| ํ์คํฌ | Base | SFT | ORPO | BaseโORPO |") |
| | lines.append("|--------|------|-----|------|-----------|") |
| |
|
| | en_tasks_list = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"] |
| | for t in en_tasks_list: |
| | prefer_norm = t in ["hellaswag", "arc_challenge"] |
| | base_a = _get_acc(base_zero.get(t, {}), prefer_norm=prefer_norm) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| | sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=prefer_norm) if t in sft_zero else None |
| | orpo_a = _get_acc(orpo_zero.get(t, {}), prefer_norm=prefer_norm) if t in orpo_zero else None |
| |
|
| | diff = "" |
| | if orpo_a is not None and base_a is not None: |
| | d = (orpo_a - base_a) * 100 |
| | diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |") |
| |
|
| | |
| | _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
| |
|
| | def _mmlu_en_avg(zero: dict) -> Optional[float]: |
| | accs = [] |
| | for t, m in zero.items(): |
| | if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| | a = _get_acc(m) |
| | if a is not None: |
| | accs.append(a) |
| | if not accs: |
| | for t in _MMLU_EN_GROUPS: |
| | if t in zero: |
| | a = _get_acc(zero[t]) |
| | if a is not None: |
| | accs.append(a) |
| | return sum(accs) / len(accs) if accs else None |
| |
|
| | base_mmlu_en = _mmlu_en_avg(base_zero) |
| | sft_mmlu_en = _mmlu_en_avg(sft_zero) |
| | orpo_mmlu_en = _mmlu_en_avg(orpo_zero) |
| |
|
| | mmlu_en_diff = "" |
| | if orpo_mmlu_en is not None and base_mmlu_en is not None: |
| | d = (orpo_mmlu_en - base_mmlu_en) * 100 |
| | mmlu_en_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| | lines.append(f"| MMLU-EN ํ๊ท | {_fmt_pct(base_mmlu_en)} | {_fmt_pct(sft_mmlu_en)} | " |
| | f"{_fmt_pct(orpo_mmlu_en)} | {mmlu_en_diff} |") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 7. Calibration ๋น๊ต\n") |
| | lines.append("| ์งํ | Base | SFT | ORPO |") |
| | lines.append("|------|------|-----|------|") |
| |
|
| | base_cal = base_p1.get("calibration", {}) |
| | sft_cal = sft_p1.get("calibration", {}) |
| | orpo_cal = orpo_p1.get("calibration", {}) |
| |
|
| | cal_metrics = [ |
| | ("top1_accuracy", "Top-1 Accuracy"), |
| | ("top5_accuracy", "Top-5 Accuracy"), |
| | ("top10_accuracy", "Top-10 Accuracy"), |
| | ] |
| | for key, label in cal_metrics: |
| | base_v = base_cal.get(key, _BASE_CALIB_REFERENCE.get(key)) |
| | sft_v = sft_cal.get(key) |
| | orpo_v = orpo_cal.get(key) |
| | lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | {_fmt_f(orpo_v)} |") |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 8. ORPO ๊ณ ์ ์งํ\n") |
| |
|
| | |
| | if training_curve and training_curve.get("eval_steps"): |
| | last_step = training_curve["eval_steps"][-1] |
| | final_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| | final_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| | if final_pref is not None: |
| | lines.append(f"- **์ต์ข
Preference Accuracy**: {final_pref:.2%}") |
| | if final_margin is not None: |
| | lines.append(f"- **์ต์ข
Reward Margins**: {final_margin:.4f}") |
| | else: |
| | lines.append("- Preference Accuracy / Reward Margins: ๋ฐ์ดํฐ ์์") |
| |
|
| | |
| | rep_grid = orpo_p1.get("repetition", {}).get("grid_results") |
| | if rep_grid: |
| | items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| | for r in items: |
| | if isinstance(r, dict): |
| | rp = r.get("repetition_penalty", r.get("rep_penalty")) |
| | if rp is not None and abs(float(rp) - 1.0) < 1e-6: |
| | rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition")) |
| | if rep_val is not None: |
| | verdict = "PASS" if rep_val < 0.05 else "FAIL" |
| | lines.append(f"- **Parameter Sensitivity**: rep_penalty=1.0 โ 3-gram rep={rep_val:.2%} " |
| | f"(๋ชฉํ < 5%) โ {verdict}") |
| | break |
| | lines.append("") |
| |
|
| | |
| | |
| | |
| | lines.append("## 9. ๋ฐ๋ณต๋ฅ ๊ทธ๋ฆฌ๋ ์์น\n") |
| | if rep_grid: |
| | items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| | rep_rows = [] |
| | for r in items: |
| | if isinstance(r, dict): |
| | rep_rows.append({ |
| | "config": r.get("params", "?"), |
| | "temp": r.get("temperature"), |
| | "rep_pen": r.get("repetition_penalty"), |
| | "3gram": r.get("avg_3gram_rep", r.get("3gram_repetition", float("inf"))), |
| | "4gram": r.get("avg_4gram_rep", r.get("4gram_repetition")), |
| | "eos_rate": r.get("eos_rate"), |
| | "avg_tokens": r.get("avg_tokens"), |
| | }) |
| | rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf")) |
| |
|
| | lines.append("| ์ค์ | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |") |
| | lines.append("|------|------|---------|--------|--------|----------|-----------|") |
| | for i, r in enumerate(rep_rows): |
| | marker = " **โ best**" if i == 0 else "" |
| | lines.append( |
| | f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | " |
| | f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | " |
| | f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}" |
| | ) |
| | lines.append("") |
| | else: |
| | lines.append("๋ฐ๋ณต๋ฅ ๊ทธ๋ฆฌ๋ ์์น ๋ฐ์ดํฐ ์์\n") |
| |
|
| | |
| | |
| | |
| | lines.append("## 10. ์์ฑ ์ํ\n") |
| | orpo_gen = orpo_p1.get("generation", {}) |
| | orpo_samples = orpo_gen.get("samples", []) |
| | greedy_samples = [s for s in orpo_samples if isinstance(s, dict) and s.get("temperature", 1.0) == 0.0] |
| | if not greedy_samples: |
| | greedy_samples = orpo_samples |
| |
|
| | if greedy_samples: |
| | lines.append("### ORPO Greedy ์์ฑ ์ํ\n") |
| | for i, s in enumerate(greedy_samples[:15], 1): |
| | if isinstance(s, dict): |
| | prompt = s.get("prompt", "") |
| | text = s.get("text", s.get("generated_text", "")) |
| | if len(text) > 500: |
| | text = text[:500] + "..." |
| | hit_eos = s.get("hit_eos", "?") |
| | rep3 = s.get("3gram_rep", s.get("avg_3gram_rep")) |
| | tokens = s.get("generated_tokens", s.get("num_tokens", "?")) |
| | lines.append(f"**[{i}]** `{prompt}`") |
| | lines.append(f"> {text}") |
| | meta_parts = [f"EOS={hit_eos}"] |
| | if rep3 is not None: |
| | meta_parts.append(f"3gram_rep={rep3:.2%}") |
| | meta_parts.append(f"tokens={tokens}") |
| | lines.append(f"> *{', '.join(meta_parts)}*\n") |
| | else: |
| | lines.append("์์ฑ ์ํ ๋ฐ์ดํฐ ์์\n") |
| |
|
| | |
| | |
| | |
| | lines.append("## 11. ์ต์ข
ํ์ \n") |
| | lines.append("### ๋ฐฐํฌ ๊ธฐ์ค ์ถฉ์กฑ ์ฌ๋ถ\n") |
| | lines.append("| ์กฐ๊ฑด | ๊ธฐ์ค | ํ์ฌ ๊ฐ | ์ถฉ์กฑ |") |
| | lines.append("|------|------|---------|------|") |
| |
|
| | criteria = [ |
| | ("Greedy 3-gram ๋ฐ๋ณต๋ฅ ", "< 5%", _fmt_pct(orpo_rep), |
| | "YES" if orpo_rep is not None and orpo_rep < 0.05 else "NO"), |
| | ("EOS ์ข
๋ฃ์จ", "> 90%", _fmt_pct(orpo_eos), |
| | "YES" if orpo_eos is not None and orpo_eos > 0.90 else "NO"), |
| | ("PPL Forgetting", "< 5%", f"{orpo_forgetting:.1f}%" if orpo_forgetting is not None else "N/A", |
| | "YES" if orpo_forgetting is not None and orpo_forgetting < 5.0 else "NO"), |
| | ("KoBEST ํ๊ท ", ">= 43%", _fmt_pct(orpo_kobest), |
| | "YES" if orpo_kobest is not None and orpo_kobest >= 0.43 else "NO"), |
| | ] |
| | for cond, threshold, current, met in criteria: |
| | lines.append(f"| {cond} | {threshold} | {current} | {met} |") |
| | lines.append("") |
| |
|
| | if deploy_criteria_met: |
| | lines.append("**โ ๋ชจ๋ ๋ฐฐํฌ ๊ธฐ์ค ์ถฉ์กฑ: DEPLOY (Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ)**\n") |
| | else: |
| | lines.append("**โ ๋ฐฐํฌ ๊ธฐ์ค ๋ฏธ๋ฌ: RETRY (ORPO ์ฌํ์ต ๋๋ ํ์ดํผํ๋ผ๋ฏธํฐ ์กฐ์ ํ์)**\n") |
| |
|
| | lines.append("---\n") |
| | lines.append("*์ด ๋ณด๊ณ ์๋ `eval/report_generator.py::generate_three_way_report()`์ ์ํด ์๋ ์์ฑ๋์์ต๋๋ค.*") |
| |
|
| | report_text = "\n".join(lines) |
| | output_path.write_text(report_text, encoding="utf-8") |
| |
|
| | |
| | if orpo_output_dir: |
| | orpo_output_dir = Path(orpo_output_dir) |
| | orpo_output_dir.mkdir(parents=True, exist_ok=True) |
| | (orpo_output_dir / "orpo_three_way_report.md").write_text(report_text, encoding="utf-8") |
| |
|
| | return output_path |
| |
|
| |
|
| | if __name__ == "__main__": |
| | print("report_generator.py โ use via full_eval_pipeline.py or sft_eval_pipeline.py") |
| |
|