frankenstallm / source /eval /report_generator.py
pathcosmos's picture
Upload folder using huggingface_hub (#29)
5b1ff4d
"""
Markdown report generator for FRANKENSTALLM 3B evaluation pipeline.
Generates comprehensive evaluation reports with sections for:
- Perplexity metrics across datasets
- Calibration statistics
- Token NLL distribution
- Generation quality samples
- Repetition parameter search results
- Standard benchmark results (lm-eval) โ€” Korean + English
- 0-shot vs 5-shot comparison
- Comparison with reference models
"""
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
import json
import logging
logger = logging.getLogger(__name__)
def _fmt_seconds(seconds: float) -> str:
"""Format seconds into a human-readable duration string."""
m, s = divmod(int(seconds), 60)
h, m = divmod(m, 60)
if h:
return f"{h}h {m}m {s}s"
if m:
return f"{m}m {s}s"
return f"{s}s"
# =========================================================================
# Normalization helpers โ€” map GPU-label keys to logical sections
# =========================================================================
def _normalize_phase1_results(raw: dict) -> dict:
"""Convert GPU-labelled phase1_results into logical sections.
Returns dict with keys: perplexity, calibration, token_nll, generation, repetition.
"""
normalized: Dict[str, Any] = {
"perplexity": {},
"calibration": {},
"token_nll": {},
"generation": {},
"repetition": {},
}
for label, data in raw.items():
if not isinstance(data, (dict, list)):
continue
if "PPL" in label:
# PPL entries: single dict or list of dicts
if isinstance(data, dict) and "ppl" in data:
name = data.get("name", label)
normalized["perplexity"][name] = data
elif isinstance(data, list):
for item in data:
if isinstance(item, dict) and "ppl" in item:
name = item.get("name", f"unknown_{len(normalized['perplexity'])}")
normalized["perplexity"][name] = item
elif isinstance(data, dict) and "error" in data:
# Task failed โ€” skip
pass
elif "Calibration" in label:
if isinstance(data, dict):
if "calibration" in data:
normalized["calibration"] = data["calibration"]
if "token_nll" in data:
normalized["token_nll"] = data["token_nll"]
elif "Generation" in label:
if isinstance(data, dict):
normalized["generation"] = data
elif "Repetition" in label:
if isinstance(data, dict):
normalized["repetition"] = data
return normalized
def _normalize_phase2_results(raw: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Convert GPU-labelled phase2_results into flat task dicts for 0-shot and 5-shot.
Returns (zero_shot_metrics, five_shot_metrics) where each is:
{"kobest_boolq": {"acc,none": 0.50, ...}, "haerae": {...}, ...}
"""
zero_shot: Dict[str, Any] = {}
five_shot: Dict[str, Any] = {}
for label, data in raw.items():
if label == "5shot":
# Recurse into 5-shot sub-dict
if isinstance(data, dict):
for sub_label, sub_data in data.items():
if isinstance(sub_data, dict) and "per_task_metrics" in sub_data:
for task_name, metrics in sub_data["per_task_metrics"].items():
five_shot[task_name] = metrics
continue
if isinstance(data, dict) and "per_task_metrics" in data:
for task_name, metrics in data["per_task_metrics"].items():
zero_shot[task_name] = metrics
return zero_shot, five_shot
def _get_acc(metrics: dict, prefer_norm: bool = False) -> Optional[float]:
"""Extract accuracy from lm-eval metrics dict."""
if prefer_norm and "acc_norm,none" in metrics:
val = metrics["acc_norm,none"]
if isinstance(val, (int, float)):
return float(val)
if "acc,none" in metrics:
val = metrics["acc,none"]
if isinstance(val, (int, float)):
return float(val)
return None
def _fmt_pct(val: Optional[float]) -> str:
"""Format as percentage string or N/A."""
if val is None:
return "N/A"
return f"{val * 100:.2f}%"
def _fmt_f(val, decimals: int = 4) -> str:
"""Format float or return N/A."""
if isinstance(val, (int, float)):
return f"{val:.{decimals}f}"
return str(val) if val is not None else "N/A"
# =========================================================================
# Main report generator
# =========================================================================
def generate_report(
phase1_results: dict,
phase2_results: dict,
generation_samples: list,
output_dir: Path,
checkpoint_name: str = "checkpoint-0057000",
total_elapsed_sec: float = 0.0,
) -> str:
"""Generate a comprehensive markdown evaluation report.
Handles the GPU-labelled key structure from full_eval_pipeline.py
and produces multiple report files.
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
reports_dir = output_dir / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)
# Normalize data
p1 = _normalize_phase1_results(phase1_results)
zero_shot, five_shot = _normalize_phase2_results(phase2_results)
eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ===== Generate individual reports =====
ppl_report = _generate_perplexity_report(p1["perplexity"])
cal_report = _generate_calibration_report(p1["calibration"], p1["token_nll"])
gen_report = _generate_generation_report(p1["generation"], generation_samples)
bench_report = _generate_benchmark_report(zero_shot, five_shot, p1["repetition"])
exec_summary = _generate_executive_summary(
p1, zero_shot, five_shot, checkpoint_name, eval_datetime, total_elapsed_sec,
)
# Write individual reports
(reports_dir / "00_executive_summary.md").write_text(exec_summary, encoding="utf-8")
(reports_dir / "01_perplexity_report.md").write_text(ppl_report, encoding="utf-8")
(reports_dir / "02_calibration_report.md").write_text(cal_report, encoding="utf-8")
(reports_dir / "03_generation_quality.md").write_text(gen_report, encoding="utf-8")
(reports_dir / "04_benchmark_report.md").write_text(bench_report, encoding="utf-8")
# Combined full report
full_report = "\n\n---\n\n".join([
exec_summary, ppl_report, cal_report, gen_report, bench_report,
])
report_path = output_dir / "full_eval_report.md"
report_path.write_text(full_report, encoding="utf-8")
return full_report
# =========================================================================
# Individual report sections
# =========================================================================
def _generate_executive_summary(
p1: dict,
zero_shot: dict,
five_shot: dict,
checkpoint_name: str,
eval_datetime: str,
total_elapsed_sec: float,
) -> str:
lines = [
"# FRANKENSTALLM 3B ์ข…ํ•ฉ ํ‰๊ฐ€ ๋ฆฌํฌํŠธ\n",
f"- **๋ชจ๋ธ**: FRANKENSTALLM 3B",
f"- **์ฒดํฌํฌ์ธํŠธ**: {checkpoint_name}",
f"- **ํ‰๊ฐ€ ์ผ์‹œ**: {eval_datetime}",
f"- **์ด ์†Œ์š” ์‹œ๊ฐ„**: {total_elapsed_sec:.1f}์ดˆ\n",
"## Executive Summary\n",
]
# Main PPL
main_ppl = "N/A"
ppl_data = p1.get("perplexity", {})
for name in ["3b", "3b_val"]:
if name in ppl_data and isinstance(ppl_data[name], dict):
main_ppl = _fmt_f(ppl_data[name].get("ppl"))
break
# KoBEST average
kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
"kobest_sentineg", "kobest_wic"]
kobest_accs = []
for t in kobest_tasks:
if t in zero_shot:
a = _get_acc(zero_shot[t])
if a is not None:
kobest_accs.append(a)
kobest_avg = _fmt_pct(sum(kobest_accs) / len(kobest_accs)) if kobest_accs else "N/A"
# MMLU-KO โ€” prefer group-level weighted average from lm-eval
mmlu_ko_avg = "N/A"
mmlu_ko_count = 0
if "global_mmlu_ko" in zero_shot:
a = _get_acc(zero_shot["global_mmlu_ko"])
if a is not None:
mmlu_ko_avg = _fmt_pct(a)
# Count subtasks for display
mmlu_ko_count = sum(
1 for t in zero_shot
if t.startswith("global_mmlu_ko_") and _get_acc(zero_shot[t]) is not None
)
if mmlu_ko_count == 0:
mmlu_ko_count = 1 # group-level only
else:
# Fallback: average subtask-level metrics
mmlu_ko_accs = []
for t, m in zero_shot.items():
if t.startswith("global_mmlu_ko_"):
a = _get_acc(m)
if a is not None:
mmlu_ko_accs.append(a)
if mmlu_ko_accs:
mmlu_ko_avg = _fmt_pct(sum(mmlu_ko_accs) / len(mmlu_ko_accs))
mmlu_ko_count = len(mmlu_ko_accs)
# MMLU-EN โ€” exclude group-level keys to avoid double-counting
_MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}
mmlu_en_accs = []
for t, m in zero_shot.items():
if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
a = _get_acc(m)
if a is not None:
mmlu_en_accs.append(a)
if not mmlu_en_accs:
# Fallback to group-level if no subtasks
for t in _MMLU_EN_GROUPS:
if t in zero_shot:
a = _get_acc(zero_shot[t])
if a is not None:
mmlu_en_accs.append(a)
mmlu_en_avg = _fmt_pct(sum(mmlu_en_accs) / len(mmlu_en_accs)) if mmlu_en_accs else "N/A"
# HAE-RAE
haerae_acc = "N/A"
if "haerae" in zero_shot:
a = _get_acc(zero_shot["haerae"])
if a is not None:
haerae_acc = _fmt_pct(a)
# English benchmarks
en_benchmarks = {}
for t in ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]:
if t in zero_shot:
a = _get_acc(zero_shot[t], prefer_norm=(t in ["hellaswag", "arc_challenge"]))
if a is not None:
en_benchmarks[t] = a
# Top-1 accuracy
top1 = _fmt_f(p1.get("calibration", {}).get("top1_accuracy"))
lines.append("| ๋ฉ”ํŠธ๋ฆญ | ๊ฐ’ |")
lines.append("|--------|-----|")
lines.append(f"| ์ฃผ์š” PPL (3b_val) | {main_ppl} |")
lines.append(f"| MMLU-KO ํ‰๊ท  ({mmlu_ko_count}๊ณผ๋ชฉ) | {mmlu_ko_avg} |")
lines.append(f"| MMLU-EN ํ‰๊ท  | {mmlu_en_avg} |")
lines.append(f"| KoBEST ํ‰๊ท  ({len(kobest_accs)}ํƒœ์Šคํฌ) | {kobest_avg} |")
lines.append(f"| HAE-RAE | {haerae_acc} |")
for t, a in en_benchmarks.items():
lines.append(f"| {t} (0-shot) | {_fmt_pct(a)} |")
lines.append(f"| Top-1 ์ •ํ™•๋„ (Calibration) | {top1} |")
lines.append("")
# Reference comparison
lines.append("## ์ฐธ๊ณ  ๋ชจ๋ธ ๋น„๊ต\n")
lines.append("| ๋ชจ๋ธ | ํŒŒ๋ผ๋ฏธํ„ฐ | MMLU-KO | MMLU-EN | KoBEST ํ‰๊ท  | PPL |")
lines.append("|------|---------|---------|---------|------------|-----|")
lines.append(f"| **FRANKENSTALLM 3B** | 3B | {mmlu_ko_avg} | {mmlu_en_avg} | {kobest_avg} | {main_ppl} |")
lines.append("| Llama-3.2-3B | 3B | ~42% | ~58% | ~55% | โ€” |")
lines.append("| Qwen2.5-3B | 3B | ~48% | ~65% | ~60% | โ€” |")
lines.append("| EXAONE-3.5-2.4B | 2.4B | ~35% | ~50% | ~50% | โ€” |")
lines.append("")
return "\n".join(lines)
def _generate_perplexity_report(ppl_data: dict) -> str:
lines = ["# Perplexity ํ‰๊ฐ€\n"]
if not ppl_data:
lines.append("๋ฐ์ดํ„ฐ ์—†์Œ\n")
return "\n".join(lines)
rows = []
for name, metrics in ppl_data.items():
if isinstance(metrics, dict) and "ppl" in metrics:
rows.append({
"name": name,
"ppl": metrics.get("ppl"),
"bits": metrics.get("bits_per_token"),
"n_tokens": metrics.get("n_tokens"),
"n_eval": metrics.get("n_eval_tokens"),
"elapsed": metrics.get("elapsed_sec"),
})
rows.sort(key=lambda x: x["ppl"] if isinstance(x["ppl"], (int, float)) else float("inf"),
reverse=True)
lines.append("| ๋ฐ์ดํ„ฐ์…‹ | PPL | Bits/Token | ์ „์ฒด ํ† ํฐ | ํ‰๊ฐ€ ํ† ํฐ | ์†Œ์š” ์‹œ๊ฐ„ |")
lines.append("|---------|-----|-----------|---------|---------|---------|")
for r in rows:
lines.append(
f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | "
f"{r['n_tokens']:,} | {r['n_eval']:,} | {_fmt_f(r['elapsed'], 1)}s |"
if isinstance(r['n_tokens'], (int, float)) and isinstance(r['n_eval'], (int, float))
else f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | "
f"{r['n_tokens']} | {r['n_eval']} | {_fmt_f(r['elapsed'], 1)}s |"
)
lines.append("")
return "\n".join(lines)
def _generate_calibration_report(cal_data: dict, nll_data: dict) -> str:
lines = ["# Calibration ๋ฐ Token NLL ๋ถ„์„\n"]
# Calibration
lines.append("## Calibration ๊ฒฐ๊ณผ\n")
if cal_data:
lines.append("| ๋ฉ”ํŠธ๋ฆญ | ๊ฐ’ |")
lines.append("|--------|-----|")
metrics_map = {
"top1_accuracy": "Top-1 Accuracy",
"top5_accuracy": "Top-5 Accuracy",
"top10_accuracy": "Top-10 Accuracy",
"mean_correct_prob": "Mean Correct Prob",
"mean_entropy": "Mean Entropy",
}
for key, label in metrics_map.items():
lines.append(f"| {label} | {_fmt_f(cal_data.get(key))} |")
lines.append("")
else:
lines.append("๋ฐ์ดํ„ฐ ์—†์Œ\n")
# Token NLL
lines.append("## Token NLL ๋ถ„ํฌ\n")
if nll_data:
# Keys may be "mean"/"std" or "nll_mean"/"nll_std"
stats_map = [
(["nll_mean", "mean"], "ํ‰๊ท "),
(["nll_std", "std"], "ํ‘œ์ค€ํŽธ์ฐจ"),
(["nll_median", "median"], "์ค‘์•™๊ฐ’"),
(["nll_min", "min"], "์ตœ์†Ÿ๊ฐ’"),
(["nll_max", "max"], "์ตœ๋Œ“๊ฐ’"),
]
lines.append("| ํ†ต๊ณ„ | ๊ฐ’ |")
lines.append("|------|-----|")
for candidates, label in stats_map:
val = None
for c in candidates:
if c in nll_data:
val = nll_data[c]
break
lines.append(f"| {label} | {_fmt_f(val)} |")
lines.append("")
# Percentiles: "nll_percentiles" (dict) or "percentiles" (dict)
pct_data = nll_data.get("nll_percentiles", nll_data.get("percentiles"))
if pct_data and isinstance(pct_data, dict):
lines.append("### Percentiles\n")
lines.append("| Percentile | ๊ฐ’ |")
lines.append("|------------|-----|")
for pct, value in pct_data.items():
lines.append(f"| {pct}th | {_fmt_f(value)} |")
lines.append("")
# High loss: "high_loss_fractions" (dict) or flat "high_loss_fraction_N" keys
hlf = nll_data.get("high_loss_fractions")
if hlf and isinstance(hlf, dict):
lines.append("### ๊ณ ์†์‹ค ํ† ํฐ ๋น„์œจ\n")
lines.append("| ์ž„๊ณ„๊ฐ’ | ๋น„์œจ |")
lines.append("|--------|-----|")
for threshold, fraction in hlf.items():
lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |")
lines.append("")
else:
# Check flat keys: high_loss_fraction_5, high_loss_fraction_10, ...
hlf_flat = {k.replace("high_loss_fraction_", ""): v
for k, v in nll_data.items()
if k.startswith("high_loss_fraction_")}
if hlf_flat:
lines.append("### ๊ณ ์†์‹ค ํ† ํฐ ๋น„์œจ\n")
lines.append("| ์ž„๊ณ„๊ฐ’ | ๋น„์œจ |")
lines.append("|--------|-----|")
for threshold, fraction in sorted(hlf_flat.items()):
lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |")
lines.append("")
else:
lines.append("๋ฐ์ดํ„ฐ ์—†์Œ\n")
return "\n".join(lines)
def _generate_generation_report(gen_data: dict, samples: list) -> str:
lines = ["# ์ƒ์„ฑ ํ’ˆ์งˆ ๋ถ„์„\n"]
if gen_data and "summary" in gen_data:
lines.append("## ์š”์•ฝ ํ†ต๊ณ„\n")
lines.append("| ๋ฉ”ํŠธ๋ฆญ | ๊ฐ’ |")
lines.append("|--------|-----|")
for key, value in gen_data["summary"].items():
display = key.replace("_", " ").title()
lines.append(f"| {display} | {_fmt_f(value)} |")
lines.append("")
if samples:
lines.append("## ์ƒ์„ฑ ์ƒ˜ํ”Œ (Greedy)\n")
for i, sample in enumerate(samples[:5], 1):
if isinstance(sample, dict):
prompt = sample.get("prompt", "")
generated = sample.get("generated_text", "")
if len(generated) > 300:
generated = generated[:300] + "..."
lines.append(f"### ์ƒ˜ํ”Œ {i}\n")
lines.append(f"**Prompt**: {prompt}\n")
lines.append(f"**Generated**: {generated}\n")
lines.append("")
elif not gen_data:
lines.append("๋ฐ์ดํ„ฐ ์—†์Œ\n")
return "\n".join(lines)
def _generate_benchmark_report(
zero_shot: dict,
five_shot: dict,
repetition: dict,
) -> str:
lines = ["# ํ‘œ์ค€ ๋ฒค์น˜๋งˆํฌ ๊ฒฐ๊ณผ\n"]
if not zero_shot and not five_shot:
lines.append("๋ฐ์ดํ„ฐ ์—†์Œ\n")
return "\n".join(lines)
# --- Korean Benchmarks ---
lines.append("## ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ\n")
# KoBEST
kobest_names = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
"kobest_sentineg", "kobest_wic"]
kobest_0 = {t: zero_shot[t] for t in kobest_names if t in zero_shot}
if kobest_0:
lines.append("### KoBEST (0-shot)\n")
lines.append("| ํƒœ์Šคํฌ | Accuracy | F1 |")
lines.append("|--------|----------|-----|")
for t in kobest_names:
if t in kobest_0:
m = kobest_0[t]
acc = _fmt_pct(_get_acc(m))
f1 = _fmt_f(m.get("f1,none"))
lines.append(f"| {t} | {acc} | {f1} |")
kobest_accs = [_get_acc(kobest_0[t]) for t in kobest_names
if t in kobest_0 and _get_acc(kobest_0[t]) is not None]
if kobest_accs:
lines.append(f"| **ํ‰๊ท ** | **{_fmt_pct(sum(kobest_accs)/len(kobest_accs))}** | |")
lines.append("")
# HAE-RAE
if "haerae" in zero_shot:
lines.append("### HAE-RAE (0-shot)\n")
m = zero_shot["haerae"]
lines.append(f"- Accuracy: {_fmt_pct(_get_acc(m))}")
# Check for sub-tasks
haerae_subs = {t: zero_shot[t] for t in zero_shot if t.startswith("haerae_") and t != "haerae"}
if haerae_subs:
lines.append("\n| ์„œ๋ธŒํƒœ์Šคํฌ | Accuracy |")
lines.append("|-----------|----------|")
for t, sm in sorted(haerae_subs.items()):
lines.append(f"| {t} | {_fmt_pct(_get_acc(sm))} |")
lines.append("")
# MMLU-KO
mmlu_ko_tasks = {t: zero_shot[t] for t in zero_shot
if t.startswith("global_mmlu_ko") and t != "global_mmlu_ko"}
if mmlu_ko_tasks or "global_mmlu_ko" in zero_shot:
lines.append("### MMLU-KO (0-shot)\n")
if mmlu_ko_tasks:
lines.append(f"ํ‰๊ฐ€๋œ ๊ณผ๋ชฉ ์ˆ˜: **{len(mmlu_ko_tasks)}**\n")
accs = [(t, _get_acc(m)) for t, m in sorted(mmlu_ko_tasks.items())
if _get_acc(m) is not None]
if accs:
# Prefer group-level weighted average from lm-eval
group_acc = _get_acc(zero_shot["global_mmlu_ko"]) if "global_mmlu_ko" in zero_shot else None
avg_acc = group_acc if group_acc is not None else sum(a for _, a in accs) / len(accs)
lines.append(f"์ „์ฒด ํ‰๊ท : **{_fmt_pct(avg_acc)}**\n")
# Top 10
accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True)
lines.append("**์ƒ์œ„ 10๊ฐœ ๊ณผ๋ชฉ**:\n")
lines.append("| ๊ณผ๋ชฉ | Accuracy |")
lines.append("|------|----------|")
for t, a in accs_sorted[:10]:
subject = t.replace("global_mmlu_ko_", "")
lines.append(f"| {subject} | {_fmt_pct(a)} |")
lines.append("")
lines.append("**ํ•˜์œ„ 10๊ฐœ ๊ณผ๋ชฉ**:\n")
lines.append("| ๊ณผ๋ชฉ | Accuracy |")
lines.append("|------|----------|")
for t, a in accs_sorted[-10:]:
subject = t.replace("global_mmlu_ko_", "")
lines.append(f"| {subject} | {_fmt_pct(a)} |")
lines.append("")
elif "global_mmlu_ko" in zero_shot:
a = _get_acc(zero_shot["global_mmlu_ko"])
lines.append(f"์ „์ฒด ์ •ํ™•๋„: {_fmt_pct(a)}\n")
# --- English Benchmarks ---
lines.append("## ์˜์–ด ๋ฒค์น˜๋งˆํฌ\n")
en_tasks = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]
en_found = {t: zero_shot[t] for t in en_tasks if t in zero_shot}
if en_found:
lines.append("### ์ฃผ์š” ๋ฒค์น˜๋งˆํฌ (0-shot)\n")
lines.append("| ํƒœ์Šคํฌ | Accuracy | Acc (norm) |")
lines.append("|--------|----------|-----------|")
for t in en_tasks:
if t in en_found:
m = en_found[t]
acc = _fmt_pct(_get_acc(m))
acc_norm = _fmt_pct(_get_acc(m, prefer_norm=True) if "acc_norm,none" in m else None)
lines.append(f"| {t} | {acc} | {acc_norm} |")
lines.append("")
# MMLU-EN
mmlu_en_tasks = {t: zero_shot[t] for t in zero_shot
if (t.startswith("mmlu_") or t == "mmlu") and not t.startswith("mmlu_ko")}
if mmlu_en_tasks:
lines.append("### MMLU-EN (0-shot)\n")
# Filter out the group-level "mmlu" if sub-tasks exist
subtasks = {t: m for t, m in mmlu_en_tasks.items() if t != "mmlu"}
if subtasks:
lines.append(f"ํ‰๊ฐ€๋œ ๊ณผ๋ชฉ ์ˆ˜: **{len(subtasks)}**\n")
accs = [(t, _get_acc(m)) for t, m in sorted(subtasks.items())
if _get_acc(m) is not None]
if accs:
avg_acc = sum(a for _, a in accs) / len(accs)
lines.append(f"์ „์ฒด ํ‰๊ท : **{_fmt_pct(avg_acc)}**\n")
accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True)
lines.append("**์ƒ์œ„ 10๊ฐœ ๊ณผ๋ชฉ**:\n")
lines.append("| ๊ณผ๋ชฉ | Accuracy |")
lines.append("|------|----------|")
for t, a in accs_sorted[:10]:
subject = t.replace("mmlu_", "")
lines.append(f"| {subject} | {_fmt_pct(a)} |")
lines.append("")
lines.append("**ํ•˜์œ„ 10๊ฐœ ๊ณผ๋ชฉ**:\n")
lines.append("| ๊ณผ๋ชฉ | Accuracy |")
lines.append("|------|----------|")
for t, a in accs_sorted[-10:]:
subject = t.replace("mmlu_", "")
lines.append(f"| {subject} | {_fmt_pct(a)} |")
lines.append("")
elif "mmlu" in mmlu_en_tasks:
a = _get_acc(mmlu_en_tasks["mmlu"])
lines.append(f"์ „์ฒด ์ •ํ™•๋„: {_fmt_pct(a)}\n")
# --- 0-shot vs 5-shot Comparison ---
if five_shot:
lines.append("## 0-shot vs 5-shot ๋น„๊ต\n")
# Collect all tasks that have both 0-shot and 5-shot results
common_tasks = sorted(set(zero_shot.keys()) & set(five_shot.keys()))
if common_tasks:
lines.append("| ํƒœ์Šคํฌ | 0-shot Acc | 5-shot Acc | ๋ณ€ํ™” |")
lines.append("|--------|-----------|-----------|------|")
for t in common_tasks:
a0 = _get_acc(zero_shot[t])
a5 = _get_acc(five_shot[t])
if a0 is not None and a5 is not None:
diff = a5 - a0
sign = "+" if diff >= 0 else ""
lines.append(
f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | {sign}{diff*100:.2f}pp |"
)
else:
lines.append(f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | โ€” |")
lines.append("")
# Summary
diffs = []
for t in common_tasks:
a0 = _get_acc(zero_shot[t])
a5 = _get_acc(five_shot[t])
if a0 is not None and a5 is not None:
diffs.append(a5 - a0)
if diffs:
avg_diff = sum(diffs) / len(diffs)
improved = sum(1 for d in diffs if d > 0)
degraded = sum(1 for d in diffs if d < 0)
lines.append(
f"ํ‰๊ท  ๋ณ€ํ™”: {'+' if avg_diff >= 0 else ''}{avg_diff*100:.2f}pp | "
f"๊ฐœ์„ : {improved} | ํ•˜๋ฝ: {degraded} | ๋™์ผ: {len(diffs) - improved - degraded}\n"
)
# --- Repetition ---
if repetition and repetition.get("grid_results"):
lines.append("## Repetition ํŒŒ๋ผ๋ฏธํ„ฐ ๊ฒ€์ƒ‰\n")
rep_data = repetition["grid_results"]
rep_rows = []
# grid_results can be a list of dicts or a dict of dicts
items = rep_data.items() if isinstance(rep_data, dict) else enumerate(rep_data)
for key, metrics in items:
if isinstance(metrics, dict):
rep_rows.append({
"config": metrics.get("params", str(key)),
"temp": metrics.get("temperature"),
"rep_pen": metrics.get("repetition_penalty"),
"3gram": metrics.get("avg_3gram_rep", metrics.get("3gram_repetition", float("inf"))),
"4gram": metrics.get("avg_4gram_rep", metrics.get("4gram_repetition")),
"eos_rate": metrics.get("eos_rate"),
"avg_tokens": metrics.get("avg_tokens"),
})
rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf"))
lines.append("| ์„ค์ • | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |")
lines.append("|------|------|---------|--------|--------|----------|-----------|")
for i, r in enumerate(rep_rows):
marker = " **โ† best**" if i == 0 else ""
lines.append(
f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | "
f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | "
f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}"
)
lines.append("")
lines.append("---\n")
lines.append("*์ด ๋ฆฌํฌํŠธ๋Š” ์ž๋™์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.*")
return "\n".join(lines)
# =========================================================================
# Base vs SFT Comparison Report
# =========================================================================
# Base model reference values (from 3b_reeval_20260305_1451)
_BASE_PPL_REFERENCE = {
"3b_val": 5.2263,
"3b": 5.2263,
"korean_c4_val": 5.7173,
"korean_c4": 5.7173,
"hplt_ko_val": 2.4028,
"hplt_ko": 2.4028,
"cc100_ko_val": 21.782,
"cc100_ko": 21.782,
"korean_val": 9.6505,
"korean": 9.6505,
}
_BASE_BENCH_REFERENCE = {
"kobest_boolq": 0.5028,
"kobest_copa": 0.4930,
"kobest_hellaswag": 0.2160,
"kobest_sentineg": 0.4861,
"kobest_wic": 0.4865,
"haerae": 0.1971,
"global_mmlu_ko": 0.2275,
"hellaswag": 0.2600,
"arc_easy": 0.2563,
"arc_challenge": 0.2167,
"winogrande": 0.5059,
"piqa": 0.5250,
}
_BASE_GEN_REFERENCE = {
"greedy_3gram_rep": 0.6099,
"greedy_4gram_rep": 0.5702,
"greedy_eos_rate": 0.0,
}
_BASE_CALIB_REFERENCE = {
"top1_accuracy": 0.6875,
"top5_accuracy": 0.8164,
"top10_accuracy": 0.8593,
"mean_entropy": 1.5682,
}
_BASE_NLL_REFERENCE = {
"nll_mean": 1.5561,
"high_loss_fraction_5": 0.1086,
}
# =========================================================================
# Threshold Justification
# =========================================================================
# PPL forgetting 15%: Kirkpatrick et al. (2017) continual learning ๊ธฐ์ค€ 10-20%
# KoBEST avg 55%: Random baseline ~40%, Llama 3.2 1B ~52%, Qwen 2.5 3B ~58%
# MMLU-KO 30%: Random 25%, Llama 3.2 3B ~35%
# Greedy 3-gram rep <5%: ์ธ๊ฐ„ ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ 256ํ† ํฐ ๊ธฐ์ค€ 1-3%, Base ๋ชจ๋ธ 61%
# EOS rate >90%: ์ฑ— ๋ชจ๋ธ์€ ์‘๋‹ต์„ ๋๋‚ด์•ผ ํ•จ, ์ผ๋ถ€ ์žฅ๋ฌธ ํ—ˆ์šฉ
# Calibration top1 65%: Base 68.75%, SFT๋กœ ์ธํ•œ ์†Œํญ ํ•˜๋ฝ ํ—ˆ์šฉ
# Distinct-2 >70%: Li et al. (2016), ๋‹ค์–‘์„ฑ ๋ณด์žฅ ์ตœ์†Œ์„ 
# =========================================================================
_SFT_TARGETS = {
# ์ƒ์„ฑ ํ’ˆ์งˆ
"greedy_3gram_rep_max": 0.05,
"eos_rate_min": 0.90,
"sampled_eos_min": 0.50,
"distinct_2_min": 0.70,
# ์ง€์‹ ๋ณด์กด
"ppl_forgetting_max_pct": 15.0,
# ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ
"kobest_avg_min": 0.55,
"haerae_min": 0.25,
"mmlu_ko_min": 0.30,
# ์นผ๋ฆฌ๋ธŒ๋ ˆ์ด์…˜
"top1_accuracy_min": 0.65,
# ์˜์–ด ์œ ์ง€
"hellaswag_min": 0.25,
"arc_easy_min": 0.25,
"arc_challenge_min": 0.21,
"winogrande_min": 0.49,
"piqa_min": 0.51,
"mmlu_en_avg_min": 0.25,
}
_REFERENCE_MODELS = {
"Llama 3.2 1B": {"kobest_avg": 0.52, "mmlu_ko": 0.28, "mmlu_en": 0.32},
"Llama 3.2 3B": {"kobest_avg": 0.56, "mmlu_ko": 0.35, "mmlu_en": 0.55},
"Qwen 2.5 3B": {"kobest_avg": 0.58, "mmlu_ko": 0.42, "mmlu_en": 0.58},
}
def _compute_orpo_score(sft_p1, sft_zero, base_p1, base_zero):
"""ORPO ํ•„์š”์„ฑ ์ •๋Ÿ‰ ํŒ์ • (0-100์ ).
Returns:
dict with keys: total_score, dimension_scores, decision, confidence, orpo_gain_estimate
"""
dimensions = {}
missing = 0
total_dims = 7
# Dim 1: PPL Forgetting (25 pts)
max_forgetting = _get_max_forgetting(sft_p1, base_p1)
if max_forgetting is not None:
threshold = _SFT_TARGETS["ppl_forgetting_max_pct"]
score = 25 * max(0, 1 - max_forgetting / threshold)
dimensions["ppl_forgetting"] = {
"score": round(score, 1), "weight": 25,
"current": round(max_forgetting, 1), "threshold": f"<{threshold}%",
"status": "PASS" if max_forgetting < threshold else "FAIL",
}
else:
missing += 1
dimensions["ppl_forgetting"] = {"score": 0, "weight": 25, "current": "N/A", "threshold": "<15%", "status": "N/A"}
# Dim 2: Greedy ๋ฐ˜๋ณต๋ฅ  (20 pts)
rep_rate = _get_greedy_3gram_rep(sft_p1)
if rep_rate is not None:
threshold = _SFT_TARGETS["greedy_3gram_rep_max"]
score = 20 * max(0, 1 - rep_rate / threshold)
dimensions["greedy_rep"] = {
"score": round(score, 1), "weight": 20,
"current": f"{rep_rate:.1%}", "threshold": f"<{threshold:.0%}",
"status": "PASS" if rep_rate < threshold else "FAIL",
}
else:
missing += 1
dimensions["greedy_rep"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": "<5%", "status": "N/A"}
# Dim 3: EOS ์ข…๋ฃŒ์œจ (10 pts)
eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
if eos_rate is not None:
threshold = _SFT_TARGETS["eos_rate_min"]
score = 10 * min(eos_rate / threshold, 1)
dimensions["eos_rate"] = {
"score": round(score, 1), "weight": 10,
"current": f"{eos_rate:.0%}", "threshold": f">{threshold:.0%}",
"status": "PASS" if eos_rate >= threshold else "FAIL",
}
else:
missing += 1
dimensions["eos_rate"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">90%", "status": "N/A"}
# Dim 4: KoBEST ํ‰๊ท  (20 pts)
kobest_avg = _get_kobest_avg(sft_zero)
if kobest_avg is not None:
threshold = _SFT_TARGETS["kobest_avg_min"]
score = 20 * min(kobest_avg / threshold, 1)
dimensions["kobest_avg"] = {
"score": round(score, 1), "weight": 20,
"current": f"{kobest_avg:.1%}", "threshold": f">{threshold:.0%}",
"status": "PASS" if kobest_avg >= threshold else "FAIL",
}
else:
missing += 1
dimensions["kobest_avg"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": ">55%", "status": "N/A"}
# Dim 5: Calibration (10 pts)
top1 = sft_p1.get("calibration", {}).get("top1_accuracy")
if top1 is not None:
threshold = _SFT_TARGETS["top1_accuracy_min"]
score = 10 * min(top1 / threshold, 1)
dimensions["calibration"] = {
"score": round(score, 1), "weight": 10,
"current": f"{top1:.1%}", "threshold": f">={threshold:.0%}",
"status": "PASS" if top1 >= threshold else "FAIL",
}
else:
missing += 1
dimensions["calibration"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">=65%", "status": "N/A"}
# Dim 6: ๋‹ค์–‘์„ฑ distinct-2 (10 pts)
distinct_2 = sft_p1.get("generation", {}).get("summary", {}).get("greedy_avg_distinct_2")
if distinct_2 is not None:
threshold = _SFT_TARGETS["distinct_2_min"]
score = 10 * min(distinct_2 / threshold, 1)
dimensions["diversity"] = {
"score": round(score, 1), "weight": 10,
"current": f"{distinct_2:.0%}", "threshold": f">{threshold:.0%}",
"status": "PASS" if distinct_2 >= threshold else "FAIL",
}
else:
missing += 1
dimensions["diversity"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">70%", "status": "N/A"}
# Dim 7: ์˜์–ด ์œ ์ง€ (5 pts)
en_tasks = {
"hellaswag": _SFT_TARGETS["hellaswag_min"],
"arc_easy": _SFT_TARGETS["arc_easy_min"],
"arc_challenge": _SFT_TARGETS["arc_challenge_min"],
"winogrande": _SFT_TARGETS["winogrande_min"],
"piqa": _SFT_TARGETS["piqa_min"],
}
en_all_pass = True
en_count = 0
for t, threshold in en_tasks.items():
a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
if a is not None:
en_count += 1
if a < threshold:
en_all_pass = False
if en_count > 0:
score = 5.0 if en_all_pass else 0.0
dimensions["english"] = {
"score": score, "weight": 5,
"current": "์ „๋ถ€ ํ†ต๊ณผ" if en_all_pass else "์ผ๋ถ€ ๋ฏธ๋‹ฌ",
"threshold": "โ€”", "status": "PASS" if en_all_pass else "FAIL",
}
else:
missing += 1
dimensions["english"] = {"score": 0, "weight": 5, "current": "N/A", "threshold": "โ€”", "status": "N/A"}
total_score = sum(d["score"] for d in dimensions.values())
confidence = round(1.0 - (missing / total_dims), 2)
if missing >= 2:
logger.warning("ORPO score has %d/%d missing dimensions โ€” confidence %.0f%%", missing, total_dims, confidence * 100)
# ORPO gain estimate: dimensions that ORPO can improve
orpo_improvable = 0.0
if rep_rate is not None and rep_rate >= _SFT_TARGETS["greedy_3gram_rep_max"]:
orpo_improvable += 20.0 # repetition
if eos_rate is not None and eos_rate < _SFT_TARGETS["eos_rate_min"]:
orpo_improvable += 10.0 # eos
if distinct_2 is not None and distinct_2 < _SFT_TARGETS["distinct_2_min"]:
orpo_improvable += 5.0 # partial diversity improvement
# Decision
forgetting_ok = max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"]
if total_score >= 80:
decision = "DEPLOY"
elif total_score >= 40 and forgetting_ok:
decision = "ORPO"
else:
decision = "SFT_RETRY"
return {
"total_score": round(total_score, 1),
"dimensions": dimensions,
"decision": decision,
"confidence": confidence,
"orpo_gain_estimate": round(orpo_improvable, 1),
}
def generate_comparison_report(
base_results_dir: Path,
sft_phase1_results: dict,
sft_phase2_results: dict,
output_path: Path,
sft_output_dir: Optional[Path] = None,
total_elapsed_sec: float = 0.0,
) -> Path:
"""Generate a comprehensive Base vs SFT comparison report.
Args:
base_results_dir: Directory containing Base model's phase1/phase2_results.json
sft_phase1_results: SFT Phase 1 results dict
sft_phase2_results: SFT Phase 2 results dict
output_path: Where to write the markdown report
sft_output_dir: SFT eval outputs directory (for linking)
total_elapsed_sec: Total pipeline elapsed time
Returns:
Path to the generated report
"""
base_results_dir = Path(base_results_dir)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Load Base results
base_p1 = {}
base_p2 = {}
p1_file = base_results_dir / "phase1_results.json"
p2_file = base_results_dir / "phase2_results.json"
if p1_file.exists():
with open(p1_file, encoding="utf-8") as f:
base_p1 = json.load(f)
if p2_file.exists():
with open(p2_file, encoding="utf-8") as f:
base_p2 = json.load(f)
# Normalize both
sft_p1 = _normalize_phase1_results(sft_phase1_results)
base_p1_norm = _normalize_phase1_results(base_p1)
sft_zero, sft_five = _normalize_phase2_results(sft_phase2_results)
base_zero, base_five = _normalize_phase2_results(base_p2)
eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines = []
# === Header ===
lines.append("# FRANKENSTALLM 3B SFT ๋ชจ๋ธ ๋‹ค๋ฉด์  ์ข…ํ•ฉ ํ‰๊ฐ€ ๋ณด๊ณ ์„œ\n")
lines.append(f"- **ํ‰๊ฐ€ ์ผ์‹œ**: {eval_datetime}")
lines.append(f"- **SFT ์ฒดํฌํฌ์ธํŠธ**: checkpoint-best (val_loss=1.8851, step 25500)")
lines.append(f"- **Base ์ฐธ์กฐ ๊ฒฐ๊ณผ**: 3b_reeval_20260305_1451")
lines.append(f"- **์ด ์†Œ์š” ์‹œ๊ฐ„**: {_fmt_seconds(total_elapsed_sec)}")
if sft_output_dir:
lines.append(f"- **๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ**: {sft_output_dir}")
lines.append("")
# === 1. Executive Summary ===
lines.append("## 1. Executive Summary\n")
verdicts = _compute_verdicts(sft_p1, sft_zero, base_p1_norm, base_zero)
lines.append("| ํ‰๊ฐ€ ์ฐจ์› | ๊ฒฐ๊ณผ | ์ƒ์„ธ |")
lines.append("|----------|------|------|")
for dim_name, verdict, detail in verdicts:
icon = "PASS" if verdict else "FAIL"
lines.append(f"| {dim_name} | **{icon}** | {detail} |")
lines.append("")
pass_count = sum(1 for _, v, _ in verdicts if v)
total_dims = len(verdicts)
lines.append(f"**์ข…ํ•ฉ**: {pass_count}/{total_dims} ์ฐจ์› ํ†ต๊ณผ\n")
# ORPO verdict โ€” quantitative scoring
rep_rate = _get_greedy_3gram_rep(sft_p1)
kobest_avg = _get_kobest_avg(sft_zero)
max_forgetting = _get_max_forgetting(sft_p1, base_p1_norm)
lines.append("### ORPO ํŒ์ • (์ •๋Ÿ‰ ์Šค์ฝ”์–ด)\n")
orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero)
lines.append(f"**๊ฒฐ์ •**: {orpo_result['decision']} (ํ™•์‹ ๋„: {orpo_result['confidence']:.0%})\n")
lines.append(f"**์ •๋Ÿ‰ ์Šค์ฝ”์–ด**: {orpo_result['total_score']}/100\n")
lines.append("| ์ฐจ์› | ์ ์ˆ˜ | /๊ฐ€์ค‘์น˜ | ํ˜„์žฌ๊ฐ’ | ๊ธฐ์ค€ | ์ƒํƒœ |")
lines.append("|------|------|--------|--------|------|------|")
dim_names = {
"ppl_forgetting": "PPL Forgetting",
"greedy_rep": "Greedy ๋ฐ˜๋ณต๋ฅ ",
"eos_rate": "EOS ์ข…๋ฃŒ์œจ",
"kobest_avg": "KoBEST ํ‰๊ท ",
"calibration": "Calibration",
"diversity": "๋‹ค์–‘์„ฑ",
"english": "์˜์–ด ์œ ์ง€",
}
for key, label in dim_names.items():
d = orpo_result["dimensions"].get(key, {})
lines.append(
f"| {label} | {d.get('score', 0)} | /{d.get('weight', 0)} | "
f"{d.get('current', 'N/A')} | {d.get('threshold', 'โ€”')} | {d.get('status', 'N/A')} |"
)
lines.append("")
if orpo_result["orpo_gain_estimate"] > 0:
lines.append(f"**ORPO ๊ธฐ๋Œ€ ์ด๋“**: +{orpo_result['orpo_gain_estimate']}์  "
f"(๋ฐ˜๋ณต๋ฅ /EOS/๋‹ค์–‘์„ฑ ๊ฐœ์„  ๊ธฐ๋Œ€, PPL/๋ฒค์น˜ ๋ณ€ํ™” ์—†์Œ)\n")
# Reference model comparison
lines.append("**์ฐธ์กฐ ๋ชจ๋ธ ๋น„๊ต**:\n")
for model_name, ref in _REFERENCE_MODELS.items():
lines.append(f"- {model_name}: KoBEST={ref['kobest_avg']:.0%}, MMLU-KO={ref['mmlu_ko']:.0%}")
lines.append("")
# Decision explanation
if orpo_result["decision"] == "DEPLOY":
lines.append("**โ†’ Phase 4: GGUF + Ollama ๋ฐฐํฌ** (์Šค์ฝ”์–ด โ‰ฅ80, ๋ชจ๋“  ํ•ต์‹ฌ ์กฐ๊ฑด ์ถฉ์กฑ)\n")
elif orpo_result["decision"] == "ORPO":
lines.append("**โ†’ Phase 3: ORPO** (์Šค์ฝ”์–ด 40-79, ์ง€์‹ ๋ณด์กด ์–‘ํ˜ธ, ์ƒ์„ฑ ๊ฐœ์„  ํ•„์š”)\n")
else:
lines.append("**โ†’ SFT ์žฌ์‹œ๋„** (์Šค์ฝ”์–ด <40 ๋˜๋Š” ์‹ฌ๊ฐํ•œ forgetting)\n")
# === 2. PPL Comparison ===
lines.append("## 2. Perplexity ๋น„๊ต (์ง€์‹ ๋ณด์กด)\n")
lines.append("| ๋ฐ์ดํ„ฐ์…‹ | Base PPL | SFT PPL | ๋ณ€ํ™” | Forgetting % | ํŒ์ • |")
lines.append("|---------|---------|---------|------|-------------|------|")
sft_ppl = sft_p1.get("perplexity", {})
base_ppl = base_p1_norm.get("perplexity", {})
# Merge all dataset names
all_ppl_names = sorted(set(list(sft_ppl.keys()) + list(base_ppl.keys())))
forgetting_values = []
for name in all_ppl_names:
sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None
base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
# Try reference table if base results not available
if base_val is None:
base_val = _BASE_PPL_REFERENCE.get(name)
if sft_val is not None and base_val is not None:
forgetting = (sft_val - base_val) / base_val * 100
forgetting_values.append(forgetting)
verdict = "PASS" if forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] else "FAIL"
lines.append(
f"| {name} | {base_val:.4f} | {sft_val:.4f} | "
f"{'+' if sft_val >= base_val else ''}{sft_val - base_val:.4f} | "
f"{forgetting:+.1f}% | {verdict} |"
)
elif sft_val is not None:
lines.append(f"| {name} | โ€” | {sft_val:.4f} | โ€” | โ€” | โ€” |")
elif base_val is not None:
lines.append(f"| {name} | {base_val:.4f} | โ€” | โ€” | โ€” | โ€” |")
if forgetting_values:
avg_forgetting = sum(forgetting_values) / len(forgetting_values)
max_f = max(forgetting_values)
lines.append("")
lines.append(f"**ํ‰๊ท  Forgetting**: {avg_forgetting:+.1f}% | **์ตœ๋Œ€**: {max_f:+.1f}% | "
f"**ํŒ์ •**: {'PASS' if max_f < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'FAIL'} (์ž„๊ณ„๊ฐ’ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)")
lines.append("")
# === 3. Generation Quality ===
lines.append("## 3. ์ƒ์„ฑ ํ’ˆ์งˆ ๋น„๊ต\n")
sft_gen = sft_p1.get("generation", {})
if not sft_gen:
logger.warning("Generation results missing from SFT Phase 1")
sft_summary = sft_gen.get("summary", {})
lines.append("| ์ง€ํ‘œ | Base | SFT | ๋ชฉํ‘œ | ํŒ์ • |")
lines.append("|------|------|-----|------|------|")
greedy_3gram = sft_summary.get("greedy_avg_3gram_rep")
greedy_4gram = sft_summary.get("greedy_avg_4gram_rep")
eos_rate = sft_summary.get("greedy_eos_rate")
rep_threshold = _SFT_TARGETS["greedy_3gram_rep_max"]
eos_threshold = _SFT_TARGETS["eos_rate_min"]
greedy_3gram_verdict = "PASS" if greedy_3gram is not None and greedy_3gram < rep_threshold else "FAIL"
greedy_4gram_verdict = "PASS" if greedy_4gram is not None and greedy_4gram < 0.05 else "FAIL"
eos_verdict = "PASS" if eos_rate is not None and eos_rate >= eos_threshold else "FAIL"
lines.append(f"| Greedy 3-gram ๋ฐ˜๋ณต๋ฅ  | {_BASE_GEN_REFERENCE['greedy_3gram_rep']:.2%} | "
f"{_fmt_pct(greedy_3gram)} | < {rep_threshold:.0%} | {greedy_3gram_verdict} |")
lines.append(f"| Greedy 4-gram ๋ฐ˜๋ณต๋ฅ  | {_BASE_GEN_REFERENCE['greedy_4gram_rep']:.2%} | "
f"{_fmt_pct(greedy_4gram)} | < 5% | {greedy_4gram_verdict} |")
lines.append(f"| EOS ์ข…๋ฃŒ์œจ | {_BASE_GEN_REFERENCE['greedy_eos_rate']:.0%} | "
f"{_fmt_pct(eos_rate)} | > {eos_threshold:.0%} | {eos_verdict} |")
sampled_3gram = sft_summary.get("sampled_avg_3gram_rep")
sampled_eos = sft_summary.get("sampled_eos_rate")
if sampled_3gram is not None:
lines.append(f"| Sampled 3-gram ๋ฐ˜๋ณต๋ฅ  | โ€” | {sampled_3gram:.2%} | โ€” | โ€” |")
if sampled_eos is not None:
lines.append(f"| Sampled EOS ์ข…๋ฃŒ์œจ | โ€” | {sampled_eos:.2%} | โ€” | โ€” |")
lines.append("")
# Chat template status
chat_status = "ํ™œ์„ฑํ™”" if sft_summary else "๋น„ํ™œ์„ฑํ™”"
lines.append(f"**Chat Template**: {chat_status}\n")
# Generation samples
if sft_gen.get("samples"):
lines.append("### ์ƒ์„ฑ ์ƒ˜ํ”Œ (Greedy, Chat Template)\n")
greedy_samples = [s for s in sft_gen["samples"] if s.get("temperature") == 0.0]
for i, s in enumerate(greedy_samples[:5], 1):
prompt = s.get("prompt", "")
text = s.get("text", "")[:400]
hit_eos = s.get("hit_eos", False)
rep3 = s.get("3gram_rep", 0)
lines.append(f"**[{i}]** `{prompt}`")
lines.append(f"> {text}")
lines.append(f"> *EOS={hit_eos}, 3gram_rep={rep3:.2%}, tokens={s.get('generated_tokens', 0)}*\n")
# Repetition grid
sft_rep = sft_p1.get("repetition", {})
if sft_rep.get("grid_results"):
lines.append("### Repetition ํŒŒ๋ผ๋ฏธํ„ฐ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ\n")
lines.append("| ์„ค์ • | 3-gram | EOS Rate | Avg Tokens |")
lines.append("|------|--------|----------|-----------|")
grid = sft_rep["grid_results"]
items = grid if isinstance(grid, list) else list(grid.values())
for r in items[:6]:
if isinstance(r, dict):
lines.append(
f"| {r.get('params', '?')} | "
f"{_fmt_f(r.get('avg_3gram_rep'))} | "
f"{_fmt_f(r.get('eos_rate'))} | "
f"{_fmt_f(r.get('avg_tokens'), 1)} |"
)
lines.append("")
# === 4. Korean Benchmarks ===
lines.append("## 4. ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ\n")
lines.append("### KoBEST (0-shot)\n")
lines.append("| ํƒœ์Šคํฌ | Base | SFT | ๋ณ€ํ™” | ๋ชฉํ‘œ | ํŒ์ • |")
lines.append("|--------|------|-----|------|------|------|")
kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
"kobest_sentineg", "kobest_wic"]
kobest_targets = {"kobest_boolq": 0.60, "kobest_copa": 0.65,
"kobest_hellaswag": 0.30, "kobest_sentineg": 0.60,
"kobest_wic": 0.55}
sft_kobest_accs = []
base_kobest_accs = []
for t in kobest_tasks:
base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
target = kobest_targets.get(t, 0.50)
if sft_a is not None:
sft_kobest_accs.append(sft_a)
if base_a is not None:
base_kobest_accs.append(base_a)
diff = ""
verdict = "โ€”"
if sft_a is not None and base_a is not None:
d = (sft_a - base_a) * 100
diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
verdict = "PASS" if sft_a >= target else "FAIL"
lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | "
f"โ‰ฅ{target*100:.0f}% | {verdict} |")
if sft_kobest_accs:
sft_avg = sum(sft_kobest_accs) / len(sft_kobest_accs)
base_avg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else _BASE_BENCH_REFERENCE.get("kobest_avg", 0.4369)
diff_avg = (sft_avg - base_avg) * 100
lines.append(f"| **ํ‰๊ท ** | **{base_avg*100:.2f}%** | **{sft_avg*100:.2f}%** | "
f"**{'+' if diff_avg >= 0 else ''}{diff_avg:.1f}pp** | "
f"**โ‰ฅ{_SFT_TARGETS['kobest_avg_min']*100:.0f}%** | **{'PASS' if sft_avg >= _SFT_TARGETS['kobest_avg_min'] else 'FAIL'}** |")
lines.append("")
# HAE-RAE
lines.append("### HAE-RAE (0-shot)\n")
base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae")
sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None
if sft_haerae is not None:
diff_h = (sft_haerae - (base_haerae or 0)) * 100 if base_haerae else 0
lines.append(f"- Base: {_fmt_pct(base_haerae)} โ†’ SFT: {_fmt_pct(sft_haerae)} "
f"({'+' if diff_h >= 0 else ''}{diff_h:.1f}pp) | "
f"๋ชฉํ‘œ โ‰ฅ{_SFT_TARGETS['haerae_min']*100:.0f}% | {'PASS' if sft_haerae >= _SFT_TARGETS['haerae_min'] else 'FAIL'}")
else:
lines.append(f"- Base: {_fmt_pct(base_haerae)} โ†’ SFT: N/A")
lines.append("")
# MMLU-KO
lines.append("### MMLU-KO (0-shot)\n")
base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko")
sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None
if sft_mmlu_ko is not None:
diff_mk = (sft_mmlu_ko - (base_mmlu_ko or 0)) * 100 if base_mmlu_ko else 0
lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ†’ SFT: {_fmt_pct(sft_mmlu_ko)} "
f"({'+' if diff_mk >= 0 else ''}{diff_mk:.1f}pp) | "
f"๋ชฉํ‘œ โ‰ฅ{_SFT_TARGETS['mmlu_ko_min']*100:.0f}% | {'PASS' if sft_mmlu_ko >= _SFT_TARGETS['mmlu_ko_min'] else 'FAIL'}")
else:
lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ†’ SFT: N/A")
lines.append("")
# 5-shot comparison
if sft_five:
lines.append("### 5-shot ๋น„๊ต (ํ•œ๊ตญ์–ด)\n")
lines.append("| ํƒœ์Šคํฌ | 0-shot | 5-shot | ๋ณ€ํ™” |")
lines.append("|--------|--------|--------|------|")
for t in kobest_tasks + ["haerae", "global_mmlu_ko"]:
a0 = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
a5 = _get_acc(sft_five.get(t, {})) if t in sft_five else None
if a0 is not None and a5 is not None:
d = (a5 - a0) * 100
lines.append(f"| {t} | {a0*100:.2f}% | {a5*100:.2f}% | {'+' if d >= 0 else ''}{d:.1f}pp |")
lines.append("")
# === 5. English Benchmarks ===
lines.append("## 5. ์˜์–ด ๋ฒค์น˜๋งˆํฌ (์œ ์ง€ ํ™•์ธ)\n")
lines.append("| ํƒœ์Šคํฌ | Base | SFT | ๋ณ€ํ™” | ํ•˜ํ•œ | ํŒ์ • |")
lines.append("|--------|------|-----|------|------|------|")
en_tasks = {
"hellaswag": _SFT_TARGETS["hellaswag_min"],
"arc_easy": _SFT_TARGETS["arc_easy_min"],
"arc_challenge": _SFT_TARGETS["arc_challenge_min"],
"winogrande": _SFT_TARGETS["winogrande_min"],
"piqa": _SFT_TARGETS["piqa_min"],
}
for t, threshold in en_tasks.items():
base_a = _get_acc(base_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \
if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \
if t in sft_zero else None
diff = ""
verdict = "โ€”"
if sft_a is not None and base_a is not None:
d = (sft_a - base_a) * 100
diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
verdict = "PASS" if sft_a >= threshold else "FAIL"
lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | "
f"โ‰ฅ{threshold*100:.0f}% | {verdict} |")
# MMLU-EN
_MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}
sft_mmlu_en = []
base_mmlu_en = []
for t, m in sft_zero.items():
if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
a = _get_acc(m)
if a is not None:
sft_mmlu_en.append(a)
if not sft_mmlu_en:
for t in _MMLU_EN_GROUPS:
if t in sft_zero:
a = _get_acc(sft_zero[t])
if a is not None:
sft_mmlu_en.append(a)
for t, m in base_zero.items():
if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
a = _get_acc(m)
if a is not None:
base_mmlu_en.append(a)
if not base_mmlu_en:
for t in _MMLU_EN_GROUPS:
if t in base_zero:
a = _get_acc(base_zero[t])
if a is not None:
base_mmlu_en.append(a)
sft_mmlu_en_avg = sum(sft_mmlu_en) / len(sft_mmlu_en) if sft_mmlu_en else None
base_mmlu_en_avg = sum(base_mmlu_en) / len(base_mmlu_en) if base_mmlu_en else 0.2581
if sft_mmlu_en_avg is not None:
d = (sft_mmlu_en_avg - base_mmlu_en_avg) * 100
lines.append(f"| MMLU-EN ํ‰๊ท  | {base_mmlu_en_avg*100:.2f}% | {sft_mmlu_en_avg*100:.2f}% | "
f"{'+' if d >= 0 else ''}{d:.1f}pp | โ‰ฅ25% | "
f"{'PASS' if sft_mmlu_en_avg >= _SFT_TARGETS['mmlu_en_avg_min'] else 'FAIL'} |")
lines.append("")
# === 6. Calibration ===
lines.append("## 6. Calibration ๋น„๊ต\n")
sft_cal = sft_p1.get("calibration", {})
lines.append("| ์ง€ํ‘œ | Base | SFT | ๋ชฉํ‘œ | ํŒ์ • |")
lines.append("|------|------|-----|------|------|")
cal_checks = [
("top1_accuracy", "Top-1 Accuracy", _SFT_TARGETS["top1_accuracy_min"], True),
("top5_accuracy", "Top-5 Accuracy", 0.78, True),
("top10_accuracy", "Top-10 Accuracy", 0.82, True),
("mean_entropy", "Mean Entropy", 2.0, False),
]
for key, label, threshold, is_higher_better in cal_checks:
base_v = _BASE_CALIB_REFERENCE.get(key)
sft_v = sft_cal.get(key)
verdict = "โ€”"
if sft_v is not None:
if is_higher_better:
verdict = "PASS" if sft_v >= threshold else "FAIL"
else:
verdict = "PASS" if sft_v <= threshold else "FAIL"
lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | "
f"{'โ‰ฅ' if is_higher_better else '<'}{threshold} | {verdict} |")
# Token NLL
sft_nll = sft_p1.get("token_nll", {})
nll_mean = sft_nll.get("nll_mean", sft_nll.get("mean"))
base_nll_mean = _BASE_NLL_REFERENCE.get("nll_mean")
if nll_mean is not None:
lines.append(f"| Token NLL mean | {_fmt_f(base_nll_mean)} | {_fmt_f(nll_mean)} | "
f"< 2.0 | {'PASS' if nll_mean < 2.0 else 'FAIL'} |")
hlf5 = sft_nll.get("high_loss_fractions", {}).get("5", sft_nll.get("high_loss_fraction_5"))
base_hlf5 = _BASE_NLL_REFERENCE.get("high_loss_fraction_5")
if hlf5 is not None:
lines.append(f"| NLL > 5 ๋น„์œจ | {_fmt_f(base_hlf5)} | {_fmt_f(hlf5)} | "
f"< 0.15 | {'PASS' if hlf5 < 0.15 else 'FAIL'} |")
lines.append("")
# === 7. Final Verdict ===
lines.append("## 7. ์ข…ํ•ฉ ํŒ์ • ๋ฐ ๋‹ค์Œ ๋‹จ๊ณ„\n")
lines.append("### ํ•ต์‹ฌ ํŒ์ • ๊ธฐ์ค€\n")
lines.append("| ์กฐ๊ฑด | ํ˜„์žฌ ๊ฐ’ | ๊ธฐ์ค€ | ์ถฉ์กฑ |")
lines.append("|------|---------|------|------|")
rep_val = rep_rate
lines.append(f"| Greedy 3-gram ๋ฐ˜๋ณต๋ฅ  | {_fmt_pct(rep_val)} | < {_SFT_TARGETS['greedy_3gram_rep_max']:.0%} | "
f"{'YES' if rep_val is not None and rep_val < _SFT_TARGETS['greedy_3gram_rep_max'] else 'NO'} |")
lines.append(f"| KoBEST ํ‰๊ท  | {_fmt_pct(kobest_avg)} | > {_SFT_TARGETS['kobest_avg_min']*100:.0f}% | "
f"{'YES' if kobest_avg is not None and kobest_avg > _SFT_TARGETS['kobest_avg_min'] else 'NO'} |")
lines.append(f"| ์ตœ๋Œ€ Forgetting | {f'{max_forgetting:.1f}%' if max_forgetting is not None else 'N/A'} | "
f"< {_SFT_TARGETS['ppl_forgetting_max_pct']}% | {'YES' if max_forgetting is not None and max_forgetting < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'NO'} |")
lines.append("")
# Final recommendation โ€” use ORPO quantitative score for decision
lines.append("### ๊ถŒ๊ณ \n")
orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero)
orpo_score = orpo_result["total_score"]
orpo_decision = orpo_result["decision"]
all_core_pass = (
rep_rate is not None and rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"]
and kobest_avg is not None and kobest_avg > _SFT_TARGETS["kobest_avg_min"]
and max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"]
)
if all_core_pass:
lines.append("**๋ชจ๋“  ํ•ต์‹ฌ ์กฐ๊ฑด ์ถฉ์กฑ โ†’ Phase 4: GGUF ๋ณ€ํ™˜ + Ollama ๋ฐฐํฌ ์ง„ํ–‰**\n")
elif orpo_decision == "ORPO":
lines.append(f"**ORPO ํŒ์ • ์Šค์ฝ”์–ด {orpo_score:.1f}/100 โ†’ Phase 3: ORPO ํ•™์Šต ์ง„ํ–‰** (795K preference pairs ํ™œ์šฉ)\n")
lines.append("ORPO ํ•™์Šต ์‹œ ์ฃผ์•ˆ์ :")
lines.append("- Greedy ๋ฐ˜๋ณต๋ฅ  ๊ฐ์†Œ (ํ˜„์žฌ 72.97% โ†’ ๋ชฉํ‘œ <5%)")
lines.append("- EOS ์ข…๋ฃŒ์œจ ๊ฐœ์„  (ํ˜„์žฌ 60% โ†’ ๋ชฉํ‘œ >90%)")
lines.append("- ๋ฒค์น˜๋งˆํฌ ์ ์ˆ˜ ์œ ์ง€/ํ–ฅ์ƒ")
lines.append("- ์ง€์‹ ๋ณด์กด ์œ ์ง€ (ํ˜„์žฌ forgetting 0.9%)")
elif orpo_decision == "SKIP_ORPO":
lines.append("**ORPO ๋ถˆํ•„์š” โ†’ Phase 4: GGUF ๋ณ€ํ™˜ + Ollama ๋ฐฐํฌ ์ง„ํ–‰**\n")
else:
lines.append("**ํ•ต์‹ฌ ์กฐ๊ฑด ๋ฏธ๋‹ฌ โ†’ SFT ์žฌ์‹œ๋„**\n")
lines.append("์žฌ์‹œ๋„ ์‹œ ๊ฒ€ํ†  ์‚ฌํ•ญ:")
lines.append("- ํ•™์Šต๋ฅ  ์กฐ์ •")
lines.append("- ๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ ์žฌ๊ฒ€ํ† ")
lines.append("- ์—ํญ ์ˆ˜ ์กฐ์ •")
lines.append("")
lines.append("---\n")
lines.append("*์ด ๋ณด๊ณ ์„œ๋Š” `eval/sft_eval_pipeline.py`์— ์˜ํ•ด ์ž๋™ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.*")
report_text = "\n".join(lines)
output_path.write_text(report_text, encoding="utf-8")
# Also save to sft_output_dir if provided
if sft_output_dir:
(Path(sft_output_dir) / "sft_comparison_report.md").write_text(report_text, encoding="utf-8")
return output_path
def _compute_verdicts(sft_p1, sft_zero, base_p1, base_zero):
"""Compute pass/fail verdicts for each of the 6 evaluation dimensions."""
verdicts = []
# Dim 1: PPL forgetting
max_forgetting = _get_max_forgetting(sft_p1, base_p1)
if max_forgetting is not None:
verdicts.append((
"์ฐจ์› 1: Perplexity (์ง€์‹ ๋ณด์กด)",
max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"],
f"์ตœ๋Œ€ forgetting {max_forgetting:.1f}% (์ž„๊ณ„๊ฐ’ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)",
))
else:
verdicts.append(("์ฐจ์› 1: Perplexity (์ง€์‹ ๋ณด์กด)", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# Dim 2: Generation quality
rep_rate = _get_greedy_3gram_rep(sft_p1)
eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
if rep_rate is not None and eos_rate is not None:
gen_pass = rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] and eos_rate > _SFT_TARGETS["eos_rate_min"]
verdicts.append((
"์ฐจ์› 2: ์ƒ์„ฑ ํ’ˆ์งˆ",
gen_pass,
f"๋ฐ˜๋ณต๋ฅ  {rep_rate:.2%} (๋ชฉํ‘œ <{_SFT_TARGETS['greedy_3gram_rep_max']:.0%}), EOS {eos_rate:.0%} (๋ชฉํ‘œ >{_SFT_TARGETS['eos_rate_min']:.0%})",
))
else:
verdicts.append(("์ฐจ์› 2: ์ƒ์„ฑ ํ’ˆ์งˆ", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# Dim 3: Korean benchmarks
kobest_avg = _get_kobest_avg(sft_zero)
if kobest_avg is not None:
verdicts.append((
"์ฐจ์› 3: ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ",
kobest_avg > _SFT_TARGETS["kobest_avg_min"],
f"KoBEST ํ‰๊ท  {kobest_avg*100:.2f}% (๋ชฉํ‘œ >{_SFT_TARGETS['kobest_avg_min']*100:.0f}%)",
))
else:
verdicts.append(("์ฐจ์› 3: ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# Dim 4: English benchmarks
en_tasks = {
"hellaswag": _SFT_TARGETS["hellaswag_min"],
"arc_easy": _SFT_TARGETS["arc_easy_min"],
"arc_challenge": _SFT_TARGETS["arc_challenge_min"],
"winogrande": _SFT_TARGETS["winogrande_min"],
"piqa": _SFT_TARGETS["piqa_min"],
}
en_pass = True
en_detail_parts = []
for t, threshold in en_tasks.items():
a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
if a is not None:
if a < threshold:
en_pass = False
en_detail_parts.append(f"{t}={a*100:.1f}%")
if en_detail_parts:
verdicts.append((
"์ฐจ์› 4: ์˜์–ด ๋ฒค์น˜๋งˆํฌ",
en_pass,
", ".join(en_detail_parts[:3]) + ("..." if len(en_detail_parts) > 3 else ""),
))
else:
verdicts.append(("์ฐจ์› 4: ์˜์–ด ๋ฒค์น˜๋งˆํฌ", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# Dim 5: Calibration
cal = sft_p1.get("calibration", {})
top1 = cal.get("top1_accuracy")
if top1 is not None:
cal_pass = top1 >= _SFT_TARGETS["top1_accuracy_min"]
verdicts.append((
"์ฐจ์› 5: Calibration",
cal_pass,
f"Top-1 {top1*100:.2f}% (๋ชฉํ‘œ โ‰ฅ{_SFT_TARGETS['top1_accuracy_min']*100:.0f}%)",
))
else:
verdicts.append(("์ฐจ์› 5: Calibration", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# Dim 6: SFT-specific (chat quality) โ€” based on generation + EOS
if eos_rate is not None:
chat_pass = eos_rate > 0.50 # relaxed threshold for chat
verdicts.append((
"์ฐจ์› 6: SFT Chat ๋Šฅ๋ ฅ",
chat_pass,
f"EOS ์ข…๋ฃŒ์œจ {eos_rate:.0%}, ์ƒ์„ฑ ์ƒ˜ํ”Œ ์ˆ˜๋™ ๊ฒ€ํ†  ํ•„์š”",
))
else:
verdicts.append(("์ฐจ์› 6: SFT Chat ๋Šฅ๋ ฅ", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
return verdicts
def _get_greedy_3gram_rep(p1: dict) -> Optional[float]:
gen = p1.get("generation", {})
return gen.get("summary", {}).get("greedy_avg_3gram_rep")
def _get_kobest_avg(zero_shot: dict) -> Optional[float]:
kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
"kobest_sentineg", "kobest_wic"]
accs = []
for t in kobest_tasks:
if t in zero_shot:
a = _get_acc(zero_shot[t])
if a is not None:
accs.append(a)
return sum(accs) / len(accs) if accs else None
def _get_max_forgetting(sft_p1: dict, base_p1: dict) -> Optional[float]:
sft_ppl = sft_p1.get("perplexity", {})
base_ppl = base_p1.get("perplexity", {})
forgetting_values = []
for name in sft_ppl:
sft_val = sft_ppl[name].get("ppl") if isinstance(sft_ppl[name], dict) else None
base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
if base_val is None:
base_val = _BASE_PPL_REFERENCE.get(name)
if sft_val is not None and base_val is not None and base_val > 0:
forgetting_values.append((sft_val - base_val) / base_val * 100)
return max(forgetting_values) if forgetting_values else None
# =========================================================================
# ORPO-specific verdict helpers
# =========================================================================
def _compute_orpo_verdicts(
orpo_p1: dict,
orpo_zero: dict,
sft_p1: dict,
sft_zero: dict,
training_curve: Optional[dict] = None,
) -> List[Tuple[str, bool, str]]:
"""Compute the 4 ORPO-specific evaluation dimensions.
Returns list of (dimension_name, pass_bool, detail_string).
"""
verdicts: List[Tuple[str, bool, str]] = []
# ORPO Dim 1: Preference Accuracy (final > 0.65)
pref_acc = None
if training_curve and training_curve.get("eval_steps"):
last_step = training_curve["eval_steps"][-1]
pref_acc = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
if pref_acc is not None:
verdicts.append((
"ORPO-1: Preference Accuracy",
pref_acc > 0.65,
f"์ตœ์ข… {pref_acc:.2%} (๋ชฉํ‘œ > 65%)",
))
else:
verdicts.append(("ORPO-1: Preference Accuracy", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# ORPO Dim 2: Reward Margins (final > 0.1)
reward_margin = None
if training_curve and training_curve.get("eval_steps"):
last_step = training_curve["eval_steps"][-1]
reward_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
if reward_margin is not None:
verdicts.append((
"ORPO-2: Reward Margins",
reward_margin > 0.1,
f"์ตœ์ข… {reward_margin:.4f} (๋ชฉํ‘œ > 0.1)",
))
else:
verdicts.append(("ORPO-2: Reward Margins", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
# ORPO Dim 3: Parameter Sensitivity (greedy rep < 5% with rep_penalty=1.0)
rep_grid = orpo_p1.get("repetition", {}).get("grid_results")
param_sens_pass = False
param_sens_detail = "๋ฐ์ดํ„ฐ ์—†์Œ"
if rep_grid:
items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
for r in items:
if isinstance(r, dict):
rp = r.get("repetition_penalty", r.get("rep_penalty"))
if rp is not None and abs(float(rp) - 1.0) < 1e-6:
rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition"))
if rep_val is not None:
param_sens_pass = rep_val < 0.05
param_sens_detail = f"rep_penalty=1.0 ์‹œ 3-gram rep={rep_val:.2%} (๋ชฉํ‘œ < 5%)"
break
verdicts.append((
"ORPO-3: Parameter Sensitivity",
param_sens_pass,
param_sens_detail,
))
# ORPO Dim 4: SFTโ†’ORPO Improvement (rep decreased AND EOS increased)
sft_rep = _get_greedy_3gram_rep(sft_p1)
orpo_rep = _get_greedy_3gram_rep(orpo_p1)
sft_eos = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
if all(v is not None for v in [sft_rep, orpo_rep, sft_eos, orpo_eos]):
rep_improved = orpo_rep < sft_rep
eos_improved = orpo_eos > sft_eos
verdicts.append((
"ORPO-4: SFTโ†’ORPO ๊ฐœ์„ ",
rep_improved and eos_improved,
f"๋ฐ˜๋ณต๋ฅ  {sft_rep:.2%}โ†’{orpo_rep:.2%} ({'โ†“' if rep_improved else 'โ†‘'}), "
f"EOS {sft_eos:.0%}โ†’{orpo_eos:.0%} ({'โ†‘' if eos_improved else 'โ†“'})",
))
else:
verdicts.append(("ORPO-4: SFTโ†’ORPO ๊ฐœ์„ ", False, "๋ฐ์ดํ„ฐ ์—†์Œ"))
return verdicts
# =========================================================================
# Base vs SFT vs ORPO 3-way Comparison Report
# =========================================================================
def generate_three_way_report(
base_results_dir: Path,
sft_results_dir: Path,
orpo_phase1_results: dict,
orpo_phase2_results: dict,
output_path: Path,
orpo_output_dir: Optional[Path] = None,
training_curve: Optional[dict] = None,
total_elapsed_sec: float = 0.0,
) -> Path:
"""Generate a comprehensive Base vs SFT vs ORPO 3-way comparison report.
Args:
base_results_dir: Directory containing Base model's phase1/phase2_results.json
sft_results_dir: Directory containing SFT model's phase1/phase2_results.json
orpo_phase1_results: ORPO Phase 1 results dict
orpo_phase2_results: ORPO Phase 2 results dict
output_path: Where to write the markdown report
orpo_output_dir: ORPO eval outputs directory (for linking)
training_curve: Dict with "eval_steps" list of per-step metrics
total_elapsed_sec: Total pipeline elapsed time
Returns:
Path to the generated report
"""
base_results_dir = Path(base_results_dir)
sft_results_dir = Path(sft_results_dir)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# --- Load Base results ---
base_p1_raw, base_p2_raw = {}, {}
p1_file = base_results_dir / "phase1_results.json"
p2_file = base_results_dir / "phase2_results.json"
if p1_file.exists():
with open(p1_file, encoding="utf-8") as f:
base_p1_raw = json.load(f)
if p2_file.exists():
with open(p2_file, encoding="utf-8") as f:
base_p2_raw = json.load(f)
# --- Load SFT results ---
sft_p1_raw, sft_p2_raw = {}, {}
sft_p1_file = sft_results_dir / "phase1_results.json"
sft_p2_file = sft_results_dir / "phase2_results.json"
if sft_p1_file.exists():
with open(sft_p1_file, encoding="utf-8") as f:
sft_p1_raw = json.load(f)
if sft_p2_file.exists():
with open(sft_p2_file, encoding="utf-8") as f:
sft_p2_raw = json.load(f)
# --- Normalize all ---
base_p1 = _normalize_phase1_results(base_p1_raw)
base_zero, base_five = _normalize_phase2_results(base_p2_raw)
sft_p1 = _normalize_phase1_results(sft_p1_raw)
sft_zero, sft_five = _normalize_phase2_results(sft_p2_raw)
orpo_p1 = _normalize_phase1_results(orpo_phase1_results)
orpo_zero, orpo_five = _normalize_phase2_results(orpo_phase2_results)
eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines: List[str] = []
# =====================================================================
# Header
# =====================================================================
lines.append("# FRANKENSTALLM 3B ORPO ๋ชจ๋ธ ์ข…ํ•ฉ ํ‰๊ฐ€ ๋ณด๊ณ ์„œ\n")
lines.append(f"- **ํ‰๊ฐ€ ์ผ์‹œ**: {eval_datetime}")
lines.append(f"- **๋น„๊ต ๋Œ€์ƒ**: Base โ†’ SFT โ†’ ORPO")
lines.append(f"- **์ด ์†Œ์š” ์‹œ๊ฐ„**: {_fmt_seconds(total_elapsed_sec)}")
if orpo_output_dir:
lines.append(f"- **๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ**: {orpo_output_dir}")
lines.append("")
# =====================================================================
# 1. Executive Summary
# =====================================================================
lines.append("## 1. Executive Summary\n")
# 6 standard verdicts (reuse existing)
std_verdicts = _compute_verdicts(orpo_p1, orpo_zero, base_p1, base_zero)
# 4 ORPO-specific verdicts
orpo_verdicts = _compute_orpo_verdicts(orpo_p1, orpo_zero, sft_p1, sft_zero, training_curve)
all_verdicts = std_verdicts + orpo_verdicts
lines.append("| # | ํ‰๊ฐ€ ์ฐจ์› | ๊ฒฐ๊ณผ | ์ƒ์„ธ |")
lines.append("|---|----------|------|------|")
for i, (dim_name, verdict, detail) in enumerate(all_verdicts, 1):
icon = "PASS" if verdict else "FAIL"
lines.append(f"| {i} | {dim_name} | **{icon}** | {detail} |")
lines.append("")
pass_count = sum(1 for _, v, _ in all_verdicts if v)
total_dims = len(all_verdicts)
lines.append(f"**์ข…ํ•ฉ**: {pass_count}/{total_dims} ์ฐจ์› ํ†ต๊ณผ\n")
# Quantitative score (reuse _compute_orpo_score with ORPO results)
orpo_score_result = _compute_orpo_score(orpo_p1, orpo_zero, base_p1, base_zero)
lines.append(f"**์ •๋Ÿ‰ ์Šค์ฝ”์–ด**: {orpo_score_result['total_score']}/100\n")
# Final decision
orpo_rep = _get_greedy_3gram_rep(orpo_p1)
orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
orpo_forgetting = _get_max_forgetting(orpo_p1, base_p1)
orpo_kobest = _get_kobest_avg(orpo_zero)
deploy_criteria_met = (
orpo_rep is not None and orpo_rep < 0.05
and orpo_eos is not None and orpo_eos > 0.90
and orpo_forgetting is not None and orpo_forgetting < 5.0
and orpo_kobest is not None and orpo_kobest >= 0.43
)
final_decision = "DEPLOY" if deploy_criteria_met else "RETRY"
lines.append(f"**์ตœ์ข… ํŒ์ •**: **{final_decision}**\n")
lines.append("")
# =====================================================================
# 2. ํ•™์Šต ๊ณก์„  ๋ถ„์„
# =====================================================================
lines.append("## 2. ํ•™์Šต ๊ณก์„  ๋ถ„์„\n")
if training_curve and training_curve.get("eval_steps"):
eval_steps = training_curve["eval_steps"]
lines.append("### Training / Eval Loss\n")
lines.append("| Step | Train Loss | Eval Loss | Pref Accuracy | Reward Margin |")
lines.append("|------|-----------|-----------|---------------|---------------|")
for step_data in eval_steps:
step = step_data.get("step", "?")
train_loss = _fmt_f(step_data.get("train_loss", step_data.get("loss")), 4)
eval_loss = _fmt_f(step_data.get("eval_loss"), 4)
pref_acc = _fmt_f(step_data.get("rewards_accuracies", step_data.get("preference_accuracy")), 4)
reward_m = _fmt_f(step_data.get("rewards_margins", step_data.get("reward_margins")), 4)
lines.append(f"| {step} | {train_loss} | {eval_loss} | {pref_acc} | {reward_m} |")
lines.append("")
# Summary stats
first_step = eval_steps[0]
last_step = eval_steps[-1]
lines.append("### ํ•™์Šต ๊ณก์„  ์š”์•ฝ\n")
first_loss = first_step.get("train_loss", first_step.get("loss"))
last_loss = last_step.get("train_loss", last_step.get("loss"))
if first_loss is not None and last_loss is not None:
lines.append(f"- **Train Loss**: {first_loss:.4f} โ†’ {last_loss:.4f}")
first_eval = first_step.get("eval_loss")
last_eval = last_step.get("eval_loss")
if first_eval is not None and last_eval is not None:
lines.append(f"- **Eval Loss**: {first_eval:.4f} โ†’ {last_eval:.4f}")
last_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
if last_pref is not None:
lines.append(f"- **์ตœ์ข… Preference Accuracy**: {last_pref:.2%}")
last_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
if last_margin is not None:
lines.append(f"- **์ตœ์ข… Reward Margin**: {last_margin:.4f}")
lines.append("")
else:
lines.append("ํ•™์Šต ๊ณก์„  ๋ฐ์ดํ„ฐ ์—†์Œ\n")
# =====================================================================
# 3. Perplexity ๋น„๊ต (์ง€์‹ ๋ณด์กด)
# =====================================================================
lines.append("## 3. Perplexity ๋น„๊ต (์ง€์‹ ๋ณด์กด)\n")
lines.append("| ๋ฐ์ดํ„ฐ์…‹ | Base PPL | SFT PPL | ORPO PPL | SFT Forgetting | ORPO Forgetting |")
lines.append("|---------|---------|---------|---------|----------------|-----------------|")
base_ppl = base_p1.get("perplexity", {})
sft_ppl = sft_p1.get("perplexity", {})
orpo_ppl = orpo_p1.get("perplexity", {})
all_ppl_names = sorted(set(
list(base_ppl.keys()) + list(sft_ppl.keys()) + list(orpo_ppl.keys())
))
for name in all_ppl_names:
base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
if base_val is None:
base_val = _BASE_PPL_REFERENCE.get(name)
sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None
orpo_val = orpo_ppl.get(name, {}).get("ppl") if isinstance(orpo_ppl.get(name), dict) else None
sft_forg = f"{(sft_val - base_val) / base_val * 100:+.1f}%" if (sft_val is not None and base_val is not None and base_val > 0) else "โ€”"
orpo_forg = f"{(orpo_val - base_val) / base_val * 100:+.1f}%" if (orpo_val is not None and base_val is not None and base_val > 0) else "โ€”"
lines.append(
f"| {name} | {_fmt_f(base_val)} | {_fmt_f(sft_val)} | {_fmt_f(orpo_val)} | "
f"{sft_forg} | {orpo_forg} |"
)
lines.append("")
# =====================================================================
# 4. ์ƒ์„ฑ ํ’ˆ์งˆ ๋น„๊ต
# =====================================================================
lines.append("## 4. ์ƒ์„ฑ ํ’ˆ์งˆ ๋น„๊ต\n")
base_gen_summary = base_p1.get("generation", {}).get("summary", {})
sft_gen_summary = sft_p1.get("generation", {}).get("summary", {})
orpo_gen_summary = orpo_p1.get("generation", {}).get("summary", {})
base_3gram = base_gen_summary.get("greedy_avg_3gram_rep", _BASE_GEN_REFERENCE.get("greedy_3gram_rep"))
sft_3gram = sft_gen_summary.get("greedy_avg_3gram_rep")
orpo_3gram = orpo_gen_summary.get("greedy_avg_3gram_rep")
base_4gram = base_gen_summary.get("greedy_avg_4gram_rep", _BASE_GEN_REFERENCE.get("greedy_4gram_rep"))
sft_4gram = sft_gen_summary.get("greedy_avg_4gram_rep")
orpo_4gram = orpo_gen_summary.get("greedy_avg_4gram_rep")
base_eos = base_gen_summary.get("greedy_eos_rate", _BASE_GEN_REFERENCE.get("greedy_eos_rate"))
sft_eos_val = sft_gen_summary.get("greedy_eos_rate")
orpo_eos_val = orpo_gen_summary.get("greedy_eos_rate")
lines.append("| ์ง€ํ‘œ | Base | SFT | ORPO | SFTโ†’ORPO ๋ณ€ํ™” |")
lines.append("|------|------|-----|------|---------------|")
# 3-gram rep
sft_orpo_3gram_diff = ""
if sft_3gram is not None and orpo_3gram is not None:
d = (orpo_3gram - sft_3gram) * 100
sft_orpo_3gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| Greedy 3-gram ๋ฐ˜๋ณต๋ฅ  | {_fmt_pct(base_3gram)} | {_fmt_pct(sft_3gram)} | "
f"{_fmt_pct(orpo_3gram)} | {sft_orpo_3gram_diff} |")
# 4-gram rep
sft_orpo_4gram_diff = ""
if sft_4gram is not None and orpo_4gram is not None:
d = (orpo_4gram - sft_4gram) * 100
sft_orpo_4gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| Greedy 4-gram ๋ฐ˜๋ณต๋ฅ  | {_fmt_pct(base_4gram)} | {_fmt_pct(sft_4gram)} | "
f"{_fmt_pct(orpo_4gram)} | {sft_orpo_4gram_diff} |")
# EOS rate
sft_orpo_eos_diff = ""
if sft_eos_val is not None and orpo_eos_val is not None:
d = (orpo_eos_val - sft_eos_val) * 100
sft_orpo_eos_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| EOS ์ข…๋ฃŒ์œจ | {_fmt_pct(base_eos)} | {_fmt_pct(sft_eos_val)} | "
f"{_fmt_pct(orpo_eos_val)} | {sft_orpo_eos_diff} |")
lines.append("")
# =====================================================================
# 5. ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ
# =====================================================================
lines.append("## 5. ํ•œ๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ\n")
# KoBEST
lines.append("### KoBEST (0-shot)\n")
lines.append("| ํƒœ์Šคํฌ | Base | SFT | ORPO | Baseโ†’ORPO |")
lines.append("|--------|------|-----|------|-----------|")
kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
"kobest_sentineg", "kobest_wic"]
base_kobest_accs, sft_kobest_accs, orpo_kobest_accs = [], [], []
for t in kobest_tasks:
base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
orpo_a = _get_acc(orpo_zero.get(t, {})) if t in orpo_zero else None
if base_a is not None:
base_kobest_accs.append(base_a)
if sft_a is not None:
sft_kobest_accs.append(sft_a)
if orpo_a is not None:
orpo_kobest_accs.append(orpo_a)
diff = ""
if orpo_a is not None and base_a is not None:
d = (orpo_a - base_a) * 100
diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |")
# Averages
base_kavg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else None
sft_kavg = sum(sft_kobest_accs) / len(sft_kobest_accs) if sft_kobest_accs else None
orpo_kavg = sum(orpo_kobest_accs) / len(orpo_kobest_accs) if orpo_kobest_accs else None
avg_diff = ""
if orpo_kavg is not None and base_kavg is not None:
d = (orpo_kavg - base_kavg) * 100
avg_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| **ํ‰๊ท ** | **{_fmt_pct(base_kavg)}** | **{_fmt_pct(sft_kavg)}** | "
f"**{_fmt_pct(orpo_kavg)}** | **{avg_diff}** |")
lines.append("")
# HAE-RAE
lines.append("### HAE-RAE (0-shot)\n")
base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae")
sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None
orpo_haerae = _get_acc(orpo_zero.get("haerae", {})) if "haerae" in orpo_zero else None
lines.append(f"- Base: {_fmt_pct(base_haerae)} โ†’ SFT: {_fmt_pct(sft_haerae)} โ†’ ORPO: {_fmt_pct(orpo_haerae)}")
lines.append("")
# MMLU-KO
lines.append("### MMLU-KO (0-shot)\n")
base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko")
sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None
orpo_mmlu_ko = _get_acc(orpo_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in orpo_zero else None
lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ†’ SFT: {_fmt_pct(sft_mmlu_ko)} โ†’ ORPO: {_fmt_pct(orpo_mmlu_ko)}")
lines.append("")
# =====================================================================
# 6. ์˜์–ด ๋ฒค์น˜๋งˆํฌ
# =====================================================================
lines.append("## 6. ์˜์–ด ๋ฒค์น˜๋งˆํฌ\n")
lines.append("| ํƒœ์Šคํฌ | Base | SFT | ORPO | Baseโ†’ORPO |")
lines.append("|--------|------|-----|------|-----------|")
en_tasks_list = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]
for t in en_tasks_list:
prefer_norm = t in ["hellaswag", "arc_challenge"]
base_a = _get_acc(base_zero.get(t, {}), prefer_norm=prefer_norm) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=prefer_norm) if t in sft_zero else None
orpo_a = _get_acc(orpo_zero.get(t, {}), prefer_norm=prefer_norm) if t in orpo_zero else None
diff = ""
if orpo_a is not None and base_a is not None:
d = (orpo_a - base_a) * 100
diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |")
# MMLU-EN averages
_MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}
def _mmlu_en_avg(zero: dict) -> Optional[float]:
accs = []
for t, m in zero.items():
if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
a = _get_acc(m)
if a is not None:
accs.append(a)
if not accs:
for t in _MMLU_EN_GROUPS:
if t in zero:
a = _get_acc(zero[t])
if a is not None:
accs.append(a)
return sum(accs) / len(accs) if accs else None
base_mmlu_en = _mmlu_en_avg(base_zero)
sft_mmlu_en = _mmlu_en_avg(sft_zero)
orpo_mmlu_en = _mmlu_en_avg(orpo_zero)
mmlu_en_diff = ""
if orpo_mmlu_en is not None and base_mmlu_en is not None:
d = (orpo_mmlu_en - base_mmlu_en) * 100
mmlu_en_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
lines.append(f"| MMLU-EN ํ‰๊ท  | {_fmt_pct(base_mmlu_en)} | {_fmt_pct(sft_mmlu_en)} | "
f"{_fmt_pct(orpo_mmlu_en)} | {mmlu_en_diff} |")
lines.append("")
# =====================================================================
# 7. Calibration
# =====================================================================
lines.append("## 7. Calibration ๋น„๊ต\n")
lines.append("| ์ง€ํ‘œ | Base | SFT | ORPO |")
lines.append("|------|------|-----|------|")
base_cal = base_p1.get("calibration", {})
sft_cal = sft_p1.get("calibration", {})
orpo_cal = orpo_p1.get("calibration", {})
cal_metrics = [
("top1_accuracy", "Top-1 Accuracy"),
("top5_accuracy", "Top-5 Accuracy"),
("top10_accuracy", "Top-10 Accuracy"),
]
for key, label in cal_metrics:
base_v = base_cal.get(key, _BASE_CALIB_REFERENCE.get(key))
sft_v = sft_cal.get(key)
orpo_v = orpo_cal.get(key)
lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | {_fmt_f(orpo_v)} |")
lines.append("")
# =====================================================================
# 8. ORPO ๊ณ ์œ  ์ง€ํ‘œ
# =====================================================================
lines.append("## 8. ORPO ๊ณ ์œ  ์ง€ํ‘œ\n")
# Final preference accuracy & reward margins
if training_curve and training_curve.get("eval_steps"):
last_step = training_curve["eval_steps"][-1]
final_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
final_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
if final_pref is not None:
lines.append(f"- **์ตœ์ข… Preference Accuracy**: {final_pref:.2%}")
if final_margin is not None:
lines.append(f"- **์ตœ์ข… Reward Margins**: {final_margin:.4f}")
else:
lines.append("- Preference Accuracy / Reward Margins: ๋ฐ์ดํ„ฐ ์—†์Œ")
# Parameter sensitivity
rep_grid = orpo_p1.get("repetition", {}).get("grid_results")
if rep_grid:
items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
for r in items:
if isinstance(r, dict):
rp = r.get("repetition_penalty", r.get("rep_penalty"))
if rp is not None and abs(float(rp) - 1.0) < 1e-6:
rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition"))
if rep_val is not None:
verdict = "PASS" if rep_val < 0.05 else "FAIL"
lines.append(f"- **Parameter Sensitivity**: rep_penalty=1.0 โ†’ 3-gram rep={rep_val:.2%} "
f"(๋ชฉํ‘œ < 5%) โ†’ {verdict}")
break
lines.append("")
# =====================================================================
# 9. ๋ฐ˜๋ณต๋ฅ  ๊ทธ๋ฆฌ๋“œ ์„œ์น˜
# =====================================================================
lines.append("## 9. ๋ฐ˜๋ณต๋ฅ  ๊ทธ๋ฆฌ๋“œ ์„œ์น˜\n")
if rep_grid:
items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
rep_rows = []
for r in items:
if isinstance(r, dict):
rep_rows.append({
"config": r.get("params", "?"),
"temp": r.get("temperature"),
"rep_pen": r.get("repetition_penalty"),
"3gram": r.get("avg_3gram_rep", r.get("3gram_repetition", float("inf"))),
"4gram": r.get("avg_4gram_rep", r.get("4gram_repetition")),
"eos_rate": r.get("eos_rate"),
"avg_tokens": r.get("avg_tokens"),
})
rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf"))
lines.append("| ์„ค์ • | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |")
lines.append("|------|------|---------|--------|--------|----------|-----------|")
for i, r in enumerate(rep_rows):
marker = " **โ† best**" if i == 0 else ""
lines.append(
f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | "
f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | "
f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}"
)
lines.append("")
else:
lines.append("๋ฐ˜๋ณต๋ฅ  ๊ทธ๋ฆฌ๋“œ ์„œ์น˜ ๋ฐ์ดํ„ฐ ์—†์Œ\n")
# =====================================================================
# 10. ์ƒ์„ฑ ์ƒ˜ํ”Œ
# =====================================================================
lines.append("## 10. ์ƒ์„ฑ ์ƒ˜ํ”Œ\n")
orpo_gen = orpo_p1.get("generation", {})
orpo_samples = orpo_gen.get("samples", [])
greedy_samples = [s for s in orpo_samples if isinstance(s, dict) and s.get("temperature", 1.0) == 0.0]
if not greedy_samples:
greedy_samples = orpo_samples # fallback: use all samples
if greedy_samples:
lines.append("### ORPO Greedy ์ƒ์„ฑ ์ƒ˜ํ”Œ\n")
for i, s in enumerate(greedy_samples[:15], 1):
if isinstance(s, dict):
prompt = s.get("prompt", "")
text = s.get("text", s.get("generated_text", ""))
if len(text) > 500:
text = text[:500] + "..."
hit_eos = s.get("hit_eos", "?")
rep3 = s.get("3gram_rep", s.get("avg_3gram_rep"))
tokens = s.get("generated_tokens", s.get("num_tokens", "?"))
lines.append(f"**[{i}]** `{prompt}`")
lines.append(f"> {text}")
meta_parts = [f"EOS={hit_eos}"]
if rep3 is not None:
meta_parts.append(f"3gram_rep={rep3:.2%}")
meta_parts.append(f"tokens={tokens}")
lines.append(f"> *{', '.join(meta_parts)}*\n")
else:
lines.append("์ƒ์„ฑ ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์—†์Œ\n")
# =====================================================================
# 11. ์ตœ์ข… ํŒ์ •
# =====================================================================
lines.append("## 11. ์ตœ์ข… ํŒ์ •\n")
lines.append("### ๋ฐฐํฌ ๊ธฐ์ค€ ์ถฉ์กฑ ์—ฌ๋ถ€\n")
lines.append("| ์กฐ๊ฑด | ๊ธฐ์ค€ | ํ˜„์žฌ ๊ฐ’ | ์ถฉ์กฑ |")
lines.append("|------|------|---------|------|")
criteria = [
("Greedy 3-gram ๋ฐ˜๋ณต๋ฅ ", "< 5%", _fmt_pct(orpo_rep),
"YES" if orpo_rep is not None and orpo_rep < 0.05 else "NO"),
("EOS ์ข…๋ฃŒ์œจ", "> 90%", _fmt_pct(orpo_eos),
"YES" if orpo_eos is not None and orpo_eos > 0.90 else "NO"),
("PPL Forgetting", "< 5%", f"{orpo_forgetting:.1f}%" if orpo_forgetting is not None else "N/A",
"YES" if orpo_forgetting is not None and orpo_forgetting < 5.0 else "NO"),
("KoBEST ํ‰๊ท ", ">= 43%", _fmt_pct(orpo_kobest),
"YES" if orpo_kobest is not None and orpo_kobest >= 0.43 else "NO"),
]
for cond, threshold, current, met in criteria:
lines.append(f"| {cond} | {threshold} | {current} | {met} |")
lines.append("")
if deploy_criteria_met:
lines.append("**โ†’ ๋ชจ๋“  ๋ฐฐํฌ ๊ธฐ์ค€ ์ถฉ์กฑ: DEPLOY (Phase 4: GGUF ๋ณ€ํ™˜ + Ollama ๋ฐฐํฌ ์ง„ํ–‰)**\n")
else:
lines.append("**โ†’ ๋ฐฐํฌ ๊ธฐ์ค€ ๋ฏธ๋‹ฌ: RETRY (ORPO ์žฌํ•™์Šต ๋˜๋Š” ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ์กฐ์ • ํ•„์š”)**\n")
lines.append("---\n")
lines.append("*์ด ๋ณด๊ณ ์„œ๋Š” `eval/report_generator.py::generate_three_way_report()`์— ์˜ํ•ด ์ž๋™ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.*")
report_text = "\n".join(lines)
output_path.write_text(report_text, encoding="utf-8")
# Also save to orpo_output_dir if provided
if orpo_output_dir:
orpo_output_dir = Path(orpo_output_dir)
orpo_output_dir.mkdir(parents=True, exist_ok=True)
(orpo_output_dir / "orpo_three_way_report.md").write_text(report_text, encoding="utf-8")
return output_path
if __name__ == "__main__":
print("report_generator.py โ€” use via full_eval_pipeline.py or sft_eval_pipeline.py")