"""
Markdown report generator for FRANKENSTALLM 3B evaluation pipeline.

Generates comprehensive evaluation reports with sections for:
- Perplexity metrics across datasets
- Calibration statistics
- Token NLL distribution
- Generation quality samples
- Repetition parameter search results
- Standard benchmark results (lm-eval) — Korean + English
- 0-shot vs 5-shot comparison
- Comparison with reference models
"""

from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
import json
import logging

logger = logging.getLogger(__name__)


def _fmt_seconds(seconds: float) -> str:
    """Format seconds into a human-readable duration string."""
    m, s = divmod(int(seconds), 60)
    h, m = divmod(m, 60)
    if h:
        return f"{h}h {m}m {s}s"
    if m:
        return f"{m}m {s}s"
    return f"{s}s"


# =========================================================================
# Normalization helpers — map GPU-label keys to logical sections
# =========================================================================

def _normalize_phase1_results(raw: dict) -> dict:
    """Convert GPU-labelled phase1_results into logical sections.

    Returns dict with keys: perplexity, calibration, token_nll, generation, repetition.
    """
    normalized: Dict[str, Any] = {
        "perplexity": {},
        "calibration": {},
        "token_nll": {},
        "generation": {},
        "repetition": {},
    }

    for label, data in raw.items():
        if not isinstance(data, (dict, list)):
            continue

        if "PPL" in label:
            # PPL entries: single dict or list of dicts
            if isinstance(data, dict) and "ppl" in data:
                name = data.get("name", label)
                normalized["perplexity"][name] = data
            elif isinstance(data, list):
                for item in data:
                    if isinstance(item, dict) and "ppl" in item:
                        name = item.get("name", f"unknown_{len(normalized['perplexity'])}")
                        normalized["perplexity"][name] = item
            elif isinstance(data, dict) and "error" in data:
                # Task failed — skip
                pass
        elif "Calibration" in label:
            if isinstance(data, dict):
                if "calibration" in data:
                    normalized["calibration"] = data["calibration"]
                if "token_nll" in data:
                    normalized["token_nll"] = data["token_nll"]
        elif "Generation" in label:
            if isinstance(data, dict):
                normalized["generation"] = data
        elif "Repetition" in label:
            if isinstance(data, dict):
                normalized["repetition"] = data

    return normalized


def _normalize_phase2_results(raw: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """Convert GPU-labelled phase2_results into flat task dicts for 0-shot and 5-shot.

    Returns (zero_shot_metrics, five_shot_metrics) where each is:
      {"kobest_boolq": {"acc,none": 0.50, ...}, "haerae": {...}, ...}
    """
    zero_shot: Dict[str, Any] = {}
    five_shot: Dict[str, Any] = {}

    for label, data in raw.items():
        if label == "5shot":
            # Recurse into 5-shot sub-dict
            if isinstance(data, dict):
                for sub_label, sub_data in data.items():
                    if isinstance(sub_data, dict) and "per_task_metrics" in sub_data:
                        for task_name, metrics in sub_data["per_task_metrics"].items():
                            five_shot[task_name] = metrics
            continue

        if isinstance(data, dict) and "per_task_metrics" in data:
            for task_name, metrics in data["per_task_metrics"].items():
                zero_shot[task_name] = metrics

    return zero_shot, five_shot


def _get_acc(metrics: dict, prefer_norm: bool = False) -> Optional[float]:
    """Extract accuracy from lm-eval metrics dict."""
    if prefer_norm and "acc_norm,none" in metrics:
        val = metrics["acc_norm,none"]
        if isinstance(val, (int, float)):
            return float(val)
    if "acc,none" in metrics:
        val = metrics["acc,none"]
        if isinstance(val, (int, float)):
            return float(val)
    return None


def _fmt_pct(val: Optional[float]) -> str:
    """Format as percentage string or N/A."""
    if val is None:
        return "N/A"
    return f"{val * 100:.2f}%"


def _fmt_f(val, decimals: int = 4) -> str:
    """Format float or return N/A."""
    if isinstance(val, (int, float)):
        return f"{val:.{decimals}f}"
    return str(val) if val is not None else "N/A"


# =========================================================================
# Main report generator
# =========================================================================

def generate_report(
    phase1_results: dict,
    phase2_results: dict,
    generation_samples: list,
    output_dir: Path,
    checkpoint_name: str = "checkpoint-0057000",
    total_elapsed_sec: float = 0.0,
) -> str:
    """Generate a comprehensive markdown evaluation report.

    Handles the GPU-labelled key structure from full_eval_pipeline.py
    and produces multiple report files.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    reports_dir = output_dir / "reports"
    reports_dir.mkdir(parents=True, exist_ok=True)

    # Normalize data
    p1 = _normalize_phase1_results(phase1_results)
    zero_shot, five_shot = _normalize_phase2_results(phase2_results)

    eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # ===== Generate individual reports =====
    ppl_report = _generate_perplexity_report(p1["perplexity"])
    cal_report = _generate_calibration_report(p1["calibration"], p1["token_nll"])
    gen_report = _generate_generation_report(p1["generation"], generation_samples)
    bench_report = _generate_benchmark_report(zero_shot, five_shot, p1["repetition"])
    exec_summary = _generate_executive_summary(
        p1, zero_shot, five_shot, checkpoint_name, eval_datetime, total_elapsed_sec,
    )

    # Write individual reports
    (reports_dir / "00_executive_summary.md").write_text(exec_summary, encoding="utf-8")
    (reports_dir / "01_perplexity_report.md").write_text(ppl_report, encoding="utf-8")
    (reports_dir / "02_calibration_report.md").write_text(cal_report, encoding="utf-8")
    (reports_dir / "03_generation_quality.md").write_text(gen_report, encoding="utf-8")
    (reports_dir / "04_benchmark_report.md").write_text(bench_report, encoding="utf-8")

    # Combined full report
    full_report = "\n\n---\n\n".join([
        exec_summary, ppl_report, cal_report, gen_report, bench_report,
    ])

    report_path = output_dir / "full_eval_report.md"
    report_path.write_text(full_report, encoding="utf-8")

    return full_report


# =========================================================================
# Individual report sections
# =========================================================================

def _generate_executive_summary(
    p1: dict,
    zero_shot: dict,
    five_shot: dict,
    checkpoint_name: str,
    eval_datetime: str,
    total_elapsed_sec: float,
) -> str:
    lines = [
        "# FRANKENSTALLM 3B 종합 평가 리포트\n",
        f"- **모델**: FRANKENSTALLM 3B",
        f"- **체크포인트**: {checkpoint_name}",
        f"- **평가 일시**: {eval_datetime}",
        f"- **총 소요 시간**: {total_elapsed_sec:.1f}초\n",
        "## Executive Summary\n",
    ]

    # Main PPL
    main_ppl = "N/A"
    ppl_data = p1.get("perplexity", {})
    for name in ["3b", "3b_val"]:
        if name in ppl_data and isinstance(ppl_data[name], dict):
            main_ppl = _fmt_f(ppl_data[name].get("ppl"))
            break

    # KoBEST average
    kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
                    "kobest_sentineg", "kobest_wic"]
    kobest_accs = []
    for t in kobest_tasks:
        if t in zero_shot:
            a = _get_acc(zero_shot[t])
            if a is not None:
                kobest_accs.append(a)
    kobest_avg = _fmt_pct(sum(kobest_accs) / len(kobest_accs)) if kobest_accs else "N/A"

    # MMLU-KO — prefer group-level weighted average from lm-eval
    mmlu_ko_avg = "N/A"
    mmlu_ko_count = 0
    if "global_mmlu_ko" in zero_shot:
        a = _get_acc(zero_shot["global_mmlu_ko"])
        if a is not None:
            mmlu_ko_avg = _fmt_pct(a)
            # Count subtasks for display
            mmlu_ko_count = sum(
                1 for t in zero_shot
                if t.startswith("global_mmlu_ko_") and _get_acc(zero_shot[t]) is not None
            )
            if mmlu_ko_count == 0:
                mmlu_ko_count = 1  # group-level only
    else:
        # Fallback: average subtask-level metrics
        mmlu_ko_accs = []
        for t, m in zero_shot.items():
            if t.startswith("global_mmlu_ko_"):
                a = _get_acc(m)
                if a is not None:
                    mmlu_ko_accs.append(a)
        if mmlu_ko_accs:
            mmlu_ko_avg = _fmt_pct(sum(mmlu_ko_accs) / len(mmlu_ko_accs))
            mmlu_ko_count = len(mmlu_ko_accs)

    # MMLU-EN — exclude group-level keys to avoid double-counting
    _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}
    mmlu_en_accs = []
    for t, m in zero_shot.items():
        if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
            a = _get_acc(m)
            if a is not None:
                mmlu_en_accs.append(a)
    if not mmlu_en_accs:
        # Fallback to group-level if no subtasks
        for t in _MMLU_EN_GROUPS:
            if t in zero_shot:
                a = _get_acc(zero_shot[t])
                if a is not None:
                    mmlu_en_accs.append(a)
    mmlu_en_avg = _fmt_pct(sum(mmlu_en_accs) / len(mmlu_en_accs)) if mmlu_en_accs else "N/A"

    # HAE-RAE
    haerae_acc = "N/A"
    if "haerae" in zero_shot:
        a = _get_acc(zero_shot["haerae"])
        if a is not None:
            haerae_acc = _fmt_pct(a)

    # English benchmarks
    en_benchmarks = {}
    for t in ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]:
        if t in zero_shot:
            a = _get_acc(zero_shot[t], prefer_norm=(t in ["hellaswag", "arc_challenge"]))
            if a is not None:
                en_benchmarks[t] = a

    # Top-1 accuracy
    top1 = _fmt_f(p1.get("calibration", {}).get("top1_accuracy"))

    lines.append("| 메트릭 | 값 |")
    lines.append("|--------|-----|")
    lines.append(f"| 주요 PPL (3b_val) | {main_ppl} |")
    lines.append(f"| MMLU-KO 평균 ({mmlu_ko_count}과목) | {mmlu_ko_avg} |")
    lines.append(f"| MMLU-EN 평균 | {mmlu_en_avg} |")
    lines.append(f"| KoBEST 평균 ({len(kobest_accs)}태스크) | {kobest_avg} |")
    lines.append(f"| HAE-RAE | {haerae_acc} |")
    for t, a in en_benchmarks.items():
        lines.append(f"| {t} (0-shot) | {_fmt_pct(a)} |")
    lines.append(f"| Top-1 정확도 (Calibration) | {top1} |")
    lines.append("")

    # Reference comparison
    lines.append("## 참고 모델 비교\n")
    lines.append("| 모델 | 파라미터 | MMLU-KO | MMLU-EN | KoBEST 평균 | PPL |")
    lines.append("|------|---------|---------|---------|------------|-----|")
    lines.append(f"| **FRANKENSTALLM 3B** | 3B | {mmlu_ko_avg} | {mmlu_en_avg} | {kobest_avg} | {main_ppl} |")
    lines.append("| Llama-3.2-3B | 3B | ~42% | ~58% | ~55% | — |")
    lines.append("| Qwen2.5-3B | 3B | ~48% | ~65% | ~60% | — |")
    lines.append("| EXAONE-3.5-2.4B | 2.4B | ~35% | ~50% | ~50% | — |")
    lines.append("")

    return "\n".join(lines)


def _generate_perplexity_report(ppl_data: dict) -> str:
    lines = ["# Perplexity 평가\n"]

    if not ppl_data:
        lines.append("데이터 없음\n")
        return "\n".join(lines)

    rows = []
    for name, metrics in ppl_data.items():
        if isinstance(metrics, dict) and "ppl" in metrics:
            rows.append({
                "name": name,
                "ppl": metrics.get("ppl"),
                "bits": metrics.get("bits_per_token"),
                "n_tokens": metrics.get("n_tokens"),
                "n_eval": metrics.get("n_eval_tokens"),
                "elapsed": metrics.get("elapsed_sec"),
            })

    rows.sort(key=lambda x: x["ppl"] if isinstance(x["ppl"], (int, float)) else float("inf"),
              reverse=True)

    lines.append("| 데이터셋 | PPL | Bits/Token | 전체 토큰 | 평가 토큰 | 소요 시간 |")
    lines.append("|---------|-----|-----------|---------|---------|---------|")
    for r in rows:
        lines.append(
            f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | "
            f"{r['n_tokens']:,} | {r['n_eval']:,} | {_fmt_f(r['elapsed'], 1)}s |"
            if isinstance(r['n_tokens'], (int, float)) and isinstance(r['n_eval'], (int, float))
            else f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | "
                 f"{r['n_tokens']} | {r['n_eval']} | {_fmt_f(r['elapsed'], 1)}s |"
        )
    lines.append("")
    return "\n".join(lines)


def _generate_calibration_report(cal_data: dict, nll_data: dict) -> str:
    lines = ["# Calibration 및 Token NLL 분석\n"]

    # Calibration
    lines.append("## Calibration 결과\n")
    if cal_data:
        lines.append("| 메트릭 | 값 |")
        lines.append("|--------|-----|")
        metrics_map = {
            "top1_accuracy": "Top-1 Accuracy",
            "top5_accuracy": "Top-5 Accuracy",
            "top10_accuracy": "Top-10 Accuracy",
            "mean_correct_prob": "Mean Correct Prob",
            "mean_entropy": "Mean Entropy",
        }
        for key, label in metrics_map.items():
            lines.append(f"| {label} | {_fmt_f(cal_data.get(key))} |")
        lines.append("")
    else:
        lines.append("데이터 없음\n")

    # Token NLL
    lines.append("## Token NLL 분포\n")
    if nll_data:
        # Keys may be "mean"/"std" or "nll_mean"/"nll_std"
        stats_map = [
            (["nll_mean", "mean"], "평균"),
            (["nll_std", "std"], "표준편차"),
            (["nll_median", "median"], "중앙값"),
            (["nll_min", "min"], "최솟값"),
            (["nll_max", "max"], "최댓값"),
        ]
        lines.append("| 통계 | 값 |")
        lines.append("|------|-----|")
        for candidates, label in stats_map:
            val = None
            for c in candidates:
                if c in nll_data:
                    val = nll_data[c]
                    break
            lines.append(f"| {label} | {_fmt_f(val)} |")
        lines.append("")

        # Percentiles: "nll_percentiles" (dict) or "percentiles" (dict)
        pct_data = nll_data.get("nll_percentiles", nll_data.get("percentiles"))
        if pct_data and isinstance(pct_data, dict):
            lines.append("### Percentiles\n")
            lines.append("| Percentile | 값 |")
            lines.append("|------------|-----|")
            for pct, value in pct_data.items():
                lines.append(f"| {pct}th | {_fmt_f(value)} |")
            lines.append("")

        # High loss: "high_loss_fractions" (dict) or flat "high_loss_fraction_N" keys
        hlf = nll_data.get("high_loss_fractions")
        if hlf and isinstance(hlf, dict):
            lines.append("### 고손실 토큰 비율\n")
            lines.append("| 임계값 | 비율 |")
            lines.append("|--------|-----|")
            for threshold, fraction in hlf.items():
                lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |")
            lines.append("")
        else:
            # Check flat keys: high_loss_fraction_5, high_loss_fraction_10, ...
            hlf_flat = {k.replace("high_loss_fraction_", ""): v
                        for k, v in nll_data.items()
                        if k.startswith("high_loss_fraction_")}
            if hlf_flat:
                lines.append("### 고손실 토큰 비율\n")
                lines.append("| 임계값 | 비율 |")
                lines.append("|--------|-----|")
                for threshold, fraction in sorted(hlf_flat.items()):
                    lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |")
                lines.append("")
    else:
        lines.append("데이터 없음\n")

    return "\n".join(lines)


def _generate_generation_report(gen_data: dict, samples: list) -> str:
    lines = ["# 생성 품질 분석\n"]

    if gen_data and "summary" in gen_data:
        lines.append("## 요약 통계\n")
        lines.append("| 메트릭 | 값 |")
        lines.append("|--------|-----|")
        for key, value in gen_data["summary"].items():
            display = key.replace("_", " ").title()
            lines.append(f"| {display} | {_fmt_f(value)} |")
        lines.append("")

    if samples:
        lines.append("## 생성 샘플 (Greedy)\n")
        for i, sample in enumerate(samples[:5], 1):
            if isinstance(sample, dict):
                prompt = sample.get("prompt", "")
                generated = sample.get("generated_text", "")
                if len(generated) > 300:
                    generated = generated[:300] + "..."
                lines.append(f"### 샘플 {i}\n")
                lines.append(f"**Prompt**: {prompt}\n")
                lines.append(f"**Generated**: {generated}\n")
        lines.append("")
    elif not gen_data:
        lines.append("데이터 없음\n")

    return "\n".join(lines)


def _generate_benchmark_report(
    zero_shot: dict,
    five_shot: dict,
    repetition: dict,
) -> str:
    lines = ["# 표준 벤치마크 결과\n"]

    if not zero_shot and not five_shot:
        lines.append("데이터 없음\n")
        return "\n".join(lines)

    # --- Korean Benchmarks ---
    lines.append("## 한국어 벤치마크\n")

    # KoBEST
    kobest_names = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
                    "kobest_sentineg", "kobest_wic"]
    kobest_0 = {t: zero_shot[t] for t in kobest_names if t in zero_shot}
    if kobest_0:
        lines.append("### KoBEST (0-shot)\n")
        lines.append("| 태스크 | Accuracy | F1 |")
        lines.append("|--------|----------|-----|")
        for t in kobest_names:
            if t in kobest_0:
                m = kobest_0[t]
                acc = _fmt_pct(_get_acc(m))
                f1 = _fmt_f(m.get("f1,none"))
                lines.append(f"| {t} | {acc} | {f1} |")
        kobest_accs = [_get_acc(kobest_0[t]) for t in kobest_names
                       if t in kobest_0 and _get_acc(kobest_0[t]) is not None]
        if kobest_accs:
            lines.append(f"| **평균** | **{_fmt_pct(sum(kobest_accs)/len(kobest_accs))}** | |")
        lines.append("")

    # HAE-RAE
    if "haerae" in zero_shot:
        lines.append("### HAE-RAE (0-shot)\n")
        m = zero_shot["haerae"]
        lines.append(f"- Accuracy: {_fmt_pct(_get_acc(m))}")
        # Check for sub-tasks
        haerae_subs = {t: zero_shot[t] for t in zero_shot if t.startswith("haerae_") and t != "haerae"}
        if haerae_subs:
            lines.append("\n| 서브태스크 | Accuracy |")
            lines.append("|-----------|----------|")
            for t, sm in sorted(haerae_subs.items()):
                lines.append(f"| {t} | {_fmt_pct(_get_acc(sm))} |")
        lines.append("")

    # MMLU-KO
    mmlu_ko_tasks = {t: zero_shot[t] for t in zero_shot
                     if t.startswith("global_mmlu_ko") and t != "global_mmlu_ko"}
    if mmlu_ko_tasks or "global_mmlu_ko" in zero_shot:
        lines.append("### MMLU-KO (0-shot)\n")
        if mmlu_ko_tasks:
            lines.append(f"평가된 과목 수: **{len(mmlu_ko_tasks)}**\n")
            accs = [(t, _get_acc(m)) for t, m in sorted(mmlu_ko_tasks.items())
                    if _get_acc(m) is not None]
            if accs:
                # Prefer group-level weighted average from lm-eval
                group_acc = _get_acc(zero_shot["global_mmlu_ko"]) if "global_mmlu_ko" in zero_shot else None
                avg_acc = group_acc if group_acc is not None else sum(a for _, a in accs) / len(accs)
                lines.append(f"전체 평균: **{_fmt_pct(avg_acc)}**\n")

                # Top 10
                accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True)
                lines.append("**상위 10개 과목**:\n")
                lines.append("| 과목 | Accuracy |")
                lines.append("|------|----------|")
                for t, a in accs_sorted[:10]:
                    subject = t.replace("global_mmlu_ko_", "")
                    lines.append(f"| {subject} | {_fmt_pct(a)} |")
                lines.append("")

                lines.append("**하위 10개 과목**:\n")
                lines.append("| 과목 | Accuracy |")
                lines.append("|------|----------|")
                for t, a in accs_sorted[-10:]:
                    subject = t.replace("global_mmlu_ko_", "")
                    lines.append(f"| {subject} | {_fmt_pct(a)} |")
                lines.append("")
        elif "global_mmlu_ko" in zero_shot:
            a = _get_acc(zero_shot["global_mmlu_ko"])
            lines.append(f"전체 정확도: {_fmt_pct(a)}\n")

    # --- English Benchmarks ---
    lines.append("## 영어 벤치마크\n")

    en_tasks = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]
    en_found = {t: zero_shot[t] for t in en_tasks if t in zero_shot}
    if en_found:
        lines.append("### 주요 벤치마크 (0-shot)\n")
        lines.append("| 태스크 | Accuracy | Acc (norm) |")
        lines.append("|--------|----------|-----------|")
        for t in en_tasks:
            if t in en_found:
                m = en_found[t]
                acc = _fmt_pct(_get_acc(m))
                acc_norm = _fmt_pct(_get_acc(m, prefer_norm=True) if "acc_norm,none" in m else None)
                lines.append(f"| {t} | {acc} | {acc_norm} |")
        lines.append("")

    # MMLU-EN
    mmlu_en_tasks = {t: zero_shot[t] for t in zero_shot
                     if (t.startswith("mmlu_") or t == "mmlu") and not t.startswith("mmlu_ko")}
    if mmlu_en_tasks:
        lines.append("### MMLU-EN (0-shot)\n")
        # Filter out the group-level "mmlu" if sub-tasks exist
        subtasks = {t: m for t, m in mmlu_en_tasks.items() if t != "mmlu"}
        if subtasks:
            lines.append(f"평가된 과목 수: **{len(subtasks)}**\n")
            accs = [(t, _get_acc(m)) for t, m in sorted(subtasks.items())
                    if _get_acc(m) is not None]
            if accs:
                avg_acc = sum(a for _, a in accs) / len(accs)
                lines.append(f"전체 평균: **{_fmt_pct(avg_acc)}**\n")

                accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True)
                lines.append("**상위 10개 과목**:\n")
                lines.append("| 과목 | Accuracy |")
                lines.append("|------|----------|")
                for t, a in accs_sorted[:10]:
                    subject = t.replace("mmlu_", "")
                    lines.append(f"| {subject} | {_fmt_pct(a)} |")
                lines.append("")

                lines.append("**하위 10개 과목**:\n")
                lines.append("| 과목 | Accuracy |")
                lines.append("|------|----------|")
                for t, a in accs_sorted[-10:]:
                    subject = t.replace("mmlu_", "")
                    lines.append(f"| {subject} | {_fmt_pct(a)} |")
                lines.append("")
        elif "mmlu" in mmlu_en_tasks:
            a = _get_acc(mmlu_en_tasks["mmlu"])
            lines.append(f"전체 정확도: {_fmt_pct(a)}\n")

    # --- 0-shot vs 5-shot Comparison ---
    if five_shot:
        lines.append("## 0-shot vs 5-shot 비교\n")

        # Collect all tasks that have both 0-shot and 5-shot results
        common_tasks = sorted(set(zero_shot.keys()) & set(five_shot.keys()))
        if common_tasks:
            lines.append("| 태스크 | 0-shot Acc | 5-shot Acc | 변화 |")
            lines.append("|--------|-----------|-----------|------|")
            for t in common_tasks:
                a0 = _get_acc(zero_shot[t])
                a5 = _get_acc(five_shot[t])
                if a0 is not None and a5 is not None:
                    diff = a5 - a0
                    sign = "+" if diff >= 0 else ""
                    lines.append(
                        f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | {sign}{diff*100:.2f}pp |"
                    )
                else:
                    lines.append(f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | — |")
            lines.append("")

            # Summary
            diffs = []
            for t in common_tasks:
                a0 = _get_acc(zero_shot[t])
                a5 = _get_acc(five_shot[t])
                if a0 is not None and a5 is not None:
                    diffs.append(a5 - a0)
            if diffs:
                avg_diff = sum(diffs) / len(diffs)
                improved = sum(1 for d in diffs if d > 0)
                degraded = sum(1 for d in diffs if d < 0)
                lines.append(
                    f"평균 변화: {'+' if avg_diff >= 0 else ''}{avg_diff*100:.2f}pp | "
                    f"개선: {improved} | 하락: {degraded} | 동일: {len(diffs) - improved - degraded}\n"
                )

    # --- Repetition ---
    if repetition and repetition.get("grid_results"):
        lines.append("## Repetition 파라미터 검색\n")
        rep_data = repetition["grid_results"]
        rep_rows = []
        # grid_results can be a list of dicts or a dict of dicts
        items = rep_data.items() if isinstance(rep_data, dict) else enumerate(rep_data)
        for key, metrics in items:
            if isinstance(metrics, dict):
                rep_rows.append({
                    "config": metrics.get("params", str(key)),
                    "temp": metrics.get("temperature"),
                    "rep_pen": metrics.get("repetition_penalty"),
                    "3gram": metrics.get("avg_3gram_rep", metrics.get("3gram_repetition", float("inf"))),
                    "4gram": metrics.get("avg_4gram_rep", metrics.get("4gram_repetition")),
                    "eos_rate": metrics.get("eos_rate"),
                    "avg_tokens": metrics.get("avg_tokens"),
                })
        rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf"))

        lines.append("| 설정 | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |")
        lines.append("|------|------|---------|--------|--------|----------|-----------|")
        for i, r in enumerate(rep_rows):
            marker = " **← best**" if i == 0 else ""
            lines.append(
                f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | "
                f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | "
                f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}"
            )
        lines.append("")

    lines.append("---\n")
    lines.append("*이 리포트는 자동으로 생성되었습니다.*")
    return "\n".join(lines)


# =========================================================================
# Base vs SFT Comparison Report
# =========================================================================

# Base model reference values (from 3b_reeval_20260305_1451)
_BASE_PPL_REFERENCE = {
    "3b_val": 5.2263,
    "3b": 5.2263,
    "korean_c4_val": 5.7173,
    "korean_c4": 5.7173,
    "hplt_ko_val": 2.4028,
    "hplt_ko": 2.4028,
    "cc100_ko_val": 21.782,
    "cc100_ko": 21.782,
    "korean_val": 9.6505,
    "korean": 9.6505,
}

_BASE_BENCH_REFERENCE = {
    "kobest_boolq": 0.5028,
    "kobest_copa": 0.4930,
    "kobest_hellaswag": 0.2160,
    "kobest_sentineg": 0.4861,
    "kobest_wic": 0.4865,
    "haerae": 0.1971,
    "global_mmlu_ko": 0.2275,
    "hellaswag": 0.2600,
    "arc_easy": 0.2563,
    "arc_challenge": 0.2167,
    "winogrande": 0.5059,
    "piqa": 0.5250,
}

_BASE_GEN_REFERENCE = {
    "greedy_3gram_rep": 0.6099,
    "greedy_4gram_rep": 0.5702,
    "greedy_eos_rate": 0.0,
}

_BASE_CALIB_REFERENCE = {
    "top1_accuracy": 0.6875,
    "top5_accuracy": 0.8164,
    "top10_accuracy": 0.8593,
    "mean_entropy": 1.5682,
}

_BASE_NLL_REFERENCE = {
    "nll_mean": 1.5561,
    "high_loss_fraction_5": 0.1086,
}

# =========================================================================
# Threshold Justification
# =========================================================================
# PPL forgetting 15%: Kirkpatrick et al. (2017) continual learning 기준 10-20%
# KoBEST avg 55%: Random baseline ~40%, Llama 3.2 1B ~52%, Qwen 2.5 3B ~58%
# MMLU-KO 30%: Random 25%, Llama 3.2 3B ~35%
# Greedy 3-gram rep <5%: 인간 한국어 텍스트 256토큰 기준 1-3%, Base 모델 61%
# EOS rate >90%: 챗 모델은 응답을 끝내야 함, 일부 장문 허용
# Calibration top1 65%: Base 68.75%, SFT로 인한 소폭 하락 허용
# Distinct-2 >70%: Li et al. (2016), 다양성 보장 최소선
# =========================================================================

_SFT_TARGETS = {
    # 생성 품질
    "greedy_3gram_rep_max": 0.05,
    "eos_rate_min": 0.90,
    "sampled_eos_min": 0.50,
    "distinct_2_min": 0.70,
    # 지식 보존
    "ppl_forgetting_max_pct": 15.0,
    # 한국어 벤치마크
    "kobest_avg_min": 0.55,
    "haerae_min": 0.25,
    "mmlu_ko_min": 0.30,
    # 칼리브레이션
    "top1_accuracy_min": 0.65,
    # 영어 유지
    "hellaswag_min": 0.25,
    "arc_easy_min": 0.25,
    "arc_challenge_min": 0.21,
    "winogrande_min": 0.49,
    "piqa_min": 0.51,
    "mmlu_en_avg_min": 0.25,
}

_REFERENCE_MODELS = {
    "Llama 3.2 1B":  {"kobest_avg": 0.52, "mmlu_ko": 0.28, "mmlu_en": 0.32},
    "Llama 3.2 3B":  {"kobest_avg": 0.56, "mmlu_ko": 0.35, "mmlu_en": 0.55},
    "Qwen 2.5 3B":   {"kobest_avg": 0.58, "mmlu_ko": 0.42, "mmlu_en": 0.58},
}


def _compute_orpo_score(sft_p1, sft_zero, base_p1, base_zero):
    """ORPO 필요성 정량 판정 (0-100점).

    Returns:
        dict with keys: total_score, dimension_scores, decision, confidence, orpo_gain_estimate
    """
    dimensions = {}
    missing = 0
    total_dims = 7

    # Dim 1: PPL Forgetting (25 pts)
    max_forgetting = _get_max_forgetting(sft_p1, base_p1)
    if max_forgetting is not None:
        threshold = _SFT_TARGETS["ppl_forgetting_max_pct"]
        score = 25 * max(0, 1 - max_forgetting / threshold)
        dimensions["ppl_forgetting"] = {
            "score": round(score, 1), "weight": 25,
            "current": round(max_forgetting, 1), "threshold": f"<{threshold}%",
            "status": "PASS" if max_forgetting < threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["ppl_forgetting"] = {"score": 0, "weight": 25, "current": "N/A", "threshold": "<15%", "status": "N/A"}

    # Dim 2: Greedy 반복률 (20 pts)
    rep_rate = _get_greedy_3gram_rep(sft_p1)
    if rep_rate is not None:
        threshold = _SFT_TARGETS["greedy_3gram_rep_max"]
        score = 20 * max(0, 1 - rep_rate / threshold)
        dimensions["greedy_rep"] = {
            "score": round(score, 1), "weight": 20,
            "current": f"{rep_rate:.1%}", "threshold": f"<{threshold:.0%}",
            "status": "PASS" if rep_rate < threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["greedy_rep"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": "<5%", "status": "N/A"}

    # Dim 3: EOS 종료율 (10 pts)
    eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
    if eos_rate is not None:
        threshold = _SFT_TARGETS["eos_rate_min"]
        score = 10 * min(eos_rate / threshold, 1)
        dimensions["eos_rate"] = {
            "score": round(score, 1), "weight": 10,
            "current": f"{eos_rate:.0%}", "threshold": f">{threshold:.0%}",
            "status": "PASS" if eos_rate >= threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["eos_rate"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">90%", "status": "N/A"}

    # Dim 4: KoBEST 평균 (20 pts)
    kobest_avg = _get_kobest_avg(sft_zero)
    if kobest_avg is not None:
        threshold = _SFT_TARGETS["kobest_avg_min"]
        score = 20 * min(kobest_avg / threshold, 1)
        dimensions["kobest_avg"] = {
            "score": round(score, 1), "weight": 20,
            "current": f"{kobest_avg:.1%}", "threshold": f">{threshold:.0%}",
            "status": "PASS" if kobest_avg >= threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["kobest_avg"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": ">55%", "status": "N/A"}

    # Dim 5: Calibration (10 pts)
    top1 = sft_p1.get("calibration", {}).get("top1_accuracy")
    if top1 is not None:
        threshold = _SFT_TARGETS["top1_accuracy_min"]
        score = 10 * min(top1 / threshold, 1)
        dimensions["calibration"] = {
            "score": round(score, 1), "weight": 10,
            "current": f"{top1:.1%}", "threshold": f">={threshold:.0%}",
            "status": "PASS" if top1 >= threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["calibration"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">=65%", "status": "N/A"}

    # Dim 6: 다양성 distinct-2 (10 pts)
    distinct_2 = sft_p1.get("generation", {}).get("summary", {}).get("greedy_avg_distinct_2")
    if distinct_2 is not None:
        threshold = _SFT_TARGETS["distinct_2_min"]
        score = 10 * min(distinct_2 / threshold, 1)
        dimensions["diversity"] = {
            "score": round(score, 1), "weight": 10,
            "current": f"{distinct_2:.0%}", "threshold": f">{threshold:.0%}",
            "status": "PASS" if distinct_2 >= threshold else "FAIL",
        }
    else:
        missing += 1
        dimensions["diversity"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">70%", "status": "N/A"}

    # Dim 7: 영어 유지 (5 pts)
    en_tasks = {
        "hellaswag": _SFT_TARGETS["hellaswag_min"],
        "arc_easy": _SFT_TARGETS["arc_easy_min"],
        "arc_challenge": _SFT_TARGETS["arc_challenge_min"],
        "winogrande": _SFT_TARGETS["winogrande_min"],
        "piqa": _SFT_TARGETS["piqa_min"],
    }
    en_all_pass = True
    en_count = 0
    for t, threshold in en_tasks.items():
        a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
        if a is not None:
            en_count += 1
            if a < threshold:
                en_all_pass = False
    if en_count > 0:
        score = 5.0 if en_all_pass else 0.0
        dimensions["english"] = {
            "score": score, "weight": 5,
            "current": "전부 통과" if en_all_pass else "일부 미달",
            "threshold": "—", "status": "PASS" if en_all_pass else "FAIL",
        }
    else:
        missing += 1
        dimensions["english"] = {"score": 0, "weight": 5, "current": "N/A", "threshold": "—", "status": "N/A"}

    total_score = sum(d["score"] for d in dimensions.values())
    confidence = round(1.0 - (missing / total_dims), 2)

    if missing >= 2:
        logger.warning("ORPO score has %d/%d missing dimensions — confidence %.0f%%", missing, total_dims, confidence * 100)

    # ORPO gain estimate: dimensions that ORPO can improve
    orpo_improvable = 0.0
    if rep_rate is not None and rep_rate >= _SFT_TARGETS["greedy_3gram_rep_max"]:
        orpo_improvable += 20.0  # repetition
    if eos_rate is not None and eos_rate < _SFT_TARGETS["eos_rate_min"]:
        orpo_improvable += 10.0  # eos
    if distinct_2 is not None and distinct_2 < _SFT_TARGETS["distinct_2_min"]:
        orpo_improvable += 5.0  # partial diversity improvement

    # Decision
    forgetting_ok = max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"]
    if total_score >= 80:
        decision = "DEPLOY"
    elif total_score >= 40 and forgetting_ok:
        decision = "ORPO"
    else:
        decision = "SFT_RETRY"

    return {
        "total_score": round(total_score, 1),
        "dimensions": dimensions,
        "decision": decision,
        "confidence": confidence,
        "orpo_gain_estimate": round(orpo_improvable, 1),
    }


def generate_comparison_report(
    base_results_dir: Path,
    sft_phase1_results: dict,
    sft_phase2_results: dict,
    output_path: Path,
    sft_output_dir: Optional[Path] = None,
    total_elapsed_sec: float = 0.0,
) -> Path:
    """Generate a comprehensive Base vs SFT comparison report.

    Args:
        base_results_dir: Directory containing Base model's phase1/phase2_results.json
        sft_phase1_results: SFT Phase 1 results dict
        sft_phase2_results: SFT Phase 2 results dict
        output_path: Where to write the markdown report
        sft_output_dir: SFT eval outputs directory (for linking)
        total_elapsed_sec: Total pipeline elapsed time

    Returns:
        Path to the generated report
    """
    base_results_dir = Path(base_results_dir)
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Load Base results
    base_p1 = {}
    base_p2 = {}
    p1_file = base_results_dir / "phase1_results.json"
    p2_file = base_results_dir / "phase2_results.json"
    if p1_file.exists():
        with open(p1_file, encoding="utf-8") as f:
            base_p1 = json.load(f)
    if p2_file.exists():
        with open(p2_file, encoding="utf-8") as f:
            base_p2 = json.load(f)

    # Normalize both
    sft_p1 = _normalize_phase1_results(sft_phase1_results)
    base_p1_norm = _normalize_phase1_results(base_p1)
    sft_zero, sft_five = _normalize_phase2_results(sft_phase2_results)
    base_zero, base_five = _normalize_phase2_results(base_p2)

    eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    lines = []

    # === Header ===
    lines.append("# FRANKENSTALLM 3B SFT 모델 다면적 종합 평가 보고서\n")
    lines.append(f"- **평가 일시**: {eval_datetime}")
    lines.append(f"- **SFT 체크포인트**: checkpoint-best (val_loss=1.8851, step 25500)")
    lines.append(f"- **Base 참조 결과**: 3b_reeval_20260305_1451")
    lines.append(f"- **총 소요 시간**: {_fmt_seconds(total_elapsed_sec)}")
    if sft_output_dir:
        lines.append(f"- **결과 디렉토리**: {sft_output_dir}")
    lines.append("")

    # === 1. Executive Summary ===
    lines.append("## 1. Executive Summary\n")
    verdicts = _compute_verdicts(sft_p1, sft_zero, base_p1_norm, base_zero)
    lines.append("| 평가 차원 | 결과 | 상세 |")
    lines.append("|----------|------|------|")
    for dim_name, verdict, detail in verdicts:
        icon = "PASS" if verdict else "FAIL"
        lines.append(f"| {dim_name} | **{icon}** | {detail} |")
    lines.append("")

    pass_count = sum(1 for _, v, _ in verdicts if v)
    total_dims = len(verdicts)
    lines.append(f"**종합**: {pass_count}/{total_dims} 차원 통과\n")

    # ORPO verdict — quantitative scoring
    rep_rate = _get_greedy_3gram_rep(sft_p1)
    kobest_avg = _get_kobest_avg(sft_zero)
    max_forgetting = _get_max_forgetting(sft_p1, base_p1_norm)

    lines.append("### ORPO 판정 (정량 스코어)\n")
    orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero)

    lines.append(f"**결정**: {orpo_result['decision']} (확신도: {orpo_result['confidence']:.0%})\n")
    lines.append(f"**정량 스코어**: {orpo_result['total_score']}/100\n")

    lines.append("| 차원 | 점수 | /가중치 | 현재값 | 기준 | 상태 |")
    lines.append("|------|------|--------|--------|------|------|")
    dim_names = {
        "ppl_forgetting": "PPL Forgetting",
        "greedy_rep": "Greedy 반복률",
        "eos_rate": "EOS 종료율",
        "kobest_avg": "KoBEST 평균",
        "calibration": "Calibration",
        "diversity": "다양성",
        "english": "영어 유지",
    }
    for key, label in dim_names.items():
        d = orpo_result["dimensions"].get(key, {})
        lines.append(
            f"| {label} | {d.get('score', 0)} | /{d.get('weight', 0)} | "
            f"{d.get('current', 'N/A')} | {d.get('threshold', '—')} | {d.get('status', 'N/A')} |"
        )
    lines.append("")

    if orpo_result["orpo_gain_estimate"] > 0:
        lines.append(f"**ORPO 기대 이득**: +{orpo_result['orpo_gain_estimate']}점 "
                     f"(반복률/EOS/다양성 개선 기대, PPL/벤치 변화 없음)\n")

    # Reference model comparison
    lines.append("**참조 모델 비교**:\n")
    for model_name, ref in _REFERENCE_MODELS.items():
        lines.append(f"- {model_name}: KoBEST={ref['kobest_avg']:.0%}, MMLU-KO={ref['mmlu_ko']:.0%}")
    lines.append("")

    # Decision explanation
    if orpo_result["decision"] == "DEPLOY":
        lines.append("**→ Phase 4: GGUF + Ollama 배포** (스코어 ≥80, 모든 핵심 조건 충족)\n")
    elif orpo_result["decision"] == "ORPO":
        lines.append("**→ Phase 3: ORPO** (스코어 40-79, 지식 보존 양호, 생성 개선 필요)\n")
    else:
        lines.append("**→ SFT 재시도** (스코어 <40 또는 심각한 forgetting)\n")

    # === 2. PPL Comparison ===
    lines.append("## 2. Perplexity 비교 (지식 보존)\n")
    lines.append("| 데이터셋 | Base PPL | SFT PPL | 변화 | Forgetting % | 판정 |")
    lines.append("|---------|---------|---------|------|-------------|------|")

    sft_ppl = sft_p1.get("perplexity", {})
    base_ppl = base_p1_norm.get("perplexity", {})

    # Merge all dataset names
    all_ppl_names = sorted(set(list(sft_ppl.keys()) + list(base_ppl.keys())))
    forgetting_values = []
    for name in all_ppl_names:
        sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None
        base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
        # Try reference table if base results not available
        if base_val is None:
            base_val = _BASE_PPL_REFERENCE.get(name)

        if sft_val is not None and base_val is not None:
            forgetting = (sft_val - base_val) / base_val * 100
            forgetting_values.append(forgetting)
            verdict = "PASS" if forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] else "FAIL"
            lines.append(
                f"| {name} | {base_val:.4f} | {sft_val:.4f} | "
                f"{'+' if sft_val >= base_val else ''}{sft_val - base_val:.4f} | "
                f"{forgetting:+.1f}% | {verdict} |"
            )
        elif sft_val is not None:
            lines.append(f"| {name} | — | {sft_val:.4f} | — | — | — |")
        elif base_val is not None:
            lines.append(f"| {name} | {base_val:.4f} | — | — | — | — |")

    if forgetting_values:
        avg_forgetting = sum(forgetting_values) / len(forgetting_values)
        max_f = max(forgetting_values)
        lines.append("")
        lines.append(f"**평균 Forgetting**: {avg_forgetting:+.1f}% | **최대**: {max_f:+.1f}% | "
                      f"**판정**: {'PASS' if max_f < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'FAIL'} (임계값 {_SFT_TARGETS['ppl_forgetting_max_pct']}%)")
    lines.append("")

    # === 3. Generation Quality ===
    lines.append("## 3. 생성 품질 비교\n")
    sft_gen = sft_p1.get("generation", {})
    if not sft_gen:
        logger.warning("Generation results missing from SFT Phase 1")
    sft_summary = sft_gen.get("summary", {})

    lines.append("| 지표 | Base | SFT | 목표 | 판정 |")
    lines.append("|------|------|-----|------|------|")

    greedy_3gram = sft_summary.get("greedy_avg_3gram_rep")
    greedy_4gram = sft_summary.get("greedy_avg_4gram_rep")
    eos_rate = sft_summary.get("greedy_eos_rate")

    rep_threshold = _SFT_TARGETS["greedy_3gram_rep_max"]
    eos_threshold = _SFT_TARGETS["eos_rate_min"]
    greedy_3gram_verdict = "PASS" if greedy_3gram is not None and greedy_3gram < rep_threshold else "FAIL"
    greedy_4gram_verdict = "PASS" if greedy_4gram is not None and greedy_4gram < 0.05 else "FAIL"
    eos_verdict = "PASS" if eos_rate is not None and eos_rate >= eos_threshold else "FAIL"
    lines.append(f"| Greedy 3-gram 반복률 | {_BASE_GEN_REFERENCE['greedy_3gram_rep']:.2%} | "
                 f"{_fmt_pct(greedy_3gram)} | < {rep_threshold:.0%} | {greedy_3gram_verdict} |")
    lines.append(f"| Greedy 4-gram 반복률 | {_BASE_GEN_REFERENCE['greedy_4gram_rep']:.2%} | "
                 f"{_fmt_pct(greedy_4gram)} | < 5% | {greedy_4gram_verdict} |")
    lines.append(f"| EOS 종료율 | {_BASE_GEN_REFERENCE['greedy_eos_rate']:.0%} | "
                 f"{_fmt_pct(eos_rate)} | > {eos_threshold:.0%} | {eos_verdict} |")

    sampled_3gram = sft_summary.get("sampled_avg_3gram_rep")
    sampled_eos = sft_summary.get("sampled_eos_rate")
    if sampled_3gram is not None:
        lines.append(f"| Sampled 3-gram 반복률 | — | {sampled_3gram:.2%} | — | — |")
    if sampled_eos is not None:
        lines.append(f"| Sampled EOS 종료율 | — | {sampled_eos:.2%} | — | — |")
    lines.append("")

    # Chat template status
    chat_status = "활성화" if sft_summary else "비활성화"
    lines.append(f"**Chat Template**: {chat_status}\n")

    # Generation samples
    if sft_gen.get("samples"):
        lines.append("### 생성 샘플 (Greedy, Chat Template)\n")
        greedy_samples = [s for s in sft_gen["samples"] if s.get("temperature") == 0.0]
        for i, s in enumerate(greedy_samples[:5], 1):
            prompt = s.get("prompt", "")
            text = s.get("text", "")[:400]
            hit_eos = s.get("hit_eos", False)
            rep3 = s.get("3gram_rep", 0)
            lines.append(f"**[{i}]** `{prompt}`")
            lines.append(f"> {text}")
            lines.append(f"> *EOS={hit_eos}, 3gram_rep={rep3:.2%}, tokens={s.get('generated_tokens', 0)}*\n")

    # Repetition grid
    sft_rep = sft_p1.get("repetition", {})
    if sft_rep.get("grid_results"):
        lines.append("### Repetition 파라미터 검색 결과\n")
        lines.append("| 설정 | 3-gram | EOS Rate | Avg Tokens |")
        lines.append("|------|--------|----------|-----------|")
        grid = sft_rep["grid_results"]
        items = grid if isinstance(grid, list) else list(grid.values())
        for r in items[:6]:
            if isinstance(r, dict):
                lines.append(
                    f"| {r.get('params', '?')} | "
                    f"{_fmt_f(r.get('avg_3gram_rep'))} | "
                    f"{_fmt_f(r.get('eos_rate'))} | "
                    f"{_fmt_f(r.get('avg_tokens'), 1)} |"
                )
        lines.append("")

    # === 4. Korean Benchmarks ===
    lines.append("## 4. 한국어 벤치마크\n")
    lines.append("### KoBEST (0-shot)\n")
    lines.append("| 태스크 | Base | SFT | 변화 | 목표 | 판정 |")
    lines.append("|--------|------|-----|------|------|------|")

    kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
                    "kobest_sentineg", "kobest_wic"]
    kobest_targets = {"kobest_boolq": 0.60, "kobest_copa": 0.65,
                      "kobest_hellaswag": 0.30, "kobest_sentineg": 0.60,
                      "kobest_wic": 0.55}
    sft_kobest_accs = []
    base_kobest_accs = []

    for t in kobest_tasks:
        base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
        sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
        target = kobest_targets.get(t, 0.50)

        if sft_a is not None:
            sft_kobest_accs.append(sft_a)
        if base_a is not None:
            base_kobest_accs.append(base_a)

        diff = ""
        verdict = "—"
        if sft_a is not None and base_a is not None:
            d = (sft_a - base_a) * 100
            diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
            verdict = "PASS" if sft_a >= target else "FAIL"

        lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | "
                     f"≥{target*100:.0f}% | {verdict} |")

    if sft_kobest_accs:
        sft_avg = sum(sft_kobest_accs) / len(sft_kobest_accs)
        base_avg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else _BASE_BENCH_REFERENCE.get("kobest_avg", 0.4369)
        diff_avg = (sft_avg - base_avg) * 100
        lines.append(f"| **평균** | **{base_avg*100:.2f}%** | **{sft_avg*100:.2f}%** | "
                     f"**{'+' if diff_avg >= 0 else ''}{diff_avg:.1f}pp** | "
                     f"**≥{_SFT_TARGETS['kobest_avg_min']*100:.0f}%** | **{'PASS' if sft_avg >= _SFT_TARGETS['kobest_avg_min'] else 'FAIL'}** |")
    lines.append("")

    # HAE-RAE
    lines.append("### HAE-RAE (0-shot)\n")
    base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae")
    sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None
    if sft_haerae is not None:
        diff_h = (sft_haerae - (base_haerae or 0)) * 100 if base_haerae else 0
        lines.append(f"- Base: {_fmt_pct(base_haerae)} → SFT: {_fmt_pct(sft_haerae)} "
                     f"({'+' if diff_h >= 0 else ''}{diff_h:.1f}pp) | "
                     f"목표 ≥{_SFT_TARGETS['haerae_min']*100:.0f}% | {'PASS' if sft_haerae >= _SFT_TARGETS['haerae_min'] else 'FAIL'}")
    else:
        lines.append(f"- Base: {_fmt_pct(base_haerae)} → SFT: N/A")
    lines.append("")

    # MMLU-KO
    lines.append("### MMLU-KO (0-shot)\n")
    base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko")
    sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None
    if sft_mmlu_ko is not None:
        diff_mk = (sft_mmlu_ko - (base_mmlu_ko or 0)) * 100 if base_mmlu_ko else 0
        lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} → SFT: {_fmt_pct(sft_mmlu_ko)} "
                     f"({'+' if diff_mk >= 0 else ''}{diff_mk:.1f}pp) | "
                     f"목표 ≥{_SFT_TARGETS['mmlu_ko_min']*100:.0f}% | {'PASS' if sft_mmlu_ko >= _SFT_TARGETS['mmlu_ko_min'] else 'FAIL'}")
    else:
        lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} → SFT: N/A")
    lines.append("")

    # 5-shot comparison
    if sft_five:
        lines.append("### 5-shot 비교 (한국어)\n")
        lines.append("| 태스크 | 0-shot | 5-shot | 변화 |")
        lines.append("|--------|--------|--------|------|")
        for t in kobest_tasks + ["haerae", "global_mmlu_ko"]:
            a0 = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
            a5 = _get_acc(sft_five.get(t, {})) if t in sft_five else None
            if a0 is not None and a5 is not None:
                d = (a5 - a0) * 100
                lines.append(f"| {t} | {a0*100:.2f}% | {a5*100:.2f}% | {'+' if d >= 0 else ''}{d:.1f}pp |")
        lines.append("")

    # === 5. English Benchmarks ===
    lines.append("## 5. 영어 벤치마크 (유지 확인)\n")
    lines.append("| 태스크 | Base | SFT | 변화 | 하한 | 판정 |")
    lines.append("|--------|------|-----|------|------|------|")

    en_tasks = {
        "hellaswag": _SFT_TARGETS["hellaswag_min"],
        "arc_easy": _SFT_TARGETS["arc_easy_min"],
        "arc_challenge": _SFT_TARGETS["arc_challenge_min"],
        "winogrande": _SFT_TARGETS["winogrande_min"],
        "piqa": _SFT_TARGETS["piqa_min"],
    }
    for t, threshold in en_tasks.items():
        base_a = _get_acc(base_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \
                 if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
        sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \
                if t in sft_zero else None
        diff = ""
        verdict = "—"
        if sft_a is not None and base_a is not None:
            d = (sft_a - base_a) * 100
            diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
            verdict = "PASS" if sft_a >= threshold else "FAIL"
        lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | "
                     f"≥{threshold*100:.0f}% | {verdict} |")

    # MMLU-EN
    _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}
    sft_mmlu_en = []
    base_mmlu_en = []
    for t, m in sft_zero.items():
        if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
            a = _get_acc(m)
            if a is not None:
                sft_mmlu_en.append(a)
    if not sft_mmlu_en:
        for t in _MMLU_EN_GROUPS:
            if t in sft_zero:
                a = _get_acc(sft_zero[t])
                if a is not None:
                    sft_mmlu_en.append(a)
    for t, m in base_zero.items():
        if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
            a = _get_acc(m)
            if a is not None:
                base_mmlu_en.append(a)
    if not base_mmlu_en:
        for t in _MMLU_EN_GROUPS:
            if t in base_zero:
                a = _get_acc(base_zero[t])
                if a is not None:
                    base_mmlu_en.append(a)

    sft_mmlu_en_avg = sum(sft_mmlu_en) / len(sft_mmlu_en) if sft_mmlu_en else None
    base_mmlu_en_avg = sum(base_mmlu_en) / len(base_mmlu_en) if base_mmlu_en else 0.2581
    if sft_mmlu_en_avg is not None:
        d = (sft_mmlu_en_avg - base_mmlu_en_avg) * 100
        lines.append(f"| MMLU-EN 평균 | {base_mmlu_en_avg*100:.2f}% | {sft_mmlu_en_avg*100:.2f}% | "
                     f"{'+' if d >= 0 else ''}{d:.1f}pp | ≥25% | "
                     f"{'PASS' if sft_mmlu_en_avg >= _SFT_TARGETS['mmlu_en_avg_min'] else 'FAIL'} |")
    lines.append("")

    # === 6. Calibration ===
    lines.append("## 6. Calibration 비교\n")
    sft_cal = sft_p1.get("calibration", {})
    lines.append("| 지표 | Base | SFT | 목표 | 판정 |")
    lines.append("|------|------|-----|------|------|")

    cal_checks = [
        ("top1_accuracy", "Top-1 Accuracy", _SFT_TARGETS["top1_accuracy_min"], True),
        ("top5_accuracy", "Top-5 Accuracy", 0.78, True),
        ("top10_accuracy", "Top-10 Accuracy", 0.82, True),
        ("mean_entropy", "Mean Entropy", 2.0, False),
    ]
    for key, label, threshold, is_higher_better in cal_checks:
        base_v = _BASE_CALIB_REFERENCE.get(key)
        sft_v = sft_cal.get(key)
        verdict = "—"
        if sft_v is not None:
            if is_higher_better:
                verdict = "PASS" if sft_v >= threshold else "FAIL"
            else:
                verdict = "PASS" if sft_v <= threshold else "FAIL"
        lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | "
                     f"{'≥' if is_higher_better else '<'}{threshold} | {verdict} |")

    # Token NLL
    sft_nll = sft_p1.get("token_nll", {})
    nll_mean = sft_nll.get("nll_mean", sft_nll.get("mean"))
    base_nll_mean = _BASE_NLL_REFERENCE.get("nll_mean")
    if nll_mean is not None:
        lines.append(f"| Token NLL mean | {_fmt_f(base_nll_mean)} | {_fmt_f(nll_mean)} | "
                     f"< 2.0 | {'PASS' if nll_mean < 2.0 else 'FAIL'} |")
    hlf5 = sft_nll.get("high_loss_fractions", {}).get("5", sft_nll.get("high_loss_fraction_5"))
    base_hlf5 = _BASE_NLL_REFERENCE.get("high_loss_fraction_5")
    if hlf5 is not None:
        lines.append(f"| NLL > 5 비율 | {_fmt_f(base_hlf5)} | {_fmt_f(hlf5)} | "
                     f"< 0.15 | {'PASS' if hlf5 < 0.15 else 'FAIL'} |")
    lines.append("")

    # === 7. Final Verdict ===
    lines.append("## 7. 종합 판정 및 다음 단계\n")

    lines.append("### 핵심 판정 기준\n")
    lines.append("| 조건 | 현재 값 | 기준 | 충족 |")
    lines.append("|------|---------|------|------|")

    rep_val = rep_rate
    lines.append(f"| Greedy 3-gram 반복률 | {_fmt_pct(rep_val)} | < {_SFT_TARGETS['greedy_3gram_rep_max']:.0%} | "
                 f"{'YES' if rep_val is not None and rep_val < _SFT_TARGETS['greedy_3gram_rep_max'] else 'NO'} |")
    lines.append(f"| KoBEST 평균 | {_fmt_pct(kobest_avg)} | > {_SFT_TARGETS['kobest_avg_min']*100:.0f}% | "
                 f"{'YES' if kobest_avg is not None and kobest_avg > _SFT_TARGETS['kobest_avg_min'] else 'NO'} |")
    lines.append(f"| 최대 Forgetting | {f'{max_forgetting:.1f}%' if max_forgetting is not None else 'N/A'} | "
                 f"< {_SFT_TARGETS['ppl_forgetting_max_pct']}% | {'YES' if max_forgetting is not None and max_forgetting < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'NO'} |")
    lines.append("")

    # Final recommendation — use ORPO quantitative score for decision
    lines.append("### 권고\n")
    orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero)
    orpo_score = orpo_result["total_score"]
    orpo_decision = orpo_result["decision"]

    all_core_pass = (
        rep_rate is not None and rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"]
        and kobest_avg is not None and kobest_avg > _SFT_TARGETS["kobest_avg_min"]
        and max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"]
    )

    if all_core_pass:
        lines.append("**모든 핵심 조건 충족 → Phase 4: GGUF 변환 + Ollama 배포 진행**\n")
    elif orpo_decision == "ORPO":
        lines.append(f"**ORPO 판정 스코어 {orpo_score:.1f}/100 → Phase 3: ORPO 학습 진행** (795K preference pairs 활용)\n")
        lines.append("ORPO 학습 시 주안점:")
        lines.append("- Greedy 반복률 감소 (현재 72.97% → 목표 <5%)")
        lines.append("- EOS 종료율 개선 (현재 60% → 목표 >90%)")
        lines.append("- 벤치마크 점수 유지/향상")
        lines.append("- 지식 보존 유지 (현재 forgetting 0.9%)")
    elif orpo_decision == "SKIP_ORPO":
        lines.append("**ORPO 불필요 → Phase 4: GGUF 변환 + Ollama 배포 진행**\n")
    else:
        lines.append("**핵심 조건 미달 → SFT 재시도**\n")
        lines.append("재시도 시 검토 사항:")
        lines.append("- 학습률 조정")
        lines.append("- 데이터 구성 재검토")
        lines.append("- 에폭 수 조정")
    lines.append("")

    lines.append("---\n")
    lines.append("*이 보고서는 `eval/sft_eval_pipeline.py`에 의해 자동 생성되었습니다.*")

    report_text = "\n".join(lines)
    output_path.write_text(report_text, encoding="utf-8")

    # Also save to sft_output_dir if provided
    if sft_output_dir:
        (Path(sft_output_dir) / "sft_comparison_report.md").write_text(report_text, encoding="utf-8")

    return output_path


def _compute_verdicts(sft_p1, sft_zero, base_p1, base_zero):
    """Compute pass/fail verdicts for each of the 6 evaluation dimensions."""
    verdicts = []

    # Dim 1: PPL forgetting
    max_forgetting = _get_max_forgetting(sft_p1, base_p1)
    if max_forgetting is not None:
        verdicts.append((
            "차원 1: Perplexity (지식 보존)",
            max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"],
            f"최대 forgetting {max_forgetting:.1f}% (임계값 {_SFT_TARGETS['ppl_forgetting_max_pct']}%)",
        ))
    else:
        verdicts.append(("차원 1: Perplexity (지식 보존)", False, "데이터 없음"))

    # Dim 2: Generation quality
    rep_rate = _get_greedy_3gram_rep(sft_p1)
    eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
    if rep_rate is not None and eos_rate is not None:
        gen_pass = rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] and eos_rate > _SFT_TARGETS["eos_rate_min"]
        verdicts.append((
            "차원 2: 생성 품질",
            gen_pass,
            f"반복률 {rep_rate:.2%} (목표 <{_SFT_TARGETS['greedy_3gram_rep_max']:.0%}), EOS {eos_rate:.0%} (목표 >{_SFT_TARGETS['eos_rate_min']:.0%})",
        ))
    else:
        verdicts.append(("차원 2: 생성 품질", False, "데이터 없음"))

    # Dim 3: Korean benchmarks
    kobest_avg = _get_kobest_avg(sft_zero)
    if kobest_avg is not None:
        verdicts.append((
            "차원 3: 한국어 벤치마크",
            kobest_avg > _SFT_TARGETS["kobest_avg_min"],
            f"KoBEST 평균 {kobest_avg*100:.2f}% (목표 >{_SFT_TARGETS['kobest_avg_min']*100:.0f}%)",
        ))
    else:
        verdicts.append(("차원 3: 한국어 벤치마크", False, "데이터 없음"))

    # Dim 4: English benchmarks
    en_tasks = {
        "hellaswag": _SFT_TARGETS["hellaswag_min"],
        "arc_easy": _SFT_TARGETS["arc_easy_min"],
        "arc_challenge": _SFT_TARGETS["arc_challenge_min"],
        "winogrande": _SFT_TARGETS["winogrande_min"],
        "piqa": _SFT_TARGETS["piqa_min"],
    }
    en_pass = True
    en_detail_parts = []
    for t, threshold in en_tasks.items():
        a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
        if a is not None:
            if a < threshold:
                en_pass = False
            en_detail_parts.append(f"{t}={a*100:.1f}%")
    if en_detail_parts:
        verdicts.append((
            "차원 4: 영어 벤치마크",
            en_pass,
            ", ".join(en_detail_parts[:3]) + ("..." if len(en_detail_parts) > 3 else ""),
        ))
    else:
        verdicts.append(("차원 4: 영어 벤치마크", False, "데이터 없음"))

    # Dim 5: Calibration
    cal = sft_p1.get("calibration", {})
    top1 = cal.get("top1_accuracy")
    if top1 is not None:
        cal_pass = top1 >= _SFT_TARGETS["top1_accuracy_min"]
        verdicts.append((
            "차원 5: Calibration",
            cal_pass,
            f"Top-1 {top1*100:.2f}% (목표 ≥{_SFT_TARGETS['top1_accuracy_min']*100:.0f}%)",
        ))
    else:
        verdicts.append(("차원 5: Calibration", False, "데이터 없음"))

    # Dim 6: SFT-specific (chat quality) — based on generation + EOS
    if eos_rate is not None:
        chat_pass = eos_rate > 0.50  # relaxed threshold for chat
        verdicts.append((
            "차원 6: SFT Chat 능력",
            chat_pass,
            f"EOS 종료율 {eos_rate:.0%}, 생성 샘플 수동 검토 필요",
        ))
    else:
        verdicts.append(("차원 6: SFT Chat 능력", False, "데이터 없음"))

    return verdicts


def _get_greedy_3gram_rep(p1: dict) -> Optional[float]:
    gen = p1.get("generation", {})
    return gen.get("summary", {}).get("greedy_avg_3gram_rep")


def _get_kobest_avg(zero_shot: dict) -> Optional[float]:
    kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
                    "kobest_sentineg", "kobest_wic"]
    accs = []
    for t in kobest_tasks:
        if t in zero_shot:
            a = _get_acc(zero_shot[t])
            if a is not None:
                accs.append(a)
    return sum(accs) / len(accs) if accs else None


def _get_max_forgetting(sft_p1: dict, base_p1: dict) -> Optional[float]:
    sft_ppl = sft_p1.get("perplexity", {})
    base_ppl = base_p1.get("perplexity", {})
    forgetting_values = []
    for name in sft_ppl:
        sft_val = sft_ppl[name].get("ppl") if isinstance(sft_ppl[name], dict) else None
        base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
        if base_val is None:
            base_val = _BASE_PPL_REFERENCE.get(name)
        if sft_val is not None and base_val is not None and base_val > 0:
            forgetting_values.append((sft_val - base_val) / base_val * 100)
    return max(forgetting_values) if forgetting_values else None


# =========================================================================
# ORPO-specific verdict helpers
# =========================================================================

def _compute_orpo_verdicts(
    orpo_p1: dict,
    orpo_zero: dict,
    sft_p1: dict,
    sft_zero: dict,
    training_curve: Optional[dict] = None,
) -> List[Tuple[str, bool, str]]:
    """Compute the 4 ORPO-specific evaluation dimensions.

    Returns list of (dimension_name, pass_bool, detail_string).
    """
    verdicts: List[Tuple[str, bool, str]] = []

    # ORPO Dim 1: Preference Accuracy (final > 0.65)
    pref_acc = None
    if training_curve and training_curve.get("eval_steps"):
        last_step = training_curve["eval_steps"][-1]
        pref_acc = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
    if pref_acc is not None:
        verdicts.append((
            "ORPO-1: Preference Accuracy",
            pref_acc > 0.65,
            f"최종 {pref_acc:.2%} (목표 > 65%)",
        ))
    else:
        verdicts.append(("ORPO-1: Preference Accuracy", False, "데이터 없음"))

    # ORPO Dim 2: Reward Margins (final > 0.1)
    reward_margin = None
    if training_curve and training_curve.get("eval_steps"):
        last_step = training_curve["eval_steps"][-1]
        reward_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
    if reward_margin is not None:
        verdicts.append((
            "ORPO-2: Reward Margins",
            reward_margin > 0.1,
            f"최종 {reward_margin:.4f} (목표 > 0.1)",
        ))
    else:
        verdicts.append(("ORPO-2: Reward Margins", False, "데이터 없음"))

    # ORPO Dim 3: Parameter Sensitivity (greedy rep < 5% with rep_penalty=1.0)
    rep_grid = orpo_p1.get("repetition", {}).get("grid_results")
    param_sens_pass = False
    param_sens_detail = "데이터 없음"
    if rep_grid:
        items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
        for r in items:
            if isinstance(r, dict):
                rp = r.get("repetition_penalty", r.get("rep_penalty"))
                if rp is not None and abs(float(rp) - 1.0) < 1e-6:
                    rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition"))
                    if rep_val is not None:
                        param_sens_pass = rep_val < 0.05
                        param_sens_detail = f"rep_penalty=1.0 시 3-gram rep={rep_val:.2%} (목표 < 5%)"
                    break
    verdicts.append((
        "ORPO-3: Parameter Sensitivity",
        param_sens_pass,
        param_sens_detail,
    ))

    # ORPO Dim 4: SFT→ORPO Improvement (rep decreased AND EOS increased)
    sft_rep = _get_greedy_3gram_rep(sft_p1)
    orpo_rep = _get_greedy_3gram_rep(orpo_p1)
    sft_eos = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
    orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")

    if all(v is not None for v in [sft_rep, orpo_rep, sft_eos, orpo_eos]):
        rep_improved = orpo_rep < sft_rep
        eos_improved = orpo_eos > sft_eos
        verdicts.append((
            "ORPO-4: SFT→ORPO 개선",
            rep_improved and eos_improved,
            f"반복률 {sft_rep:.2%}→{orpo_rep:.2%} ({'↓' if rep_improved else '↑'}), "
            f"EOS {sft_eos:.0%}→{orpo_eos:.0%} ({'↑' if eos_improved else '↓'})",
        ))
    else:
        verdicts.append(("ORPO-4: SFT→ORPO 개선", False, "데이터 없음"))

    return verdicts


# =========================================================================
# Base vs SFT vs ORPO 3-way Comparison Report
# =========================================================================

def generate_three_way_report(
    base_results_dir: Path,
    sft_results_dir: Path,
    orpo_phase1_results: dict,
    orpo_phase2_results: dict,
    output_path: Path,
    orpo_output_dir: Optional[Path] = None,
    training_curve: Optional[dict] = None,
    total_elapsed_sec: float = 0.0,
) -> Path:
    """Generate a comprehensive Base vs SFT vs ORPO 3-way comparison report.

    Args:
        base_results_dir: Directory containing Base model's phase1/phase2_results.json
        sft_results_dir: Directory containing SFT model's phase1/phase2_results.json
        orpo_phase1_results: ORPO Phase 1 results dict
        orpo_phase2_results: ORPO Phase 2 results dict
        output_path: Where to write the markdown report
        orpo_output_dir: ORPO eval outputs directory (for linking)
        training_curve: Dict with "eval_steps" list of per-step metrics
        total_elapsed_sec: Total pipeline elapsed time

    Returns:
        Path to the generated report
    """
    base_results_dir = Path(base_results_dir)
    sft_results_dir = Path(sft_results_dir)
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # --- Load Base results ---
    base_p1_raw, base_p2_raw = {}, {}
    p1_file = base_results_dir / "phase1_results.json"
    p2_file = base_results_dir / "phase2_results.json"
    if p1_file.exists():
        with open(p1_file, encoding="utf-8") as f:
            base_p1_raw = json.load(f)
    if p2_file.exists():
        with open(p2_file, encoding="utf-8") as f:
            base_p2_raw = json.load(f)

    # --- Load SFT results ---
    sft_p1_raw, sft_p2_raw = {}, {}
    sft_p1_file = sft_results_dir / "phase1_results.json"
    sft_p2_file = sft_results_dir / "phase2_results.json"
    if sft_p1_file.exists():
        with open(sft_p1_file, encoding="utf-8") as f:
            sft_p1_raw = json.load(f)
    if sft_p2_file.exists():
        with open(sft_p2_file, encoding="utf-8") as f:
            sft_p2_raw = json.load(f)

    # --- Normalize all ---
    base_p1 = _normalize_phase1_results(base_p1_raw)
    base_zero, base_five = _normalize_phase2_results(base_p2_raw)
    sft_p1 = _normalize_phase1_results(sft_p1_raw)
    sft_zero, sft_five = _normalize_phase2_results(sft_p2_raw)
    orpo_p1 = _normalize_phase1_results(orpo_phase1_results)
    orpo_zero, orpo_five = _normalize_phase2_results(orpo_phase2_results)

    eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    lines: List[str] = []

    # =====================================================================
    # Header
    # =====================================================================
    lines.append("# FRANKENSTALLM 3B ORPO 모델 종합 평가 보고서\n")
    lines.append(f"- **평가 일시**: {eval_datetime}")
    lines.append(f"- **비교 대상**: Base → SFT → ORPO")
    lines.append(f"- **총 소요 시간**: {_fmt_seconds(total_elapsed_sec)}")
    if orpo_output_dir:
        lines.append(f"- **결과 디렉토리**: {orpo_output_dir}")
    lines.append("")

    # =====================================================================
    # 1. Executive Summary
    # =====================================================================
    lines.append("## 1. Executive Summary\n")

    # 6 standard verdicts (reuse existing)
    std_verdicts = _compute_verdicts(orpo_p1, orpo_zero, base_p1, base_zero)
    # 4 ORPO-specific verdicts
    orpo_verdicts = _compute_orpo_verdicts(orpo_p1, orpo_zero, sft_p1, sft_zero, training_curve)

    all_verdicts = std_verdicts + orpo_verdicts

    lines.append("| # | 평가 차원 | 결과 | 상세 |")
    lines.append("|---|----------|------|------|")
    for i, (dim_name, verdict, detail) in enumerate(all_verdicts, 1):
        icon = "PASS" if verdict else "FAIL"
        lines.append(f"| {i} | {dim_name} | **{icon}** | {detail} |")
    lines.append("")

    pass_count = sum(1 for _, v, _ in all_verdicts if v)
    total_dims = len(all_verdicts)
    lines.append(f"**종합**: {pass_count}/{total_dims} 차원 통과\n")

    # Quantitative score (reuse _compute_orpo_score with ORPO results)
    orpo_score_result = _compute_orpo_score(orpo_p1, orpo_zero, base_p1, base_zero)
    lines.append(f"**정량 스코어**: {orpo_score_result['total_score']}/100\n")

    # Final decision
    orpo_rep = _get_greedy_3gram_rep(orpo_p1)
    orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate")
    orpo_forgetting = _get_max_forgetting(orpo_p1, base_p1)
    orpo_kobest = _get_kobest_avg(orpo_zero)

    deploy_criteria_met = (
        orpo_rep is not None and orpo_rep < 0.05
        and orpo_eos is not None and orpo_eos > 0.90
        and orpo_forgetting is not None and orpo_forgetting < 5.0
        and orpo_kobest is not None and orpo_kobest >= 0.43
    )
    final_decision = "DEPLOY" if deploy_criteria_met else "RETRY"
    lines.append(f"**최종 판정**: **{final_decision}**\n")
    lines.append("")

    # =====================================================================
    # 2. 학습 곡선 분석
    # =====================================================================
    lines.append("## 2. 학습 곡선 분석\n")
    if training_curve and training_curve.get("eval_steps"):
        eval_steps = training_curve["eval_steps"]

        lines.append("### Training / Eval Loss\n")
        lines.append("| Step | Train Loss | Eval Loss | Pref Accuracy | Reward Margin |")
        lines.append("|------|-----------|-----------|---------------|---------------|")
        for step_data in eval_steps:
            step = step_data.get("step", "?")
            train_loss = _fmt_f(step_data.get("train_loss", step_data.get("loss")), 4)
            eval_loss = _fmt_f(step_data.get("eval_loss"), 4)
            pref_acc = _fmt_f(step_data.get("rewards_accuracies", step_data.get("preference_accuracy")), 4)
            reward_m = _fmt_f(step_data.get("rewards_margins", step_data.get("reward_margins")), 4)
            lines.append(f"| {step} | {train_loss} | {eval_loss} | {pref_acc} | {reward_m} |")
        lines.append("")

        # Summary stats
        first_step = eval_steps[0]
        last_step = eval_steps[-1]
        lines.append("### 학습 곡선 요약\n")
        first_loss = first_step.get("train_loss", first_step.get("loss"))
        last_loss = last_step.get("train_loss", last_step.get("loss"))
        if first_loss is not None and last_loss is not None:
            lines.append(f"- **Train Loss**: {first_loss:.4f} → {last_loss:.4f}")
        first_eval = first_step.get("eval_loss")
        last_eval = last_step.get("eval_loss")
        if first_eval is not None and last_eval is not None:
            lines.append(f"- **Eval Loss**: {first_eval:.4f} → {last_eval:.4f}")
        last_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
        if last_pref is not None:
            lines.append(f"- **최종 Preference Accuracy**: {last_pref:.2%}")
        last_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
        if last_margin is not None:
            lines.append(f"- **최종 Reward Margin**: {last_margin:.4f}")
        lines.append("")
    else:
        lines.append("학습 곡선 데이터 없음\n")

    # =====================================================================
    # 3. Perplexity 비교 (지식 보존)
    # =====================================================================
    lines.append("## 3. Perplexity 비교 (지식 보존)\n")
    lines.append("| 데이터셋 | Base PPL | SFT PPL | ORPO PPL | SFT Forgetting | ORPO Forgetting |")
    lines.append("|---------|---------|---------|---------|----------------|-----------------|")

    base_ppl = base_p1.get("perplexity", {})
    sft_ppl = sft_p1.get("perplexity", {})
    orpo_ppl = orpo_p1.get("perplexity", {})

    all_ppl_names = sorted(set(
        list(base_ppl.keys()) + list(sft_ppl.keys()) + list(orpo_ppl.keys())
    ))
    for name in all_ppl_names:
        base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None
        if base_val is None:
            base_val = _BASE_PPL_REFERENCE.get(name)
        sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None
        orpo_val = orpo_ppl.get(name, {}).get("ppl") if isinstance(orpo_ppl.get(name), dict) else None

        sft_forg = f"{(sft_val - base_val) / base_val * 100:+.1f}%" if (sft_val is not None and base_val is not None and base_val > 0) else "—"
        orpo_forg = f"{(orpo_val - base_val) / base_val * 100:+.1f}%" if (orpo_val is not None and base_val is not None and base_val > 0) else "—"

        lines.append(
            f"| {name} | {_fmt_f(base_val)} | {_fmt_f(sft_val)} | {_fmt_f(orpo_val)} | "
            f"{sft_forg} | {orpo_forg} |"
        )
    lines.append("")

    # =====================================================================
    # 4. 생성 품질 비교
    # =====================================================================
    lines.append("## 4. 생성 품질 비교\n")

    base_gen_summary = base_p1.get("generation", {}).get("summary", {})
    sft_gen_summary = sft_p1.get("generation", {}).get("summary", {})
    orpo_gen_summary = orpo_p1.get("generation", {}).get("summary", {})

    base_3gram = base_gen_summary.get("greedy_avg_3gram_rep", _BASE_GEN_REFERENCE.get("greedy_3gram_rep"))
    sft_3gram = sft_gen_summary.get("greedy_avg_3gram_rep")
    orpo_3gram = orpo_gen_summary.get("greedy_avg_3gram_rep")

    base_4gram = base_gen_summary.get("greedy_avg_4gram_rep", _BASE_GEN_REFERENCE.get("greedy_4gram_rep"))
    sft_4gram = sft_gen_summary.get("greedy_avg_4gram_rep")
    orpo_4gram = orpo_gen_summary.get("greedy_avg_4gram_rep")

    base_eos = base_gen_summary.get("greedy_eos_rate", _BASE_GEN_REFERENCE.get("greedy_eos_rate"))
    sft_eos_val = sft_gen_summary.get("greedy_eos_rate")
    orpo_eos_val = orpo_gen_summary.get("greedy_eos_rate")

    lines.append("| 지표 | Base | SFT | ORPO | SFT→ORPO 변화 |")
    lines.append("|------|------|-----|------|---------------|")

    # 3-gram rep
    sft_orpo_3gram_diff = ""
    if sft_3gram is not None and orpo_3gram is not None:
        d = (orpo_3gram - sft_3gram) * 100
        sft_orpo_3gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
    lines.append(f"| Greedy 3-gram 반복률 | {_fmt_pct(base_3gram)} | {_fmt_pct(sft_3gram)} | "
                 f"{_fmt_pct(orpo_3gram)} | {sft_orpo_3gram_diff} |")

    # 4-gram rep
    sft_orpo_4gram_diff = ""
    if sft_4gram is not None and orpo_4gram is not None:
        d = (orpo_4gram - sft_4gram) * 100
        sft_orpo_4gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
    lines.append(f"| Greedy 4-gram 반복률 | {_fmt_pct(base_4gram)} | {_fmt_pct(sft_4gram)} | "
                 f"{_fmt_pct(orpo_4gram)} | {sft_orpo_4gram_diff} |")

    # EOS rate
    sft_orpo_eos_diff = ""
    if sft_eos_val is not None and orpo_eos_val is not None:
        d = (orpo_eos_val - sft_eos_val) * 100
        sft_orpo_eos_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
    lines.append(f"| EOS 종료율 | {_fmt_pct(base_eos)} | {_fmt_pct(sft_eos_val)} | "
                 f"{_fmt_pct(orpo_eos_val)} | {sft_orpo_eos_diff} |")
    lines.append("")

    # =====================================================================
    # 5. 한국어 벤치마크
    # =====================================================================
    lines.append("## 5. 한국어 벤치마크\n")

    # KoBEST
    lines.append("### KoBEST (0-shot)\n")
    lines.append("| 태스크 | Base | SFT | ORPO | Base→ORPO |")
    lines.append("|--------|------|-----|------|-----------|")

    kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag",
                    "kobest_sentineg", "kobest_wic"]
    base_kobest_accs, sft_kobest_accs, orpo_kobest_accs = [], [], []

    for t in kobest_tasks:
        base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
        sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None
        orpo_a = _get_acc(orpo_zero.get(t, {})) if t in orpo_zero else None

        if base_a is not None:
            base_kobest_accs.append(base_a)
        if sft_a is not None:
            sft_kobest_accs.append(sft_a)
        if orpo_a is not None:
            orpo_kobest_accs.append(orpo_a)

        diff = ""
        if orpo_a is not None and base_a is not None:
            d = (orpo_a - base_a) * 100
            diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"

        lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |")

    # Averages
    base_kavg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else None
    sft_kavg = sum(sft_kobest_accs) / len(sft_kobest_accs) if sft_kobest_accs else None
    orpo_kavg = sum(orpo_kobest_accs) / len(orpo_kobest_accs) if orpo_kobest_accs else None
    avg_diff = ""
    if orpo_kavg is not None and base_kavg is not None:
        d = (orpo_kavg - base_kavg) * 100
        avg_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
    lines.append(f"| **평균** | **{_fmt_pct(base_kavg)}** | **{_fmt_pct(sft_kavg)}** | "
                 f"**{_fmt_pct(orpo_kavg)}** | **{avg_diff}** |")
    lines.append("")

    # HAE-RAE
    lines.append("### HAE-RAE (0-shot)\n")
    base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae")
    sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None
    orpo_haerae = _get_acc(orpo_zero.get("haerae", {})) if "haerae" in orpo_zero else None
    lines.append(f"- Base: {_fmt_pct(base_haerae)} → SFT: {_fmt_pct(sft_haerae)} → ORPO: {_fmt_pct(orpo_haerae)}")
    lines.append("")

    # MMLU-KO
    lines.append("### MMLU-KO (0-shot)\n")
    base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko")
    sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None
    orpo_mmlu_ko = _get_acc(orpo_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in orpo_zero else None
    lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} → SFT: {_fmt_pct(sft_mmlu_ko)} → ORPO: {_fmt_pct(orpo_mmlu_ko)}")
    lines.append("")

    # =====================================================================
    # 6. 영어 벤치마크
    # =====================================================================
    lines.append("## 6. 영어 벤치마크\n")
    lines.append("| 태스크 | Base | SFT | ORPO | Base→ORPO |")
    lines.append("|--------|------|-----|------|-----------|")

    en_tasks_list = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]
    for t in en_tasks_list:
        prefer_norm = t in ["hellaswag", "arc_challenge"]
        base_a = _get_acc(base_zero.get(t, {}), prefer_norm=prefer_norm) if t in base_zero else _BASE_BENCH_REFERENCE.get(t)
        sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=prefer_norm) if t in sft_zero else None
        orpo_a = _get_acc(orpo_zero.get(t, {}), prefer_norm=prefer_norm) if t in orpo_zero else None

        diff = ""
        if orpo_a is not None and base_a is not None:
            d = (orpo_a - base_a) * 100
            diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
        lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |")

    # MMLU-EN averages
    _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"}

    def _mmlu_en_avg(zero: dict) -> Optional[float]:
        accs = []
        for t, m in zero.items():
            if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS:
                a = _get_acc(m)
                if a is not None:
                    accs.append(a)
        if not accs:
            for t in _MMLU_EN_GROUPS:
                if t in zero:
                    a = _get_acc(zero[t])
                    if a is not None:
                        accs.append(a)
        return sum(accs) / len(accs) if accs else None

    base_mmlu_en = _mmlu_en_avg(base_zero)
    sft_mmlu_en = _mmlu_en_avg(sft_zero)
    orpo_mmlu_en = _mmlu_en_avg(orpo_zero)

    mmlu_en_diff = ""
    if orpo_mmlu_en is not None and base_mmlu_en is not None:
        d = (orpo_mmlu_en - base_mmlu_en) * 100
        mmlu_en_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp"
    lines.append(f"| MMLU-EN 평균 | {_fmt_pct(base_mmlu_en)} | {_fmt_pct(sft_mmlu_en)} | "
                 f"{_fmt_pct(orpo_mmlu_en)} | {mmlu_en_diff} |")
    lines.append("")

    # =====================================================================
    # 7. Calibration
    # =====================================================================
    lines.append("## 7. Calibration 비교\n")
    lines.append("| 지표 | Base | SFT | ORPO |")
    lines.append("|------|------|-----|------|")

    base_cal = base_p1.get("calibration", {})
    sft_cal = sft_p1.get("calibration", {})
    orpo_cal = orpo_p1.get("calibration", {})

    cal_metrics = [
        ("top1_accuracy", "Top-1 Accuracy"),
        ("top5_accuracy", "Top-5 Accuracy"),
        ("top10_accuracy", "Top-10 Accuracy"),
    ]
    for key, label in cal_metrics:
        base_v = base_cal.get(key, _BASE_CALIB_REFERENCE.get(key))
        sft_v = sft_cal.get(key)
        orpo_v = orpo_cal.get(key)
        lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | {_fmt_f(orpo_v)} |")
    lines.append("")

    # =====================================================================
    # 8. ORPO 고유 지표
    # =====================================================================
    lines.append("## 8. ORPO 고유 지표\n")

    # Final preference accuracy & reward margins
    if training_curve and training_curve.get("eval_steps"):
        last_step = training_curve["eval_steps"][-1]
        final_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy"))
        final_margin = last_step.get("rewards_margins", last_step.get("reward_margins"))
        if final_pref is not None:
            lines.append(f"- **최종 Preference Accuracy**: {final_pref:.2%}")
        if final_margin is not None:
            lines.append(f"- **최종 Reward Margins**: {final_margin:.4f}")
    else:
        lines.append("- Preference Accuracy / Reward Margins: 데이터 없음")

    # Parameter sensitivity
    rep_grid = orpo_p1.get("repetition", {}).get("grid_results")
    if rep_grid:
        items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
        for r in items:
            if isinstance(r, dict):
                rp = r.get("repetition_penalty", r.get("rep_penalty"))
                if rp is not None and abs(float(rp) - 1.0) < 1e-6:
                    rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition"))
                    if rep_val is not None:
                        verdict = "PASS" if rep_val < 0.05 else "FAIL"
                        lines.append(f"- **Parameter Sensitivity**: rep_penalty=1.0 → 3-gram rep={rep_val:.2%} "
                                     f"(목표 < 5%) → {verdict}")
                    break
    lines.append("")

    # =====================================================================
    # 9. 반복률 그리드 서치
    # =====================================================================
    lines.append("## 9. 반복률 그리드 서치\n")
    if rep_grid:
        items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values())
        rep_rows = []
        for r in items:
            if isinstance(r, dict):
                rep_rows.append({
                    "config": r.get("params", "?"),
                    "temp": r.get("temperature"),
                    "rep_pen": r.get("repetition_penalty"),
                    "3gram": r.get("avg_3gram_rep", r.get("3gram_repetition", float("inf"))),
                    "4gram": r.get("avg_4gram_rep", r.get("4gram_repetition")),
                    "eos_rate": r.get("eos_rate"),
                    "avg_tokens": r.get("avg_tokens"),
                })
        rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf"))

        lines.append("| 설정 | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |")
        lines.append("|------|------|---------|--------|--------|----------|-----------|")
        for i, r in enumerate(rep_rows):
            marker = " **← best**" if i == 0 else ""
            lines.append(
                f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | "
                f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | "
                f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}"
            )
        lines.append("")
    else:
        lines.append("반복률 그리드 서치 데이터 없음\n")

    # =====================================================================
    # 10. 생성 샘플
    # =====================================================================
    lines.append("## 10. 생성 샘플\n")
    orpo_gen = orpo_p1.get("generation", {})
    orpo_samples = orpo_gen.get("samples", [])
    greedy_samples = [s for s in orpo_samples if isinstance(s, dict) and s.get("temperature", 1.0) == 0.0]
    if not greedy_samples:
        greedy_samples = orpo_samples  # fallback: use all samples

    if greedy_samples:
        lines.append("### ORPO Greedy 생성 샘플\n")
        for i, s in enumerate(greedy_samples[:15], 1):
            if isinstance(s, dict):
                prompt = s.get("prompt", "")
                text = s.get("text", s.get("generated_text", ""))
                if len(text) > 500:
                    text = text[:500] + "..."
                hit_eos = s.get("hit_eos", "?")
                rep3 = s.get("3gram_rep", s.get("avg_3gram_rep"))
                tokens = s.get("generated_tokens", s.get("num_tokens", "?"))
                lines.append(f"**[{i}]** `{prompt}`")
                lines.append(f"> {text}")
                meta_parts = [f"EOS={hit_eos}"]
                if rep3 is not None:
                    meta_parts.append(f"3gram_rep={rep3:.2%}")
                meta_parts.append(f"tokens={tokens}")
                lines.append(f"> *{', '.join(meta_parts)}*\n")
    else:
        lines.append("생성 샘플 데이터 없음\n")

    # =====================================================================
    # 11. 최종 판정
    # =====================================================================
    lines.append("## 11. 최종 판정\n")
    lines.append("### 배포 기준 충족 여부\n")
    lines.append("| 조건 | 기준 | 현재 값 | 충족 |")
    lines.append("|------|------|---------|------|")

    criteria = [
        ("Greedy 3-gram 반복률", "< 5%", _fmt_pct(orpo_rep),
         "YES" if orpo_rep is not None and orpo_rep < 0.05 else "NO"),
        ("EOS 종료율", "> 90%", _fmt_pct(orpo_eos),
         "YES" if orpo_eos is not None and orpo_eos > 0.90 else "NO"),
        ("PPL Forgetting", "< 5%", f"{orpo_forgetting:.1f}%" if orpo_forgetting is not None else "N/A",
         "YES" if orpo_forgetting is not None and orpo_forgetting < 5.0 else "NO"),
        ("KoBEST 평균", ">= 43%", _fmt_pct(orpo_kobest),
         "YES" if orpo_kobest is not None and orpo_kobest >= 0.43 else "NO"),
    ]
    for cond, threshold, current, met in criteria:
        lines.append(f"| {cond} | {threshold} | {current} | {met} |")
    lines.append("")

    if deploy_criteria_met:
        lines.append("**→ 모든 배포 기준 충족: DEPLOY (Phase 4: GGUF 변환 + Ollama 배포 진행)**\n")
    else:
        lines.append("**→ 배포 기준 미달: RETRY (ORPO 재학습 또는 하이퍼파라미터 조정 필요)**\n")

    lines.append("---\n")
    lines.append("*이 보고서는 `eval/report_generator.py::generate_three_way_report()`에 의해 자동 생성되었습니다.*")

    report_text = "\n".join(lines)
    output_path.write_text(report_text, encoding="utf-8")

    # Also save to orpo_output_dir if provided
    if orpo_output_dir:
        orpo_output_dir = Path(orpo_output_dir)
        orpo_output_dir.mkdir(parents=True, exist_ok=True)
        (orpo_output_dir / "orpo_three_way_report.md").write_text(report_text, encoding="utf-8")

    return output_path


if __name__ == "__main__":
    print("report_generator.py — use via full_eval_pipeline.py or sft_eval_pipeline.py")