SignalMod / src /evaluation /golden_baseline_report.py
Mirae Kang
feat: implement new models and improve UI, #23
46cc63a
"""Briefing-aligned report: Golden Baseline (gap <1%) + Final Hybrid (gap <5%, F1≥0.80)."""
from __future__ import annotations
from pathlib import Path
def write_golden_baseline_report(metrics: dict, path: Path) -> None:
run_id = metrics.get("run_id", "unknown")
target = float(metrics.get("target_f1_weighted", 0.80))
max_gap_pp = float(metrics.get("max_train_test_gap_pp", 5.0))
baseline_gap_target_pp = float(metrics.get("baseline_gap_target_pp", 1.0))
base = metrics.get("golden_baseline", {})
squeeze = metrics.get("performance_squeeze", {})
hybrid = metrics.get("hybrid_safety_net", {})
base_gap = base.get("train_test_gap_pp", 99)
hybrid_gap = hybrid.get("train_test_gap_pp", 99)
hybrid_f1 = hybrid.get("f1_weighted", 0)
base_ok = base_gap < baseline_gap_target_pp
hybrid_gap_ok = hybrid_gap < max_gap_pp
hybrid_f1_ok = hybrid_f1 >= target
hybrid_ok = hybrid_gap_ok and hybrid_f1_ok
lines = [
f"# Golden Baseline Strategy — {run_id}",
"",
"Two-step briefing alignment: **Esencial** frozen expert baseline, then **Experto** squeeze + hybrid.",
"",
"## Step 1 — Golden Baseline (Esencial)",
"",
f"| Metric | Value | Target |",
f"|--------|-------|--------|",
f"| F1 weighted (test) | **{base.get('f1_weighted', '—')}** | ~0.72 (pretrained expert) |",
f"| Train–test gap (pp) | **{base_gap}** | < {baseline_gap_target_pp}% {'✅' if base_ok else '⚠️'} |",
f"| Fine-tuning | None (all layers frozen) | — |",
f"| Threshold | {base.get('threshold', '—')} | val-tuned |",
"",
"## Step 2 — Performance Squeeze (Experto)",
"",
f"| Metric | Value | Target |",
f"|--------|-------|--------|",
f"| F1 weighted (test) | **{squeeze.get('f1_weighted', '—')}** | ≥ {target} |",
f"| Train–test gap (pp) | **{squeeze.get('train_test_gap_pp', '—')}** | ≤ 4.9% |",
f"| R-Drop | {squeeze.get('rdrop_enabled', False)} | enabled |",
f"| Layers trained | last {squeeze.get('freeze_mode', '—')} | 2 + head |",
"",
"## Step 3 — Hybrid Safety Net (Final)",
"",
f"| Metric | Value | Target |",
f"|--------|-------|--------|",
f"| F1 weighted (test) | **{hybrid_f1}** | ≥ {target} {'✅' if hybrid_f1_ok else '⚠️'} |",
f"| Train–test gap (pp) | **{hybrid_gap}** | < {max_gap_pp}% {'✅' if hybrid_gap_ok else '⚠️'} |",
f"| Weights | BERT {hybrid.get('bert_weight')} / LR {hybrid.get('lr_weight')} | anchor |",
f"| LR regularization | C=0.001, max_features=200 | stability |",
"",
f"### Overall: {'✅ Briefing targets met' if base_ok and hybrid_ok else '⚠️ Review gaps / F1'}",
"",
f"- JSON: `reports/golden_baseline/golden_baseline_run_{run_id}.json`",
"",
]
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")