""" 베이스라인 vs KcELECTRA 성능 비교 스크립트 =========================================== 담당: 경이 목적: 동일한 test 데이터로 두 모델의 성능을 비교하여 CSV·JSON으로 저장. 결과는 02_evaluate_compare.ipynb에서 시각화. 실행: python scripts/evaluate_compare.py # test split 평가 python scripts/evaluate_compare.py --split val # val split 평가 결과 파일: data/eval_results_simple.json data/eval_results_kcelectra.json data/eval_comparison_summary.csv 평가 지표: - Macro F1 : 클래스 불균형 무관 전체 성능 (메인 지표) - Per-class F1, Precision, Recall - Confusion Matrix """ import argparse import json import sys from datetime import datetime from pathlib import Path import pandas as pd from sklearn.metrics import ( classification_report, confusion_matrix, f1_score, precision_score, recall_score, ) _BASE = Path(__file__).parent.parent sys.path.insert(0, str(_BASE)) from src.classifier_simple import load_pipeline, load_data from src.classifier_kcelectra import predict_kcelectra, is_ready as kcelectra_ready LABELS = ["일정", "준비물", "제출", "비용", "건강·안전", "기타"] OUT_DIR = _BASE / "data" def _fill_per_class_from_cm(res: dict) -> dict: """Colab 저장 JSON에 per_class가 없을 때 confusion_matrix에서 보완.""" if res.get("per_class"): return res cm = res.get("confusion_matrix") labels = res.get("labels", LABELS) if not cm: return res per_class = {} for i, label in enumerate(labels): tp = cm[i][i] fp = sum(cm[r][i] for r in range(len(cm))) - tp fn = sum(cm[i]) - tp precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 per_class[label] = { "precision": round(precision, 4), "recall": round(recall, 4), "f1": round(f1, 4), "support": sum(cm[i]), } res["per_class"] = per_class return res def evaluate_simple(split: str = "test") -> dict: texts, true_labels = load_data(split) if not texts: texts, true_labels = load_data("all") pipe = load_pipeline() pred_labels = pipe.predict(texts) return _make_result("simple", true_labels, list(pred_labels)) def evaluate_kcelectra(split: str = "test") -> dict: # Colab에서 이미 평가한 JSON이 있으면 재활용 (torch 없는 환경에서도 비교 가능) cached_json = OUT_DIR / "eval_results_kcelectra.json" if cached_json.exists() and not kcelectra_ready(): print(f"[compare] Colab 결과 파일 사용: {cached_json.name}") with open(cached_json, encoding="utf-8") as f: result = json.load(f) result = _fill_per_class_from_cm(result) # per_class 보완된 내용을 다시 저장 with open(cached_json, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) return result if not kcelectra_ready(): print("[compare] KcELECTRA 체크포인트 없음. 01_train_kcelectra.ipynb 먼저 실행하세요.") return {} texts, true_labels = load_data(split) if not texts: texts, true_labels = load_data("all") pred_labels = [predict_kcelectra(t)["category"] for t in texts] return _make_result("kcelectra", true_labels, pred_labels) def _make_result(model_name: str, true: list, pred: list) -> dict: macro_f1 = f1_score(true, pred, labels=LABELS, average="macro", zero_division=0) macro_pre = precision_score(true, pred, labels=LABELS, average="macro", zero_division=0) macro_rec = recall_score(true, pred, labels=LABELS, average="macro", zero_division=0) report = classification_report( true, pred, labels=LABELS, output_dict=True, zero_division=0 ) cm = confusion_matrix(true, pred, labels=LABELS) print(f"\n{'='*50}") print(f"[{model_name}] 분류 리포트") print(classification_report(true, pred, labels=LABELS, zero_division=0)) print(f"[{model_name}] Macro F1={macro_f1:.4f} Pre={macro_pre:.4f} Rec={macro_rec:.4f}") print(f"[{model_name}] Confusion Matrix:\n{cm}") return { "model": model_name, "macro_f1": round(macro_f1, 4), "macro_precision":round(macro_pre, 4), "macro_recall": round(macro_rec, 4), "per_class": { label: { "precision": round(report[label]["precision"], 4), "recall": round(report[label]["recall"], 4), "f1": round(report[label]["f1-score"], 4), "support": report[label]["support"], } for label in LABELS if label in report }, "confusion_matrix": cm.tolist(), "labels": LABELS, } def save_and_compare(simple_res: dict, kcelectra_res: dict) -> None: ts = datetime.now().strftime("%Y%m%d") def _write_json(data: dict, canonical: str) -> None: with open(OUT_DIR / canonical, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) stem = canonical.replace(".json", "") with open(OUT_DIR / f"{stem}_{ts}.json", "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) _write_json(simple_res, "eval_results_simple.json") if kcelectra_res: _write_json(kcelectra_res, "eval_results_kcelectra.json") rows = [] for res in [simple_res, kcelectra_res]: if not res: continue row = { "model": res["model"], "macro_f1": res["macro_f1"], "macro_precision":res["macro_precision"], "macro_recall": res["macro_recall"], } for label in LABELS: if label in res.get("per_class", {}): row[f"{label}_f1"] = res["per_class"][label]["f1"] rows.append(row) summary_df = pd.DataFrame(rows) summary_df.to_csv(OUT_DIR / "eval_comparison_summary.csv", index=False, encoding="utf-8-sig") summary_df.to_csv(OUT_DIR / f"eval_comparison_summary_{ts}.csv", index=False, encoding="utf-8-sig") print(f"\n[compare] 결과 저장 완료 → {OUT_DIR}") print("\n── 성능 요약 ──") print(summary_df[["model", "macro_f1", "macro_precision", "macro_recall"]].to_string(index=False)) if kcelectra_res: delta = kcelectra_res["macro_f1"] - simple_res["macro_f1"] print(f"\n KcELECTRA vs Simple ΔMacro F1 = {delta:+.4f}") if delta >= 0.05: print(" → KcELECTRA 5%+ 향상: 채택 권장!") else: print(" → 5% 미만 향상: Simple 유지 고려") def main(split: str = "test") -> None: print(f"[compare] 평가 split: {split}") simple_res = evaluate_simple(split) kcelectra_res = evaluate_kcelectra(split) save_and_compare(simple_res, kcelectra_res) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--split", default="test", choices=["train", "val", "test", "all"]) args = parser.parse_args() main(split=args.split)