""" evaluate_compare_ensemble_20260505.py ====================================== 담당: 경이 (kyeongyi) 작성일: 2026-05-05 목적: Simple + KcELECTRA 소프트 투표 앙상블 평가. KcELECTRA 단독(v3, Macro F1 0.8545)이 목표 +5%에 0.71%p 미달하여 devlog 우선순위 3에 따라 앙상블 시도. [비교 모델] 1. Simple : TF-IDF + Logistic Regression (baseline 0.8116) 2. KcELECTRA: v3 파인튜닝 단독 (0.8545) 3. Ensemble : KcELECTRA×0.7 + Simple×0.3 (목표: 0.8616+) [앙상블 방식] 소프트 투표(Soft Voting) — 두 모델의 확률을 가중 합산 후 argmax combined[i] = weight_kc × kc_prob[i] + (1-weight_kc) × s_prob[i] [출력 파일 - data/20260505/] eval_results_ensemble_20260505.json eval_comparison_summary_ensemble_20260505.csv 실행: cd model/classification python scripts/evaluate_compare_ensemble_20260505.py python scripts/evaluate_compare_ensemble_20260505.py --weight_kc 0.6 python scripts/evaluate_compare_ensemble_20260505.py --search_weights """ import argparse import json import pickle import sys from pathlib import Path import numpy as np import pandas as pd from sklearn.metrics import classification_report, confusion_matrix, f1_score from sklearn.pipeline import Pipeline _BASE = Path(__file__).parent.parent sys.path.insert(0, str(_BASE / "src")) SPLIT_CSV = _BASE / "data" / "split_v5_20260505.csv" SIMPLE_PKL = _BASE / "checkpoints" / "simple_tfidf_logreg_v3_20260505.pkl" KCELECTRA_CKPT = _BASE / "checkpoints" / "kcelectra-category-v3" OUT_DIR = _BASE / "data" / "20260505" # HF Hub fallback (로컬 체크포인트 없을 때) _HF_REPO = "kysophia/kcelectra-category" _HF_SUBFOLDER = "kcelectra-category-v3" LABELS = ["일정", "준비물", "제출", "비용", "건강·안전", "기타"] _LABEL_TO_COL = {lbl: i for i, lbl in enumerate(LABELS)} # ────────────────────────────────────────────────────────────────── # 데이터 로드 # ────────────────────────────────────────────────────────────────── def load_split(split: str = "test") -> tuple[list[str], list[str]]: if not SPLIT_CSV.exists(): raise FileNotFoundError(f"{SPLIT_CSV} 없음") df = pd.read_csv(SPLIT_CSV, encoding="utf-8-sig") df = df[df["split"] == split] df = df[df["category"].isin(LABELS)] return df["text"].tolist(), df["category"].tolist() # ────────────────────────────────────────────────────────────────── # Simple 확률 행렬 (N, 6) # ────────────────────────────────────────────────────────────────── def _load_simple() -> Pipeline: if not SIMPLE_PKL.exists(): raise FileNotFoundError( f"{SIMPLE_PKL.name} 없음.\n" " 먼저 실행: python scripts/evaluate_compare_v3_20260505.py" ) with open(SIMPLE_PKL, "rb") as f: return pickle.load(f) def get_simple_proba(texts: list[str]) -> np.ndarray: """Simple predict_proba → LABELS 순서로 열 정렬한 (N, 6) 행렬.""" pipe = _load_simple() raw = pipe.predict_proba(texts) # (N, K), pipe.classes_ 순서 classes = list(pipe.classes_) out = np.zeros((len(texts), len(LABELS))) for j, lbl in enumerate(LABELS): if lbl in classes: out[:, j] = raw[:, classes.index(lbl)] return out # ────────────────────────────────────────────────────────────────── # KcELECTRA 확률 행렬 (N, 6) # ────────────────────────────────────────────────────────────────── def _load_kcelectra_model(): """(device, tokenizer, model, id2label) 반환. 로컬 우선, HF Hub fallback.""" try: import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer except ImportError: raise ImportError("pip install torch transformers") device = "cuda" if torch.cuda.is_available() else "cpu" local_ready = ( KCELECTRA_CKPT.exists() and (KCELECTRA_CKPT / "config.json").exists() and any( (KCELECTRA_CKPT / f).exists() for f in ("pytorch_model.bin", "model.safetensors") ) ) if local_ready: tokenizer = AutoTokenizer.from_pretrained(str(KCELECTRA_CKPT)) model = AutoModelForSequenceClassification.from_pretrained( str(KCELECTRA_CKPT), num_labels=len(LABELS), ignore_mismatched_sizes=True ) src = str(KCELECTRA_CKPT) else: print(f"[kcelectra] 로컬 없음 → HF Hub 다운로드: {_HF_REPO}/{_HF_SUBFOLDER}") tokenizer = AutoTokenizer.from_pretrained(_HF_REPO, subfolder=_HF_SUBFOLDER) model = AutoModelForSequenceClassification.from_pretrained( _HF_REPO, subfolder=_HF_SUBFOLDER, num_labels=len(LABELS) ) src = f"{_HF_REPO}/{_HF_SUBFOLDER}" model.to(device).eval() print(f"[kcelectra] 모델 로드: {src} → device={device}") labels_file = KCELECTRA_CKPT / "label2id.json" if local_ready and labels_file.exists(): with open(labels_file, encoding="utf-8") as f: label2id: dict[str, int] = json.load(f) id2label = {v: k for k, v in label2id.items()} else: id2label = {i: lbl for i, lbl in enumerate(LABELS)} return device, tokenizer, model, id2label def get_kcelectra_proba(texts: list[str], batch_size: int = 32) -> np.ndarray: """KcELECTRA softmax → LABELS 순서로 열 정렬한 (N, 6) 행렬. 배치 처리.""" import torch device, tokenizer, model, id2label = _load_kcelectra_model() out = np.zeros((len(texts), len(LABELS))) with torch.no_grad(): for start in range(0, len(texts), batch_size): batch = texts[start : start + batch_size] enc = tokenizer( batch, return_tensors="pt", truncation=True, padding=True, max_length=128, ).to(device) probs = torch.softmax(model(**enc).logits, dim=-1).cpu().numpy() for b_i, prob_row in enumerate(probs): for k_i, p in enumerate(prob_row): lbl = id2label.get(k_i, "기타") col = _LABEL_TO_COL.get(lbl, _LABEL_TO_COL["기타"]) out[start + b_i, col] = p return out # ────────────────────────────────────────────────────────────────── # 앙상블 평가 # ────────────────────────────────────────────────────────────────── def evaluate_ensemble( split: str, weight_kc: float, s_proba: np.ndarray, kc_proba: np.ndarray, true_labels: list[str], ) -> dict: """소프트 투표. s_proba / kc_proba를 받아 가중 합산 → 분류 리포트 반환.""" combined = weight_kc * kc_proba + (1.0 - weight_kc) * s_proba pred_idx = combined.argmax(axis=1) pred_labels = [LABELS[i] for i in pred_idx] report = classification_report( true_labels, pred_labels, labels=LABELS, output_dict=True, zero_division=0, ) cm = confusion_matrix(true_labels, pred_labels, labels=LABELS) macro_f1 = f1_score(true_labels, pred_labels, labels=LABELS, average="macro", zero_division=0) return { "model": "ensemble", "version": f"kc{weight_kc:.1f}+s{1-weight_kc:.1f}", "weight_kc": weight_kc, "macro_f1": round(macro_f1, 4), "macro_precision": round(report["macro avg"]["precision"], 4), "macro_recall": round(report["macro avg"]["recall"], 4), "per_class": { lbl: { "precision": round(report[lbl]["precision"], 4), "recall": round(report[lbl]["recall"], 4), "f1": round(report[lbl]["f1-score"], 4), "support": report[lbl]["support"], } for lbl in LABELS }, "confusion_matrix": cm.tolist(), "labels": LABELS, "split_used": split, "data_version": "v5_20260505", } # ────────────────────────────────────────────────────────────────── # val 세트 가중치 그리드 탐색 # ────────────────────────────────────────────────────────────────── def search_best_weight() -> float: """val 세트에서 weight_kc 0.4~0.9 탐색 → 최적 weight 반환.""" print("\n[weight 탐색] val 세트 기준 KcELECTRA 가중치 최적화") val_texts, val_labels = load_split("val") print(f" val {len(val_texts)}건 확률 계산 중...") val_s = get_simple_proba(val_texts) val_kc = get_kcelectra_proba(val_texts) print(f"\n {'weight_kc':>10s} {'val Macro F1':>14s}") print(" " + "-" * 28) best_w, best_f1 = 0.7, 0.0 for w in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: res = evaluate_ensemble("val", w, val_s, val_kc, val_labels) mark = " ← best" if res["macro_f1"] > best_f1 else "" print(f" {w:>10.1f} {res['macro_f1']:>14.4f}{mark}") if res["macro_f1"] > best_f1: best_f1, best_w = res["macro_f1"], w print(f"\n 최적 weight_kc = {best_w} (val Macro F1 = {best_f1:.4f})") return best_w # ────────────────────────────────────────────────────────────────── # 저장 + 비교 출력 # ────────────────────────────────────────────────────────────────── def _summary_row(res: dict) -> dict: row = { "macro_f1": res.get("macro_f1", "-"), "macro_precision": res.get("macro_precision", "-"), "macro_recall": res.get("macro_recall", "-"), } for lbl in LABELS: row[f"f1_{lbl}"] = res.get("per_class", {}).get(lbl, {}).get("f1", "-") return row def _build_result_from_proba( proba: np.ndarray, true_labels: list[str], model_name: str, split: str, ) -> dict: """확률 행렬 → 분류 리포트 dict (JSON 없을 때 fallback).""" pred_labels = [LABELS[i] for i in proba.argmax(axis=1)] report = classification_report(true_labels, pred_labels, labels=LABELS, output_dict=True, zero_division=0) macro_f1 = f1_score(true_labels, pred_labels, labels=LABELS, average="macro", zero_division=0) cm = confusion_matrix(true_labels, pred_labels, labels=LABELS) return { "model": model_name, "version": "v3", "macro_f1": round(macro_f1, 4), "macro_precision": round(report["macro avg"]["precision"], 4), "macro_recall": round(report["macro avg"]["recall"], 4), "per_class": { lbl: { "precision": round(report[lbl]["precision"], 4), "recall": round(report[lbl]["recall"], 4), "f1": round(report[lbl]["f1-score"], 4), "support": report[lbl]["support"], } for lbl in LABELS }, "confusion_matrix": cm.tolist(), "labels": LABELS, "split_used": split, "data_version": "v5_20260505", } def save_and_compare(simple_res: dict, kc_res: dict, ens_res: dict) -> None: OUT_DIR.mkdir(parents=True, exist_ok=True) ens_path = OUT_DIR / "eval_results_ensemble_20260505.json" with open(ens_path, "w", encoding="utf-8") as f: json.dump(ens_res, f, ensure_ascii=False, indent=2) print(f"\n[저장] {ens_path.name}") label_e = f"Ensemble(kc{ens_res['weight_kc']:.1f}+s{1-ens_res['weight_kc']:.1f})" rows = [ {"model": "Simple (TF-IDF+LR)", **_summary_row(simple_res)}, {"model": "KcELECTRA v3 (단독)", **_summary_row(kc_res)}, {"model": label_e, **_summary_row(ens_res)}, ] summary_path = OUT_DIR / "eval_comparison_summary_ensemble_20260505.csv" pd.DataFrame(rows).to_csv(summary_path, index=False, encoding="utf-8-sig") print(f"[저장] {summary_path.name}") s_f1 = simple_res["macro_f1"] kc_f1 = kc_res["macro_f1"] e_f1 = ens_res["macro_f1"] print("\n" + "=" * 60) print(" 앙상블 성능 비교 (v5 데이터 - 4992행)") print("=" * 60) print(f" Simple Macro F1 : {s_f1:.4f} (baseline)") print(f" KcELECTRA Macro F1 : {kc_f1:.4f} (Delta vs Simple: {kc_f1-s_f1:+.4f})") print(f" Ensemble Macro F1 : {e_f1:.4f} (Delta vs Simple: {e_f1-s_f1:+.4f})") print() delta = e_f1 - s_f1 if delta >= 0.05: print(" ★ 앙상블 5%+ 향상 달성! 채택 확정") elif delta > 0: print(f" ~ 앙상블 소폭 향상 ({delta:+.4f}) — --search_weights 또는 추가 튜닝 권장") else: print(" ✗ 앙상블이 Simple 미달 — weight_kc 조정 필요") print("\n [카테고리별 F1 비교]") print(f" {'카테고리':10s} {'Simple':>8s} {'KcELEC':>8s} {'Ensemble':>10s}") print(" " + "-" * 42) for lbl in LABELS: s = simple_res["per_class"].get(lbl, {}).get("f1", 0.0) k = kc_res["per_class"].get(lbl, {}).get("f1", 0.0) e = ens_res["per_class"].get(lbl, {}).get("f1", 0.0) best = " ★" if e > max(s, k) else (" " if e >= max(s, k) else " ") print(f" {lbl:10s} {s:>8.4f} {k:>8.4f} {e:>10.4f}{best}") print(f"\n[출력 폴더] {OUT_DIR}") # ────────────────────────────────────────────────────────────────── # CLI # ────────────────────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser( description="KcELECTRA + Simple 소프트 투표 앙상블 평가" ) parser.add_argument("--split", default="test", choices=["train", "val", "test"]) parser.add_argument("--weight_kc", type=float, default=0.7, help="KcELECTRA 가중치 (0.0~1.0). 기본값 0.7") parser.add_argument("--search_weights", action="store_true", help="val 세트에서 0.4~0.9 그리드 탐색 후 최적값으로 test 평가") args = parser.parse_args() print(f"앙상블 평가 시작 — split: {args.split}, weight_kc: {args.weight_kc}") # ① val 세트 가중치 탐색 (--search_weights) weight_kc = args.weight_kc if args.search_weights: weight_kc = search_best_weight() print(f"\n[test 평가] 최적 weight_kc={weight_kc} 적용") # ② test 확률 행렬 texts, true_labels = load_split(args.split) print(f"\n[데이터] {args.split} 세트 {len(texts)}건") print("\n[Simple] 확률 행렬 계산 중...") s_proba = get_simple_proba(texts) print("[KcELECTRA] 확률 행렬 계산 중... (CPU면 수 분 소요)") kc_proba = get_kcelectra_proba(texts) # ③ 앙상블 평가 ens_res = evaluate_ensemble(args.split, weight_kc, s_proba, kc_proba, true_labels) # ④ 단독 결과 — 기존 JSON 재활용, 없으면 확률 행렬에서 직접 계산 simple_json = OUT_DIR / "eval_results_simple_v3_20260505.json" kc_json = OUT_DIR / "eval_results_kcelectra_v3_20260505.json" if simple_json.exists(): with open(simple_json, encoding="utf-8") as f: simple_res = json.load(f) print(f"[simple] 기존 JSON 재활용: {simple_json.name}") else: simple_res = _build_result_from_proba(s_proba, true_labels, "simple", args.split) if kc_json.exists(): with open(kc_json, encoding="utf-8") as f: kc_res = json.load(f) print(f"[kcelectra] 기존 JSON 재활용: {kc_json.name}") else: kc_res = _build_result_from_proba(kc_proba, true_labels, "kcelectra", args.split) # ⑤ 저장 + 비교 출력 print("\n[앙상블] 분류 리포트") pred_labels = [LABELS[i] for i in (weight_kc * kc_proba + (1 - weight_kc) * s_proba).argmax(axis=1)] print(classification_report(true_labels, pred_labels, labels=LABELS, zero_division=0)) save_and_compare(simple_res, kc_res, ens_res) if __name__ == "__main__": main()