Spaces:

maxmunzy
/

schoolbridge

Running

File size: 7,387 Bytes

7f105c8

"""
베이스라인 vs KcELECTRA 성능 비교 스크립트
===========================================
담당: 경이
목적: 동일한 test 데이터로 두 모델의 성능을 비교하여 CSV·JSON으로 저장.
      결과는 02_evaluate_compare.ipynb에서 시각화.

실행:
    python scripts/evaluate_compare.py              # test split 평가
    python scripts/evaluate_compare.py --split val  # val split 평가

결과 파일:
    data/eval_results_simple.json
    data/eval_results_kcelectra.json
    data/eval_comparison_summary.csv

평가 지표:
    - Macro F1  : 클래스 불균형 무관 전체 성능 (메인 지표)
    - Per-class F1, Precision, Recall
    - Confusion Matrix
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path

import pandas as pd
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

_BASE = Path(__file__).parent.parent
sys.path.insert(0, str(_BASE))

from src.classifier_simple import load_pipeline, load_data
from src.classifier_kcelectra import predict_kcelectra, is_ready as kcelectra_ready

LABELS = ["일정", "준비물", "제출", "비용", "건강·안전", "기타"]
OUT_DIR = _BASE / "data"


def _fill_per_class_from_cm(res: dict) -> dict:
    """Colab 저장 JSON에 per_class가 없을 때 confusion_matrix에서 보완."""
    if res.get("per_class"):
        return res
    cm = res.get("confusion_matrix")
    labels = res.get("labels", LABELS)
    if not cm:
        return res
    per_class = {}
    for i, label in enumerate(labels):
        tp = cm[i][i]
        fp = sum(cm[r][i] for r in range(len(cm))) - tp
        fn = sum(cm[i]) - tp
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        per_class[label] = {
            "precision": round(precision, 4),
            "recall":    round(recall, 4),
            "f1":        round(f1, 4),
            "support":   sum(cm[i]),
        }
    res["per_class"] = per_class
    return res


def evaluate_simple(split: str = "test") -> dict:
    texts, true_labels = load_data(split)
    if not texts:
        texts, true_labels = load_data("all")

    pipe = load_pipeline()
    pred_labels = pipe.predict(texts)

    return _make_result("simple", true_labels, list(pred_labels))


def evaluate_kcelectra(split: str = "test") -> dict:
    # Colab에서 이미 평가한 JSON이 있으면 재활용 (torch 없는 환경에서도 비교 가능)
    cached_json = OUT_DIR / "eval_results_kcelectra.json"
    if cached_json.exists() and not kcelectra_ready():
        print(f"[compare] Colab 결과 파일 사용: {cached_json.name}")
        with open(cached_json, encoding="utf-8") as f:
            result = json.load(f)
        result = _fill_per_class_from_cm(result)
        # per_class 보완된 내용을 다시 저장
        with open(cached_json, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        return result

    if not kcelectra_ready():
        print("[compare] KcELECTRA 체크포인트 없음. 01_train_kcelectra.ipynb 먼저 실행하세요.")
        return {}

    texts, true_labels = load_data(split)
    if not texts:
        texts, true_labels = load_data("all")

    pred_labels = [predict_kcelectra(t)["category"] for t in texts]
    return _make_result("kcelectra", true_labels, pred_labels)


def _make_result(model_name: str, true: list, pred: list) -> dict:
    macro_f1  = f1_score(true, pred, labels=LABELS, average="macro",  zero_division=0)
    macro_pre = precision_score(true, pred, labels=LABELS, average="macro", zero_division=0)
    macro_rec = recall_score(true, pred, labels=LABELS, average="macro", zero_division=0)

    report = classification_report(
        true, pred, labels=LABELS, output_dict=True, zero_division=0
    )
    cm = confusion_matrix(true, pred, labels=LABELS)

    print(f"\n{'='*50}")
    print(f"[{model_name}] 분류 리포트")
    print(classification_report(true, pred, labels=LABELS, zero_division=0))
    print(f"[{model_name}] Macro F1={macro_f1:.4f}  Pre={macro_pre:.4f}  Rec={macro_rec:.4f}")
    print(f"[{model_name}] Confusion Matrix:\n{cm}")

    return {
        "model":          model_name,
        "macro_f1":       round(macro_f1, 4),
        "macro_precision":round(macro_pre, 4),
        "macro_recall":   round(macro_rec, 4),
        "per_class":      {
            label: {
                "precision": round(report[label]["precision"], 4),
                "recall":    round(report[label]["recall"], 4),
                "f1":        round(report[label]["f1-score"], 4),
                "support":   report[label]["support"],
            }
            for label in LABELS if label in report
        },
        "confusion_matrix": cm.tolist(),
        "labels":           LABELS,
    }


def save_and_compare(simple_res: dict, kcelectra_res: dict) -> None:
    ts = datetime.now().strftime("%Y%m%d")

    def _write_json(data: dict, canonical: str) -> None:
        with open(OUT_DIR / canonical, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        stem = canonical.replace(".json", "")
        with open(OUT_DIR / f"{stem}_{ts}.json", "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

    _write_json(simple_res, "eval_results_simple.json")
    if kcelectra_res:
        _write_json(kcelectra_res, "eval_results_kcelectra.json")

    rows = []
    for res in [simple_res, kcelectra_res]:
        if not res:
            continue
        row = {
            "model":          res["model"],
            "macro_f1":       res["macro_f1"],
            "macro_precision":res["macro_precision"],
            "macro_recall":   res["macro_recall"],
        }
        for label in LABELS:
            if label in res.get("per_class", {}):
                row[f"{label}_f1"] = res["per_class"][label]["f1"]
        rows.append(row)

    summary_df = pd.DataFrame(rows)
    summary_df.to_csv(OUT_DIR / "eval_comparison_summary.csv", index=False, encoding="utf-8-sig")
    summary_df.to_csv(OUT_DIR / f"eval_comparison_summary_{ts}.csv", index=False, encoding="utf-8-sig")
    print(f"\n[compare] 결과 저장 완료 → {OUT_DIR}")
    print("\n── 성능 요약 ──")
    print(summary_df[["model", "macro_f1", "macro_precision", "macro_recall"]].to_string(index=False))

    if kcelectra_res:
        delta = kcelectra_res["macro_f1"] - simple_res["macro_f1"]
        print(f"\n KcELECTRA vs Simple  ΔMacro F1 = {delta:+.4f}")
        if delta >= 0.05:
            print("  → KcELECTRA 5%+ 향상: 채택 권장!")
        else:
            print("  → 5% 미만 향상: Simple 유지 고려")


def main(split: str = "test") -> None:
    print(f"[compare] 평가 split: {split}")

    simple_res     = evaluate_simple(split)
    kcelectra_res  = evaluate_kcelectra(split)

    save_and_compare(simple_res, kcelectra_res)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--split", default="test", choices=["train", "val", "test", "all"])
    args = parser.parse_args()
    main(split=args.split)