File size: 6,654 Bytes
7f105c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
evaluate_model.py
=================
Base 모델 vs v2 Fine-tuned vs v3 Fine-tuned 성능 비교

[실행 전 준비]
  1. test_data.jsonl 준비
       python scripts/export_predict_output.py  # 자동 생성
  2. 체크포인트 배치
       checkpoints/koelectra-binary/       ← 현재(v3) 모델
       checkpoints/koelectra-binary-v2/    ← 이전(v2) 모델 (선택)

[사용법]
  # Base vs v3
  python file/evaluate_model.py

  # Base vs v2 vs v3
  python file/evaluate_model.py --v2_model ../checkpoints/koelectra-binary-v2

  # 테스트 데이터 직접 지정
  python file/evaluate_model.py --test_data ../data/train/test_data.jsonl
"""

import argparse
import sys
from pathlib import Path

from sklearn.metrics import accuracy_score, classification_report, f1_score
from transformers import pipeline

if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
    sys.stdout.reconfigure(encoding="utf-8")

_HERE = Path(__file__).resolve().parent
_ROOT = _HERE.parent

DEFAULT_TEST_DATA    = _ROOT / "data" / "train" / "test_data.jsonl"
BASE_MODEL_ID        = "monologg/koelectra-small-v3-discriminator"
V3_MODEL_PATH        = str(_ROOT / "checkpoints" / "koelectra-binary")
V2_MODEL_PATH        = str(_ROOT / "checkpoints" / "koelectra-binary-v2")

_LABEL_MAP = {
    "노이즈":  0,
    "할 일":   1,
    "LABEL_0": 0,
    "LABEL_1": 1,
}


def _parse_label(raw: str) -> int:
    if raw in _LABEL_MAP:
        return _LABEL_MAP[raw]
    try:
        return int(raw.split("_")[-1])
    except ValueError:
        raise ValueError(f"알 수 없는 라벨: {raw!r}")


def evaluate_model(
    model_path: str,
    test_texts: list[str],
    true_labels: list[int],
    model_name: str = "Model",
) -> tuple[float, float]:
    print(f"\n[{model_name}] 추론 중...")
    clf = pipeline(
        "text-classification",
        model=model_path,
        tokenizer=model_path,
        device=-1,
        truncation=True,
        max_length=128,
    )
    predictions = clf(test_texts, batch_size=16)
    pred_labels = [_parse_label(p["label"]) for p in predictions]

    acc = accuracy_score(true_labels, pred_labels)
    f1  = f1_score(true_labels, pred_labels, pos_label=1, zero_division=0)

    print(f"[{model_name}] 완료")
    print(f"  Accuracy : {acc * 100:.2f}%")
    print(f"  F1-Score : {f1:.4f}")
    print()
    print(classification_report(
        true_labels, pred_labels,
        target_names=["노이즈(0)", "할 일(1)"],
        digits=4,
        zero_division=0,
    ))
    return acc, f1


def load_test_data(path: Path) -> tuple[list[str], list[int]]:
    import json
    texts, labels = [], []
    for line in path.read_text("utf-8").splitlines():
        if not line.strip():
            continue
        obj = json.loads(line)
        if "text" not in obj or "is_todo" not in obj:
            continue
        texts.append(str(obj["text"]))
        labels.append(int(bool(obj["is_todo"])))
    return texts, labels


def main() -> None:
    parser = argparse.ArgumentParser(description="Base / v2 / v3 모델 성능 비교")
    parser.add_argument(
        "--test_data",
        type=Path,
        default=DEFAULT_TEST_DATA,
        help=f"테스트 JSONL 경로 (기본: {DEFAULT_TEST_DATA})",
    )
    parser.add_argument(
        "--v3_model",
        default=V3_MODEL_PATH,
        help=f"v3 Fine-tuned 모델 경로 (기본: {V3_MODEL_PATH})",
    )
    parser.add_argument(
        "--v2_model",
        default=None,
        help="v2 Fine-tuned 모델 경로 (없으면 Base vs v3 비교만 수행)",
    )
    args = parser.parse_args()

    if not args.test_data.exists():
        print(f"[오류] 테스트 파일이 없습니다: {args.test_data}", file=sys.stderr)
        print("  먼저 실행: python scripts/export_predict_output.py", file=sys.stderr)
        sys.exit(1)

    test_texts, true_labels = load_test_data(args.test_data)
    print(f"테스트 문장: {len(test_texts)}개")
    print(f"  할 일(1): {sum(true_labels)}개  "
          f"노이즈(0): {len(true_labels) - sum(true_labels)}개")

    # ── Base 모델 ─────────────────────────────────────────────────────────
    print("\n" + "=" * 60)
    print("Base 모델 (파인튜닝 전, 랜덤 가중치)")
    print("=" * 60)
    base_acc, base_f1 = evaluate_model(
        BASE_MODEL_ID, test_texts, true_labels, "Base"
    )

    # ── v2 Fine-tuned (선택) ──────────────────────────────────────────────
    v2_acc = v2_f1 = None
    if args.v2_model:
        v2_path = Path(args.v2_model)
        if not v2_path.exists():
            print(f"[경고] v2 모델 경로 없음: {v2_path} — 건너뜀", file=sys.stderr)
        else:
            print("=" * 60)
            print("v2 Fine-tuned 모델")
            print("=" * 60)
            v2_acc, v2_f1 = evaluate_model(
                str(v2_path), test_texts, true_labels, "v2 Fine-tuned"
            )

    # ── v3 Fine-tuned ─────────────────────────────────────────────────────
    print("=" * 60)
    print("v3 Fine-tuned 모델 (v3_dual_labeled_clean.jsonl 학습)")
    print("=" * 60)
    v3_acc, v3_f1 = evaluate_model(
        args.v3_model, test_texts, true_labels, "v3 Fine-tuned"
    )

    # ── 비교 요약 ─────────────────────────────────────────────────────────
    print("\n" + "=" * 60)
    print("[성능 비교 요약]")
    print("=" * 60)
    print(f"{'모델':<22} {'Accuracy':>10} {'F1 (할 일)':>12}")
    print("-" * 48)
    print(f"{'Base 모델':<22} {base_acc*100:>9.2f}% {base_f1:>12.4f}")
    if v2_acc is not None:
        print(f"{'v2 Fine-tuned':<22} {v2_acc*100:>9.2f}% {v2_f1:>12.4f}")
    print(f"{'v3 Fine-tuned':<22} {v3_acc*100:>9.2f}% {v3_f1:>12.4f}")
    print("-" * 48)

    ref_acc = v2_acc if v2_acc is not None else base_acc
    ref_f1  = v2_f1  if v2_f1  is not None else base_f1
    ref_name = "v2 대비" if v2_acc is not None else "Base 대비"
    delta_acc = (v3_acc - ref_acc) * 100
    delta_f1  = v3_f1 - ref_f1
    print(f"{'v3 향상 폭 (' + ref_name + ')':<22} "
          f"{'+' if delta_acc>=0 else ''}{delta_acc:>8.2f}%p "
          f"{'+' if delta_f1>=0 else ''}{delta_f1:>11.4f}")
    print("=" * 60)


if __name__ == "__main__":
    main()