schoolbridge / model /extraction /file /evaluate_model.py
tsKim
feat: schoolbridge spaces deploy (extract-text endpoint added)
7f105c8
"""
evaluate_model.py
=================
Base 모델 vs v2 Fine-tuned vs v3 Fine-tuned 성능 비교
[실행 전 준비]
1. test_data.jsonl 준비
python scripts/export_predict_output.py # 자동 생성
2. 체크포인트 배치
checkpoints/koelectra-binary/ ← 현재(v3) 모델
checkpoints/koelectra-binary-v2/ ← 이전(v2) 모델 (선택)
[사용법]
# Base vs v3
python file/evaluate_model.py
# Base vs v2 vs v3
python file/evaluate_model.py --v2_model ../checkpoints/koelectra-binary-v2
# 테스트 데이터 직접 지정
python file/evaluate_model.py --test_data ../data/train/test_data.jsonl
"""
import argparse
import sys
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, f1_score
from transformers import pipeline
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
_HERE = Path(__file__).resolve().parent
_ROOT = _HERE.parent
DEFAULT_TEST_DATA = _ROOT / "data" / "train" / "test_data.jsonl"
BASE_MODEL_ID = "monologg/koelectra-small-v3-discriminator"
V3_MODEL_PATH = str(_ROOT / "checkpoints" / "koelectra-binary")
V2_MODEL_PATH = str(_ROOT / "checkpoints" / "koelectra-binary-v2")
_LABEL_MAP = {
"노이즈": 0,
"할 일": 1,
"LABEL_0": 0,
"LABEL_1": 1,
}
def _parse_label(raw: str) -> int:
if raw in _LABEL_MAP:
return _LABEL_MAP[raw]
try:
return int(raw.split("_")[-1])
except ValueError:
raise ValueError(f"알 수 없는 라벨: {raw!r}")
def evaluate_model(
model_path: str,
test_texts: list[str],
true_labels: list[int],
model_name: str = "Model",
) -> tuple[float, float]:
print(f"\n[{model_name}] 추론 중...")
clf = pipeline(
"text-classification",
model=model_path,
tokenizer=model_path,
device=-1,
truncation=True,
max_length=128,
)
predictions = clf(test_texts, batch_size=16)
pred_labels = [_parse_label(p["label"]) for p in predictions]
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, pos_label=1, zero_division=0)
print(f"[{model_name}] 완료")
print(f" Accuracy : {acc * 100:.2f}%")
print(f" F1-Score : {f1:.4f}")
print()
print(classification_report(
true_labels, pred_labels,
target_names=["노이즈(0)", "할 일(1)"],
digits=4,
zero_division=0,
))
return acc, f1
def load_test_data(path: Path) -> tuple[list[str], list[int]]:
import json
texts, labels = [], []
for line in path.read_text("utf-8").splitlines():
if not line.strip():
continue
obj = json.loads(line)
if "text" not in obj or "is_todo" not in obj:
continue
texts.append(str(obj["text"]))
labels.append(int(bool(obj["is_todo"])))
return texts, labels
def main() -> None:
parser = argparse.ArgumentParser(description="Base / v2 / v3 모델 성능 비교")
parser.add_argument(
"--test_data",
type=Path,
default=DEFAULT_TEST_DATA,
help=f"테스트 JSONL 경로 (기본: {DEFAULT_TEST_DATA})",
)
parser.add_argument(
"--v3_model",
default=V3_MODEL_PATH,
help=f"v3 Fine-tuned 모델 경로 (기본: {V3_MODEL_PATH})",
)
parser.add_argument(
"--v2_model",
default=None,
help="v2 Fine-tuned 모델 경로 (없으면 Base vs v3 비교만 수행)",
)
args = parser.parse_args()
if not args.test_data.exists():
print(f"[오류] 테스트 파일이 없습니다: {args.test_data}", file=sys.stderr)
print(" 먼저 실행: python scripts/export_predict_output.py", file=sys.stderr)
sys.exit(1)
test_texts, true_labels = load_test_data(args.test_data)
print(f"테스트 문장: {len(test_texts)}개")
print(f" 할 일(1): {sum(true_labels)}개 "
f"노이즈(0): {len(true_labels) - sum(true_labels)}개")
# ── Base 모델 ─────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Base 모델 (파인튜닝 전, 랜덤 가중치)")
print("=" * 60)
base_acc, base_f1 = evaluate_model(
BASE_MODEL_ID, test_texts, true_labels, "Base"
)
# ── v2 Fine-tuned (선택) ──────────────────────────────────────────────
v2_acc = v2_f1 = None
if args.v2_model:
v2_path = Path(args.v2_model)
if not v2_path.exists():
print(f"[경고] v2 모델 경로 없음: {v2_path} — 건너뜀", file=sys.stderr)
else:
print("=" * 60)
print("v2 Fine-tuned 모델")
print("=" * 60)
v2_acc, v2_f1 = evaluate_model(
str(v2_path), test_texts, true_labels, "v2 Fine-tuned"
)
# ── v3 Fine-tuned ─────────────────────────────────────────────────────
print("=" * 60)
print("v3 Fine-tuned 모델 (v3_dual_labeled_clean.jsonl 학습)")
print("=" * 60)
v3_acc, v3_f1 = evaluate_model(
args.v3_model, test_texts, true_labels, "v3 Fine-tuned"
)
# ── 비교 요약 ─────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("[성능 비교 요약]")
print("=" * 60)
print(f"{'모델':<22} {'Accuracy':>10} {'F1 (할 일)':>12}")
print("-" * 48)
print(f"{'Base 모델':<22} {base_acc*100:>9.2f}% {base_f1:>12.4f}")
if v2_acc is not None:
print(f"{'v2 Fine-tuned':<22} {v2_acc*100:>9.2f}% {v2_f1:>12.4f}")
print(f"{'v3 Fine-tuned':<22} {v3_acc*100:>9.2f}% {v3_f1:>12.4f}")
print("-" * 48)
ref_acc = v2_acc if v2_acc is not None else base_acc
ref_f1 = v2_f1 if v2_f1 is not None else base_f1
ref_name = "v2 대비" if v2_acc is not None else "Base 대비"
delta_acc = (v3_acc - ref_acc) * 100
delta_f1 = v3_f1 - ref_f1
print(f"{'v3 향상 폭 (' + ref_name + ')':<22} "
f"{'+' if delta_acc>=0 else ''}{delta_acc:>8.2f}%p "
f"{'+' if delta_f1>=0 else ''}{delta_f1:>11.4f}")
print("=" * 60)
if __name__ == "__main__":
main()