SillokBert-Scratch / scripts /4_evaluate_model.py
ddokbaro's picture
Upload 15 files
170de4d verified
# SillokBert-Scratch ν”„λ‘œμ νŠΈ 4단계: μ΅œμ’… λͺ¨λΈ 평가
# -----------------------------------------------------------------
# 3λ‹¨κ³„μ—μ„œ μ‚¬μ „ν•™μŠ΅μ„ 마친 μ΅œμ’… λͺ¨λΈμ˜ Perplexity(PPL)λ₯Ό κ³„μ‚°ν•©λ‹ˆλ‹€.
# -----------------------------------------------------------------
import os
import math
from pathlib import Path
from transformers import (
BertForMaskedLM,
PreTrainedTokenizerFast,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
from datasets import load_dataset, Dataset
from itertools import chain
def evaluate_sillok_bert():
"""μ‚¬μ „ν•™μŠ΅μ΄ μ™„λ£Œλœ SillokBert-Scratch λͺ¨λΈμ˜ μ„±λŠ₯(Perplexity)λ₯Ό ν‰κ°€ν•©λ‹ˆλ‹€."""
# --- 경둜 μ„€μ • ---
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
final_model_path = project_dir / "sillokbert_scratch_pretraining_output" / "final_model"
test_dataset_file = "/home/work/baro/sillok25060103/preprocessed_corpus/test.txt"
eval_output_dir = project_dir / "evaluation_output"
print("--- 4. SillokBert-Scratch Model Evaluation ---")
# --- λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ ---
model = BertForMaskedLM.from_pretrained(final_model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(final_model_path)
# --- 데이터셋 μ€€λΉ„ ---
test_dataset = load_dataset('text', data_files={'test': test_dataset_file})
block_size = 512
def tokenize_function(examples):
return tokenizer(examples['text'], add_special_tokens=False, return_special_tokens_mask=False)
tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
all_input_ids = list(chain(*tokenized_datasets['test']['input_ids']))
total_length = len(all_input_ids)
total_length = (total_length // block_size) * block_size
grouped_input_ids = [all_input_ids[i : i + block_size] for i in range(0, total_length, block_size)]
eval_dataset = Dataset.from_dict({'input_ids': grouped_input_ids})
print(f"총 {len(eval_dataset)}개의 평가 μƒ˜ν”Œμ΄ μƒμ„±λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
# --- 평가 μˆ˜ν–‰ ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(output_dir=eval_output_dir, per_device_eval_batch_size=8, fp16=True)
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, eval_dataset=eval_dataset
)
metrics = trainer.evaluate()
eval_loss = metrics["eval_loss"]
perplexity = math.exp(eval_loss)
# --- μ΅œμ’… κ²°κ³Ό λ°œν‘œ ---
print("\n--- μ΅œμ’… 평가 κ²°κ³Ό ---")
print(f" - μ΅œμ’… Eval Loss: {eval_loss:.4f}")
print(f" - μ΅œμ’… Perplexity(PPL): {perplexity:.4f}")
print("-" * 30)
if __name__ == "__main__":
evaluate_sillok_bert()