|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
import math
|
|
|
from pathlib import Path
|
|
|
from transformers import (
|
|
|
BertForMaskedLM,
|
|
|
PreTrainedTokenizerFast,
|
|
|
DataCollatorForLanguageModeling,
|
|
|
Trainer,
|
|
|
TrainingArguments,
|
|
|
)
|
|
|
from datasets import load_dataset, Dataset
|
|
|
from itertools import chain
|
|
|
|
|
|
def evaluate_sillok_bert():
|
|
|
"""μ¬μ νμ΅μ΄ μλ£λ SillokBert-Scratch λͺ¨λΈμ μ±λ₯(Perplexity)λ₯Ό νκ°ν©λλ€."""
|
|
|
|
|
|
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
|
|
|
final_model_path = project_dir / "sillokbert_scratch_pretraining_output" / "final_model"
|
|
|
test_dataset_file = "/home/work/baro/sillok25060103/preprocessed_corpus/test.txt"
|
|
|
eval_output_dir = project_dir / "evaluation_output"
|
|
|
|
|
|
print("--- 4. SillokBert-Scratch Model Evaluation ---")
|
|
|
|
|
|
|
|
|
model = BertForMaskedLM.from_pretrained(final_model_path)
|
|
|
tokenizer = PreTrainedTokenizerFast.from_pretrained(final_model_path)
|
|
|
|
|
|
|
|
|
test_dataset = load_dataset('text', data_files={'test': test_dataset_file})
|
|
|
block_size = 512
|
|
|
|
|
|
def tokenize_function(examples):
|
|
|
return tokenizer(examples['text'], add_special_tokens=False, return_special_tokens_mask=False)
|
|
|
|
|
|
tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
|
|
|
|
|
|
all_input_ids = list(chain(*tokenized_datasets['test']['input_ids']))
|
|
|
total_length = len(all_input_ids)
|
|
|
total_length = (total_length // block_size) * block_size
|
|
|
grouped_input_ids = [all_input_ids[i : i + block_size] for i in range(0, total_length, block_size)]
|
|
|
eval_dataset = Dataset.from_dict({'input_ids': grouped_input_ids})
|
|
|
print(f"μ΄ {len(eval_dataset)}κ°μ νκ° μνμ΄ μμ±λμμ΅λλ€.")
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
|
training_args = TrainingArguments(output_dir=eval_output_dir, per_device_eval_batch_size=8, fp16=True)
|
|
|
|
|
|
trainer = Trainer(
|
|
|
model=model, args=training_args, data_collator=data_collator, eval_dataset=eval_dataset
|
|
|
)
|
|
|
|
|
|
metrics = trainer.evaluate()
|
|
|
eval_loss = metrics["eval_loss"]
|
|
|
perplexity = math.exp(eval_loss)
|
|
|
|
|
|
|
|
|
print("\n--- μ΅μ’
νκ° κ²°κ³Ό ---")
|
|
|
print(f" - μ΅μ’
Eval Loss: {eval_loss:.4f}")
|
|
|
print(f" - μ΅μ’
Perplexity(PPL): {perplexity:.4f}")
|
|
|
print("-" * 30)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
evaluate_sillok_bert()
|
|
|
|