File size: 2,903 Bytes
170de4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# SillokBert-Scratch νλ‘μ νΈ 4λ¨κ³: μ΅μ’
λͺ¨λΈ νκ°
# -----------------------------------------------------------------
# 3λ¨κ³μμ μ¬μ νμ΅μ λ§μΉ μ΅μ’
λͺ¨λΈμ Perplexity(PPL)λ₯Ό κ³μ°ν©λλ€.
# -----------------------------------------------------------------
import os
import math
from pathlib import Path
from transformers import (
BertForMaskedLM,
PreTrainedTokenizerFast,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
from datasets import load_dataset, Dataset
from itertools import chain
def evaluate_sillok_bert():
"""μ¬μ νμ΅μ΄ μλ£λ SillokBert-Scratch λͺ¨λΈμ μ±λ₯(Perplexity)λ₯Ό νκ°ν©λλ€."""
# --- κ²½λ‘ μ€μ ---
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
final_model_path = project_dir / "sillokbert_scratch_pretraining_output" / "final_model"
test_dataset_file = "/home/work/baro/sillok25060103/preprocessed_corpus/test.txt"
eval_output_dir = project_dir / "evaluation_output"
print("--- 4. SillokBert-Scratch Model Evaluation ---")
# --- λͺ¨λΈ λ° ν ν¬λμ΄μ λ‘λ ---
model = BertForMaskedLM.from_pretrained(final_model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(final_model_path)
# --- λ°μ΄ν°μ
μ€λΉ ---
test_dataset = load_dataset('text', data_files={'test': test_dataset_file})
block_size = 512
def tokenize_function(examples):
return tokenizer(examples['text'], add_special_tokens=False, return_special_tokens_mask=False)
tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
all_input_ids = list(chain(*tokenized_datasets['test']['input_ids']))
total_length = len(all_input_ids)
total_length = (total_length // block_size) * block_size
grouped_input_ids = [all_input_ids[i : i + block_size] for i in range(0, total_length, block_size)]
eval_dataset = Dataset.from_dict({'input_ids': grouped_input_ids})
print(f"μ΄ {len(eval_dataset)}κ°μ νκ° μνμ΄ μμ±λμμ΅λλ€.")
# --- νκ° μν ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(output_dir=eval_output_dir, per_device_eval_batch_size=8, fp16=True)
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, eval_dataset=eval_dataset
)
metrics = trainer.evaluate()
eval_loss = metrics["eval_loss"]
perplexity = math.exp(eval_loss)
# --- μ΅μ’
κ²°κ³Ό λ°ν ---
print("\n--- μ΅μ’
νκ° κ²°κ³Ό ---")
print(f" - μ΅μ’
Eval Loss: {eval_loss:.4f}")
print(f" - μ΅μ’
Perplexity(PPL): {perplexity:.4f}")
print("-" * 30)
if __name__ == "__main__":
evaluate_sillok_bert()
|