File size: 2,842 Bytes
170de4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# SillokBert-Scratch ํ๋ก์ ํธ 5๋จ๊ณ: ๋ฒ ์ด์ค๋ผ์ธ ๋ชจ๋ธ ํ๊ฐ
# -----------------------------------------------------------------
# ์๋ฌด ํ๋ จ๋ ๊ฑฐ์น์ง ์์ ๋ฒ์ฉ BERT ๋ชจ๋ธ(bert-base-multilingual-cased)์
# Perplexity(PPL)๋ฅผ ๊ณ์ฐํฉ๋๋ค.
# -----------------------------------------------------------------
import os
import math
from pathlib import Path
from transformers import (
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
from datasets import load_dataset, Dataset
from itertools import chain
def evaluate_baseline_bert():
"""bert-base-multilingual-cased ๋ชจ๋ธ์ ์ฑ๋ฅ(Perplexity)๋ฅผ ํ๊ฐํฉ๋๋ค."""
# --- ๋ชจ๋ธ ๋ฐ ๊ฒฝ๋ก ์ค์ ---
baseline_model_name = "bert-base-multilingual-cased"
test_dataset_file = "/home/work/baro/sillok25060103/preprocessed_corpus/test.txt"
eval_output_dir = Path("./baseline_evaluation_output")
print("--- 5. Baseline BERT Model Evaluation ---")
# --- ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋ ---
model = AutoModelForMaskedLM.from_pretrained(baseline_model_name)
tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
# --- ๋ฐ์ดํฐ์
์ค๋น ---
test_dataset = load_dataset('text', data_files={'test': test_dataset_file})
block_size = 512
def tokenize_function(examples):
return tokenizer(examples['text'], add_special_tokens=False, return_special_tokens_mask=False)
tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
all_input_ids = list(chain(*tokenized_datasets['test']['input_ids']))
total_length = len(all_input_ids)
total_length = (total_length // block_size) * block_size
grouped_input_ids = [all_input_ids[i : i + block_size] for i in range(0, total_length, block_size)]
eval_dataset = Dataset.from_dict({'input_ids': grouped_input_ids})
print(f"์ด {len(eval_dataset)}๊ฐ์ ํ๊ฐ ์ํ์ด ์์ฑ๋์์ต๋๋ค.")
# --- ํ๊ฐ ์ํ ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(output_dir=eval_output_dir, per_device_eval_batch_size=8, fp16=True)
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, eval_dataset=eval_dataset
)
metrics = trainer.evaluate()
eval_loss = metrics["eval_loss"]
perplexity = math.exp(eval_loss)
# --- ์ต์ข
๊ฒฐ๊ณผ ๋ฐํ ---
print(f"\n--- {baseline_model_name} ํ๊ฐ ๊ฒฐ๊ณผ ---")
print(f" - ์ต์ข
Eval Loss: {eval_loss:.4f}")
print(f" - ์ต์ข
Perplexity(PPL): {perplexity:.4f}")
print("-" * 40)
if __name__ == "__main__":
evaluate_baseline_bert()
|