File size: 2,842 Bytes
170de4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# SillokBert-Scratch ํ”„๋กœ์ ํŠธ 5๋‹จ๊ณ„: ๋ฒ ์ด์Šค๋ผ์ธ ๋ชจ๋ธ ํ‰๊ฐ€
# -----------------------------------------------------------------
# ์•„๋ฌด ํ›ˆ๋ จ๋„ ๊ฑฐ์น˜์ง€ ์•Š์€ ๋ฒ”์šฉ BERT ๋ชจ๋ธ(bert-base-multilingual-cased)์˜
# Perplexity(PPL)๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.
# -----------------------------------------------------------------
import os
import math
from pathlib import Path
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset, Dataset
from itertools import chain

def evaluate_baseline_bert():
    """bert-base-multilingual-cased ๋ชจ๋ธ์˜ ์„ฑ๋Šฅ(Perplexity)๋ฅผ ํ‰๊ฐ€ํ•ฉ๋‹ˆ๋‹ค."""
    # --- ๋ชจ๋ธ ๋ฐ ๊ฒฝ๋กœ ์„ค์ • ---
    baseline_model_name = "bert-base-multilingual-cased"
    test_dataset_file = "/home/work/baro/sillok25060103/preprocessed_corpus/test.txt"
    eval_output_dir = Path("./baseline_evaluation_output")

    print("--- 5. Baseline BERT Model Evaluation ---")
    
    # --- ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ---
    model = AutoModelForMaskedLM.from_pretrained(baseline_model_name)
    tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)

    # --- ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ---
    test_dataset = load_dataset('text', data_files={'test': test_dataset_file})
    block_size = 512

    def tokenize_function(examples):
        return tokenizer(examples['text'], add_special_tokens=False, return_special_tokens_mask=False)

    tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
    
    all_input_ids = list(chain(*tokenized_datasets['test']['input_ids']))
    total_length = len(all_input_ids)
    total_length = (total_length // block_size) * block_size
    grouped_input_ids = [all_input_ids[i : i + block_size] for i in range(0, total_length, block_size)]
    eval_dataset = Dataset.from_dict({'input_ids': grouped_input_ids})
    print(f"์ด {len(eval_dataset)}๊ฐœ์˜ ํ‰๊ฐ€ ์ƒ˜ํ”Œ์ด ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")

    # --- ํ‰๊ฐ€ ์ˆ˜ํ–‰ ---
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    training_args = TrainingArguments(output_dir=eval_output_dir, per_device_eval_batch_size=8, fp16=True)

    trainer = Trainer(
        model=model, args=training_args, data_collator=data_collator, eval_dataset=eval_dataset
    )
    
    metrics = trainer.evaluate()
    eval_loss = metrics["eval_loss"]
    perplexity = math.exp(eval_loss)

    # --- ์ตœ์ข… ๊ฒฐ๊ณผ ๋ฐœํ‘œ ---
    print(f"\n--- {baseline_model_name} ํ‰๊ฐ€ ๊ฒฐ๊ณผ ---")
    print(f"  - ์ตœ์ข… Eval Loss: {eval_loss:.4f}")
    print(f"  - ์ตœ์ข… Perplexity(PPL): {perplexity:.4f}")
    print("-" * 40)

if __name__ == "__main__":
    evaluate_baseline_bert()