File size: 2,139 Bytes
170de4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# SillokBert-Scratch ํ”„๋กœ์ ํŠธ 2๋‹จ๊ณ„: ๋ชจ๋ธ ์•„ํ‚คํ…์ฒ˜ ์ •์˜
# -----------------------------------------------------------------
# 1๋‹จ๊ณ„์—์„œ ์ƒ์„ฑ๋œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์‚ฌ์ „ ํ•™์Šต๋˜์ง€ ์•Š์€(from scratch)
# BERT ๋ชจ๋ธ์˜ ๋ผˆ๋Œ€๋ฅผ ์ •์˜ํ•˜๊ณ  ์ •๋ณด๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
# -----------------------------------------------------------------
from pathlib import Path
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast

def define_sillok_bert_architecture():
    """

    ์‚ฌ์ „ ํ•™์Šต๋œ ๊ฐ€์ค‘์น˜ ์—†์ด, BERT-base ๊ตฌ์กฐ์™€ ์ปค์Šคํ…€ BPE ํ† ํฌ๋‚˜์ด์ € ์–ดํœ˜์ง‘์—

    ๋งž์ถฐ ์ดˆ๊ธฐํ™”๋œ '๋นˆ' BERT ๋ชจ๋ธ์„ ์ •์˜ํ•˜๊ณ  ์ •๋ณด๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.

    """
    # --- ๊ฒฝ๋กœ ์„ค์ • ---
    project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
    tokenizer_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
    tokenizer_file = tokenizer_dir / "tokenizer.json"

    print("--- 2. SillokBert Model Architecture Definition ---")

    # --- ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ๋ฐ PAD ํ† ํฐ ์„ค์ • ---
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_file))
    if tokenizer.pad_token is None:
        tokenizer.pad_token = '[PAD]'
    vocab_size = tokenizer.vocab_size

    print(f"๋กœ๋“œ๋œ ์–ดํœ˜์ง‘ ํฌ๊ธฐ (vocab_size): {vocab_size}")
    print(f"PAD ํ† ํฐ ID: {tokenizer.pad_token_id}")

    # --- ๋ชจ๋ธ ๊ตฌ์„ฑ(Configuration) ์ •์˜ ---
    config = BertConfig(
        vocab_size=vocab_size,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        max_position_embeddings=512,
        pad_token_id=tokenizer.pad_token_id,
    )
    print("\n์ƒ์„ฑ๋œ BERT ๋ชจ๋ธ ์„ค์ • (Configuration):")
    print(config)

    # --- ๋ชจ๋ธ ์ƒ์„ฑ ๋ฐ ์ •๋ณด ํ™•์ธ ---
    model = BertForMaskedLM(config=config)
    print("\nโœ… 'From Scratch' BERT ๋ชจ๋ธ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
    print(f"๋ชจ๋ธ์˜ ์ด ํŒŒ๋ผ๋ฏธํ„ฐ ์ˆ˜: {model.num_parameters():,}")

if __name__ == "__main__":
    define_sillok_bert_architecture()