File size: 2,139 Bytes
170de4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# SillokBert-Scratch ํ๋ก์ ํธ 2๋จ๊ณ: ๋ชจ๋ธ ์ํคํ
์ฒ ์ ์
# -----------------------------------------------------------------
# 1๋จ๊ณ์์ ์์ฑ๋ ํ ํฌ๋์ด์ ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ฌ์ ํ์ต๋์ง ์์(from scratch)
# BERT ๋ชจ๋ธ์ ๋ผ๋๋ฅผ ์ ์ํ๊ณ ์ ๋ณด๋ฅผ ํ์ธํฉ๋๋ค.
# -----------------------------------------------------------------
from pathlib import Path
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
def define_sillok_bert_architecture():
"""
์ฌ์ ํ์ต๋ ๊ฐ์ค์น ์์ด, BERT-base ๊ตฌ์กฐ์ ์ปค์คํ
BPE ํ ํฌ๋์ด์ ์ดํ์ง์
๋ง์ถฐ ์ด๊ธฐํ๋ '๋น' BERT ๋ชจ๋ธ์ ์ ์ํ๊ณ ์ ๋ณด๋ฅผ ์ถ๋ ฅํฉ๋๋ค.
"""
# --- ๊ฒฝ๋ก ์ค์ ---
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
tokenizer_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
tokenizer_file = tokenizer_dir / "tokenizer.json"
print("--- 2. SillokBert Model Architecture Definition ---")
# --- ํ ํฌ๋์ด์ ๋ก๋ ๋ฐ PAD ํ ํฐ ์ค์ ---
tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_file))
if tokenizer.pad_token is None:
tokenizer.pad_token = '[PAD]'
vocab_size = tokenizer.vocab_size
print(f"๋ก๋๋ ์ดํ์ง ํฌ๊ธฐ (vocab_size): {vocab_size}")
print(f"PAD ํ ํฐ ID: {tokenizer.pad_token_id}")
# --- ๋ชจ๋ธ ๊ตฌ์ฑ(Configuration) ์ ์ ---
config = BertConfig(
vocab_size=vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
max_position_embeddings=512,
pad_token_id=tokenizer.pad_token_id,
)
print("\n์์ฑ๋ BERT ๋ชจ๋ธ ์ค์ (Configuration):")
print(config)
# --- ๋ชจ๋ธ ์์ฑ ๋ฐ ์ ๋ณด ํ์ธ ---
model = BertForMaskedLM(config=config)
print("\nโ
'From Scratch' BERT ๋ชจ๋ธ์ด ์ฑ๊ณต์ ์ผ๋ก ์์ฑ๋์์ต๋๋ค.")
print(f"๋ชจ๋ธ์ ์ด ํ๋ผ๋ฏธํฐ ์: {model.num_parameters():,}")
if __name__ == "__main__":
define_sillok_bert_architecture()
|