SillokBert-Scratch / scripts /2_define_architecture.py
ddokbaro's picture
Upload 15 files
170de4d verified
# SillokBert-Scratch ํ”„๋กœ์ ํŠธ 2๋‹จ๊ณ„: ๋ชจ๋ธ ์•„ํ‚คํ…์ฒ˜ ์ •์˜
# -----------------------------------------------------------------
# 1๋‹จ๊ณ„์—์„œ ์ƒ์„ฑ๋œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์‚ฌ์ „ ํ•™์Šต๋˜์ง€ ์•Š์€(from scratch)
# BERT ๋ชจ๋ธ์˜ ๋ผˆ๋Œ€๋ฅผ ์ •์˜ํ•˜๊ณ  ์ •๋ณด๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
# -----------------------------------------------------------------
from pathlib import Path
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
def define_sillok_bert_architecture():
"""
์‚ฌ์ „ ํ•™์Šต๋œ ๊ฐ€์ค‘์น˜ ์—†์ด, BERT-base ๊ตฌ์กฐ์™€ ์ปค์Šคํ…€ BPE ํ† ํฌ๋‚˜์ด์ € ์–ดํœ˜์ง‘์—
๋งž์ถฐ ์ดˆ๊ธฐํ™”๋œ '๋นˆ' BERT ๋ชจ๋ธ์„ ์ •์˜ํ•˜๊ณ  ์ •๋ณด๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
"""
# --- ๊ฒฝ๋กœ ์„ค์ • ---
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
tokenizer_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
tokenizer_file = tokenizer_dir / "tokenizer.json"
print("--- 2. SillokBert Model Architecture Definition ---")
# --- ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ๋ฐ PAD ํ† ํฐ ์„ค์ • ---
tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_file))
if tokenizer.pad_token is None:
tokenizer.pad_token = '[PAD]'
vocab_size = tokenizer.vocab_size
print(f"๋กœ๋“œ๋œ ์–ดํœ˜์ง‘ ํฌ๊ธฐ (vocab_size): {vocab_size}")
print(f"PAD ํ† ํฐ ID: {tokenizer.pad_token_id}")
# --- ๋ชจ๋ธ ๊ตฌ์„ฑ(Configuration) ์ •์˜ ---
config = BertConfig(
vocab_size=vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
max_position_embeddings=512,
pad_token_id=tokenizer.pad_token_id,
)
print("\n์ƒ์„ฑ๋œ BERT ๋ชจ๋ธ ์„ค์ • (Configuration):")
print(config)
# --- ๋ชจ๋ธ ์ƒ์„ฑ ๋ฐ ์ •๋ณด ํ™•์ธ ---
model = BertForMaskedLM(config=config)
print("\nโœ… 'From Scratch' BERT ๋ชจ๋ธ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
print(f"๋ชจ๋ธ์˜ ์ด ํŒŒ๋ผ๋ฏธํ„ฐ ์ˆ˜: {model.num_parameters():,}")
if __name__ == "__main__":
define_sillok_bert_architecture()