|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
|
|
|
|
|
def define_sillok_bert_architecture():
|
|
|
"""
|
|
|
์ฌ์ ํ์ต๋ ๊ฐ์ค์น ์์ด, BERT-base ๊ตฌ์กฐ์ ์ปค์คํ
BPE ํ ํฌ๋์ด์ ์ดํ์ง์
|
|
|
๋ง์ถฐ ์ด๊ธฐํ๋ '๋น' BERT ๋ชจ๋ธ์ ์ ์ํ๊ณ ์ ๋ณด๋ฅผ ์ถ๋ ฅํฉ๋๋ค.
|
|
|
"""
|
|
|
|
|
|
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
|
|
|
tokenizer_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
|
|
|
tokenizer_file = tokenizer_dir / "tokenizer.json"
|
|
|
|
|
|
print("--- 2. SillokBert Model Architecture Definition ---")
|
|
|
|
|
|
|
|
|
tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_file))
|
|
|
if tokenizer.pad_token is None:
|
|
|
tokenizer.pad_token = '[PAD]'
|
|
|
vocab_size = tokenizer.vocab_size
|
|
|
|
|
|
print(f"๋ก๋๋ ์ดํ์ง ํฌ๊ธฐ (vocab_size): {vocab_size}")
|
|
|
print(f"PAD ํ ํฐ ID: {tokenizer.pad_token_id}")
|
|
|
|
|
|
|
|
|
config = BertConfig(
|
|
|
vocab_size=vocab_size,
|
|
|
hidden_size=768,
|
|
|
num_hidden_layers=12,
|
|
|
num_attention_heads=12,
|
|
|
intermediate_size=3072,
|
|
|
max_position_embeddings=512,
|
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
|
)
|
|
|
print("\n์์ฑ๋ BERT ๋ชจ๋ธ ์ค์ (Configuration):")
|
|
|
print(config)
|
|
|
|
|
|
|
|
|
model = BertForMaskedLM(config=config)
|
|
|
print("\nโ
'From Scratch' BERT ๋ชจ๋ธ์ด ์ฑ๊ณต์ ์ผ๋ก ์์ฑ๋์์ต๋๋ค.")
|
|
|
print(f"๋ชจ๋ธ์ ์ด ํ๋ผ๋ฏธํฐ ์: {model.num_parameters():,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
define_sillok_bert_architecture()
|
|
|
|