SillokBert-Scratch / scripts /1_train_tokenizer.py
ddokbaro's picture
Upload 15 files
170de4d verified
# SillokBert-Scratch ν”„λ‘œμ νŠΈ 1단계: 싀둝 νŠΉν™” ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨
# -----------------------------------------------------------------
# BPE μ•Œκ³ λ¦¬μ¦˜κ³Ό PUA/NFC μ „μ²˜λ¦¬ νŒŒμ΄ν”„λΌμΈμ„ μ‚¬μš©ν•˜μ—¬
# Sillok 데이터에 μ΅œμ ν™”λœ ν† ν¬λ‚˜μ΄μ €λ₯Ό ν›ˆλ ¨ν•©λ‹ˆλ‹€.
# -----------------------------------------------------------------
import os
import re
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFC
def train_sillok_bpe_tokenizer():
"""
BPE μ•Œκ³ λ¦¬μ¦˜κ³Ό κ°•ν™”λœ μ „μ²˜λ¦¬ νŒŒμ΄ν”„λΌμΈμ„ μ‚¬μš©ν•˜μ—¬ ν† ν¬λ‚˜μ΄μ €λ₯Ό ν›ˆλ ¨ν•©λ‹ˆλ‹€.
"""
# --- 경둜 μ„€μ • ---
corpus_file = "/home/work/baro/sillok25060103/preprocessed_corpus/train.txt"
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
output_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
tokenizer_file = output_dir / "tokenizer.json"
print("--- 1. Sillok-Specific Tokenizer Training ---")
output_dir.mkdir(parents=True, exist_ok=True)
# --- PUA λ³€ν™˜ 둜직 ---
# μ‹€μ œ 데이터에 λ§žλŠ” μ™„μ „ν•œ λ§€ν•‘ ν…Œμ΄λΈ”λ‘œ ꡐ체해야 ν•©λ‹ˆλ‹€.
pua_to_standard_map = {}
pua_regex = None
if pua_to_standard_map:
pua_regex = re.compile("|".join(map(re.escape, pua_to_standard_map.keys())))
def normalize_pua(text: str) -> str:
if not pua_regex:
return text
return pua_regex.sub(lambda m: pua_to_standard_map[m.group(0)], text)
# --- ν† ν¬λ‚˜μ΄μ € μ„€μ • ---
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = NFC()
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
trainer = BpeTrainer(
vocab_size=500000,
min_frequency=1,
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)
# --- μ½”νΌμŠ€ μ΄ν„°λ ˆμ΄ν„° 및 ν›ˆλ ¨ ---
def get_training_corpus():
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
processed_line = normalize_pua(line)
yield processed_line.strip()
print("BPE ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨μ„ μ‹œμž‘ν•©λ‹ˆλ‹€...")
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# --- μ €μž₯ 및 검증 ---
tokenizer.save(str(tokenizer_file))
print(f"\nπŸŽ‰ BPE ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨ μ™„λ£Œ. κ²°κ³Ό 파일: {tokenizer_file}")
reloaded_tokenizer = Tokenizer.from_file(str(tokenizer_file))
print(f"μ‹€μ œ μƒμ„±λœ μ–΄νœ˜μ§‘ 크기: {reloaded_tokenizer.get_vocab_size()}")
if __name__ == "__main__":
train_sillok_bpe_tokenizer()