File size: 2,839 Bytes
170de4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# SillokBert-Scratch νλ‘μ νΈ 1λ¨κ³: μ€λ‘ νΉν ν ν¬λμ΄μ νλ ¨
# -----------------------------------------------------------------
# BPE μκ³ λ¦¬μ¦κ³Ό PUA/NFC μ μ²λ¦¬ νμ΄νλΌμΈμ μ¬μ©νμ¬
# Sillok λ°μ΄ν°μ μ΅μ νλ ν ν¬λμ΄μ λ₯Ό νλ ¨ν©λλ€.
# -----------------------------------------------------------------
import os
import re
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFC
def train_sillok_bpe_tokenizer():
"""
BPE μκ³ λ¦¬μ¦κ³Ό κ°νλ μ μ²λ¦¬ νμ΄νλΌμΈμ μ¬μ©νμ¬ ν ν¬λμ΄μ λ₯Ό νλ ¨ν©λλ€.
"""
# --- κ²½λ‘ μ€μ ---
corpus_file = "/home/work/baro/sillok25060103/preprocessed_corpus/train.txt"
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
output_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
tokenizer_file = output_dir / "tokenizer.json"
print("--- 1. Sillok-Specific Tokenizer Training ---")
output_dir.mkdir(parents=True, exist_ok=True)
# --- PUA λ³ν λ‘μ§ ---
# μ€μ λ°μ΄ν°μ λ§λ μμ ν λ§€ν ν
μ΄λΈλ‘ κ΅μ²΄ν΄μΌ ν©λλ€.
pua_to_standard_map = {}
pua_regex = None
if pua_to_standard_map:
pua_regex = re.compile("|".join(map(re.escape, pua_to_standard_map.keys())))
def normalize_pua(text: str) -> str:
if not pua_regex:
return text
return pua_regex.sub(lambda m: pua_to_standard_map[m.group(0)], text)
# --- ν ν¬λμ΄μ μ€μ ---
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = NFC()
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
trainer = BpeTrainer(
vocab_size=500000,
min_frequency=1,
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)
# --- μ½νΌμ€ μ΄ν°λ μ΄ν° λ° νλ ¨ ---
def get_training_corpus():
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
processed_line = normalize_pua(line)
yield processed_line.strip()
print("BPE ν ν¬λμ΄μ νλ ¨μ μμν©λλ€...")
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# --- μ μ₯ λ° κ²μ¦ ---
tokenizer.save(str(tokenizer_file))
print(f"\nπ BPE ν ν¬λμ΄μ νλ ¨ μλ£. κ²°κ³Ό νμΌ: {tokenizer_file}")
reloaded_tokenizer = Tokenizer.from_file(str(tokenizer_file))
print(f"μ€μ μμ±λ μ΄νμ§ ν¬κΈ°: {reloaded_tokenizer.get_vocab_size()}")
if __name__ == "__main__":
train_sillok_bpe_tokenizer()
|