|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
import re
|
|
|
from pathlib import Path
|
|
|
from tokenizers import Tokenizer
|
|
|
from tokenizers.models import BPE
|
|
|
from tokenizers.trainers import BpeTrainer
|
|
|
from tokenizers.pre_tokenizers import ByteLevel
|
|
|
from tokenizers.normalizers import NFC
|
|
|
|
|
|
def train_sillok_bpe_tokenizer():
|
|
|
"""
|
|
|
BPE μκ³ λ¦¬μ¦κ³Ό κ°νλ μ μ²λ¦¬ νμ΄νλΌμΈμ μ¬μ©νμ¬ ν ν¬λμ΄μ λ₯Ό νλ ¨ν©λλ€.
|
|
|
"""
|
|
|
|
|
|
corpus_file = "/home/work/baro/sillok25060103/preprocessed_corpus/train.txt"
|
|
|
project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
|
|
|
output_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
|
|
|
tokenizer_file = output_dir / "tokenizer.json"
|
|
|
|
|
|
print("--- 1. Sillok-Specific Tokenizer Training ---")
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
pua_to_standard_map = {}
|
|
|
|
|
|
pua_regex = None
|
|
|
if pua_to_standard_map:
|
|
|
pua_regex = re.compile("|".join(map(re.escape, pua_to_standard_map.keys())))
|
|
|
|
|
|
def normalize_pua(text: str) -> str:
|
|
|
if not pua_regex:
|
|
|
return text
|
|
|
return pua_regex.sub(lambda m: pua_to_standard_map[m.group(0)], text)
|
|
|
|
|
|
|
|
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
|
tokenizer.normalizer = NFC()
|
|
|
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
|
|
|
|
|
|
trainer = BpeTrainer(
|
|
|
vocab_size=500000,
|
|
|
min_frequency=1,
|
|
|
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
|
|
|
)
|
|
|
|
|
|
|
|
|
def get_training_corpus():
|
|
|
with open(corpus_file, "r", encoding="utf-8") as f:
|
|
|
for line in f:
|
|
|
processed_line = normalize_pua(line)
|
|
|
yield processed_line.strip()
|
|
|
|
|
|
print("BPE ν ν¬λμ΄μ νλ ¨μ μμν©λλ€...")
|
|
|
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
|
|
|
|
|
|
|
|
|
tokenizer.save(str(tokenizer_file))
|
|
|
print(f"\nπ BPE ν ν¬λμ΄μ νλ ¨ μλ£. κ²°κ³Ό νμΌ: {tokenizer_file}")
|
|
|
|
|
|
reloaded_tokenizer = Tokenizer.from_file(str(tokenizer_file))
|
|
|
print(f"μ€μ μμ±λ μ΄νμ§ ν¬κΈ°: {reloaded_tokenizer.get_vocab_size()}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
train_sillok_bpe_tokenizer()
|
|
|
|