File size: 2,839 Bytes
170de4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# SillokBert-Scratch ν”„λ‘œμ νŠΈ 1단계: 싀둝 νŠΉν™” ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨
# -----------------------------------------------------------------
# BPE μ•Œκ³ λ¦¬μ¦˜κ³Ό PUA/NFC μ „μ²˜λ¦¬ νŒŒμ΄ν”„λΌμΈμ„ μ‚¬μš©ν•˜μ—¬
# Sillok 데이터에 μ΅œμ ν™”λœ ν† ν¬λ‚˜μ΄μ €λ₯Ό ν›ˆλ ¨ν•©λ‹ˆλ‹€.
# -----------------------------------------------------------------
import os
import re
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFC

def train_sillok_bpe_tokenizer():
    """

    BPE μ•Œκ³ λ¦¬μ¦˜κ³Ό κ°•ν™”λœ μ „μ²˜λ¦¬ νŒŒμ΄ν”„λΌμΈμ„ μ‚¬μš©ν•˜μ—¬ ν† ν¬λ‚˜μ΄μ €λ₯Ό ν›ˆλ ¨ν•©λ‹ˆλ‹€.

    """
    # --- 경둜 μ„€μ • ---
    corpus_file = "/home/work/baro/sillok25060103/preprocessed_corpus/train.txt"
    project_dir = Path("/home/work/baro/sillok/sillok_scratch_20250626")
    output_dir = project_dir / "sillok_tokenizer_bpe_preprocessed"
    tokenizer_file = output_dir / "tokenizer.json"

    print("--- 1. Sillok-Specific Tokenizer Training ---")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # --- PUA λ³€ν™˜ 둜직 ---
    # μ‹€μ œ 데이터에 λ§žλŠ” μ™„μ „ν•œ λ§€ν•‘ ν…Œμ΄λΈ”λ‘œ ꡐ체해야 ν•©λ‹ˆλ‹€.
    pua_to_standard_map = {}
    
    pua_regex = None
    if pua_to_standard_map:
        pua_regex = re.compile("|".join(map(re.escape, pua_to_standard_map.keys())))
    
    def normalize_pua(text: str) -> str:
        if not pua_regex:
            return text
        return pua_regex.sub(lambda m: pua_to_standard_map[m.group(0)], text)

    # --- ν† ν¬λ‚˜μ΄μ € μ„€μ • ---
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.normalizer = NFC()
    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)

    trainer = BpeTrainer(
        vocab_size=500000,
        min_frequency=1,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )

    # --- μ½”νΌμŠ€ μ΄ν„°λ ˆμ΄ν„° 및 ν›ˆλ ¨ ---
    def get_training_corpus():
        with open(corpus_file, "r", encoding="utf-8") as f:
            for line in f:
                processed_line = normalize_pua(line)
                yield processed_line.strip()

    print("BPE ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨μ„ μ‹œμž‘ν•©λ‹ˆλ‹€...")
    tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
    
    # --- μ €μž₯ 및 검증 ---
    tokenizer.save(str(tokenizer_file))
    print(f"\nπŸŽ‰ BPE ν† ν¬λ‚˜μ΄μ € ν›ˆλ ¨ μ™„λ£Œ. κ²°κ³Ό 파일: {tokenizer_file}")
    
    reloaded_tokenizer = Tokenizer.from_file(str(tokenizer_file))
    print(f"μ‹€μ œ μƒμ„±λœ μ–΄νœ˜μ§‘ 크기: {reloaded_tokenizer.get_vocab_size()}")

if __name__ == "__main__":
    train_sillok_bpe_tokenizer()