File size: 1,794 Bytes
b287045 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | """
tokenizer.py - ุจูุงุก ูุชุญู
ูู Tokenizer ู
ุฎุตุต ูููุญุฉ XO.
"""
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import pre_tokenizers, decoders
from transformers import PreTrainedTokenizerFast
def build_tokenizer(save_path="./xo_tokenizer"):
"""
ููุดุฆ Tokenizer ุนูู ู
ุณุชูู ุงูู
ุญุงุฑู (character-level) ู
ุน ุงูุฑู
ูุฒ ุงูุฎุงุตุฉ:
<pad>, <eos>, <unk>.
ุซู
ูุญูุธู ููุนูุฏู.
"""
special_tokens = ["<pad>", "<eos>", "<unk>"]
# ุฌู
ูุน ุงูุฑู
ูุฒ ุงูู
ู
ููุฉ ูู ุงูููุญุฉ + ุฃุฑูุงู
(ููุงุณุชุฎุฏุงู
ุงูู
ุณุชูุจูู)
vocab_chars = ['.', 'X', 'O'] + [str(i) for i in range(10)]
# ุจูุงุก ุงููุงู
ูุณ (ูู ุฑู
ุฒ -> ู
ุนุฑู)
vocab = {tok: i for i, tok in enumerate(special_tokens + vocab_chars)}
# ุฅูุดุงุก tokenizer ุฃุณุงุณู
base_tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token="<unk>"))
base_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
base_tokenizer.decoder = decoders.ByteLevel()
# ุชุบูููู ููุชูุงูู ู
ุน ู
ูุชุจุฉ transformers
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=base_tokenizer,
bos_token=None,
eos_token="<eos>",
unk_token="<unk>",
pad_token="<pad>",
)
tokenizer.save_pretrained(save_path)
print(f"โ
ุชู
ุจูุงุก Tokenizer ูุญูุธู ูู {save_path}")
return tokenizer
def load_tokenizer(tokenizer_path="./xo_tokenizer"):
"""
ูุญู
ูู Tokenizer ู
ุญููุธุงู ู
ุณุจูุงู.
"""
tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
print(f"๐ ุชู
ุชุญู
ูู Tokenizer ู
ู {tokenizer_path}")
return tokenizer |