| """
|
| tokenizer.py - ุจูุงุก ูุชุญู
ูู Tokenizer ู
ุฎุตุต ูููุญุฉ XO.
|
| """
|
|
|
| from tokenizers import Tokenizer
|
| from tokenizers.models import WordLevel
|
| from tokenizers import pre_tokenizers, decoders
|
| from transformers import PreTrainedTokenizerFast
|
|
|
|
|
| def build_tokenizer(save_path="./xo_tokenizer"):
|
| """
|
| ููุดุฆ Tokenizer ุนูู ู
ุณุชูู ุงูู
ุญุงุฑู (character-level) ู
ุน ุงูุฑู
ูุฒ ุงูุฎุงุตุฉ:
|
| <pad>, <eos>, <unk>.
|
| ุซู
ูุญูุธู ููุนูุฏู.
|
| """
|
| special_tokens = ["<pad>", "<eos>", "<unk>"]
|
|
|
| vocab_chars = ['.', 'X', 'O'] + [str(i) for i in range(10)]
|
|
|
|
|
| vocab = {tok: i for i, tok in enumerate(special_tokens + vocab_chars)}
|
|
|
|
|
| base_tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token="<unk>"))
|
| base_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
| base_tokenizer.decoder = decoders.ByteLevel()
|
|
|
|
|
| tokenizer = PreTrainedTokenizerFast(
|
| tokenizer_object=base_tokenizer,
|
| bos_token=None,
|
| eos_token="<eos>",
|
| unk_token="<unk>",
|
| pad_token="<pad>",
|
| )
|
|
|
| tokenizer.save_pretrained(save_path)
|
| print(f"โ
ุชู
ุจูุงุก Tokenizer ูุญูุธู ูู {save_path}")
|
| return tokenizer
|
|
|
|
|
| def load_tokenizer(tokenizer_path="./xo_tokenizer"):
|
| """
|
| ูุญู
ูู Tokenizer ู
ุญููุธุงู ู
ุณุจูุงู.
|
| """
|
| tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
|
| print(f"๐ ุชู
ุชุญู
ูู Tokenizer ู
ู {tokenizer_path}")
|
| return tokenizer |