FemtoXO / src /tokenizer.py
abdelkader-dev's picture
Upload 16 files
b287045 verified
Raw
History Blame Contribute Delete
1.79 kB
"""
tokenizer.py - ุจู†ุงุก ูˆุชุญู…ูŠู„ Tokenizer ู…ุฎุตุต ู„ู„ูˆุญุฉ XO.
"""
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import pre_tokenizers, decoders
from transformers import PreTrainedTokenizerFast
def build_tokenizer(save_path="./xo_tokenizer"):
"""
ูŠู†ุดุฆ Tokenizer ุนู„ู‰ ู…ุณุชูˆู‰ ุงู„ู…ุญุงุฑู (character-level) ู…ุน ุงู„ุฑู…ูˆุฒ ุงู„ุฎุงุตุฉ:
<pad>, <eos>, <unk>.
ุซู… ูŠุญูุธู‡ ูˆูŠุนูŠุฏู‡.
"""
special_tokens = ["<pad>", "<eos>", "<unk>"]
# ุฌู…ูŠุน ุงู„ุฑู…ูˆุฒ ุงู„ู…ู…ูƒู†ุฉ ููŠ ุงู„ู„ูˆุญุฉ + ุฃุฑู‚ุงู… (ู„ู„ุงุณุชุฎุฏุงู… ุงู„ู…ุณุชู‚ุจู„ูŠ)
vocab_chars = ['.', 'X', 'O'] + [str(i) for i in range(10)]
# ุจู†ุงุก ุงู„ู‚ุงู…ูˆุณ (ูƒู„ ุฑู…ุฒ -> ู…ุนุฑู)
vocab = {tok: i for i, tok in enumerate(special_tokens + vocab_chars)}
# ุฅู†ุดุงุก tokenizer ุฃุณุงุณูŠ
base_tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token="<unk>"))
base_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
base_tokenizer.decoder = decoders.ByteLevel()
# ุชุบู„ูŠูู‡ ู„ูŠุชูˆุงูู‚ ู…ุน ู…ูƒุชุจุฉ transformers
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=base_tokenizer,
bos_token=None,
eos_token="<eos>",
unk_token="<unk>",
pad_token="<pad>",
)
tokenizer.save_pretrained(save_path)
print(f"โœ… ุชู… ุจู†ุงุก Tokenizer ูˆุญูุธู‡ ููŠ {save_path}")
return tokenizer
def load_tokenizer(tokenizer_path="./xo_tokenizer"):
"""
ูŠุญู…ู‘ู„ Tokenizer ู…ุญููˆุธุงู‹ ู…ุณุจู‚ุงู‹.
"""
tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
print(f"๐Ÿ“‚ ุชู… ุชุญู…ูŠู„ Tokenizer ู…ู† {tokenizer_path}")
return tokenizer