File size: 1,794 Bytes
b287045
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""

tokenizer.py - ุจู†ุงุก ูˆุชุญู…ูŠู„ Tokenizer ู…ุฎุตุต ู„ู„ูˆุญุฉ XO.

"""

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import pre_tokenizers, decoders
from transformers import PreTrainedTokenizerFast


def build_tokenizer(save_path="./xo_tokenizer"):
    """

    ูŠู†ุดุฆ Tokenizer ุนู„ู‰ ู…ุณุชูˆู‰ ุงู„ู…ุญุงุฑู (character-level) ู…ุน ุงู„ุฑู…ูˆุฒ ุงู„ุฎุงุตุฉ:

    <pad>, <eos>, <unk>.

    ุซู… ูŠุญูุธู‡ ูˆูŠุนูŠุฏู‡.

    """
    special_tokens = ["<pad>", "<eos>", "<unk>"]
    # ุฌู…ูŠุน ุงู„ุฑู…ูˆุฒ ุงู„ู…ู…ูƒู†ุฉ ููŠ ุงู„ู„ูˆุญุฉ + ุฃุฑู‚ุงู… (ู„ู„ุงุณุชุฎุฏุงู… ุงู„ู…ุณุชู‚ุจู„ูŠ)
    vocab_chars = ['.', 'X', 'O'] + [str(i) for i in range(10)]

    # ุจู†ุงุก ุงู„ู‚ุงู…ูˆุณ (ูƒู„ ุฑู…ุฒ -> ู…ุนุฑู)
    vocab = {tok: i for i, tok in enumerate(special_tokens + vocab_chars)}

    # ุฅู†ุดุงุก tokenizer ุฃุณุงุณูŠ
    base_tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token="<unk>"))
    base_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    base_tokenizer.decoder = decoders.ByteLevel()

    # ุชุบู„ูŠูู‡ ู„ูŠุชูˆุงูู‚ ู…ุน ู…ูƒุชุจุฉ transformers
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=base_tokenizer,
        bos_token=None,
        eos_token="<eos>",
        unk_token="<unk>",
        pad_token="<pad>",
    )

    tokenizer.save_pretrained(save_path)
    print(f"โœ… ุชู… ุจู†ุงุก Tokenizer ูˆุญูุธู‡ ููŠ {save_path}")
    return tokenizer


def load_tokenizer(tokenizer_path="./xo_tokenizer"):
    """

    ูŠุญู…ู‘ู„ Tokenizer ู…ุญููˆุธุงู‹ ู…ุณุจู‚ุงู‹.

    """
    tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
    print(f"๐Ÿ“‚ ุชู… ุชุญู…ูŠู„ Tokenizer ู…ู† {tokenizer_path}")
    return tokenizer