File size: 4,880 Bytes
fa639b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import json
import os
from typing import List, Optional
from transformers import PreTrainedTokenizer


class CenturioTokenizer(PreTrainedTokenizer):
    vocab_files_names = {"vocab_file": "centurio_vocab.json"}
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(

        self,

        vocab_file=None,

        unk_token="<unk>",

        bos_token="<s>",

        eos_token="</s>",

        pad_token="<pad>",

        sep_token="<sep>",

        cls_token="<cls>",

        mask_token="<mask>",

        space_token="▁",

        **kwargs

    ):
        self.space_token = space_token
        self._vocab = {}
        self._inv_vocab = {}

        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            sep_token=sep_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )

        if vocab_file is not None:
            self._load_vocab(vocab_file)
        else:
            self._build_default_vocab()

    def _build_default_vocab(self):
        special_tokens = [
            self.unk_token, self.bos_token, self.eos_token,
            self.pad_token, self.sep_token, self.cls_token,
            self.mask_token, self.space_token
        ]
        self._vocab = {token: i for i, token in enumerate(special_tokens)}
        self._inv_vocab = {i: token for token, i in self._vocab.items()}

    def _load_vocab(self, vocab_file):
        with open(vocab_file, "r", encoding="utf-8") as f:
            self._vocab = json.load(f)
        self._inv_vocab = {v: k for k, v in self._vocab.items()}

    def get_vocab(self):
        return self._vocab.copy()

    @property
    def vocab_size(self):
        return len(self._vocab)

    def _tokenize(self, text: str) -> List[str]:
        text = text.replace(" ", self.space_token)
        tokens = []
        current = ""
        for ch in text:
            if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ":
                current += ch
            else:
                if current:
                    tokens.append(current)
                    current = ""
                tokens.append(ch if ch != self.space_token else self.space_token)
        if current:
            tokens.append(current)
        return tokens

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab.get(token, self._vocab.get(self.unk_token, 0))

    def _convert_id_to_token(self, index: int) -> str:
        return self._inv_vocab.get(index, self.unk_token)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json"
        )
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
        return (vocab_file,)

    def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2):
        from collections import Counter
        token_counter = Counter()
        for text in corpus:
            tokens = self._tokenize(text)
            token_counter.update(tokens)

        special_tokens = [
            self.unk_token, self.bos_token, self.eos_token,
            self.pad_token, self.sep_token, self.cls_token,
            self.mask_token, self.space_token
        ]
        new_vocab = {token: i for i, token in enumerate(special_tokens)}
        idx = len(new_vocab)

        for token, freq in token_counter.items():
            if freq >= min_freq and token not in new_vocab:
                new_vocab[token] = idx
                idx += 1

        self._vocab = new_vocab
        self._inv_vocab = {v: k for k, v in self._vocab.items()}

if __name__ == "__main__":
    corpus = [
        "Привет, как дела!",
        "Я учу немецкий язык.",
        "Морфемы помогают понять структуру слов."
    ]

    tokenizer = CenturioTokenizer()
    tokenizer.build_vocab_from_corpus(corpus, min_freq=1)
    tokenizer.save_pretrained("./centurio_model")

    for text in corpus:
        tokens = tokenizer.tokenize(text)
        ids = tokenizer.encode(text)
        back = tokenizer.decode(ids)
        print(f"\nTEXT   : {text}")
        print(f"TOKENS : {tokens}")
        print(f"IDS    : {ids}")
        print(f"BACK   : {back}")
        print(f"VOCAB  : {tokenizer.vocab_size}")