File size: 4,880 Bytes
fa639b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import json
import os
from typing import List, Optional
from transformers import PreTrainedTokenizer
class CenturioTokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "centurio_vocab.json"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
sep_token="<sep>",
cls_token="<cls>",
mask_token="<mask>",
space_token="▁",
**kwargs
):
self.space_token = space_token
self._vocab = {}
self._inv_vocab = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
sep_token=sep_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs
)
if vocab_file is not None:
self._load_vocab(vocab_file)
else:
self._build_default_vocab()
def _build_default_vocab(self):
special_tokens = [
self.unk_token, self.bos_token, self.eos_token,
self.pad_token, self.sep_token, self.cls_token,
self.mask_token, self.space_token
]
self._vocab = {token: i for i, token in enumerate(special_tokens)}
self._inv_vocab = {i: token for token, i in self._vocab.items()}
def _load_vocab(self, vocab_file):
with open(vocab_file, "r", encoding="utf-8") as f:
self._vocab = json.load(f)
self._inv_vocab = {v: k for k, v in self._vocab.items()}
def get_vocab(self):
return self._vocab.copy()
@property
def vocab_size(self):
return len(self._vocab)
def _tokenize(self, text: str) -> List[str]:
text = text.replace(" ", self.space_token)
tokens = []
current = ""
for ch in text:
if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ":
current += ch
else:
if current:
tokens.append(current)
current = ""
tokens.append(ch if ch != self.space_token else self.space_token)
if current:
tokens.append(current)
return tokens
def _convert_token_to_id(self, token: str) -> int:
return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
def _convert_id_to_token(self, index: int) -> str:
return self._inv_vocab.get(index, self.unk_token)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
if not os.path.isdir(save_directory):
os.makedirs(save_directory)
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json"
)
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
return (vocab_file,)
def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2):
from collections import Counter
token_counter = Counter()
for text in corpus:
tokens = self._tokenize(text)
token_counter.update(tokens)
special_tokens = [
self.unk_token, self.bos_token, self.eos_token,
self.pad_token, self.sep_token, self.cls_token,
self.mask_token, self.space_token
]
new_vocab = {token: i for i, token in enumerate(special_tokens)}
idx = len(new_vocab)
for token, freq in token_counter.items():
if freq >= min_freq and token not in new_vocab:
new_vocab[token] = idx
idx += 1
self._vocab = new_vocab
self._inv_vocab = {v: k for k, v in self._vocab.items()}
if __name__ == "__main__":
corpus = [
"Привет, как дела!",
"Я учу немецкий язык.",
"Морфемы помогают понять структуру слов."
]
tokenizer = CenturioTokenizer()
tokenizer.build_vocab_from_corpus(corpus, min_freq=1)
tokenizer.save_pretrained("./centurio_model")
for text in corpus:
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)
back = tokenizer.decode(ids)
print(f"\nTEXT : {text}")
print(f"TOKENS : {tokens}")
print(f"IDS : {ids}")
print(f"BACK : {back}")
print(f"VOCAB : {tokenizer.vocab_size}")
|