import json import os from typing import List, Optional from transformers import PreTrainedTokenizer class CenturioTokenizer(PreTrainedTokenizer): vocab_files_names = {"vocab_file": "centurio_vocab.json"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file=None, unk_token="", bos_token="", eos_token="", pad_token="", sep_token="", cls_token="", mask_token="", space_token="▁", **kwargs ): self.space_token = space_token self._vocab = {} self._inv_vocab = {} super().__init__( unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token, **kwargs ) if vocab_file is not None: self._load_vocab(vocab_file) else: self._build_default_vocab() def _build_default_vocab(self): special_tokens = [ self.unk_token, self.bos_token, self.eos_token, self.pad_token, self.sep_token, self.cls_token, self.mask_token, self.space_token ] self._vocab = {token: i for i, token in enumerate(special_tokens)} self._inv_vocab = {i: token for token, i in self._vocab.items()} def _load_vocab(self, vocab_file): with open(vocab_file, "r", encoding="utf-8") as f: self._vocab = json.load(f) self._inv_vocab = {v: k for k, v in self._vocab.items()} def get_vocab(self): return self._vocab.copy() @property def vocab_size(self): return len(self._vocab) def _tokenize(self, text: str) -> List[str]: text = text.replace(" ", self.space_token) tokens = [] current = "" for ch in text: if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ": current += ch else: if current: tokens.append(current) current = "" tokens.append(ch if ch != self.space_token else self.space_token) if current: tokens.append(current) return tokens def _convert_token_to_id(self, token: str) -> int: return self._vocab.get(token, self._vocab.get(self.unk_token, 0)) def _convert_id_to_token(self, index: int) -> str: return self._inv_vocab.get(index, self.unk_token) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple: if not os.path.isdir(save_directory): os.makedirs(save_directory) vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json" ) with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self._vocab, f, ensure_ascii=False, indent=2) return (vocab_file,) def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2): from collections import Counter token_counter = Counter() for text in corpus: tokens = self._tokenize(text) token_counter.update(tokens) special_tokens = [ self.unk_token, self.bos_token, self.eos_token, self.pad_token, self.sep_token, self.cls_token, self.mask_token, self.space_token ] new_vocab = {token: i for i, token in enumerate(special_tokens)} idx = len(new_vocab) for token, freq in token_counter.items(): if freq >= min_freq and token not in new_vocab: new_vocab[token] = idx idx += 1 self._vocab = new_vocab self._inv_vocab = {v: k for k, v in self._vocab.items()} if __name__ == "__main__": corpus = [ "Привет, как дела!", "Я учу немецкий язык.", "Морфемы помогают понять структуру слов." ] tokenizer = CenturioTokenizer() tokenizer.build_vocab_from_corpus(corpus, min_freq=1) tokenizer.save_pretrained("./centurio_model") for text in corpus: tokens = tokenizer.tokenize(text) ids = tokenizer.encode(text) back = tokenizer.decode(ids) print(f"\nTEXT : {text}") print(f"TOKENS : {tokens}") print(f"IDS : {ids}") print(f"BACK : {back}") print(f"VOCAB : {tokenizer.vocab_size}")