Darkester commited on
Commit
fa639b2
·
verified ·
1 Parent(s): b3dd67c

Upload centurio.py

Browse files
Files changed (1) hide show
  1. centurio.py +140 -0
centurio.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Optional
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class CenturioTokenizer(PreTrainedTokenizer):
8
+ vocab_files_names = {"vocab_file": "centurio_vocab.json"}
9
+ model_input_names = ["input_ids", "attention_mask"]
10
+
11
+ def __init__(
12
+ self,
13
+ vocab_file=None,
14
+ unk_token="<unk>",
15
+ bos_token="<s>",
16
+ eos_token="</s>",
17
+ pad_token="<pad>",
18
+ sep_token="<sep>",
19
+ cls_token="<cls>",
20
+ mask_token="<mask>",
21
+ space_token="▁",
22
+ **kwargs
23
+ ):
24
+ self.space_token = space_token
25
+ self._vocab = {}
26
+ self._inv_vocab = {}
27
+
28
+ super().__init__(
29
+ unk_token=unk_token,
30
+ bos_token=bos_token,
31
+ eos_token=eos_token,
32
+ pad_token=pad_token,
33
+ sep_token=sep_token,
34
+ cls_token=cls_token,
35
+ mask_token=mask_token,
36
+ **kwargs
37
+ )
38
+
39
+ if vocab_file is not None:
40
+ self._load_vocab(vocab_file)
41
+ else:
42
+ self._build_default_vocab()
43
+
44
+ def _build_default_vocab(self):
45
+ special_tokens = [
46
+ self.unk_token, self.bos_token, self.eos_token,
47
+ self.pad_token, self.sep_token, self.cls_token,
48
+ self.mask_token, self.space_token
49
+ ]
50
+ self._vocab = {token: i for i, token in enumerate(special_tokens)}
51
+ self._inv_vocab = {i: token for token, i in self._vocab.items()}
52
+
53
+ def _load_vocab(self, vocab_file):
54
+ with open(vocab_file, "r", encoding="utf-8") as f:
55
+ self._vocab = json.load(f)
56
+ self._inv_vocab = {v: k for k, v in self._vocab.items()}
57
+
58
+ def get_vocab(self):
59
+ return self._vocab.copy()
60
+
61
+ @property
62
+ def vocab_size(self):
63
+ return len(self._vocab)
64
+
65
+ def _tokenize(self, text: str) -> List[str]:
66
+ text = text.replace(" ", self.space_token)
67
+ tokens = []
68
+ current = ""
69
+ for ch in text:
70
+ if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ":
71
+ current += ch
72
+ else:
73
+ if current:
74
+ tokens.append(current)
75
+ current = ""
76
+ tokens.append(ch if ch != self.space_token else self.space_token)
77
+ if current:
78
+ tokens.append(current)
79
+ return tokens
80
+
81
+ def _convert_token_to_id(self, token: str) -> int:
82
+ return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
83
+
84
+ def _convert_id_to_token(self, index: int) -> str:
85
+ return self._inv_vocab.get(index, self.unk_token)
86
+
87
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
88
+ if not os.path.isdir(save_directory):
89
+ os.makedirs(save_directory)
90
+ vocab_file = os.path.join(
91
+ save_directory,
92
+ (filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json"
93
+ )
94
+ with open(vocab_file, "w", encoding="utf-8") as f:
95
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
96
+ return (vocab_file,)
97
+
98
+ def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2):
99
+ from collections import Counter
100
+ token_counter = Counter()
101
+ for text in corpus:
102
+ tokens = self._tokenize(text)
103
+ token_counter.update(tokens)
104
+
105
+ special_tokens = [
106
+ self.unk_token, self.bos_token, self.eos_token,
107
+ self.pad_token, self.sep_token, self.cls_token,
108
+ self.mask_token, self.space_token
109
+ ]
110
+ new_vocab = {token: i for i, token in enumerate(special_tokens)}
111
+ idx = len(new_vocab)
112
+
113
+ for token, freq in token_counter.items():
114
+ if freq >= min_freq and token not in new_vocab:
115
+ new_vocab[token] = idx
116
+ idx += 1
117
+
118
+ self._vocab = new_vocab
119
+ self._inv_vocab = {v: k for k, v in self._vocab.items()}
120
+
121
+ if __name__ == "__main__":
122
+ corpus = [
123
+ "Привет, как дела!",
124
+ "Я учу немецкий язык.",
125
+ "Морфемы помогают понять структуру слов."
126
+ ]
127
+
128
+ tokenizer = CenturioTokenizer()
129
+ tokenizer.build_vocab_from_corpus(corpus, min_freq=1)
130
+ tokenizer.save_pretrained("./centurio_model")
131
+
132
+ for text in corpus:
133
+ tokens = tokenizer.tokenize(text)
134
+ ids = tokenizer.encode(text)
135
+ back = tokenizer.decode(ids)
136
+ print(f"\nTEXT : {text}")
137
+ print(f"TOKENS : {tokens}")
138
+ print(f"IDS : {ids}")
139
+ print(f"BACK : {back}")
140
+ print(f"VOCAB : {tokenizer.vocab_size}")