Kosasih commited on
Commit
5215885
·
verified ·
1 Parent(s): 766da71

Create tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +214 -0
tokenizer.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OmniCoreX Custom Tokenizer
3
+
4
+ A super advanced, ultra high-tech tokenizer utility designed for OmniCoreX to handle
5
+ custom tokenization requirements beyond standard libraries.
6
+
7
+ Features:
8
+ - Subword tokenization using Byte-Pair Encoding (BPE)
9
+ - Efficient vocabulary management with encoding and decoding
10
+ - Support for special tokens and adaptable vocabulary expansion
11
+ - Fast string-to-token and token-to-string translation
12
+ - Serialization and deserialization utilities for tokenizer state
13
+ """
14
+
15
+ import re
16
+ import json
17
+ from collections import defaultdict
18
+ from typing import List, Dict, Optional
19
+
20
+
21
+ class BPETokenizer:
22
+ def __init__(self, vocab: Optional[Dict[str, int]] = None, merges: Optional[List[List[str]]] = None):
23
+ """
24
+ Initialize the BPE tokenizer.
25
+
26
+ Args:
27
+ vocab: Dictionary mapping tokens to indices.
28
+ merges: List of token pair merges in order.
29
+ """
30
+ self.vocab = vocab or {}
31
+ self.merges = merges or []
32
+ # Build merge pairs to rank for quick lookup
33
+ self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)}
34
+ self.cache = {}
35
+
36
+ self.pattern = re.compile(r"\w+|[^\w\s]", re.UNICODE)
37
+ self.special_tokens = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"]
38
+ for token in self.special_tokens:
39
+ if token not in self.vocab:
40
+ self.vocab[token] = len(self.vocab)
41
+
42
+ def get_vocab_size(self) -> int:
43
+ return len(self.vocab)
44
+
45
+ def tokenize(self, text: str) -> List[str]:
46
+ """
47
+ Tokenize input text to list of subword tokens using BPE.
48
+
49
+ Args:
50
+ text: Input string.
51
+
52
+ Returns:
53
+ List of tokens.
54
+ """
55
+ tokens = []
56
+ words = self.pattern.findall(text)
57
+ for word in words:
58
+ word_tokens = self.bpe(word)
59
+ tokens.extend(word_tokens)
60
+ return tokens
61
+
62
+ def bpe(self, token: str) -> List[str]:
63
+ """
64
+ Perform Byte Pair Encoding on a single token.
65
+
66
+ Args:
67
+ token: Token string.
68
+
69
+ Returns:
70
+ List of BPE sub-tokens.
71
+ """
72
+ if token in self.cache:
73
+ return self.cache[token]
74
+
75
+ word = list(token) + ["</w>"]
76
+ pairs = self.get_pairs(word)
77
+
78
+ while True:
79
+ if not pairs:
80
+ break
81
+ # Find lowest rank pair
82
+ min_pair = None
83
+ min_rank = float('inf')
84
+ for pair in pairs:
85
+ rank = self.bpe_ranks.get(pair, None)
86
+ if rank is not None and rank < min_rank:
87
+ min_rank = rank
88
+ min_pair = pair
89
+ if min_pair is None:
90
+ break
91
+ first, second = min_pair
92
+ new_word = []
93
+ i = 0
94
+ while i < len(word):
95
+ try:
96
+ j = word.index(first, i)
97
+ except ValueError:
98
+ new_word.extend(word[i:])
99
+ break
100
+ new_word.extend(word[i:j])
101
+ if j < len(word)-1 and word[j+1] == second:
102
+ new_word.append(first+second)
103
+ i = j + 2
104
+ else:
105
+ new_word.append(word[j])
106
+ i = j + 1
107
+ word = new_word
108
+ pairs = self.get_pairs(word)
109
+
110
+ if word[-1] == "</w>":
111
+ word = word[:-1]
112
+ self.cache[token] = word
113
+ return word
114
+
115
+ def get_pairs(self, word: List[str]) -> set:
116
+ """
117
+ Return set of symbol pairs in a word.
118
+
119
+ Args:
120
+ word: List of symbols.
121
+
122
+ Returns:
123
+ Set of adjacent pairs.
124
+ """
125
+ pairs = set()
126
+ prev_char = word[0]
127
+ for char in word[1:]:
128
+ pairs.add((prev_char, char))
129
+ prev_char = char
130
+ return pairs
131
+
132
+ def encode(self, text: str) -> List[int]:
133
+ """
134
+ Tokenize and convert tokens to indices.
135
+
136
+ Args:
137
+ text: Input string.
138
+
139
+ Returns:
140
+ List of token indices.
141
+ """
142
+ tokens = self.tokenize(text)
143
+ indices = [self.vocab.get(token, self.vocab.get("<UNK>")) for token in tokens]
144
+ return indices
145
+
146
+ def decode(self, indices: List[int]) -> str:
147
+ """
148
+ Convert indices back to string.
149
+
150
+ Args:
151
+ indices: List of token indices.
152
+
153
+ Returns:
154
+ Decoded string.
155
+ """
156
+ inv_vocab = {v: k for k, v in self.vocab.items()}
157
+ tokens = [inv_vocab.get(idx, "<UNK>") for idx in indices]
158
+ # Remove end of word tokens and join
159
+ text = "".join([token.replace("</w>", " ") for token in tokens])
160
+ return text.strip()
161
+
162
+ def save(self, vocab_path: str, merges_path: str):
163
+ """
164
+ Save vocabulary and merges to files.
165
+
166
+ Args:
167
+ vocab_path: Path for vocab JSON.
168
+ merges_path: Path for merges JSON.
169
+ """
170
+ with open(vocab_path, "w", encoding="utf-8") as f:
171
+ json.dump(self.vocab, f, indent=2)
172
+ with open(merges_path, "w", encoding="utf-8") as f:
173
+ json.dump(self.merges, f, indent=2)
174
+
175
+ def load(self, vocab_path: str, merges_path: str):
176
+ """
177
+ Load vocabulary and merges from files.
178
+
179
+ Args:
180
+ vocab_path: Path for vocab JSON.
181
+ merges_path: Path for merges JSON.
182
+ """
183
+ with open(vocab_path, "r", encoding="utf-8") as f:
184
+ self.vocab = json.load(f)
185
+ with open(merges_path, "r", encoding="utf-8") as f:
186
+ self.merges = json.load(f)
187
+ self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)}
188
+ self.cache = {}
189
+
190
+ if __name__ == "__main__":
191
+ # Simple usage example with dummy vocab and merges
192
+ dummy_vocab = {
193
+ "<PAD>": 0,
194
+ "<UNK>": 1,
195
+ "a": 2,
196
+ "b": 3,
197
+ "c": 4,
198
+ "ab": 5,
199
+ "bc": 6,
200
+ "abc": 7,
201
+ "</w>": 8
202
+ }
203
+ dummy_merges = [["a", "b"], ["b", "c"], ["ab", "c"]]
204
+
205
+ tokenizer = BPETokenizer(vocab=dummy_vocab, merges=dummy_merges)
206
+
207
+ sample_text = "abc cab"
208
+ print(f"Encoding text: {sample_text}")
209
+ encoded = tokenizer.encode(sample_text)
210
+ print(f"Encoded tokens: {encoded}")
211
+
212
+ decoded = tokenizer.decode(encoded)
213
+ print(f"Decoded text: '{decoded}'")
214
+