rawcell commited on
Commit
765cac2
·
verified ·
1 Parent(s): 8651e03

Upload tokenization_moonshot.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_moonshot.py +301 -0
tokenization_moonshot.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ import numpy as np
17
+ from tiktoken.load import load_tiktoken_bpe
18
+ from tokenizers import AddedToken
19
+ from transformers import PreTrainedTokenizerFast
20
+ from transformers.tokenization_utils import PreTrainedTokenizer
21
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
22
+
23
+
24
+
25
+ logger = getLogger(__name__)
26
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
27
+ SPIECE_UNDERLINE = "▁"
28
+
29
+ class TikTokenTokenizer(PreTrainedTokenizer):
30
+ """
31
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
32
+
33
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34
+ this superclass for more information regarding those methods.
35
+
36
+ Args:
37
+ vocab_file (`str`):
38
+ The path to the Tiktoken model file.
39
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
40
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
41
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
42
+ The end of sequence token.
43
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
44
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
45
+ token instead. The second to last item in special_tokens.
46
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
47
+ The token used for padding, for example when batching sequences of different lengths.
48
+ additional_special_tokens (list of `str`, *optional*):
49
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
50
+ skipped when decoding if `skip_special_tokens` is set to `True`.
51
+ """
52
+
53
+ vocab_files_names = VOCAB_FILES_NAMES
54
+
55
+ model_input_names = ["input_ids", "attention_mask"]
56
+
57
+ special_tokens: Dict[str, int]
58
+
59
+ num_reserved_special_tokens = 256
60
+
61
+ pat_str = "|".join(
62
+ [
63
+ r"""[\p{Han}]+""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
66
+ r"""\p{N}{1,3}""",
67
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
68
+ r"""\s*[\r\n]+""",
69
+ r"""\s+(?!\S)""",
70
+ r"""\s+""",
71
+ ]
72
+ )
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_file,
77
+ bos_token: Union[str, AddedToken]="[BOS]",
78
+ eos_token: Union[str, AddedToken]="[EOS]",
79
+ unk_token: Union[str, AddedToken]="[UNK]",
80
+ pad_token: Union[str, AddedToken]="[PAD]",
81
+ additional_special_tokens: Optional[List[str]] = None,
82
+ added_tokens_decoder: Optional[dict] = None,
83
+ **kwargs,
84
+ ):
85
+ assert os.path.isfile(vocab_file), vocab_file
86
+ if additional_special_tokens is None:
87
+ additional_special_tokens = [
88
+ "<|im_end|>",
89
+ "<|im_middle|>",
90
+ "<|im_user|>",
91
+ "<|im_assistant|>",
92
+ "<|im_system|>"
93
+ ]
94
+ special_tokens_mapping = {i: added_tokens_decoder[i].content for i in added_tokens_decoder}
95
+
96
+ special_tokens = [str(bos_token), str(eos_token)] + additional_special_tokens + [str(unk_token), str(pad_token)]
97
+
98
+ self.vocab_file = vocab_file
99
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
100
+ num_base_tokens = len(mergeable_ranks)
101
+ self.special_tokens = {
102
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i \
103
+ for i in range(num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2)
104
+ }
105
+
106
+ self.model = tiktoken.Encoding(
107
+ name=Path(vocab_file).name,
108
+ pat_str=self.pat_str,
109
+ mergeable_ranks=mergeable_ranks,
110
+ special_tokens=self.special_tokens,
111
+ )
112
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
113
+
114
+ self.n_words: int = self.model.n_vocab
115
+ # BOS / EOS token IDs
116
+ self.bos_id: int = self.special_tokens[str(bos_token)]
117
+ self.eos_id: int = self.special_tokens[str(eos_token)]
118
+ logger.info(
119
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
120
+ )
121
+
122
+ self.pad_id: int = self.special_tokens[str(pad_token)]
123
+ self.unk_id: int = self.special_tokens[str(unk_token)]
124
+
125
+ self.byte_encoder = bytes_to_unicode()
126
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
127
+
128
+ self.decoder = {}
129
+ for i in range(self.n_words):
130
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
131
+ decoding = ''.join([
132
+ self.byte_encoder[ord(char)] for char in
133
+ self.model.decode_single_token_bytes(i).decode('latin-1')
134
+ ])
135
+ self.decoder[i] = decoding
136
+
137
+ self.encoder = {}
138
+ for i in range(self.n_words):
139
+ if i in self.decoder:
140
+ self.encoder[self.decoder[i]] = i
141
+
142
+ super().__init__(
143
+ bos_token=bos_token,
144
+ eos_token=eos_token,
145
+ unk_token=unk_token,
146
+ pad_token=pad_token,
147
+ additional_special_tokens=additional_special_tokens,
148
+ **kwargs,
149
+ )
150
+ self.all_special_ids_set = set(self.all_special_ids)
151
+
152
+ def encode(
153
+ self,
154
+ text: str,
155
+ allow_special_tokens: bool = True,
156
+ **kwargs
157
+ ) -> List[int]:
158
+ """
159
+ Encodes a string into a list of token IDs.
160
+
161
+ Args:
162
+ text (str): The input string to be encoded.
163
+
164
+ Returns:
165
+ list[int]: A list of token IDs.
166
+ """
167
+ # If there are other args, we should call super().encode because there are a lot of code
168
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
169
+ if len(kwargs) > 0:
170
+ return super().encode(text, **kwargs)
171
+
172
+ assert type(text) is str
173
+
174
+ # The tiktoken tokenizer can handle <=400k chars without
175
+ # pyo3_runtime.PanicException.
176
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
177
+
178
+ # https://github.com/openai/tiktoken/issues/195
179
+ # Here we iterate over subsequences and split if we exceed the limit
180
+ # of max consecutive non-whitespace or whitespace characters.
181
+ MAX_NO_WHITESPACES_CHARS = 25_000
182
+
183
+ substrs = (
184
+ substr
185
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
186
+ for substr in self._split_whitespaces_or_nonwhitespaces(
187
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
188
+ )
189
+ )
190
+ t: List[int] = []
191
+ for substr in substrs:
192
+ if allow_special_tokens:
193
+ t.extend(
194
+ # we should consider special token as a common token
195
+ self.model.encode(
196
+ substr,
197
+ allowed_special="all",
198
+ )
199
+ )
200
+ else:
201
+ t.extend(
202
+ # we should consider special token as a common token
203
+ self.model.encode(
204
+ substr,
205
+ disallowed_special=(),
206
+ )
207
+ )
208
+ return t
209
+
210
+ def decode(
211
+ self,
212
+ token_ids: Union[int, List[int]],
213
+ **kwargs
214
+ ) -> str:
215
+ """
216
+ Decodes a list of token IDs into a string.
217
+
218
+ Args:
219
+ t (List[int]): The list of token IDs to be decoded.
220
+
221
+ Returns:
222
+ str: The decoded string.
223
+ """
224
+ # If there are other args, we should call super().decode because there are a lot of code
225
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
226
+ if len(kwargs) > 0:
227
+ return super().decode(token_ids, **kwargs)
228
+
229
+ if type(token_ids) is int:
230
+ token_ids = [token_ids]
231
+
232
+ return self.model.decode(cast(List[int], token_ids))
233
+
234
+ @staticmethod
235
+ def _split_whitespaces_or_nonwhitespaces(
236
+ s: str, max_consecutive_slice_len: int
237
+ ) -> Iterator[str]:
238
+ """
239
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
240
+ consecutive whitespaces or consecutive non-whitespaces.
241
+ """
242
+ current_slice_len = 0
243
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
244
+ slice_start = 0
245
+
246
+ for i in range(len(s)):
247
+ is_now_space = s[i].isspace()
248
+
249
+ if current_slice_is_space ^ is_now_space:
250
+ current_slice_len = 1
251
+ current_slice_is_space = is_now_space
252
+ else:
253
+ current_slice_len += 1
254
+ if current_slice_len > max_consecutive_slice_len:
255
+ yield s[slice_start:i]
256
+ slice_start = i
257
+ current_slice_len = 1
258
+ yield s[slice_start:]
259
+
260
+
261
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
262
+ @property
263
+ def vocab_size(self) -> int:
264
+ return self.n_words
265
+
266
+ def get_vocab(self) -> Dict[str, int]:
267
+ return self.encoder
268
+
269
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
270
+ return [
271
+ self.decoder[t]
272
+ for t in self.encode(text)
273
+ ]
274
+
275
+ def _convert_token_to_id(self, token: str) -> int:
276
+ return self.encoder.get(token, self.unk_id)
277
+
278
+ def _convert_id_to_token(self, index: int) -> str:
279
+ return self.decoder.get(index)
280
+
281
+ @staticmethod
282
+ def clean_up_tokenization(out_string: str) -> str:
283
+ return out_string
284
+
285
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
286
+ text = ''.join(tokens).replace(SPIECE_UNDERLINE, '')
287
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
288
+ return text
289
+
290
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
291
+ if not os.path.isdir(save_directory):
292
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
293
+ return
294
+ out_vocab_file = os.path.join(
295
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
296
+ )
297
+
298
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
299
+ copyfile(self.vocab_file, out_vocab_file)
300
+
301
+ return (out_vocab_file,)