xiaoyi1734 commited on
Commit
e67549b
·
verified ·
1 Parent(s): d6fdb78

Upload Kimi-Audio-Reaction/tokenization_kimia.py with huggingface_hub

Browse files
Kimi-Audio-Reaction/tokenization_kimia.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ """Megatron tokenizers."""
4
+ from transformers.tokenization_utils import PreTrainedTokenizer
5
+ from typing import Union
6
+ from typing import (
7
+ AbstractSet,
8
+ cast,
9
+ Collection,
10
+ Dict,
11
+ Iterator,
12
+ List,
13
+ Literal,
14
+ Sequence,
15
+ Union,
16
+ Optional,
17
+ )
18
+ from tiktoken.load import load_tiktoken_bpe
19
+ import tiktoken
20
+ from pathlib import Path
21
+ import os
22
+ import logging
23
+ from tokenizers import AddedToken
24
+
25
+ logger = logging.getLogger(__name__)
26
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
27
+
28
+
29
+ class TikTokenTokenizer(PreTrainedTokenizer):
30
+ """
31
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
32
+ """
33
+
34
+ special_tokens: Dict[str, int]
35
+
36
+ num_reserved_special_tokens = 293 + 128
37
+
38
+ pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
39
+
40
+ vocab_files_names = VOCAB_FILES_NAMES
41
+
42
+ def __init__(
43
+ self,
44
+ vocab_file,
45
+ bos_token: Union[str, AddedToken] = "[BOS]",
46
+ eos_token: Union[str, AddedToken] = "[EOS]",
47
+ unk_token: Union[str, AddedToken] = "[UNK]",
48
+ pad_token: Union[str, AddedToken] = "[PAD]",
49
+ additional_special_tokens: Optional[List[str]] = None,
50
+ added_tokens_decoder: Optional[dict] = None,
51
+ **kwargs,
52
+ ):
53
+ """
54
+ Initializes the Tokenizer with a Tiktoken model.
55
+
56
+ Args:
57
+ model_path (str): The path to the Tiktoken model file.
58
+ """
59
+ assert os.path.isfile(vocab_file), vocab_file
60
+
61
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
62
+ num_base_tokens = len(mergeable_ranks)
63
+
64
+ used_special_tokens = [
65
+ "[BOS]",
66
+ "[EOS]",
67
+ "<|im_msg_end|>", # 0
68
+ "<|im_user_msg_start|>", # 1
69
+ "<|im_assistant_msg_start|>", # 2
70
+ "<|reserved_token_0|>", # 3
71
+ "<|reserved_token_1|>",
72
+ "<|reserved_token_2|>",
73
+ "<|reserved_token_3|>", # 4
74
+ "[EOT]",
75
+ "<|reserved_token_4|>", # 5
76
+ "<|reserved_token_5|>", # 6
77
+ "<|reserved_token_6|>", # 7
78
+ "<|reserved_token_7|>", # 8
79
+ "<|reserved_token_8|>", # 9
80
+ "<|reserved_token_9|>", # 10
81
+ "<|reserved_token_10|>", # 11
82
+ "<|reserved_token_11|>", # 12
83
+ "<|im_media_begin|>", # 13
84
+ "<|reserved_token_12|>", # 14
85
+ "<|im_media_end|>", # 15
86
+ "<|reserved_token_13|>", # 16
87
+ "<|reserved_token_14|>", # 17
88
+ "<|im_kimia_text_blank|>", # 18
89
+ "<|im_kimia_text_eos|>", # 19
90
+ "<|reserved_token_15|>", # 20
91
+ "<|reserved_token_16|>", # 21
92
+ "<|im_kimia_user_msg_start|>", # 22
93
+ "<|im_kimia_assistant_msg_start|>", # 23
94
+ "<|reserved_token_17|>", # 24
95
+ "<|reserved_token_18|>", # 25
96
+ "<|reserved_token_19|>", # 26
97
+ "<|im_kimia_speech_ct_id|>", # 27
98
+ "<|im_kimia_speech_ctd_id|>", # 28
99
+ ]
100
+ autoset_special_tokens = [
101
+ f"<|reserved_token_{i}|>"
102
+ for i in range(
103
+ 20, self.num_reserved_special_tokens - len(used_special_tokens) + 20
104
+ )
105
+ ]
106
+ special_tokens = used_special_tokens + autoset_special_tokens
107
+ self.special_tokens = {
108
+ token: num_base_tokens + i for i, token in enumerate(special_tokens)
109
+ }
110
+ self.model = tiktoken.Encoding(
111
+ name=Path(vocab_file).name,
112
+ pat_str=self.pat_str,
113
+ mergeable_ranks=mergeable_ranks,
114
+ special_tokens=self.special_tokens,
115
+ )
116
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
117
+
118
+ self.n_words: int = self.model.n_vocab
119
+ # BOS / EOS token IDs
120
+ self.bos_token = "[BOS]"
121
+ self.bos_id: int = self.special_tokens["[BOS]"]
122
+ self.eos_token = "[EOS]"
123
+ self.eos_id: int = self.special_tokens["[EOS]"]
124
+
125
+ # use last speical token as pad token, the last - 1 is unk_token
126
+ self.pad_token: str = special_tokens[-1]
127
+ self.pad_id: int = self.special_tokens[self.pad_token]
128
+
129
+ self.unk_token: str = special_tokens[-2]
130
+ self.unk_id: int = self.special_tokens[self.pad_token]
131
+
132
+ self.stop_tokens = {
133
+ self.special_tokens["[EOS]"],
134
+ self.special_tokens["[EOT]"],
135
+ }
136
+
137
+ logger.info(
138
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
139
+ )
140
+
141
+ def encode(
142
+ self,
143
+ s: str,
144
+ *,
145
+ bos: bool,
146
+ eos: bool,
147
+ allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
148
+ disallowed_special: Union[Literal["all"], Collection[str]] = (),
149
+ ) -> List[int]:
150
+ """
151
+ Encodes a string into a list of token IDs.
152
+
153
+ Args:
154
+ s (str): The input string to be encoded.
155
+ bos (bool): Whether to prepend the beginning-of-sequence token.
156
+ eos (bool): Whether to append the end-of-sequence token.
157
+ allowed_tokens ("all"|set[str]): allowed special tokens in string
158
+ disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
159
+
160
+ Returns:
161
+ list[int]: A list of token IDs.
162
+
163
+ By default, setting disallowed_special=() encodes a string by ignoring
164
+ special tokens. Specifically:
165
+ - Setting `disallowed_special` to () will cause all text corresponding
166
+ to special tokens to be encoded as natural text (insteading of raising
167
+ an error).
168
+ - Setting `allowed_special` to "all" will treat all text corresponding
169
+ to special tokens to be encoded as special tokens.
170
+ """
171
+ assert type(s) is str
172
+
173
+ # The tiktoken tokenizer can handle <=400k chars without
174
+ # pyo3_runtime.PanicException.
175
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
176
+
177
+ # https://github.com/openai/tiktoken/issues/195
178
+ # Here we iterate over subsequences and split if we exceed the limit
179
+ # of max consecutive non-whitespace or whitespace characters.
180
+ MAX_NO_WHITESPACES_CHARS = 25_000
181
+
182
+ substrs = (
183
+ substr
184
+ for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
185
+ for substr in self._split_whitespaces_or_nonwhitespaces(
186
+ s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
187
+ )
188
+ )
189
+ t: List[int] = []
190
+ for substr in substrs:
191
+ t.extend(
192
+ self.model.encode(
193
+ substr,
194
+ allowed_special=allowed_special,
195
+ disallowed_special=disallowed_special,
196
+ )
197
+ )
198
+ if bos:
199
+ t.insert(0, self.bos_id)
200
+ if eos:
201
+ t.append(self.eos_id)
202
+ return t
203
+
204
+ def decode(self, t: Sequence[int]) -> str:
205
+ """
206
+ Decodes a list of token IDs into a string.
207
+
208
+ Args:
209
+ t (List[int]): The list of token IDs to be decoded.
210
+
211
+ Returns:
212
+ str: The decoded string.
213
+ """
214
+ # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
215
+ return self.model.decode(cast(List[int], t))
216
+
217
+ @staticmethod
218
+ def _split_whitespaces_or_nonwhitespaces(
219
+ s: str, max_consecutive_slice_len: int
220
+ ) -> Iterator[str]:
221
+ """
222
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
223
+ consecutive whitespaces or consecutive non-whitespaces.
224
+ """
225
+ current_slice_len = 0
226
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
227
+ slice_start = 0
228
+
229
+ for i in range(len(s)):
230
+ is_now_space = s[i].isspace()
231
+
232
+ if current_slice_is_space ^ is_now_space:
233
+ current_slice_len = 1
234
+ current_slice_is_space = is_now_space
235
+ else:
236
+ current_slice_len += 1
237
+ if current_slice_len > max_consecutive_slice_len:
238
+ yield s[slice_start:i]
239
+ slice_start = i
240
+ current_slice_len = 1
241
+ yield s[slice_start:]
242
+
243
+ """ ----- Below are the abstract methods required by megatron ----- """
244
+
245
+ @property
246
+ def vocab_size(self):
247
+ return self.n_words
248
+
249
+ @property
250
+ def vocab(self):
251
+ if hasattr(self, "str_vocab"):
252
+ return self.str_vocab
253
+ self.str_vocab = {}
254
+
255
+ # convert mergeable_ranks from bytes to string
256
+ utf8_num, unicode_num = 0, 0
257
+ for byte_key, index in self.model._mergeable_ranks.items():
258
+ try:
259
+ str_key = byte_key.decode("utf-8")
260
+ utf8_num += 1
261
+ except UnicodeDecodeError:
262
+ # use backslashreplace so we can get num vocab different tokens
263
+ # see: https://docs.python.org/3/howto/unicode.html
264
+ # this vocab is only used for offline processing, so this is fine
265
+ str_key = byte_key.decode("utf-8", "backslashreplace") + "_unicode_"
266
+ unicode_num += 1
267
+
268
+ self.str_vocab[str_key] = index
269
+ logger.info(f"num utf8: {utf8_num}, num unicode: {unicode_num}")
270
+
271
+ # add all special tokens to the dictionary
272
+ self.str_vocab.update(self.model._special_tokens)
273
+
274
+ assert len(self.str_vocab) == self.vocab_size
275
+ return self.str_vocab
276
+
277
+ @property
278
+ def inv_vocab(self):
279
+ return {v: k for k, v in self.vocab.items()}
280
+
281
+ def tokenize(self, text, eos=True):
282
+ # BOS: always add bos token
283
+ # EOS:
284
+ # Most cases should be true when we are tokenizing a full sequence
285
+ # Only setting to false when we are running a inference
286
+ return self.encode(text, bos=True, eos=eos)
287
+
288
+ def detokenize(self, tokens):
289
+ # convert tensor to list if needed...
290
+ if not isinstance(tokens, list):
291
+ tokens = tokens.tolist()
292
+ return self.decode(tokens)
293
+
294
+ @property
295
+ def eod(self):
296
+ return self.eos_id
297
+
298
+ def bod(self):
299
+ return self.bos_id
300
+
301
+ @property
302
+ def msk_start_id(self):
303
+ return self.msk_start
304
+
305
+ @property
306
+ def msk_end_id(self):
307
+ return self.msk_end
308
+
309
+ def _get_index_2_bytes(self):
310
+ if hasattr(self, "index_2_bytes"):
311
+ return self.index_2_bytes
312
+
313
+ # use array rather than dict for faster access
314
+ self.index_2_bytes = [0] * self.model.n_vocab
315
+ for byte_key, index in self.model._mergeable_ranks.items():
316
+ self.index_2_bytes[index] = len(byte_key)
317
+
318
+ for _, index in self.model._special_tokens.items():
319
+ # in total we have 256 special tokens, 2^8 = 256
320
+ # so the num of bytes of each token is only 1
321
+ self.index_2_bytes[index] = 1
322
+
323
+ return self.index_2_bytes
324
+
325
+ def get_array_bytes(self, array):
326
+ index_2_bytes = self._get_index_2_bytes()
327
+ return sum(index_2_bytes[i] for i in array)
328
+
329
+ @property
330
+ def eos_token_id(self):
331
+ return self.eos_id
332
+
333
+ @property
334
+ def pad_token_id(self):
335
+ return self.pad_id