Thatphum commited on
Commit
00b915f
·
verified ·
1 Parent(s): bb9e1af

Delete tokenization_qwen.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +0 -264
tokenization_qwen.py DELETED
@@ -1,264 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- import base64
9
- import logging
10
- import os
11
- import unicodedata
12
- from typing import Collection, Dict, List, Set, Tuple, Union
13
-
14
- import tiktoken
15
- from transformers import PreTrainedTokenizer, AddedToken
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
-
22
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
- IMEND = "<|im_end|>"
26
- # as the default behavior is changed to allow special tokens in
27
- # regular texts, the surface forms of special tokens need to be
28
- # as different as possible to minimize the impact
29
- EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- SPECIAL_TOKENS = (
31
- ENDOFTEXT,
32
- IMSTART,
33
- IMEND,
34
- ) + EXTRAS
35
-
36
-
37
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
- with open(tiktoken_bpe_file, "rb") as f:
39
- contents = f.read()
40
- return {
41
- base64.b64decode(token): int(rank)
42
- for token, rank in (line.split() for line in contents.splitlines() if line)
43
- }
44
-
45
- class QWenTokenizer(PreTrainedTokenizer):
46
- """QWen tokenizer."""
47
-
48
- vocab_files_names = VOCAB_FILES_NAMES
49
-
50
- def __init__(
51
- self,
52
- vocab_file,
53
- errors="replace",
54
- image_start_tag='<img>',
55
- image_end_tag='</img>',
56
- image_pad_tag='<imgpad>',
57
- ref_start_tag='<ref>',
58
- ref_end_tag='</ref>',
59
- box_start_tag='<box>',
60
- box_end_tag='</box>',
61
- quad_start_tag='<quad>',
62
- quad_end_tag='</quad>',
63
- **kwargs,
64
- ):
65
- super().__init__(**kwargs)
66
-
67
- self.image_start_tag = image_start_tag
68
- self.image_end_tag = image_end_tag
69
- self.image_pad_tag = image_pad_tag
70
- self.ref_start_tag = ref_start_tag
71
- self.ref_end_tag = ref_end_tag
72
- self.box_start_tag = box_start_tag
73
- self.box_end_tag = box_end_tag
74
- self.quad_start_tag = quad_start_tag
75
- self.quad_end_tag = quad_end_tag
76
- self.IMAGE_ST = (
77
- ref_start_tag, ref_end_tag,
78
- box_start_tag, box_end_tag,
79
- quad_start_tag, quad_end_tag,
80
- image_start_tag, image_end_tag,
81
- image_pad_tag
82
- )
83
-
84
- self.errors = errors # how to handle errors in decoding
85
-
86
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
87
- self.special_tokens = {
88
- token: index
89
- for index, token in enumerate(
90
- SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
91
- )
92
- }
93
-
94
- self.img_start_id = self.special_tokens[self.image_start_tag]
95
- self.img_end_id = self.special_tokens[self.image_end_tag]
96
- self.img_pad_id = self.special_tokens[self.image_pad_tag]
97
- self.ref_start_id = self.special_tokens[self.ref_start_tag]
98
- self.ref_end_id = self.special_tokens[self.ref_end_tag]
99
- self.box_start_id = self.special_tokens[self.box_start_tag]
100
- self.box_end_id = self.special_tokens[self.box_end_tag]
101
- self.quad_start_id = self.special_tokens[self.quad_start_tag]
102
- self.quad_end_id = self.special_tokens[self.quad_end_tag]
103
-
104
- enc = tiktoken.Encoding(
105
- "Qwen",
106
- pat_str=PAT_STR,
107
- mergeable_ranks=self.mergeable_ranks,
108
- special_tokens=self.special_tokens,
109
- )
110
- assert (
111
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
112
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
113
-
114
- self.decoder = {
115
- v: k for k, v in self.mergeable_ranks.items()
116
- } # type: dict[int, bytes|str]
117
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
118
-
119
- self.tokenizer = enc # type: tiktoken.Encoding
120
-
121
- self.eod_id = self.tokenizer.eot_token
122
- self.im_start_id = self.special_tokens[IMSTART]
123
- self.im_end_id = self.special_tokens[IMEND]
124
-
125
- def __len__(self) -> int:
126
- return self.tokenizer.n_vocab
127
-
128
- def get_vocab(self) -> Dict[bytes, int]:
129
- return self.mergeable_ranks
130
-
131
- def convert_tokens_to_ids(
132
- self, tokens: Union[bytes, str, List[Union[bytes, str]]]
133
- ) -> List[int]:
134
- ids = []
135
- if isinstance(tokens, (str, bytes)):
136
- if tokens in self.special_tokens:
137
- return self.special_tokens[tokens]
138
- else:
139
- return self.mergeable_ranks.get(tokens)
140
- for token in tokens:
141
- if token in self.special_tokens:
142
- ids.append(self.special_tokens[token])
143
- else:
144
- ids.append(self.mergeable_ranks.get(token))
145
- return ids
146
-
147
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
148
- if not special_tokens and new_tokens:
149
- raise ValueError('Adding regular tokens is not supported')
150
- for token in new_tokens:
151
- surface_form = token.content if isinstance(token, AddedToken) else token
152
- if surface_form not in SPECIAL_TOKENS:
153
- raise ValueError('Adding unknown special tokens is not supported')
154
- return 0
155
-
156
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
157
- """
158
- Save only the vocabulary of the tokenizer (vocabulary).
159
-
160
- Returns:
161
- `Tuple(str)`: Paths to the files saved.
162
- """
163
- file_path = os.path.join(save_directory, "qwen.tiktoken")
164
- with open(file_path, "w", encoding="utf8") as w:
165
- for k, v in self.mergeable_ranks.items():
166
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
167
- w.write(line)
168
- return (file_path,)
169
-
170
- def tokenize(
171
- self,
172
- text: str,
173
- allowed_special: Union[Set, str] = "all",
174
- disallowed_special: Union[Collection, str] = (),
175
- **kwargs,
176
- ) -> List[Union[bytes, str]]:
177
- """
178
- Converts a string in a sequence of tokens.
179
-
180
- Args:
181
- text (`str`):
182
- The sequence to be encoded.
183
- allowed_special (`Literal["all"]` or `set`):
184
- The surface forms of the tokens to be encoded as special tokens in regular texts.
185
- Default to "all".
186
- disallowed_special (`Literal["all"]` or `Collection`):
187
- The surface forms of the tokens that should not be in regular texts and trigger errors.
188
- Default to an empty tuple.
189
-
190
- kwargs (additional keyword arguments, *optional*):
191
- Will be passed to the underlying model specific encode method.
192
-
193
- Returns:
194
- `List[bytes|str]`: The list of tokens.
195
- """
196
- tokens = []
197
- text = unicodedata.normalize("NFC", text)
198
-
199
- # this implementation takes a detour: text -> token id -> token surface forms
200
- for t in self.tokenizer.encode(
201
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
202
- ):
203
- tokens.append(self.decoder[t])
204
- return tokens
205
-
206
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
207
- """
208
- Converts a sequence of tokens in a single string.
209
- """
210
- text = ""
211
- temp = b""
212
- for t in tokens:
213
- if isinstance(t, str):
214
- if temp:
215
- text += temp.decode("utf-8", errors=self.errors)
216
- temp = b""
217
- text += t
218
- elif isinstance(t, bytes):
219
- temp += t
220
- else:
221
- raise TypeError("token should only be of type types or str")
222
- if temp:
223
- text += temp.decode("utf-8", errors=self.errors)
224
- return text
225
-
226
- @property
227
- def vocab_size(self):
228
- return self.tokenizer.n_vocab
229
-
230
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
231
- """Converts an id to a token, special tokens included"""
232
- if index in self.decoder:
233
- return self.decoder[index]
234
- raise ValueError("unknown ids")
235
-
236
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
237
- """Converts a token to an id using the vocab, special tokens included"""
238
- if token in self.special_tokens:
239
- return self.special_tokens[token]
240
- if token in self.mergeable_ranks:
241
- return self.mergeable_ranks[token]
242
- raise ValueError("unknown token")
243
-
244
- def _tokenize(self, text: str, **kwargs):
245
- """
246
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
247
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
248
-
249
- Do NOT take care of added tokens.
250
- """
251
- raise NotImplementedError
252
-
253
- def _decode(
254
- self,
255
- token_ids: Union[int, List[int]],
256
- skip_special_tokens: bool = False,
257
- errors: str = None,
258
- **kwargs,
259
- ) -> str:
260
- if isinstance(token_ids, int):
261
- token_ids = [token_ids]
262
- if skip_special_tokens:
263
- token_ids = [i for i in token_ids if i < self.eod_id]
264
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)