revert convert_tokens_to_string
Browse files- tokenization_qwen.py +6 -5
tokenization_qwen.py
CHANGED
|
@@ -198,15 +198,16 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 198 |
|
| 199 |
return tokens
|
| 200 |
|
| 201 |
-
def convert_tokens_to_string(self, tokens: List[
|
| 202 |
"""
|
| 203 |
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
|
| 204 |
often want to remove sub-word tokenization artifacts at the same time.
|
| 205 |
"""
|
| 206 |
-
text =
|
| 207 |
-
for
|
| 208 |
-
|
| 209 |
-
|
|
|
|
| 210 |
|
| 211 |
@property
|
| 212 |
def vocab_size(self):
|
|
|
|
| 198 |
|
| 199 |
return tokens
|
| 200 |
|
| 201 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 202 |
"""
|
| 203 |
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
|
| 204 |
often want to remove sub-word tokenization artifacts at the same time.
|
| 205 |
"""
|
| 206 |
+
text = "".join(tokens)
|
| 207 |
+
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
| 208 |
+
"utf-8", errors=self.errors
|
| 209 |
+
)
|
| 210 |
+
return text
|
| 211 |
|
| 212 |
@property
|
| 213 |
def vocab_size(self):
|