Commit
·
8609610
1
Parent(s):
14f37f0
Fix tokenization decoding error
Browse filesThe model would generates token IDs between 255-511, which failed with the previous decoding method
- tokenizer.py +14 -6
tokenizer.py
CHANGED
|
@@ -71,13 +71,21 @@ class ByteTokenizer(PreTrainedTokenizer):
|
|
| 71 |
|
| 72 |
def _decode(self, token_ids: List[int], **kwargs) -> str:
|
| 73 |
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
-
return (
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
|
| 83 |
|
|
|
|
| 71 |
|
| 72 |
def _decode(self, token_ids: List[int], **kwargs) -> str:
|
| 73 |
|
| 74 |
+
# Comment this out because tobytes() is interpreted literally
|
| 75 |
+
# indices = np.asarray(token_ids, dtype=np.uint8)
|
| 76 |
|
| 77 |
+
# return (
|
| 78 |
+
# indices.clip(min=32, max=self.vocab_size, out=indices)
|
| 79 |
+
# .tobytes()
|
| 80 |
+
# .decode('utf-8')
|
| 81 |
+
# )
|
| 82 |
+
|
| 83 |
+
# Use the existing method to convert each ID back to its character.
|
| 84 |
+
# This method implicitly handles clamping based on vocab_size.
|
| 85 |
+
tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
|
| 86 |
+
|
| 87 |
+
# Join the resulting characters into a single string.
|
| 88 |
+
return "".join(tokens)
|
| 89 |
|
| 90 |
def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
|
| 91 |
|