ishanjmukherjee commited on
Commit
8609610
·
1 Parent(s): 14f37f0

Fix tokenization decoding error

Browse files

The model would generates token IDs between 255-511, which failed with the previous decoding method

Files changed (1) hide show
  1. tokenizer.py +14 -6
tokenizer.py CHANGED
@@ -71,13 +71,21 @@ class ByteTokenizer(PreTrainedTokenizer):
71
 
72
  def _decode(self, token_ids: List[int], **kwargs) -> str:
73
 
74
- indices = np.asarray(token_ids, dtype=np.uint8)
 
75
 
76
- return (
77
- indices.clip(min=32, max=self.vocab_size, out=indices)
78
- .tobytes()
79
- .decode('utf-8')
80
- )
 
 
 
 
 
 
 
81
 
82
  def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
83
 
 
71
 
72
  def _decode(self, token_ids: List[int], **kwargs) -> str:
73
 
74
+ # Comment this out because tobytes() is interpreted literally
75
+ # indices = np.asarray(token_ids, dtype=np.uint8)
76
 
77
+ # return (
78
+ # indices.clip(min=32, max=self.vocab_size, out=indices)
79
+ # .tobytes()
80
+ # .decode('utf-8')
81
+ # )
82
+
83
+ # Use the existing method to convert each ID back to its character.
84
+ # This method implicitly handles clamping based on vocab_size.
85
+ tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
86
+
87
+ # Join the resulting characters into a single string.
88
+ return "".join(tokens)
89
 
90
  def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
91