yehaochen RangiLyu commited on
Commit
5d5ba41
·
verified ·
1 Parent(s): d790aca

fix out of vocab token (#5)

Browse files

- fix out of vocab token (a6c5212a2fb00470a8e6407f7c0def425709d024)


Co-authored-by: RangiLyu <RangiLyu@users.noreply.huggingface.co>

Files changed (1) hide show
  1. tokenization_interns1.py +3 -1
tokenization_interns1.py CHANGED
@@ -893,7 +893,9 @@ class InternS1Tokenizer(Qwen2Tokenizer):
893
 
894
  def convert_tokens_to_string(self, tokens):
895
  """Converts a sequence of tokens (string) in a single string."""
896
- text = "".join(tokens)
 
 
897
  text = text.replace(
898
  "▁", "Ġ"
899
  ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
 
893
 
894
  def convert_tokens_to_string(self, tokens):
895
  """Converts a sequence of tokens (string) in a single string."""
896
+ text = ""
897
+ for token in tokens:
898
+ text += token if token else ""
899
  text = text.replace(
900
  "▁", "Ġ"
901
  ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.