Ilikemechuri commited on
Commit
c38f274
·
verified ·
1 Parent(s): 29ed536

Update tokenization_rwkv7.py

Browse files
Files changed (1) hide show
  1. tokenization_rwkv7.py +11 -3
tokenization_rwkv7.py CHANGED
@@ -91,9 +91,17 @@ class Rwkv7Tokenizer(PreTrainedTokenizer):
91
  return {repr(v): k for k, v in self.idx2bytes.items()}
92
 
93
  def _tokenize(self, text: str) -> List[str]:
94
- # str token id 목록 repr str 목록 (HuggingFace 내부 형식)
95
- ids = self._encode_bytes(text.encode("utf-8"))
96
- return [repr(self.idx2bytes[i]) for i in ids]
 
 
 
 
 
 
 
 
97
 
98
  def _convert_token_to_id(self, token: str) -> int:
99
  token_bytes = eval(token) if token.startswith("b'") or token.startswith("b\"") else token.encode("utf-8")
 
91
  return {repr(v): k for k, v in self.idx2bytes.items()}
92
 
93
  def _tokenize(self, text: str) -> List[str]:
94
+ # [수정] 문자열 입력 리스트 반환 보장
95
+ if not text:
96
+ return []
97
+
98
+ try:
99
+ ids = self._encode_bytes(text.encode("utf-8"))
100
+ except Exception:
101
+ return []
102
+
103
+ # [수정] 혹시라도 결과가 None이 섞이지 않도록 처리
104
+ return [repr(self.idx2bytes.get(i, b"<unk>")) for i in ids]
105
 
106
  def _convert_token_to_id(self, token: str) -> int:
107
  token_bytes = eval(token) if token.startswith("b'") or token.startswith("b\"") else token.encode("utf-8")