Update tokenization_rwkv7.py
Browse files- tokenization_rwkv7.py +11 -3
tokenization_rwkv7.py
CHANGED
|
@@ -91,9 +91,17 @@ class Rwkv7Tokenizer(PreTrainedTokenizer):
|
|
| 91 |
return {repr(v): k for k, v in self.idx2bytes.items()}
|
| 92 |
|
| 93 |
def _tokenize(self, text: str) -> List[str]:
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def _convert_token_to_id(self, token: str) -> int:
|
| 99 |
token_bytes = eval(token) if token.startswith("b'") or token.startswith("b\"") else token.encode("utf-8")
|
|
|
|
| 91 |
return {repr(v): k for k, v in self.idx2bytes.items()}
|
| 92 |
|
| 93 |
def _tokenize(self, text: str) -> List[str]:
|
| 94 |
+
# [수정] 빈 문자열 입력 시 빈 리스트 반환 보장
|
| 95 |
+
if not text:
|
| 96 |
+
return []
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
ids = self._encode_bytes(text.encode("utf-8"))
|
| 100 |
+
except Exception:
|
| 101 |
+
return []
|
| 102 |
+
|
| 103 |
+
# [수정] 혹시라도 결과가 None이 섞이지 않도록 처리
|
| 104 |
+
return [repr(self.idx2bytes.get(i, b"<unk>")) for i in ids]
|
| 105 |
|
| 106 |
def _convert_token_to_id(self, token: str) -> int:
|
| 107 |
token_bytes = eval(token) if token.startswith("b'") or token.startswith("b\"") else token.encode("utf-8")
|