Upload folder using huggingface_hub
Browse files- tokenization_df_arc.py +15 -0
tokenization_df_arc.py
CHANGED
|
@@ -213,7 +213,22 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 213 |
text = self.phrase_helper.merge_phrases(text)
|
| 214 |
return super().encode(text, *args, **kwargs)
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
|
|
|
| 217 |
"""
|
| 218 |
Converts a sequence of tokens (string) in a single string.
|
| 219 |
"""
|
|
|
|
| 213 |
text = self.phrase_helper.merge_phrases(text)
|
| 214 |
return super().encode(text, *args, **kwargs)
|
| 215 |
|
| 216 |
+
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
|
| 217 |
+
"""
|
| 218 |
+
Override decode to force use of convert_tokens_to_string for readable output.
|
| 219 |
+
"""
|
| 220 |
+
# Ensure token_ids is a list of ints
|
| 221 |
+
if isinstance(token_ids, int):
|
| 222 |
+
token_ids = [token_ids]
|
| 223 |
+
|
| 224 |
+
# Convert to tokens
|
| 225 |
+
tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
| 226 |
+
|
| 227 |
+
# Convert to string using our custom logic
|
| 228 |
+
return self.convert_tokens_to_string(tokens)
|
| 229 |
+
|
| 230 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 231 |
+
|
| 232 |
"""
|
| 233 |
Converts a sequence of tokens (string) in a single string.
|
| 234 |
"""
|