Upload folder using huggingface_hub
Browse files- tokenization_df_arc.py +14 -1
tokenization_df_arc.py
CHANGED
|
@@ -218,6 +218,19 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 218 |
text = self.normalizer_helper.normalize(text)
|
| 219 |
text = self.morph_helper.segment_text(text)
|
| 220 |
text = self.phrase_helper.merge_phrases(text)
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
|
|
|
|
| 218 |
text = self.normalizer_helper.normalize(text)
|
| 219 |
text = self.morph_helper.segment_text(text)
|
| 220 |
text = self.phrase_helper.merge_phrases(text)
|
| 221 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 222 |
+
"""
|
| 223 |
+
Converts a sequence of tokens (string) in a single string.
|
| 224 |
+
"""
|
| 225 |
+
# Join with simple space (since we used Whitespace pre-tokenizer)
|
| 226 |
+
text = " ".join(tokens)
|
| 227 |
+
|
| 228 |
+
# Remove morphological markers (underscores)
|
| 229 |
+
# We only remove internal underscores.
|
| 230 |
+
# Note: This is an approximation.
|
| 231 |
+
text = text.replace("_", "")
|
| 232 |
+
|
| 233 |
+
return text
|
| 234 |
+
|
| 235 |
|
| 236 |
|