dataflare
/

df-arc

@@ -218,6 +218,19 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
             text = self.normalizer_helper.normalize(text)
             text = self.morph_helper.segment_text(text)
             text = self.phrase_helper.merge_phrases(text)
-        return super()._encode_plus(text, *args, **kwargs)

             text = self.normalizer_helper.normalize(text)
             text = self.morph_helper.segment_text(text)
             text = self.phrase_helper.merge_phrases(text)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        # Join with simple space (since we used Whitespace pre-tokenizer)
+        text = " ".join(tokens)
+        # Remove morphological markers (underscores)
+        # We only remove internal underscores.
+        # Note: This is an approximation.
+        text = text.replace("_", "")
+        return text