dataflare
/

df-arc

@@ -1,15 +1,13 @@
----
-tags:
-- arabic
-- tokenizer
-- morphology
-- nlp
-license: apache-2.0
-language:
-- ar
-datasets:
-- dataflare/arabic-dialect-corpus
----
 # DF-Arc: Morphology-Aware Arabic Tokenizer
@@ -34,4 +32,4 @@ print(tokens)
 ## Citation
 If you use DF-Arc, please cite our paper:
-*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).

+---
+tags:
+- arabic
+- tokenizer
+- morphology
+- nlp
+license: apache-2.0
+language:
+- ar
+---
 # DF-Arc: Morphology-Aware Arabic Tokenizer
 ## Citation
 If you use DF-Arc, please cite our paper:
+*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).

tokenization_df_arc.py CHANGED Viewed

@@ -207,31 +207,17 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
         return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
     def encode(self, text, *args, **kwargs):
-        # We need to intercept single text calls too if they bypass batch_encode_plus
-        # But PreTrainedTokenizerFast usually routes through it.
-        # However, to be safe, we can manually check 'text' if it's the first arg.
-        # NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
-        # We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
-        # The cleanest way is often to override __call__.
-        pass
-    def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
-        def preprocess(t):
-            if not isinstance(t, str): return t
-            t = self.normalizer_helper.normalize(t)
-            t = self.morph_helper.segment_text(t)
-            t = self.phrase_helper.merge_phrases(t)
-            return t
         if isinstance(text, str):
-            text = preprocess(text)
-        elif isinstance(text, (list, tuple)):
-            if len(text) > 0 and isinstance(text[0], str): # List of strings
-                 text = [preprocess(t) for t in text]
-            elif len(text) > 0 and isinstance(text[0], (list, tuple)): # Pairs
-                 text = [(preprocess(p[0]), preprocess(p[1])) for p in text]
-        return super().__call__(text, *args, **kwargs)

         return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
     def encode(self, text, *args, **kwargs):
         if isinstance(text, str):
+            text = self.normalizer_helper.normalize(text)
+            text = self.morph_helper.segment_text(text)
+            text = self.phrase_helper.merge_phrases(text)
+        return super().encode(text, *args, **kwargs)
+    def _encode_plus(self, text, *args, **kwargs):
+        if isinstance(text, str):
+            text = self.normalizer_helper.normalize(text)
+            text = self.morph_helper.segment_text(text)
+            text = self.phrase_helper.merge_phrases(text)
+        return super()._encode_plus(text, *args, **kwargs)