Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
6443890
·
verified ·
1 Parent(s): 1b8fc9e

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +11 -13
  2. tokenization_df_arc.py +12 -26
README.md CHANGED
@@ -1,15 +1,13 @@
1
- ---
2
- tags:
3
- - arabic
4
- - tokenizer
5
- - morphology
6
- - nlp
7
- license: apache-2.0
8
- language:
9
- - ar
10
- datasets:
11
- - dataflare/arabic-dialect-corpus
12
- ---
13
 
14
  # DF-Arc: Morphology-Aware Arabic Tokenizer
15
 
@@ -34,4 +32,4 @@ print(tokens)
34
 
35
  ## Citation
36
  If you use DF-Arc, please cite our paper:
37
- *The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
 
1
+ ---
2
+ tags:
3
+ - arabic
4
+ - tokenizer
5
+ - morphology
6
+ - nlp
7
+ license: apache-2.0
8
+ language:
9
+ - ar
10
+ ---
 
 
11
 
12
  # DF-Arc: Morphology-Aware Arabic Tokenizer
13
 
 
32
 
33
  ## Citation
34
  If you use DF-Arc, please cite our paper:
35
+ *The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
tokenization_df_arc.py CHANGED
@@ -207,31 +207,17 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
207
  return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
208
 
209
  def encode(self, text, *args, **kwargs):
210
- # We need to intercept single text calls too if they bypass batch_encode_plus
211
- # But PreTrainedTokenizerFast usually routes through it.
212
- # However, to be safe, we can manually check 'text' if it's the first arg.
213
-
214
- # NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
215
- # We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
216
-
217
- # The cleanest way is often to override __call__.
218
- pass
219
-
220
- def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
221
- def preprocess(t):
222
- if not isinstance(t, str): return t
223
- t = self.normalizer_helper.normalize(t)
224
- t = self.morph_helper.segment_text(t)
225
- t = self.phrase_helper.merge_phrases(t)
226
- return t
227
-
228
  if isinstance(text, str):
229
- text = preprocess(text)
230
- elif isinstance(text, (list, tuple)):
231
- if len(text) > 0 and isinstance(text[0], str): # List of strings
232
- text = [preprocess(t) for t in text]
233
- elif len(text) > 0 and isinstance(text[0], (list, tuple)): # Pairs
234
- text = [(preprocess(p[0]), preprocess(p[1])) for p in text]
235
-
236
- return super().__call__(text, *args, **kwargs)
 
 
 
 
237
 
 
207
  return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
208
 
209
  def encode(self, text, *args, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  if isinstance(text, str):
211
+ text = self.normalizer_helper.normalize(text)
212
+ text = self.morph_helper.segment_text(text)
213
+ text = self.phrase_helper.merge_phrases(text)
214
+ return super().encode(text, *args, **kwargs)
215
+
216
+ def _encode_plus(self, text, *args, **kwargs):
217
+ if isinstance(text, str):
218
+ text = self.normalizer_helper.normalize(text)
219
+ text = self.morph_helper.segment_text(text)
220
+ text = self.phrase_helper.merge_phrases(text)
221
+ return super()._encode_plus(text, *args, **kwargs)
222
+
223