Upload folder using huggingface_hub
Browse files- README.md +11 -13
- tokenization_df_arc.py +12 -26
README.md
CHANGED
|
@@ -1,15 +1,13 @@
|
|
| 1 |
-
---
|
| 2 |
-
tags:
|
| 3 |
-
- arabic
|
| 4 |
-
- tokenizer
|
| 5 |
-
- morphology
|
| 6 |
-
- nlp
|
| 7 |
-
license: apache-2.0
|
| 8 |
-
language:
|
| 9 |
-
- ar
|
| 10 |
-
|
| 11 |
-
- dataflare/arabic-dialect-corpus
|
| 12 |
-
---
|
| 13 |
|
| 14 |
# DF-Arc: Morphology-Aware Arabic Tokenizer
|
| 15 |
|
|
@@ -34,4 +32,4 @@ print(tokens)
|
|
| 34 |
|
| 35 |
## Citation
|
| 36 |
If you use DF-Arc, please cite our paper:
|
| 37 |
-
*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- arabic
|
| 4 |
+
- tokenizer
|
| 5 |
+
- morphology
|
| 6 |
+
- nlp
|
| 7 |
+
license: apache-2.0
|
| 8 |
+
language:
|
| 9 |
+
- ar
|
| 10 |
+
---
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# DF-Arc: Morphology-Aware Arabic Tokenizer
|
| 13 |
|
|
|
|
| 32 |
|
| 33 |
## Citation
|
| 34 |
If you use DF-Arc, please cite our paper:
|
| 35 |
+
*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
|
tokenization_df_arc.py
CHANGED
|
@@ -207,31 +207,17 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 207 |
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
|
| 208 |
|
| 209 |
def encode(self, text, *args, **kwargs):
|
| 210 |
-
# We need to intercept single text calls too if they bypass batch_encode_plus
|
| 211 |
-
# But PreTrainedTokenizerFast usually routes through it.
|
| 212 |
-
# However, to be safe, we can manually check 'text' if it's the first arg.
|
| 213 |
-
|
| 214 |
-
# NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
|
| 215 |
-
# We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
|
| 216 |
-
|
| 217 |
-
# The cleanest way is often to override __call__.
|
| 218 |
-
pass
|
| 219 |
-
|
| 220 |
-
def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
|
| 221 |
-
def preprocess(t):
|
| 222 |
-
if not isinstance(t, str): return t
|
| 223 |
-
t = self.normalizer_helper.normalize(t)
|
| 224 |
-
t = self.morph_helper.segment_text(t)
|
| 225 |
-
t = self.phrase_helper.merge_phrases(t)
|
| 226 |
-
return t
|
| 227 |
-
|
| 228 |
if isinstance(text, str):
|
| 229 |
-
text =
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
|
|
|
| 207 |
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
|
| 208 |
|
| 209 |
def encode(self, text, *args, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
if isinstance(text, str):
|
| 211 |
+
text = self.normalizer_helper.normalize(text)
|
| 212 |
+
text = self.morph_helper.segment_text(text)
|
| 213 |
+
text = self.phrase_helper.merge_phrases(text)
|
| 214 |
+
return super().encode(text, *args, **kwargs)
|
| 215 |
+
|
| 216 |
+
def _encode_plus(self, text, *args, **kwargs):
|
| 217 |
+
if isinstance(text, str):
|
| 218 |
+
text = self.normalizer_helper.normalize(text)
|
| 219 |
+
text = self.morph_helper.segment_text(text)
|
| 220 |
+
text = self.phrase_helper.merge_phrases(text)
|
| 221 |
+
return super()._encode_plus(text, *args, **kwargs)
|
| 222 |
+
|
| 223 |
|