Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +35 -0
- phrases.json +0 -0
- tokenization_df_arc.py +237 -0
- tokenizer.json +3 -0
- tokenizer_config.json +21 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- arabic
|
| 4 |
+
- tokenizer
|
| 5 |
+
- morphology
|
| 6 |
+
- nlp
|
| 7 |
+
license: apache-2.0
|
| 8 |
+
language:
|
| 9 |
+
- ar
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# DF-Arc: Morphology-Aware Arabic Tokenizer
|
| 13 |
+
|
| 14 |
+
DF-Arc is a specialized tokenizer for Arabic LLMs that achieves **1.0 fertility** (one token per word) on average, eliminating the "Arabic Token Tax".
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
- **Morphological Pre-tokenization**: Splits words into prefix-stem-suffix units.
|
| 18 |
+
- **Phrase Merging**: Automatically merges common multi-word expressions (e.g., "in the name of God") into single tokens.
|
| 19 |
+
- **Dialect Support**: Optimized for Egyptian, Gulf, and Levantine dialects.
|
| 20 |
+
|
| 21 |
+
## Usage
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from transformers import AutoTokenizer
|
| 25 |
+
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained("dataflare/df-arc", trust_remote_code=True)
|
| 27 |
+
|
| 28 |
+
text = "والكتابة بالعربية ممتعة جدا"
|
| 29 |
+
tokens = tokenizer.tokenize(text)
|
| 30 |
+
print(tokens)
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Citation
|
| 34 |
+
If you use DF-Arc, please cite our paper:
|
| 35 |
+
*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
|
phrases.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenization_df_arc.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DF-Arc Tokenizer
|
| 3 |
+
Morphology-aware, dialect-inclusive tokenization for Arabic LLMs.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import unicodedata
|
| 9 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 10 |
+
|
| 11 |
+
from transformers import PreTrainedTokenizerFast
|
| 12 |
+
from tokenizers import Tokenizer
|
| 13 |
+
|
| 14 |
+
class ArabicNormalizer:
|
| 15 |
+
"""Normalizes Arabic text with configurable rules."""
|
| 16 |
+
|
| 17 |
+
DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')
|
| 18 |
+
TATWEEL_PATTERN = re.compile(r'\u0640')
|
| 19 |
+
ALEF_PATTERN = re.compile(r'[أإآ]')
|
| 20 |
+
YEH_PATTERN = re.compile(r'ى')
|
| 21 |
+
TEH_MARBUTA_PATTERN = re.compile(r'ة')
|
| 22 |
+
REPEATS_PATTERN = re.compile(r'(.)\1{2,}')
|
| 23 |
+
URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', re.MULTILINE)
|
| 24 |
+
EMAIL_PATTERN = re.compile(r'\S+@\S+')
|
| 25 |
+
WHITESPACE_PATTERN = re.compile(r'\s+')
|
| 26 |
+
|
| 27 |
+
def __init__(self,
|
| 28 |
+
unify_alef: bool = True,
|
| 29 |
+
unify_yeh: bool = True,
|
| 30 |
+
unify_teh_marbuta: bool = True,
|
| 31 |
+
remove_diacritics: bool = True,
|
| 32 |
+
remove_tatweel: bool = True,
|
| 33 |
+
remove_repeats: bool = True):
|
| 34 |
+
self.unify_alef = unify_alef
|
| 35 |
+
self.unify_yeh = unify_yeh
|
| 36 |
+
self.unify_teh_marbuta = unify_teh_marbuta
|
| 37 |
+
self.remove_diacritics = remove_diacritics
|
| 38 |
+
self.remove_tatweel = remove_tatweel
|
| 39 |
+
self.remove_repeats = remove_repeats
|
| 40 |
+
|
| 41 |
+
def normalize(self, text: str) -> str:
|
| 42 |
+
if not text:
|
| 43 |
+
return ""
|
| 44 |
+
text = unicodedata.normalize("NFKC", text)
|
| 45 |
+
text = self.URL_PATTERN.sub('', text)
|
| 46 |
+
text = self.EMAIL_PATTERN.sub('', text)
|
| 47 |
+
if self.remove_diacritics:
|
| 48 |
+
text = self.DIACRITICS_PATTERN.sub('', text)
|
| 49 |
+
if self.remove_tatweel:
|
| 50 |
+
text = self.TATWEEL_PATTERN.sub('', text)
|
| 51 |
+
if self.unify_alef:
|
| 52 |
+
text = self.ALEF_PATTERN.sub('ا', text)
|
| 53 |
+
if self.unify_yeh:
|
| 54 |
+
text = self.YEH_PATTERN.sub('ي', text)
|
| 55 |
+
if self.unify_teh_marbuta:
|
| 56 |
+
text = self.TEH_MARBUTA_PATTERN.sub('ه', text)
|
| 57 |
+
if self.remove_repeats:
|
| 58 |
+
text = self.REPEATS_PATTERN.sub(r'\1', text)
|
| 59 |
+
text = self.WHITESPACE_PATTERN.sub(' ', text).strip()
|
| 60 |
+
return text
|
| 61 |
+
|
| 62 |
+
class MorphologicalPreTokenizer:
|
| 63 |
+
"""
|
| 64 |
+
Rule-based Arabic morphological pre-tokenizer.
|
| 65 |
+
Segments Arabic words into prefix-stem-suffix units.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
|
| 69 |
+
SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
|
| 70 |
+
|
| 71 |
+
def __init__(self, min_stem_length: int = 2):
|
| 72 |
+
self.min_stem_length = min_stem_length
|
| 73 |
+
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 74 |
+
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 75 |
+
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
| 76 |
+
|
| 77 |
+
def segment_word(self, word: str) -> List[str]:
|
| 78 |
+
if not word or not self.arabic_pattern.fullmatch(word):
|
| 79 |
+
return [word]
|
| 80 |
+
|
| 81 |
+
original = word
|
| 82 |
+
segments = []
|
| 83 |
+
prefix = ""
|
| 84 |
+
for p in self.prefixes:
|
| 85 |
+
if word.startswith(p) and len(word) - len(p) >= self.min_stem_length:
|
| 86 |
+
prefix = p
|
| 87 |
+
word = word[len(p):]
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
suffix = ""
|
| 91 |
+
for s in self.suffixes:
|
| 92 |
+
if word.endswith(s) and len(word) - len(s) >= self.min_stem_length:
|
| 93 |
+
suffix = s
|
| 94 |
+
word = word[:-len(s)]
|
| 95 |
+
break
|
| 96 |
+
|
| 97 |
+
if prefix: segments.append(prefix)
|
| 98 |
+
segments.append(word)
|
| 99 |
+
if suffix: segments.append(suffix)
|
| 100 |
+
|
| 101 |
+
if len(word) < self.min_stem_length:
|
| 102 |
+
return [original]
|
| 103 |
+
return segments
|
| 104 |
+
|
| 105 |
+
def segment_text(self, text: str) -> str:
|
| 106 |
+
words = text.split()
|
| 107 |
+
segmented_words = []
|
| 108 |
+
for word in words:
|
| 109 |
+
segments = self.segment_word(word)
|
| 110 |
+
segmented_words.append('_'.join(segments))
|
| 111 |
+
return ' '.join(segmented_words)
|
| 112 |
+
|
| 113 |
+
class PhraseMerger:
|
| 114 |
+
"""Detects and merges common word n-grams."""
|
| 115 |
+
|
| 116 |
+
def __init__(self, phrases_file: Optional[str] = None):
|
| 117 |
+
self.phrase_vocab = {}
|
| 118 |
+
self.max_ngram = 3
|
| 119 |
+
self.merge_char = ""
|
| 120 |
+
if phrases_file:
|
| 121 |
+
self.load_phrases(phrases_file)
|
| 122 |
+
|
| 123 |
+
def load_phrases(self, path: str) -> None:
|
| 124 |
+
try:
|
| 125 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 126 |
+
loaded_vocab = json.load(f)
|
| 127 |
+
self.phrase_vocab = {}
|
| 128 |
+
for phrase_str, freq in loaded_vocab.items():
|
| 129 |
+
ngram = tuple(phrase_str.split())
|
| 130 |
+
self.phrase_vocab[ngram] = freq
|
| 131 |
+
self.max_ngram = max(self.max_ngram, len(ngram))
|
| 132 |
+
except FileNotFoundError:
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
def merge_phrases(self, text: str) -> str:
|
| 136 |
+
if not self.phrase_vocab:
|
| 137 |
+
return text
|
| 138 |
+
|
| 139 |
+
words = text.split()
|
| 140 |
+
result = []
|
| 141 |
+
i = 0
|
| 142 |
+
while i < len(words):
|
| 143 |
+
matched = False
|
| 144 |
+
for n in range(self.max_ngram, 1, -1):
|
| 145 |
+
if i + n <= len(words):
|
| 146 |
+
ngram = tuple(words[i:i+n])
|
| 147 |
+
if ngram in self.phrase_vocab:
|
| 148 |
+
result.append(self.merge_char.join(ngram))
|
| 149 |
+
i += n
|
| 150 |
+
matched = True
|
| 151 |
+
break
|
| 152 |
+
if not matched:
|
| 153 |
+
result.append(words[i])
|
| 154 |
+
i += 1
|
| 155 |
+
return ' '.join(result)
|
| 156 |
+
|
| 157 |
+
class DFArcTokenizer(PreTrainedTokenizerFast):
|
| 158 |
+
"""
|
| 159 |
+
DF-Arc: Morphology-aware Arabic Tokenizer.
|
| 160 |
+
Wrapper around PreTrainedTokenizerFast that applies custom normalization,
|
| 161 |
+
morphological segmentation, and phrase merging before tokenization.
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
def __init__(
|
| 165 |
+
self,
|
| 166 |
+
vocab_file=None,
|
| 167 |
+
tokenizer_file=None,
|
| 168 |
+
phrases_file=None,
|
| 169 |
+
normalization_config=None,
|
| 170 |
+
min_stem_length=2,
|
| 171 |
+
**kwargs
|
| 172 |
+
):
|
| 173 |
+
# Initialize helpers
|
| 174 |
+
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 175 |
+
self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
|
| 176 |
+
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 177 |
+
|
| 178 |
+
super().__init__(
|
| 179 |
+
vocab_file=vocab_file,
|
| 180 |
+
tokenizer_file=tokenizer_file,
|
| 181 |
+
**kwargs
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
|
| 185 |
+
# Pre-process batch
|
| 186 |
+
def preprocess(text):
|
| 187 |
+
if not text: return ""
|
| 188 |
+
t = self.normalizer_helper.normalize(text)
|
| 189 |
+
t = self.morph_helper.segment_text(t)
|
| 190 |
+
t = self.phrase_helper.merge_phrases(t)
|
| 191 |
+
return t
|
| 192 |
+
|
| 193 |
+
if isinstance(batch_text_or_text_pairs, str):
|
| 194 |
+
batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
|
| 195 |
+
elif isinstance(batch_text_or_text_pairs, (list, tuple)):
|
| 196 |
+
# Handle text pairs? For now assume list of strings
|
| 197 |
+
processed = []
|
| 198 |
+
for item in batch_text_or_text_pairs:
|
| 199 |
+
if isinstance(item, str):
|
| 200 |
+
processed.append(preprocess(item))
|
| 201 |
+
elif isinstance(item, (list, tuple)): # Pairs
|
| 202 |
+
processed.append((preprocess(item[0]), preprocess(item[1])))
|
| 203 |
+
else:
|
| 204 |
+
processed.append(item)
|
| 205 |
+
batch_text_or_text_pairs = processed
|
| 206 |
+
|
| 207 |
+
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
|
| 208 |
+
|
| 209 |
+
def encode(self, text, *args, **kwargs):
|
| 210 |
+
# We need to intercept single text calls too if they bypass batch_encode_plus
|
| 211 |
+
# But PreTrainedTokenizerFast usually routes through it.
|
| 212 |
+
# However, to be safe, we can manually check 'text' if it's the first arg.
|
| 213 |
+
|
| 214 |
+
# NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
|
| 215 |
+
# We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
|
| 216 |
+
|
| 217 |
+
# The cleanest way is often to override __call__.
|
| 218 |
+
pass
|
| 219 |
+
|
| 220 |
+
def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
|
| 221 |
+
def preprocess(t):
|
| 222 |
+
if not isinstance(t, str): return t
|
| 223 |
+
t = self.normalizer_helper.normalize(t)
|
| 224 |
+
t = self.morph_helper.segment_text(t)
|
| 225 |
+
t = self.phrase_helper.merge_phrases(t)
|
| 226 |
+
return t
|
| 227 |
+
|
| 228 |
+
if isinstance(text, str):
|
| 229 |
+
text = preprocess(text)
|
| 230 |
+
elif isinstance(text, (list, tuple)):
|
| 231 |
+
if len(text) > 0 and isinstance(text[0], str): # List of strings
|
| 232 |
+
text = [preprocess(t) for t in text]
|
| 233 |
+
elif len(text) > 0 and isinstance(text[0], (list, tuple)): # Pairs
|
| 234 |
+
text = [(preprocess(p[0]), preprocess(p[1])) for p in text]
|
| 235 |
+
|
| 236 |
+
return super().__call__(text, *args, **kwargs)
|
| 237 |
+
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4cb94e0dd002d6792ceccf5609bc3f739a751f4281d01bbf4c8af58e1544d77
|
| 3 |
+
size 13422799
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoTokenizer": [
|
| 4 |
+
"tokenization_df_arc.DFArcTokenizer",
|
| 5 |
+
null
|
| 6 |
+
]
|
| 7 |
+
},
|
| 8 |
+
"tokenizer_class": "DFArcTokenizer",
|
| 9 |
+
"phrases_file": "phrases.json",
|
| 10 |
+
"normalization": {
|
| 11 |
+
"unify_alef": true,
|
| 12 |
+
"unify_yeh": true,
|
| 13 |
+
"unify_teh_marbuta": true,
|
| 14 |
+
"remove_diacritics": true,
|
| 15 |
+
"remove_tatweel": true,
|
| 16 |
+
"remove_repeats": true
|
| 17 |
+
},
|
| 18 |
+
"min_stem_length": 2,
|
| 19 |
+
"vocab_size": 256000,
|
| 20 |
+
"model_max_length": 4096
|
| 21 |
+
}
|