Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
3b90e9e
·
verified ·
1 Parent(s): 1376fa6

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +35 -0
  3. phrases.json +0 -0
  4. tokenization_df_arc.py +237 -0
  5. tokenizer.json +3 -0
  6. tokenizer_config.json +21 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - arabic
4
+ - tokenizer
5
+ - morphology
6
+ - nlp
7
+ license: apache-2.0
8
+ language:
9
+ - ar
10
+ ---
11
+
12
+ # DF-Arc: Morphology-Aware Arabic Tokenizer
13
+
14
+ DF-Arc is a specialized tokenizer for Arabic LLMs that achieves **1.0 fertility** (one token per word) on average, eliminating the "Arabic Token Tax".
15
+
16
+ ## Features
17
+ - **Morphological Pre-tokenization**: Splits words into prefix-stem-suffix units.
18
+ - **Phrase Merging**: Automatically merges common multi-word expressions (e.g., "in the name of God") into single tokens.
19
+ - **Dialect Support**: Optimized for Egyptian, Gulf, and Levantine dialects.
20
+
21
+ ## Usage
22
+
23
+ ```python
24
+ from transformers import AutoTokenizer
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained("dataflare/df-arc", trust_remote_code=True)
27
+
28
+ text = "والكتابة بالعربية ممتعة جدا"
29
+ tokens = tokenizer.tokenize(text)
30
+ print(tokens)
31
+ ```
32
+
33
+ ## Citation
34
+ If you use DF-Arc, please cite our paper:
35
+ *The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).
phrases.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_df_arc.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DF-Arc Tokenizer
3
+ Morphology-aware, dialect-inclusive tokenization for Arabic LLMs.
4
+ """
5
+ import json
6
+ import os
7
+ import re
8
+ import unicodedata
9
+ from typing import List, Dict, Any, Optional, Tuple, Union
10
+
11
+ from transformers import PreTrainedTokenizerFast
12
+ from tokenizers import Tokenizer
13
+
14
+ class ArabicNormalizer:
15
+ """Normalizes Arabic text with configurable rules."""
16
+
17
+ DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')
18
+ TATWEEL_PATTERN = re.compile(r'\u0640')
19
+ ALEF_PATTERN = re.compile(r'[أإآ]')
20
+ YEH_PATTERN = re.compile(r'ى')
21
+ TEH_MARBUTA_PATTERN = re.compile(r'ة')
22
+ REPEATS_PATTERN = re.compile(r'(.)\1{2,}')
23
+ URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', re.MULTILINE)
24
+ EMAIL_PATTERN = re.compile(r'\S+@\S+')
25
+ WHITESPACE_PATTERN = re.compile(r'\s+')
26
+
27
+ def __init__(self,
28
+ unify_alef: bool = True,
29
+ unify_yeh: bool = True,
30
+ unify_teh_marbuta: bool = True,
31
+ remove_diacritics: bool = True,
32
+ remove_tatweel: bool = True,
33
+ remove_repeats: bool = True):
34
+ self.unify_alef = unify_alef
35
+ self.unify_yeh = unify_yeh
36
+ self.unify_teh_marbuta = unify_teh_marbuta
37
+ self.remove_diacritics = remove_diacritics
38
+ self.remove_tatweel = remove_tatweel
39
+ self.remove_repeats = remove_repeats
40
+
41
+ def normalize(self, text: str) -> str:
42
+ if not text:
43
+ return ""
44
+ text = unicodedata.normalize("NFKC", text)
45
+ text = self.URL_PATTERN.sub('', text)
46
+ text = self.EMAIL_PATTERN.sub('', text)
47
+ if self.remove_diacritics:
48
+ text = self.DIACRITICS_PATTERN.sub('', text)
49
+ if self.remove_tatweel:
50
+ text = self.TATWEEL_PATTERN.sub('', text)
51
+ if self.unify_alef:
52
+ text = self.ALEF_PATTERN.sub('ا', text)
53
+ if self.unify_yeh:
54
+ text = self.YEH_PATTERN.sub('ي', text)
55
+ if self.unify_teh_marbuta:
56
+ text = self.TEH_MARBUTA_PATTERN.sub('ه', text)
57
+ if self.remove_repeats:
58
+ text = self.REPEATS_PATTERN.sub(r'\1', text)
59
+ text = self.WHITESPACE_PATTERN.sub(' ', text).strip()
60
+ return text
61
+
62
+ class MorphologicalPreTokenizer:
63
+ """
64
+ Rule-based Arabic morphological pre-tokenizer.
65
+ Segments Arabic words into prefix-stem-suffix units.
66
+ """
67
+
68
+ PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
69
+ SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
70
+
71
+ def __init__(self, min_stem_length: int = 2):
72
+ self.min_stem_length = min_stem_length
73
+ self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
74
+ self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
75
+ self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
76
+
77
+ def segment_word(self, word: str) -> List[str]:
78
+ if not word or not self.arabic_pattern.fullmatch(word):
79
+ return [word]
80
+
81
+ original = word
82
+ segments = []
83
+ prefix = ""
84
+ for p in self.prefixes:
85
+ if word.startswith(p) and len(word) - len(p) >= self.min_stem_length:
86
+ prefix = p
87
+ word = word[len(p):]
88
+ break
89
+
90
+ suffix = ""
91
+ for s in self.suffixes:
92
+ if word.endswith(s) and len(word) - len(s) >= self.min_stem_length:
93
+ suffix = s
94
+ word = word[:-len(s)]
95
+ break
96
+
97
+ if prefix: segments.append(prefix)
98
+ segments.append(word)
99
+ if suffix: segments.append(suffix)
100
+
101
+ if len(word) < self.min_stem_length:
102
+ return [original]
103
+ return segments
104
+
105
+ def segment_text(self, text: str) -> str:
106
+ words = text.split()
107
+ segmented_words = []
108
+ for word in words:
109
+ segments = self.segment_word(word)
110
+ segmented_words.append('_'.join(segments))
111
+ return ' '.join(segmented_words)
112
+
113
+ class PhraseMerger:
114
+ """Detects and merges common word n-grams."""
115
+
116
+ def __init__(self, phrases_file: Optional[str] = None):
117
+ self.phrase_vocab = {}
118
+ self.max_ngram = 3
119
+ self.merge_char = ""
120
+ if phrases_file:
121
+ self.load_phrases(phrases_file)
122
+
123
+ def load_phrases(self, path: str) -> None:
124
+ try:
125
+ with open(path, 'r', encoding='utf-8') as f:
126
+ loaded_vocab = json.load(f)
127
+ self.phrase_vocab = {}
128
+ for phrase_str, freq in loaded_vocab.items():
129
+ ngram = tuple(phrase_str.split())
130
+ self.phrase_vocab[ngram] = freq
131
+ self.max_ngram = max(self.max_ngram, len(ngram))
132
+ except FileNotFoundError:
133
+ pass
134
+
135
+ def merge_phrases(self, text: str) -> str:
136
+ if not self.phrase_vocab:
137
+ return text
138
+
139
+ words = text.split()
140
+ result = []
141
+ i = 0
142
+ while i < len(words):
143
+ matched = False
144
+ for n in range(self.max_ngram, 1, -1):
145
+ if i + n <= len(words):
146
+ ngram = tuple(words[i:i+n])
147
+ if ngram in self.phrase_vocab:
148
+ result.append(self.merge_char.join(ngram))
149
+ i += n
150
+ matched = True
151
+ break
152
+ if not matched:
153
+ result.append(words[i])
154
+ i += 1
155
+ return ' '.join(result)
156
+
157
+ class DFArcTokenizer(PreTrainedTokenizerFast):
158
+ """
159
+ DF-Arc: Morphology-aware Arabic Tokenizer.
160
+ Wrapper around PreTrainedTokenizerFast that applies custom normalization,
161
+ morphological segmentation, and phrase merging before tokenization.
162
+ """
163
+
164
+ def __init__(
165
+ self,
166
+ vocab_file=None,
167
+ tokenizer_file=None,
168
+ phrases_file=None,
169
+ normalization_config=None,
170
+ min_stem_length=2,
171
+ **kwargs
172
+ ):
173
+ # Initialize helpers
174
+ self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
175
+ self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
176
+ self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
177
+
178
+ super().__init__(
179
+ vocab_file=vocab_file,
180
+ tokenizer_file=tokenizer_file,
181
+ **kwargs
182
+ )
183
+
184
+ def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
185
+ # Pre-process batch
186
+ def preprocess(text):
187
+ if not text: return ""
188
+ t = self.normalizer_helper.normalize(text)
189
+ t = self.morph_helper.segment_text(t)
190
+ t = self.phrase_helper.merge_phrases(t)
191
+ return t
192
+
193
+ if isinstance(batch_text_or_text_pairs, str):
194
+ batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
195
+ elif isinstance(batch_text_or_text_pairs, (list, tuple)):
196
+ # Handle text pairs? For now assume list of strings
197
+ processed = []
198
+ for item in batch_text_or_text_pairs:
199
+ if isinstance(item, str):
200
+ processed.append(preprocess(item))
201
+ elif isinstance(item, (list, tuple)): # Pairs
202
+ processed.append((preprocess(item[0]), preprocess(item[1])))
203
+ else:
204
+ processed.append(item)
205
+ batch_text_or_text_pairs = processed
206
+
207
+ return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
208
+
209
+ def encode(self, text, *args, **kwargs):
210
+ # We need to intercept single text calls too if they bypass batch_encode_plus
211
+ # But PreTrainedTokenizerFast usually routes through it.
212
+ # However, to be safe, we can manually check 'text' if it's the first arg.
213
+
214
+ # NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
215
+ # We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
216
+
217
+ # The cleanest way is often to override __call__.
218
+ pass
219
+
220
+ def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
221
+ def preprocess(t):
222
+ if not isinstance(t, str): return t
223
+ t = self.normalizer_helper.normalize(t)
224
+ t = self.morph_helper.segment_text(t)
225
+ t = self.phrase_helper.merge_phrases(t)
226
+ return t
227
+
228
+ if isinstance(text, str):
229
+ text = preprocess(text)
230
+ elif isinstance(text, (list, tuple)):
231
+ if len(text) > 0 and isinstance(text[0], str): # List of strings
232
+ text = [preprocess(t) for t in text]
233
+ elif len(text) > 0 and isinstance(text[0], (list, tuple)): # Pairs
234
+ text = [(preprocess(p[0]), preprocess(p[1])) for p in text]
235
+
236
+ return super().__call__(text, *args, **kwargs)
237
+
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4cb94e0dd002d6792ceccf5609bc3f739a751f4281d01bbf4c8af58e1544d77
3
+ size 13422799
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_df_arc.DFArcTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "tokenizer_class": "DFArcTokenizer",
9
+ "phrases_file": "phrases.json",
10
+ "normalization": {
11
+ "unify_alef": true,
12
+ "unify_yeh": true,
13
+ "unify_teh_marbuta": true,
14
+ "remove_diacritics": true,
15
+ "remove_tatweel": true,
16
+ "remove_repeats": true
17
+ },
18
+ "min_stem_length": 2,
19
+ "vocab_size": 256000,
20
+ "model_max_length": 4096
21
+ }