Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +35 -0
phrases.json +0 -0
tokenization_df_arc.py +237 -0
tokenizer.json +3 -0
tokenizer_config.json +21 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+tags:
+- arabic
+- tokenizer
+- morphology
+- nlp
+license: apache-2.0
+language:
+- ar
+---
+# DF-Arc: Morphology-Aware Arabic Tokenizer
+DF-Arc is a specialized tokenizer for Arabic LLMs that achieves **1.0 fertility** (one token per word) on average, eliminating the "Arabic Token Tax".
+## Features
+- **Morphological Pre-tokenization**: Splits words into prefix-stem-suffix units.
+- **Phrase Merging**: Automatically merges common multi-word expressions (e.g., "in the name of God") into single tokens.
+- **Dialect Support**: Optimized for Egyptian, Gulf, and Levantine dialects.
+## Usage
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("dataflare/df-arc", trust_remote_code=True)
+text = "والكتابة بالعربية ممتعة جدا"
+tokens = tokenizer.tokenize(text)
+print(tokens)
+```
+## Citation
+If you use DF-Arc, please cite our paper:
+*The Arabic Token Tax: Quantifying Tokenization Inefficiency in Large Language Models* (Dataflare Lab, 2026).

phrases.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenization_df_arc.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+DF-Arc Tokenizer
+Morphology-aware, dialect-inclusive tokenization for Arabic LLMs.
+"""
+import json
+import os
+import re
+import unicodedata
+from typing import List, Dict, Any, Optional, Tuple, Union
+from transformers import PreTrainedTokenizerFast
+from tokenizers import Tokenizer
+class ArabicNormalizer:
+    """Normalizes Arabic text with configurable rules."""
+    DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')
+    TATWEEL_PATTERN = re.compile(r'\u0640')
+    ALEF_PATTERN = re.compile(r'[أإآ]')
+    YEH_PATTERN = re.compile(r'ى')
+    TEH_MARBUTA_PATTERN = re.compile(r'ة')
+    REPEATS_PATTERN = re.compile(r'(.)\1{2,}')
+    URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', re.MULTILINE)
+    EMAIL_PATTERN = re.compile(r'\S+@\S+')
+    WHITESPACE_PATTERN = re.compile(r'\s+')
+    def __init__(self,
+                 unify_alef: bool = True,
+                 unify_yeh: bool = True,
+                 unify_teh_marbuta: bool = True,
+                 remove_diacritics: bool = True,
+                 remove_tatweel: bool = True,
+                 remove_repeats: bool = True):
+        self.unify_alef = unify_alef
+        self.unify_yeh = unify_yeh
+        self.unify_teh_marbuta = unify_teh_marbuta
+        self.remove_diacritics = remove_diacritics
+        self.remove_tatweel = remove_tatweel
+        self.remove_repeats = remove_repeats
+    def normalize(self, text: str) -> str:
+        if not text:
+            return ""
+        text = unicodedata.normalize("NFKC", text)
+        text = self.URL_PATTERN.sub('', text)
+        text = self.EMAIL_PATTERN.sub('', text)
+        if self.remove_diacritics:
+            text = self.DIACRITICS_PATTERN.sub('', text)
+        if self.remove_tatweel:
+            text = self.TATWEEL_PATTERN.sub('', text)
+        if self.unify_alef:
+            text = self.ALEF_PATTERN.sub('ا', text)
+        if self.unify_yeh:
+            text = self.YEH_PATTERN.sub('ي', text)
+        if self.unify_teh_marbuta:
+            text = self.TEH_MARBUTA_PATTERN.sub('ه', text)
+        if self.remove_repeats:
+            text = self.REPEATS_PATTERN.sub(r'\1', text)
+        text = self.WHITESPACE_PATTERN.sub(' ', text).strip()
+        return text
+class MorphologicalPreTokenizer:
+    """
+    Rule-based Arabic morphological pre-tokenizer.
+    Segments Arabic words into prefix-stem-suffix units.
+    """
+    PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
+    SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
+    def __init__(self, min_stem_length: int = 2):
+        self.min_stem_length = min_stem_length
+        self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
+        self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
+        self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
+    def segment_word(self, word: str) -> List[str]:
+        if not word or not self.arabic_pattern.fullmatch(word):
+            return [word]
+        original = word
+        segments = []
+        prefix = ""
+        for p in self.prefixes:
+            if word.startswith(p) and len(word) - len(p) >= self.min_stem_length:
+                prefix = p
+                word = word[len(p):]
+                break
+        suffix = ""
+        for s in self.suffixes:
+            if word.endswith(s) and len(word) - len(s) >= self.min_stem_length:
+                suffix = s
+                word = word[:-len(s)]
+                break
+        if prefix: segments.append(prefix)
+        segments.append(word)
+        if suffix: segments.append(suffix)
+        if len(word) < self.min_stem_length:
+            return [original]
+        return segments
+    def segment_text(self, text: str) -> str:
+        words = text.split()
+        segmented_words = []
+        for word in words:
+            segments = self.segment_word(word)
+            segmented_words.append('_'.join(segments))
+        return ' '.join(segmented_words)
+class PhraseMerger:
+    """Detects and merges common word n-grams."""
+    def __init__(self, phrases_file: Optional[str] = None):
+        self.phrase_vocab = {}
+        self.max_ngram = 3
+        self.merge_char = ""
+        if phrases_file:
+            self.load_phrases(phrases_file)
+    def load_phrases(self, path: str) -> None:
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                loaded_vocab = json.load(f)
+            self.phrase_vocab = {}
+            for phrase_str, freq in loaded_vocab.items():
+                ngram = tuple(phrase_str.split())
+                self.phrase_vocab[ngram] = freq
+                self.max_ngram = max(self.max_ngram, len(ngram))
+        except FileNotFoundError:
+            pass
+    def merge_phrases(self, text: str) -> str:
+        if not self.phrase_vocab:
+            return text
+        words = text.split()
+        result = []
+        i = 0
+        while i < len(words):
+            matched = False
+            for n in range(self.max_ngram, 1, -1):
+                if i + n <= len(words):
+                    ngram = tuple(words[i:i+n])
+                    if ngram in self.phrase_vocab:
+                        result.append(self.merge_char.join(ngram))
+                        i += n
+                        matched = True
+                        break
+            if not matched:
+                result.append(words[i])
+                i += 1
+        return ' '.join(result)
+class DFArcTokenizer(PreTrainedTokenizerFast):
+    """
+    DF-Arc: Morphology-aware Arabic Tokenizer.
+    Wrapper around PreTrainedTokenizerFast that applies custom normalization,
+    morphological segmentation, and phrase merging before tokenization.
+    """
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        phrases_file=None,
+        normalization_config=None,
+        min_stem_length=2,
+        **kwargs
+    ):
+        # Initialize helpers
+        self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
+        self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
+        self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            **kwargs
+        )
+    def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
+        # Pre-process batch
+        def preprocess(text):
+            if not text: return ""
+            t = self.normalizer_helper.normalize(text)
+            t = self.morph_helper.segment_text(t)
+            t = self.phrase_helper.merge_phrases(t)
+            return t
+        if isinstance(batch_text_or_text_pairs, str):
+            batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
+        elif isinstance(batch_text_or_text_pairs, (list, tuple)):
+            # Handle text pairs? For now assume list of strings
+            processed = []
+            for item in batch_text_or_text_pairs:
+                if isinstance(item, str):
+                    processed.append(preprocess(item))
+                elif isinstance(item, (list, tuple)): # Pairs
+                    processed.append((preprocess(item[0]), preprocess(item[1])))
+                else:
+                    processed.append(item)
+            batch_text_or_text_pairs = processed
+        return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
+    def encode(self, text, *args, **kwargs):
+        # We need to intercept single text calls too if they bypass batch_encode_plus
+        # But PreTrainedTokenizerFast usually routes through it.
+        # However, to be safe, we can manually check 'text' if it's the first arg.
+        # NOTE: standard 'encode' calls _encode_plus (slow) or backend (fast).
+        # We are subclassing Fast, so we need to ensure inputs to the backend are pre-processed.
+        # The cleanest way is often to override __call__.
+        pass
+    def __call__(self, text: Union[str, List[str], List[List[str]]], *args, **kwargs):
+        def preprocess(t):
+            if not isinstance(t, str): return t
+            t = self.normalizer_helper.normalize(t)
+            t = self.morph_helper.segment_text(t)
+            t = self.phrase_helper.merge_phrases(t)
+            return t
+        if isinstance(text, str):
+            text = preprocess(text)
+        elif isinstance(text, (list, tuple)):
+            if len(text) > 0 and isinstance(text[0], str): # List of strings
+                 text = [preprocess(t) for t in text]
+            elif len(text) > 0 and isinstance(text[0], (list, tuple)): # Pairs
+                 text = [(preprocess(p[0]), preprocess(p[1])) for p in text]
+        return super().__call__(text, *args, **kwargs)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4cb94e0dd002d6792ceccf5609bc3f739a751f4281d01bbf4c8af58e1544d77
+size 13422799

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_df_arc.DFArcTokenizer",
+      null
+    ]
+  },
+  "tokenizer_class": "DFArcTokenizer",
+  "phrases_file": "phrases.json",
+  "normalization": {
+    "unify_alef": true,
+    "unify_yeh": true,
+    "unify_teh_marbuta": true,
+    "remove_diacritics": true,
+    "remove_tatweel": true,
+    "remove_repeats": true
+  },
+  "min_stem_length": 2,
+  "vocab_size": 256000,
+  "model_max_length": 4096
+}