Delete tokenization_df_arc.py
Browse files- tokenization_df_arc.py +0 -279
tokenization_df_arc.py
DELETED
|
@@ -1,279 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
DF-Arc Tokenizer
|
| 3 |
-
Morphology-aware, dialect-inclusive tokenization for Arabic LLMs.
|
| 4 |
-
"""
|
| 5 |
-
import json
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import unicodedata
|
| 9 |
-
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 10 |
-
|
| 11 |
-
from transformers import PreTrainedTokenizerFast
|
| 12 |
-
from tokenizers import Tokenizer
|
| 13 |
-
|
| 14 |
-
class ArabicNormalizer:
|
| 15 |
-
"""Normalizes Arabic text with configurable rules."""
|
| 16 |
-
|
| 17 |
-
DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')
|
| 18 |
-
TATWEEL_PATTERN = re.compile(r'\u0640')
|
| 19 |
-
ALEF_PATTERN = re.compile(r'[أإآ]')
|
| 20 |
-
YEH_PATTERN = re.compile(r'ى')
|
| 21 |
-
TEH_MARBUTA_PATTERN = re.compile(r'ة')
|
| 22 |
-
REPEATS_PATTERN = re.compile(r'(.)\1{2,}')
|
| 23 |
-
URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', re.MULTILINE)
|
| 24 |
-
EMAIL_PATTERN = re.compile(r'\S+@\S+')
|
| 25 |
-
WHITESPACE_PATTERN = re.compile(r'\s+')
|
| 26 |
-
|
| 27 |
-
def __init__(self,
|
| 28 |
-
unify_alef: bool = True,
|
| 29 |
-
unify_yeh: bool = True,
|
| 30 |
-
unify_teh_marbuta: bool = True,
|
| 31 |
-
remove_diacritics: bool = True,
|
| 32 |
-
remove_tatweel: bool = True,
|
| 33 |
-
remove_repeats: bool = True):
|
| 34 |
-
self.unify_alef = unify_alef
|
| 35 |
-
self.unify_yeh = unify_yeh
|
| 36 |
-
self.unify_teh_marbuta = unify_teh_marbuta
|
| 37 |
-
self.remove_diacritics = remove_diacritics
|
| 38 |
-
self.remove_tatweel = remove_tatweel
|
| 39 |
-
self.remove_repeats = remove_repeats
|
| 40 |
-
|
| 41 |
-
def normalize(self, text: str) -> str:
|
| 42 |
-
if not text:
|
| 43 |
-
return ""
|
| 44 |
-
text = unicodedata.normalize("NFKC", text)
|
| 45 |
-
text = self.URL_PATTERN.sub('', text)
|
| 46 |
-
text = self.EMAIL_PATTERN.sub('', text)
|
| 47 |
-
if self.remove_diacritics:
|
| 48 |
-
text = self.DIACRITICS_PATTERN.sub('', text)
|
| 49 |
-
if self.remove_tatweel:
|
| 50 |
-
text = self.TATWEEL_PATTERN.sub('', text)
|
| 51 |
-
if self.unify_alef:
|
| 52 |
-
text = self.ALEF_PATTERN.sub('ا', text)
|
| 53 |
-
if self.unify_yeh:
|
| 54 |
-
text = self.YEH_PATTERN.sub('ي', text)
|
| 55 |
-
if self.unify_teh_marbuta:
|
| 56 |
-
text = self.TEH_MARBUTA_PATTERN.sub('ه', text)
|
| 57 |
-
if self.remove_repeats:
|
| 58 |
-
text = self.REPEATS_PATTERN.sub(r'\1', text)
|
| 59 |
-
text = self.WHITESPACE_PATTERN.sub(' ', text).strip()
|
| 60 |
-
return text
|
| 61 |
-
|
| 62 |
-
class MorphologicalPreTokenizer:
|
| 63 |
-
"""
|
| 64 |
-
Rule-based Arabic morphological pre-tokenizer.
|
| 65 |
-
Segments Arabic words into prefix-stem-suffix units.
|
| 66 |
-
"""
|
| 67 |
-
|
| 68 |
-
PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
|
| 69 |
-
SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
|
| 70 |
-
|
| 71 |
-
# Common entities/words to protect from segmentation (embedded fallback)
|
| 72 |
-
DEFAULT_EXCEPTIONS = {
|
| 73 |
-
"الله", "محمد", "عبدالله", "عبدالرحمن", "مكة", "بغداد", "دمشق", "القاهرة", "بيروت", "عمان",
|
| 74 |
-
"الرياض", "جدة", "الكويت", "دبي", "أبوظبي", "المنامة", "الدوحة", "مسقط", "ليبيا", "تونس",
|
| 75 |
-
"الجزائر", "المغرب", "فلسطين", "الأردن", "لبنان", "سوريا", "العراق", "مصر", "السودان", "اليمن",
|
| 76 |
-
"أمريكا", "أوروبا", "آسيا", "أفريقيا", "ترامب", "بايدن", "جوجل", "فيسبوك", "أمازون", "مايكروسوفت",
|
| 77 |
-
"أبل", "سامسونج", "سوني", "هواوي", "مرسيدس", "بي إم دبليو", "تويوتا", "هوندا", "فورد", "شيفروليه",
|
| 78 |
-
"تسلا", "ناسا", "إيلون ماسك", "مارك زوكربيرج", "بيل جيتس", "ستيف جوبز", "ألبرت أينشتاين",
|
| 79 |
-
"إسحاق نيوتن", "داروين", "بيتهوفن", "موتزارت", "شكسبير", "دوستويفسكي", "تولستوي", "نجيب محفوظ",
|
| 80 |
-
"طه حسين", "العقاد", "المنفلوطي", "جبران خليل جبران", "محمود درويش", "نزار قباني"
|
| 81 |
-
}
|
| 82 |
-
|
| 83 |
-
def __init__(self, min_stem_length: int = 2, exceptions: Optional[List[str]] = None):
|
| 84 |
-
self.min_stem_length = min_stem_length
|
| 85 |
-
# Merge user exceptions with defaults using frozenset for immutability and O(1) lookups
|
| 86 |
-
user_exceptions = set(exceptions) if exceptions else set()
|
| 87 |
-
self.exceptions = frozenset(self.DEFAULT_EXCEPTIONS.union(user_exceptions))
|
| 88 |
-
|
| 89 |
-
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 90 |
-
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 91 |
-
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
| 92 |
-
|
| 93 |
-
def segment_word(self, word: str) -> List[str]:
|
| 94 |
-
if not word or not self.arabic_pattern.fullmatch(word):
|
| 95 |
-
return [word]
|
| 96 |
-
|
| 97 |
-
if word in self.exceptions:
|
| 98 |
-
return [word]
|
| 99 |
-
|
| 100 |
-
original = word
|
| 101 |
-
segments = []
|
| 102 |
-
prefix = ""
|
| 103 |
-
for p in self.prefixes:
|
| 104 |
-
if word.startswith(p) and len(word) - len(p) >= self.min_stem_length:
|
| 105 |
-
prefix = p
|
| 106 |
-
word = word[len(p):]
|
| 107 |
-
break
|
| 108 |
-
|
| 109 |
-
suffix = ""
|
| 110 |
-
for s in self.suffixes:
|
| 111 |
-
if word.endswith(s) and len(word) - len(s) >= self.min_stem_length:
|
| 112 |
-
suffix = s
|
| 113 |
-
word = word[:-len(s)]
|
| 114 |
-
break
|
| 115 |
-
|
| 116 |
-
if prefix: segments.append(prefix)
|
| 117 |
-
segments.append(word)
|
| 118 |
-
if suffix: segments.append(suffix)
|
| 119 |
-
|
| 120 |
-
if len(word) < self.min_stem_length:
|
| 121 |
-
return [original]
|
| 122 |
-
return segments
|
| 123 |
-
|
| 124 |
-
def segment_text(self, text: str) -> str:
|
| 125 |
-
words = text.split()
|
| 126 |
-
segmented_words = [
|
| 127 |
-
'_'.join(self.segment_word(word)) for word in words
|
| 128 |
-
]
|
| 129 |
-
return ' '.join(segmented_words)
|
| 130 |
-
|
| 131 |
-
class PhraseMerger:
|
| 132 |
-
"""Detects and merges common word n-grams."""
|
| 133 |
-
|
| 134 |
-
def __init__(self, phrases_file: Optional[str] = None):
|
| 135 |
-
self.phrase_vocab = {}
|
| 136 |
-
self.max_ngram = 3
|
| 137 |
-
self.merge_char = ""
|
| 138 |
-
if phrases_file:
|
| 139 |
-
self.load_phrases(phrases_file)
|
| 140 |
-
|
| 141 |
-
def load_phrases(self, path: str) -> None:
|
| 142 |
-
try:
|
| 143 |
-
with open(path, 'r', encoding='utf-8') as f:
|
| 144 |
-
loaded_vocab = json.load(f)
|
| 145 |
-
self.phrase_vocab = {}
|
| 146 |
-
for phrase_str, freq in loaded_vocab.items():
|
| 147 |
-
ngram = tuple(phrase_str.split())
|
| 148 |
-
self.phrase_vocab[ngram] = freq
|
| 149 |
-
self.max_ngram = max(self.max_ngram, len(ngram))
|
| 150 |
-
except FileNotFoundError:
|
| 151 |
-
pass
|
| 152 |
-
|
| 153 |
-
def merge_phrases(self, text: str) -> str:
|
| 154 |
-
if not self.phrase_vocab:
|
| 155 |
-
return text
|
| 156 |
-
|
| 157 |
-
words = text.split()
|
| 158 |
-
result = []
|
| 159 |
-
i = 0
|
| 160 |
-
while i < len(words):
|
| 161 |
-
matched = False
|
| 162 |
-
for n in range(self.max_ngram, 1, -1):
|
| 163 |
-
if i + n <= len(words):
|
| 164 |
-
ngram = tuple(words[i:i+n])
|
| 165 |
-
if ngram in self.phrase_vocab:
|
| 166 |
-
result.append(self.merge_char.join(ngram))
|
| 167 |
-
i += n
|
| 168 |
-
matched = True
|
| 169 |
-
break
|
| 170 |
-
if not matched:
|
| 171 |
-
result.append(words[i])
|
| 172 |
-
i += 1
|
| 173 |
-
return ' '.join(result)
|
| 174 |
-
|
| 175 |
-
class DFArcTokenizer(PreTrainedTokenizerFast):
|
| 176 |
-
"""
|
| 177 |
-
DF-Arc: Morphology-aware Arabic Tokenizer.
|
| 178 |
-
Wrapper around PreTrainedTokenizerFast that applies custom normalization,
|
| 179 |
-
morphological segmentation, and phrase merging before tokenization.
|
| 180 |
-
"""
|
| 181 |
-
|
| 182 |
-
vocab_files_names = {
|
| 183 |
-
"vocab_file": "tokenizer.json",
|
| 184 |
-
"tokenizer_file": "tokenizer.json",
|
| 185 |
-
"phrases_file": "phrase_vocab.json"
|
| 186 |
-
}
|
| 187 |
-
|
| 188 |
-
def __init__(
|
| 189 |
-
self,
|
| 190 |
-
vocab_file: Optional[str] = None,
|
| 191 |
-
tokenizer_file: Optional[str] = None,
|
| 192 |
-
phrases_file: Optional[str] = None,
|
| 193 |
-
normalization_config: Optional[Dict[str, bool]] = None,
|
| 194 |
-
min_stem_length: int = 2,
|
| 195 |
-
exceptions_file: Optional[str] = None,
|
| 196 |
-
**kwargs
|
| 197 |
-
):
|
| 198 |
-
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 199 |
-
|
| 200 |
-
# Load user-provided exceptions if file exists
|
| 201 |
-
user_exceptions = []
|
| 202 |
-
if exceptions_file and os.path.exists(exceptions_file):
|
| 203 |
-
try:
|
| 204 |
-
with open(exceptions_file, 'r', encoding='utf-8') as f:
|
| 205 |
-
user_exceptions = [line.strip() for line in f if line.strip()]
|
| 206 |
-
except OSError:
|
| 207 |
-
# If file read fails, we just won't have custom exceptions
|
| 208 |
-
# The MorphologicalPreTokenizer has embedded defaults now.
|
| 209 |
-
pass
|
| 210 |
-
|
| 211 |
-
self.morph_helper = MorphologicalPreTokenizer(
|
| 212 |
-
min_stem_length=min_stem_length,
|
| 213 |
-
exceptions=user_exceptions
|
| 214 |
-
)
|
| 215 |
-
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 216 |
-
|
| 217 |
-
super().__init__(
|
| 218 |
-
vocab_file=vocab_file,
|
| 219 |
-
tokenizer_file=tokenizer_file,
|
| 220 |
-
**kwargs
|
| 221 |
-
)
|
| 222 |
-
|
| 223 |
-
def _batch_encode_plus(self, batch_text_or_text_pairs: Union[str, List[str], List[Tuple[str, str]]], *args, **kwargs):
|
| 224 |
-
def preprocess(text: str) -> str:
|
| 225 |
-
if not text:
|
| 226 |
-
return ""
|
| 227 |
-
t = self.normalizer_helper.normalize(text)
|
| 228 |
-
t = self.morph_helper.segment_text(t)
|
| 229 |
-
t = self.phrase_helper.merge_phrases(t)
|
| 230 |
-
return t
|
| 231 |
-
|
| 232 |
-
if isinstance(batch_text_or_text_pairs, str):
|
| 233 |
-
batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
|
| 234 |
-
elif isinstance(batch_text_or_text_pairs, (list, tuple)):
|
| 235 |
-
processed = []
|
| 236 |
-
for item in batch_text_or_text_pairs:
|
| 237 |
-
if isinstance(item, str):
|
| 238 |
-
processed.append(preprocess(item))
|
| 239 |
-
elif isinstance(item, (list, tuple)):
|
| 240 |
-
processed.append((preprocess(item[0]), preprocess(item[1])))
|
| 241 |
-
else:
|
| 242 |
-
processed.append(item)
|
| 243 |
-
batch_text_or_text_pairs = processed
|
| 244 |
-
|
| 245 |
-
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
|
| 246 |
-
|
| 247 |
-
def encode(self, text, *args, **kwargs):
|
| 248 |
-
if isinstance(text, str):
|
| 249 |
-
text = self.normalizer_helper.normalize(text)
|
| 250 |
-
text = self.morph_helper.segment_text(text)
|
| 251 |
-
text = self.phrase_helper.merge_phrases(text)
|
| 252 |
-
return super().encode(text, *args, **kwargs)
|
| 253 |
-
|
| 254 |
-
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
|
| 255 |
-
"""
|
| 256 |
-
Override decode to force use of convert_tokens_to_string for readable output.
|
| 257 |
-
"""
|
| 258 |
-
# Ensure token_ids is a list of ints
|
| 259 |
-
if isinstance(token_ids, int):
|
| 260 |
-
token_ids = [token_ids]
|
| 261 |
-
|
| 262 |
-
# Convert to tokens
|
| 263 |
-
tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
| 264 |
-
|
| 265 |
-
# Convert to string using our custom logic
|
| 266 |
-
return self.convert_tokens_to_string(tokens)
|
| 267 |
-
|
| 268 |
-
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 269 |
-
"""Converts a sequence of tokens into a single string."""
|
| 270 |
-
text = " ".join(tokens)
|
| 271 |
-
|
| 272 |
-
# Remove internal morphological underscores (e.g., 'w_s_y' -> 'wsy')
|
| 273 |
-
# We use a regex to ensure we only remove underscores that are
|
| 274 |
-
# acting as connectors between Arabic segments, preserving snake_case.
|
| 275 |
-
arabic_range = r'[\u0600-\u06FF]'
|
| 276 |
-
return re.sub(rf'(?<={arabic_range})_|_(?={arabic_range})', '', text)
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|