Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
7956c14
·
verified ·
1 Parent(s): 46d7f48

Delete tokenization_df_arc.py

Browse files
Files changed (1) hide show
  1. tokenization_df_arc.py +0 -279
tokenization_df_arc.py DELETED
@@ -1,279 +0,0 @@
1
- """
2
- DF-Arc Tokenizer
3
- Morphology-aware, dialect-inclusive tokenization for Arabic LLMs.
4
- """
5
- import json
6
- import os
7
- import re
8
- import unicodedata
9
- from typing import List, Dict, Any, Optional, Tuple, Union
10
-
11
- from transformers import PreTrainedTokenizerFast
12
- from tokenizers import Tokenizer
13
-
14
- class ArabicNormalizer:
15
- """Normalizes Arabic text with configurable rules."""
16
-
17
- DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')
18
- TATWEEL_PATTERN = re.compile(r'\u0640')
19
- ALEF_PATTERN = re.compile(r'[أإآ]')
20
- YEH_PATTERN = re.compile(r'ى')
21
- TEH_MARBUTA_PATTERN = re.compile(r'ة')
22
- REPEATS_PATTERN = re.compile(r'(.)\1{2,}')
23
- URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', re.MULTILINE)
24
- EMAIL_PATTERN = re.compile(r'\S+@\S+')
25
- WHITESPACE_PATTERN = re.compile(r'\s+')
26
-
27
- def __init__(self,
28
- unify_alef: bool = True,
29
- unify_yeh: bool = True,
30
- unify_teh_marbuta: bool = True,
31
- remove_diacritics: bool = True,
32
- remove_tatweel: bool = True,
33
- remove_repeats: bool = True):
34
- self.unify_alef = unify_alef
35
- self.unify_yeh = unify_yeh
36
- self.unify_teh_marbuta = unify_teh_marbuta
37
- self.remove_diacritics = remove_diacritics
38
- self.remove_tatweel = remove_tatweel
39
- self.remove_repeats = remove_repeats
40
-
41
- def normalize(self, text: str) -> str:
42
- if not text:
43
- return ""
44
- text = unicodedata.normalize("NFKC", text)
45
- text = self.URL_PATTERN.sub('', text)
46
- text = self.EMAIL_PATTERN.sub('', text)
47
- if self.remove_diacritics:
48
- text = self.DIACRITICS_PATTERN.sub('', text)
49
- if self.remove_tatweel:
50
- text = self.TATWEEL_PATTERN.sub('', text)
51
- if self.unify_alef:
52
- text = self.ALEF_PATTERN.sub('ا', text)
53
- if self.unify_yeh:
54
- text = self.YEH_PATTERN.sub('ي', text)
55
- if self.unify_teh_marbuta:
56
- text = self.TEH_MARBUTA_PATTERN.sub('ه', text)
57
- if self.remove_repeats:
58
- text = self.REPEATS_PATTERN.sub(r'\1', text)
59
- text = self.WHITESPACE_PATTERN.sub(' ', text).strip()
60
- return text
61
-
62
- class MorphologicalPreTokenizer:
63
- """
64
- Rule-based Arabic morphological pre-tokenizer.
65
- Segments Arabic words into prefix-stem-suffix units.
66
- """
67
-
68
- PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
69
- SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
70
-
71
- # Common entities/words to protect from segmentation (embedded fallback)
72
- DEFAULT_EXCEPTIONS = {
73
- "الله", "محمد", "عبدالله", "عبدالرحمن", "مكة", "بغداد", "دمشق", "القاهرة", "بيروت", "عمان",
74
- "الرياض", "جدة", "الكويت", "دبي", "أبوظبي", "المنامة", "الدوحة", "مسقط", "ليبيا", "تونس",
75
- "الجزائر", "المغرب", "فلسطين", "الأردن", "لبنان", "سوريا", "العراق", "مصر", "السودان", "اليمن",
76
- "أمريكا", "أوروبا", "آسيا", "أفريقيا", "ترامب", "بايدن", "جوجل", "فيسبوك", "أمازون", "مايكروسوفت",
77
- "أبل", "سامسونج", "سوني", "هواوي", "مرسيدس", "بي إم دبليو", "تويوتا", "هوندا", "فورد", "شيفروليه",
78
- "تسلا", "ناسا", "إيلون ماسك", "مارك زوكربيرج", "بيل جيتس", "ستيف جوبز", "ألبرت أينشتاين",
79
- "إسحاق نيوتن", "داروين", "بيتهوفن", "موتزارت", "شكسبير", "دوستويفسكي", "تولستوي", "نجيب محفوظ",
80
- "طه حسين", "العقاد", "المنفلوطي", "جبران خليل جبران", "محمود درويش", "نزار قباني"
81
- }
82
-
83
- def __init__(self, min_stem_length: int = 2, exceptions: Optional[List[str]] = None):
84
- self.min_stem_length = min_stem_length
85
- # Merge user exceptions with defaults using frozenset for immutability and O(1) lookups
86
- user_exceptions = set(exceptions) if exceptions else set()
87
- self.exceptions = frozenset(self.DEFAULT_EXCEPTIONS.union(user_exceptions))
88
-
89
- self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
90
- self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
91
- self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
92
-
93
- def segment_word(self, word: str) -> List[str]:
94
- if not word or not self.arabic_pattern.fullmatch(word):
95
- return [word]
96
-
97
- if word in self.exceptions:
98
- return [word]
99
-
100
- original = word
101
- segments = []
102
- prefix = ""
103
- for p in self.prefixes:
104
- if word.startswith(p) and len(word) - len(p) >= self.min_stem_length:
105
- prefix = p
106
- word = word[len(p):]
107
- break
108
-
109
- suffix = ""
110
- for s in self.suffixes:
111
- if word.endswith(s) and len(word) - len(s) >= self.min_stem_length:
112
- suffix = s
113
- word = word[:-len(s)]
114
- break
115
-
116
- if prefix: segments.append(prefix)
117
- segments.append(word)
118
- if suffix: segments.append(suffix)
119
-
120
- if len(word) < self.min_stem_length:
121
- return [original]
122
- return segments
123
-
124
- def segment_text(self, text: str) -> str:
125
- words = text.split()
126
- segmented_words = [
127
- '_'.join(self.segment_word(word)) for word in words
128
- ]
129
- return ' '.join(segmented_words)
130
-
131
- class PhraseMerger:
132
- """Detects and merges common word n-grams."""
133
-
134
- def __init__(self, phrases_file: Optional[str] = None):
135
- self.phrase_vocab = {}
136
- self.max_ngram = 3
137
- self.merge_char = ""
138
- if phrases_file:
139
- self.load_phrases(phrases_file)
140
-
141
- def load_phrases(self, path: str) -> None:
142
- try:
143
- with open(path, 'r', encoding='utf-8') as f:
144
- loaded_vocab = json.load(f)
145
- self.phrase_vocab = {}
146
- for phrase_str, freq in loaded_vocab.items():
147
- ngram = tuple(phrase_str.split())
148
- self.phrase_vocab[ngram] = freq
149
- self.max_ngram = max(self.max_ngram, len(ngram))
150
- except FileNotFoundError:
151
- pass
152
-
153
- def merge_phrases(self, text: str) -> str:
154
- if not self.phrase_vocab:
155
- return text
156
-
157
- words = text.split()
158
- result = []
159
- i = 0
160
- while i < len(words):
161
- matched = False
162
- for n in range(self.max_ngram, 1, -1):
163
- if i + n <= len(words):
164
- ngram = tuple(words[i:i+n])
165
- if ngram in self.phrase_vocab:
166
- result.append(self.merge_char.join(ngram))
167
- i += n
168
- matched = True
169
- break
170
- if not matched:
171
- result.append(words[i])
172
- i += 1
173
- return ' '.join(result)
174
-
175
- class DFArcTokenizer(PreTrainedTokenizerFast):
176
- """
177
- DF-Arc: Morphology-aware Arabic Tokenizer.
178
- Wrapper around PreTrainedTokenizerFast that applies custom normalization,
179
- morphological segmentation, and phrase merging before tokenization.
180
- """
181
-
182
- vocab_files_names = {
183
- "vocab_file": "tokenizer.json",
184
- "tokenizer_file": "tokenizer.json",
185
- "phrases_file": "phrase_vocab.json"
186
- }
187
-
188
- def __init__(
189
- self,
190
- vocab_file: Optional[str] = None,
191
- tokenizer_file: Optional[str] = None,
192
- phrases_file: Optional[str] = None,
193
- normalization_config: Optional[Dict[str, bool]] = None,
194
- min_stem_length: int = 2,
195
- exceptions_file: Optional[str] = None,
196
- **kwargs
197
- ):
198
- self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
199
-
200
- # Load user-provided exceptions if file exists
201
- user_exceptions = []
202
- if exceptions_file and os.path.exists(exceptions_file):
203
- try:
204
- with open(exceptions_file, 'r', encoding='utf-8') as f:
205
- user_exceptions = [line.strip() for line in f if line.strip()]
206
- except OSError:
207
- # If file read fails, we just won't have custom exceptions
208
- # The MorphologicalPreTokenizer has embedded defaults now.
209
- pass
210
-
211
- self.morph_helper = MorphologicalPreTokenizer(
212
- min_stem_length=min_stem_length,
213
- exceptions=user_exceptions
214
- )
215
- self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
216
-
217
- super().__init__(
218
- vocab_file=vocab_file,
219
- tokenizer_file=tokenizer_file,
220
- **kwargs
221
- )
222
-
223
- def _batch_encode_plus(self, batch_text_or_text_pairs: Union[str, List[str], List[Tuple[str, str]]], *args, **kwargs):
224
- def preprocess(text: str) -> str:
225
- if not text:
226
- return ""
227
- t = self.normalizer_helper.normalize(text)
228
- t = self.morph_helper.segment_text(t)
229
- t = self.phrase_helper.merge_phrases(t)
230
- return t
231
-
232
- if isinstance(batch_text_or_text_pairs, str):
233
- batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
234
- elif isinstance(batch_text_or_text_pairs, (list, tuple)):
235
- processed = []
236
- for item in batch_text_or_text_pairs:
237
- if isinstance(item, str):
238
- processed.append(preprocess(item))
239
- elif isinstance(item, (list, tuple)):
240
- processed.append((preprocess(item[0]), preprocess(item[1])))
241
- else:
242
- processed.append(item)
243
- batch_text_or_text_pairs = processed
244
-
245
- return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
246
-
247
- def encode(self, text, *args, **kwargs):
248
- if isinstance(text, str):
249
- text = self.normalizer_helper.normalize(text)
250
- text = self.morph_helper.segment_text(text)
251
- text = self.phrase_helper.merge_phrases(text)
252
- return super().encode(text, *args, **kwargs)
253
-
254
- def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
255
- """
256
- Override decode to force use of convert_tokens_to_string for readable output.
257
- """
258
- # Ensure token_ids is a list of ints
259
- if isinstance(token_ids, int):
260
- token_ids = [token_ids]
261
-
262
- # Convert to tokens
263
- tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
264
-
265
- # Convert to string using our custom logic
266
- return self.convert_tokens_to_string(tokens)
267
-
268
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
269
- """Converts a sequence of tokens into a single string."""
270
- text = " ".join(tokens)
271
-
272
- # Remove internal morphological underscores (e.g., 'w_s_y' -> 'wsy')
273
- # We use a regex to ensure we only remove underscores that are
274
- # acting as connectors between Arabic segments, preserving snake_case.
275
- arabic_range = r'[\u0600-\u06FF]'
276
- return re.sub(rf'(?<={arabic_range})_|_(?={arabic_range})', '', text)
277
-
278
-
279
-