Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
dfdf13b
·
verified ·
1 Parent(s): 60eb243

Release v1.1: PMI Phrase Merging & Smart Morphology

Browse files
Files changed (3) hide show
  1. exceptions.txt +71 -0
  2. tokenization_df_arc.py +16 -23
  3. tokenizer_config.json +6 -7
exceptions.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ الله
2
+ محمد
3
+ عبدالله
4
+ عبدالرحمن
5
+ مكة
6
+ بغداد
7
+ دمشق
8
+ القاهرة
9
+ بيروت
10
+ عمان
11
+ الرياض
12
+ جدة
13
+ الكويت
14
+ دبي
15
+ أبوظبي
16
+ المنامة
17
+ الدوحة
18
+ مسقط
19
+ ليبيا
20
+ تونس
21
+ الجزائر
22
+ المغرب
23
+ فلسطين
24
+ الأردن
25
+ لبنان
26
+ سوريا
27
+ العراق
28
+ مصر
29
+ السودان
30
+ اليمن
31
+ أمريكا
32
+ أوروبا
33
+ آسيا
34
+ أفريقيا
35
+ ترامب
36
+ بايدن
37
+ جوجل
38
+ فيسبوك
39
+ أمازون
40
+ مايكروسوفت
41
+ أبل
42
+ سامسونج
43
+ سوني
44
+ هواوي
45
+ مرسيدس
46
+ بي إم دبليو
47
+ تويوتا
48
+ هوندا
49
+ فورد
50
+ شيفروليه
51
+ تسلا
52
+ ناسا
53
+ إيلون ماسك
54
+ مارك زوكربيرج
55
+ بيل جيتس
56
+ ستيف جوبز
57
+ ألبرت أينشتاين
58
+ إسحاق نيوتن
59
+ داروين
60
+ بيتهوفن
61
+ موتزارت
62
+ شكسبير
63
+ دوستويفسكي
64
+ تولستوي
65
+ نجيب محفوظ
66
+ طه حسين
67
+ العقاد
68
+ المنفلوطي
69
+ جبران خليل جبران
70
+ محمود درويش
71
+ نزار قباني
tokenization_df_arc.py CHANGED
@@ -163,14 +163,13 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
163
 
164
  def __init__(
165
  self,
166
- vocab_file=None,
167
- tokenizer_file=None,
168
- phrases_file=None,
169
- normalization_config=None,
170
- min_stem_length=2,
171
  **kwargs
172
  ):
173
- # Initialize helpers
174
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
175
  self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
176
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
@@ -181,10 +180,10 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
181
  **kwargs
182
  )
183
 
184
- def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
185
- # Pre-process batch
186
- def preprocess(text):
187
- if not text: return ""
188
  t = self.normalizer_helper.normalize(text)
189
  t = self.morph_helper.segment_text(t)
190
  t = self.phrase_helper.merge_phrases(t)
@@ -193,12 +192,11 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
193
  if isinstance(batch_text_or_text_pairs, str):
194
  batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
195
  elif isinstance(batch_text_or_text_pairs, (list, tuple)):
196
- # Handle text pairs? For now assume list of strings
197
  processed = []
198
  for item in batch_text_or_text_pairs:
199
  if isinstance(item, str):
200
  processed.append(preprocess(item))
201
- elif isinstance(item, (list, tuple)): # Pairs
202
  processed.append((preprocess(item[0]), preprocess(item[1])))
203
  else:
204
  processed.append(item)
@@ -228,19 +226,14 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
228
  return self.convert_tokens_to_string(tokens)
229
 
230
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
231
-
232
- """
233
- Converts a sequence of tokens (string) in a single string.
234
- """
235
- # Join with simple space (since we used Whitespace pre-tokenizer)
236
  text = " ".join(tokens)
237
 
238
- # Remove morphological markers (underscores)
239
- # We only remove internal underscores.
240
- # Note: This is an approximation.
241
- text = text.replace("_", "")
242
-
243
- return text
244
 
245
 
246
 
 
163
 
164
  def __init__(
165
  self,
166
+ vocab_file: Optional[str] = None,
167
+ tokenizer_file: Optional[str] = None,
168
+ phrases_file: Optional[str] = None,
169
+ normalization_config: Optional[Dict[str, bool]] = None,
170
+ min_stem_length: int = 2,
171
  **kwargs
172
  ):
 
173
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
174
  self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
175
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
 
180
  **kwargs
181
  )
182
 
183
+ def _batch_encode_plus(self, batch_text_or_text_pairs: Union[str, List[str], List[Tuple[str, str]]], *args, **kwargs):
184
+ def preprocess(text: str) -> str:
185
+ if not text:
186
+ return ""
187
  t = self.normalizer_helper.normalize(text)
188
  t = self.morph_helper.segment_text(t)
189
  t = self.phrase_helper.merge_phrases(t)
 
192
  if isinstance(batch_text_or_text_pairs, str):
193
  batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
194
  elif isinstance(batch_text_or_text_pairs, (list, tuple)):
 
195
  processed = []
196
  for item in batch_text_or_text_pairs:
197
  if isinstance(item, str):
198
  processed.append(preprocess(item))
199
+ elif isinstance(item, (list, tuple)):
200
  processed.append((preprocess(item[0]), preprocess(item[1])))
201
  else:
202
  processed.append(item)
 
226
  return self.convert_tokens_to_string(tokens)
227
 
228
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
229
+ """Converts a sequence of tokens into a single string."""
 
 
 
 
230
  text = " ".join(tokens)
231
 
232
+ # Remove internal morphological underscores (e.g., 'w_s_y' -> 'wsy')
233
+ # We use a regex to ensure we only remove underscores that are
234
+ # acting as connectors between Arabic segments, preserving snake_case.
235
+ arabic_range = r'[\u0600-\u06FF]'
236
+ return re.sub(rf'(?<={arabic_range})_|_(?={arabic_range})', '', text)
 
237
 
238
 
239
 
tokenizer_config.json CHANGED
@@ -1,21 +1,20 @@
1
  {
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenization_df_arc.DFArcTokenizer",
5
  null
6
  ]
7
  },
8
- "tokenizer_class": "DFArcTokenizer",
9
- "phrases_file": "phrases.json",
10
- "normalization": {
 
11
  "unify_alef": true,
12
  "unify_yeh": true,
13
  "unify_teh_marbuta": true,
14
  "remove_diacritics": true,
15
  "remove_tatweel": true,
16
  "remove_repeats": true
17
- },
18
- "min_stem_length": 2,
19
- "vocab_size": 256000,
20
- "model_max_length": 4096
21
  }
 
1
  {
2
+ "tokenizer_class": "DFArcTokenizer",
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenization_df_arc.DFArcTokenizer",
6
  null
7
  ]
8
  },
9
+ "phrases_file": "phrase_vocab.json",
10
+ "vocab_file": "tokenizer.json",
11
+ "min_stem_length": 2,
12
+ "normalization_config": {
13
  "unify_alef": true,
14
  "unify_yeh": true,
15
  "unify_teh_marbuta": true,
16
  "remove_diacritics": true,
17
  "remove_tatweel": true,
18
  "remove_repeats": true
19
+ }
 
 
 
20
  }