Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
b892abd
·
verified ·
1 Parent(s): f1a695e

Release v1.1: PMI Phrase Merging & Smart Morphology

Browse files
Files changed (2) hide show
  1. tokenization_df_arc.py +28 -12
  2. tokenizer_config.json +0 -1
tokenization_df_arc.py CHANGED
@@ -68,9 +68,24 @@ class MorphologicalPreTokenizer:
68
  PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
69
  SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
70
 
71
- def __init__(self, min_stem_length: int = 2, exceptions: List[str] = None):
 
 
 
 
 
 
 
 
 
 
 
 
72
  self.min_stem_length = min_stem_length
73
- self.exceptions = set(exceptions) if exceptions else set()
 
 
 
74
  self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
75
  self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
76
  self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
@@ -108,10 +123,9 @@ class MorphologicalPreTokenizer:
108
 
109
  def segment_text(self, text: str) -> str:
110
  words = text.split()
111
- segmented_words = []
112
- for word in words:
113
- segments = self.segment_word(word)
114
- segmented_words.append('_'.join(segments))
115
  return ' '.join(segmented_words)
116
 
117
  class PhraseMerger:
@@ -177,18 +191,20 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
177
  ):
178
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
179
 
180
- # Load exceptions if provided
181
- exceptions = []
182
- if exceptions_file:
183
  try:
184
  with open(exceptions_file, 'r', encoding='utf-8') as f:
185
- exceptions = [line.strip() for line in f if line.strip()]
186
- except FileNotFoundError:
 
 
187
  pass
188
 
189
  self.morph_helper = MorphologicalPreTokenizer(
190
  min_stem_length=min_stem_length,
191
- exceptions=exceptions
192
  )
193
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
194
 
 
68
  PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
69
  SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
70
 
71
+ # Common entities/words to protect from segmentation (embedded fallback)
72
+ DEFAULT_EXCEPTIONS = {
73
+ "الله", "محمد", "عبدالله", "عبدالرحمن", "مكة", "بغداد", "دمشق", "القاهرة", "بيروت", "عمان",
74
+ "الرياض", "جدة", "الكويت", "دبي", "أبوظبي", "المنامة", "الدوحة", "مسقط", "ليبيا", "تونس",
75
+ "الجزائر", "المغرب", "فلسطين", "الأردن", "لبنان", "سوريا", "العراق", "مصر", "السودان", "اليمن",
76
+ "أمريكا", "أوروبا", "آسيا", "أفريقيا", "ترامب", "بايدن", "جوجل", "فيسبوك", "أمازون", "مايكروسوفت",
77
+ "أبل", "سامسونج", "سوني", "هواوي", "مرسيدس", "بي إم دبليو", "تويوتا", "هوندا", "فورد", "شيفروليه",
78
+ "تسلا", "ناسا", "إيلون ماسك", "مارك زوكربيرج", "بيل جيتس", "ستيف جوبز", "ألبرت أينشتاين",
79
+ "إسحاق نيوتن", "داروين", "بيتهوفن", "موتزارت", "شكسبير", "دوستويفسكي", "تولستوي", "نجيب محفوظ",
80
+ "طه حسين", "العقاد", "المنفلوطي", "جبران خليل جبران", "محمود درويش", "نزار قباني"
81
+ }
82
+
83
+ def __init__(self, min_stem_length: int = 2, exceptions: Optional[List[str]] = None):
84
  self.min_stem_length = min_stem_length
85
+ # Merge user exceptions with defaults using frozenset for immutability and O(1) lookups
86
+ user_exceptions = set(exceptions) if exceptions else set()
87
+ self.exceptions = frozenset(self.DEFAULT_EXCEPTIONS.union(user_exceptions))
88
+
89
  self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
90
  self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
91
  self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
 
123
 
124
  def segment_text(self, text: str) -> str:
125
  words = text.split()
126
+ segmented_words = [
127
+ '_'.join(self.segment_word(word)) for word in words
128
+ ]
 
129
  return ' '.join(segmented_words)
130
 
131
  class PhraseMerger:
 
191
  ):
192
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
193
 
194
+ # Load user-provided exceptions if file exists
195
+ user_exceptions = []
196
+ if exceptions_file and os.path.exists(exceptions_file):
197
  try:
198
  with open(exceptions_file, 'r', encoding='utf-8') as f:
199
+ user_exceptions = [line.strip() for line in f if line.strip()]
200
+ except OSError:
201
+ # If file read fails, we just won't have custom exceptions
202
+ # The MorphologicalPreTokenizer has embedded defaults now.
203
  pass
204
 
205
  self.morph_helper = MorphologicalPreTokenizer(
206
  min_stem_length=min_stem_length,
207
+ exceptions=user_exceptions
208
  )
209
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
210
 
tokenizer_config.json CHANGED
@@ -7,7 +7,6 @@
7
  ]
8
  },
9
  "phrases_file": "phrase_vocab.json",
10
- "exceptions_file": "exceptions.txt",
11
  "vocab_file": "tokenizer.json",
12
  "min_stem_length": 2,
13
  "normalization_config": {
 
7
  ]
8
  },
9
  "phrases_file": "phrase_vocab.json",
 
10
  "vocab_file": "tokenizer.json",
11
  "min_stem_length": 2,
12
  "normalization_config": {