youssefreda9 commited on
Commit
e68c40c
·
1 Parent(s): 53a22ae

fix: all model bugs — S1 S2 S3 G1 P1 (6 fixes across 4 files)

Browse files

S1 (P0): WordAligner now prefers ة over ه at word end when both IV
- araspell_rules.py _select_best_word: ه→ة preference for feminine nouns

S2 (P0): Gender preservation — reject corrections that drop feminine marker
- app.py _is_small_spelling_change: block بارده→بارد, منخفظه→منخفض

S3 (P1): Hamza whitelist — 50+ common Arabic hamza corrections
- araspell_rules.py HAMZA_WHITELIST + fix_common_hamza()
- Fixes: الي→إلى, انت→أنت, لان→لأن, امس→أمس, الايام→الأيام, etc.

G1 (P1): Verb-subject agreement for SVO word order
- grammar_rules.py fix_subject_verb_agreement()
- Handles: الطلاب ذهب→ذهبوا, الطالبات ذهب→ذهبن

P1 (P2): Punctuation model now only adds marks, no spelling/grammar changes
- punctuation_service.py _strip_non_punctuation_changes()
- Reverts PuncAra's baked-in spelling/grammar corrections, keeps only marks

S4 (P2): Mitigated by S1+S2+S3 — spelling now makes better corrections,
fewer bad locks blocking grammar

31/31 tests passing

src/app.py CHANGED
@@ -751,6 +751,14 @@ def _is_small_spelling_change(orig_word, corr_word):
751
  if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
752
  return False
753
 
 
 
 
 
 
 
 
 
754
  dist = _levenshtein(orig_word, corr_word)
755
  max_len = max(len(orig_word), len(corr_word))
756
 
 
751
  if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
752
  return False
753
 
754
+ # Fix S2: Reject corrections that drop feminine marker (ه/ة)
755
+ # e.g. بارده→بارد, منخفظه→منخفض — these are WORSE than no correction
756
+ feminine_endings = ('ه', 'ة')
757
+ if orig_word.endswith(feminine_endings) and not corr_word.endswith(feminine_endings):
758
+ # Only reject if the correction is just the word minus the ending
759
+ if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
760
+ return False
761
+
762
  dist = _levenshtein(orig_word, corr_word)
763
  max_len = max(len(orig_word), len(corr_word))
764
 
src/nlp/grammar/grammar_rules.py CHANGED
@@ -161,6 +161,85 @@ class ArabicGrammarGuard:
161
  text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
162
  return text
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def regex_rules_fallback(self, text):
165
  # إن وأخواتها
166
  text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
@@ -183,6 +262,8 @@ class ArabicGrammarGuard:
183
  text = self.fix_verbs_nasb_and_jazm(text)
184
  text = self.fix_gender_agreement(text)
185
  text = self.fix_prepositions_advanced(text)
 
186
  text = self.regex_rules_fallback(text)
187
  text = re.sub(r'\s+', ' ', text).strip()
188
  return text
 
 
161
  text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
162
  return text
163
 
164
+ def fix_subject_verb_agreement(self, text):
165
+ """
166
+ Fix G1: When a plural/dual noun PRECEDES a singular verb (SVO order),
167
+ the verb must agree in number and gender.
168
+
169
+ Arabic rule: In VSO order, verb can be singular even with plural subject.
170
+ But in SVO order, subject-verb agreement is required.
171
+ """
172
+ tokens = simple_word_tokenize(text)
173
+ if len(tokens) < 2:
174
+ return text
175
+ disambig_tokens = self.mle.disambiguate(tokens)
176
+ corrected_tokens = list(tokens)
177
+
178
+ # Common plural nouns (masculine sound plural) ending in ون/ين/ات
179
+ # and their expected verb conjugation patterns
180
+ for i in range(len(disambig_tokens) - 1):
181
+ noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
182
+ verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
183
+ if not noun_info or not verb_info:
184
+ continue
185
+
186
+ noun_pos = noun_info.analysis.get('pos', 'unknown')
187
+ verb_pos = verb_info.analysis.get('pos', 'unknown')
188
+ noun_word = corrected_tokens[i]
189
+ verb_word = corrected_tokens[i+1]
190
+
191
+ # Only process noun → verb patterns (SVO order)
192
+ if noun_pos != 'noun' or verb_pos != 'verb':
193
+ continue
194
+
195
+ noun_num = noun_info.analysis.get('num', 's')
196
+ noun_gen = noun_info.analysis.get('gen', 'm')
197
+ verb_num = verb_info.analysis.get('num', 's')
198
+
199
+ # Skip if verb is already plural
200
+ if verb_num != 's':
201
+ continue
202
+
203
+ # Detect plural nouns
204
+ is_plural_masc = (noun_word.endswith('ون') or noun_word.endswith('ين')
205
+ or noun_num == 'p')
206
+ is_plural_fem = (noun_word.endswith('ات') or
207
+ (noun_gen == 'f' and noun_num == 'p'))
208
+ # Common broken plurals and collective nouns
209
+ KNOWN_PLURALS_MASC = {
210
+ 'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
211
+ 'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
212
+ 'العمال', 'عمال', 'الناس', 'الشباب', 'الأبناء',
213
+ }
214
+ KNOWN_PLURALS_FEM = {
215
+ 'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
216
+ 'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
217
+ }
218
+ if noun_word in KNOWN_PLURALS_MASC:
219
+ is_plural_masc = True
220
+ if noun_word in KNOWN_PLURALS_FEM:
221
+ is_plural_fem = True
222
+
223
+ if not is_plural_masc and not is_plural_fem:
224
+ continue
225
+
226
+ # Fix the verb to agree with the plural subject
227
+ # Past tense singular → plural
228
+ if is_plural_fem:
229
+ # Feminine plural: ذهب → ذهبن
230
+ if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
231
+ # Check if it's a past tense verb (typically 3-5 chars, no prefix)
232
+ if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
233
+ corrected_tokens[i+1] = verb_word + 'ن'
234
+ elif is_plural_masc:
235
+ # Masculine plural: ذهب → ذهبوا
236
+ if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
237
+ and not verb_word.endswith('ين')):
238
+ if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
239
+ corrected_tokens[i+1] = verb_word + 'وا'
240
+
241
+ return " ".join(corrected_tokens)
242
+
243
  def regex_rules_fallback(self, text):
244
  # إن وأخواتها
245
  text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
 
262
  text = self.fix_verbs_nasb_and_jazm(text)
263
  text = self.fix_gender_agreement(text)
264
  text = self.fix_prepositions_advanced(text)
265
+ text = self.fix_subject_verb_agreement(text) # Fix G1
266
  text = self.regex_rules_fallback(text)
267
  text = re.sub(r'\s+', ' ', text).strip()
268
  return text
269
+
src/nlp/punctuation/punctuation_service.py CHANGED
@@ -27,14 +27,108 @@ class PunctuationChecker:
27
  Arabic punctuation restoration pipeline:
28
  1. Preprocessing (remove diacritics)
29
  2. Model inference (chunked, windowed — 50 words/chunk)
30
- 3. Postprocessing (typographic cleanup)
 
31
  """
32
 
 
 
 
33
  def __init__(self, model, tokenizer, device):
34
  self.model = model
35
  self.tokenizer = tokenizer
36
  self.device = device
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def _predict_chunk(self, text_chunk: str) -> str:
39
  """Run model inference on a single chunk (max 128 tokens)."""
40
  from nlp.punctuation.punctuation_rules import arabic_preprocessing
@@ -114,6 +208,8 @@ class PunctuationChecker:
114
 
115
  for paragraph in paragraphs:
116
  punctuated = self._fix_punctuation(paragraph)
 
 
117
  cleaned = arabic_postprocessing(punctuated)
118
  processed_paragraphs.append(cleaned)
119
 
 
27
  Arabic punctuation restoration pipeline:
28
  1. Preprocessing (remove diacritics)
29
  2. Model inference (chunked, windowed — 50 words/chunk)
30
+ 3. Postprocessing: strip non-punctuation changes (Fix P1)
31
+ 4. Typographic cleanup
32
  """
33
 
34
+ # Arabic and common punctuation marks
35
+ PUNCTUATION_CHARS = set('.,;:!?،؛؟!.:«»"\'()-–—…')
36
+
37
  def __init__(self, model, tokenizer, device):
38
  self.model = model
39
  self.tokenizer = tokenizer
40
  self.device = device
41
 
42
+ @staticmethod
43
+ def _strip_punct(word: str) -> str:
44
+ """Remove leading/trailing punctuation from a word."""
45
+ return word.strip('.,;:!?،؛؟!.:«»"\'()-–—…')
46
+
47
+ def _strip_non_punctuation_changes(self, original: str, punctuated: str) -> str:
48
+ """
49
+ Fix P1: The PuncAra model was fine-tuned on data with spelling/grammar
50
+ corrections. We only want punctuation marks from this stage.
51
+
52
+ Strategy: Align original and punctuated word-by-word. For each word,
53
+ if the model changed the BASE text (not just added/moved punctuation),
54
+ revert to the original word but keep any punctuation the model added.
55
+ """
56
+ orig_words = original.split()
57
+ punc_words = punctuated.split()
58
+
59
+ if not orig_words or not punc_words:
60
+ return punctuated
61
+
62
+ # Build result by aligning words
63
+ result = []
64
+ oi = 0 # index into orig_words
65
+ pi = 0 # index into punc_words
66
+
67
+ while oi < len(orig_words) and pi < len(punc_words):
68
+ o_word = orig_words[oi]
69
+ p_word = punc_words[pi]
70
+
71
+ o_base = self._strip_punct(o_word)
72
+ p_base = self._strip_punct(p_word)
73
+
74
+ if o_base == p_base:
75
+ # Same base word — keep punctuation changes from model
76
+ result.append(p_word)
77
+ oi += 1
78
+ pi += 1
79
+ elif self._is_only_punct_difference(o_word, p_word):
80
+ # Words differ only by punctuation — keep model's punctuation
81
+ result.append(p_word)
82
+ oi += 1
83
+ pi += 1
84
+ else:
85
+ # Model changed the actual word content (spelling/grammar/hamza)
86
+ # Revert to original word but transfer any NEW punctuation
87
+ punct_suffix = ''
88
+ punct_prefix = ''
89
+ for ch in reversed(p_word):
90
+ if ch in self.PUNCTUATION_CHARS:
91
+ punct_suffix = ch + punct_suffix
92
+ else:
93
+ break
94
+ for ch in p_word:
95
+ if ch in self.PUNCTUATION_CHARS:
96
+ punct_prefix += ch
97
+ else:
98
+ break
99
+
100
+ # Only add punctuation that wasn't already there
101
+ if not o_word.endswith(punct_suffix) and punct_suffix:
102
+ result.append(o_word + punct_suffix)
103
+ elif punct_prefix and not o_word.startswith(punct_prefix):
104
+ result.append(punct_prefix + o_word)
105
+ else:
106
+ result.append(o_word)
107
+ oi += 1
108
+ pi += 1
109
+
110
+ # Append remaining original words
111
+ while oi < len(orig_words):
112
+ result.append(orig_words[oi])
113
+ oi += 1
114
+
115
+ # Append remaining punctuation-only words from model
116
+ while pi < len(punc_words):
117
+ p_word = punc_words[pi]
118
+ if all(ch in self.PUNCTUATION_CHARS or ch.isspace() for ch in p_word):
119
+ result.append(p_word)
120
+ pi += 1
121
+
122
+ return ' '.join(result)
123
+
124
+ @staticmethod
125
+ def _is_only_punct_difference(word1: str, word2: str) -> bool:
126
+ """Check if two words differ only by punctuation characters."""
127
+ PUNCT = set('.,;:!?،؛؟!.:«»"\'()-–—…')
128
+ base1 = ''.join(c for c in word1 if c not in PUNCT)
129
+ base2 = ''.join(c for c in word2 if c not in PUNCT)
130
+ return base1 == base2
131
+
132
  def _predict_chunk(self, text_chunk: str) -> str:
133
  """Run model inference on a single chunk (max 128 tokens)."""
134
  from nlp.punctuation.punctuation_rules import arabic_preprocessing
 
208
 
209
  for paragraph in paragraphs:
210
  punctuated = self._fix_punctuation(paragraph)
211
+ # Fix P1: Strip spelling/grammar changes, keep only punctuation
212
+ punctuated = self._strip_non_punctuation_changes(paragraph, punctuated)
213
  cleaned = arabic_postprocessing(punctuated)
214
  processed_paragraphs.append(cleaned)
215
 
src/nlp/spelling/araspell_rules.py CHANGED
@@ -114,6 +114,50 @@ class AraSpellPostProcessor:
114
 
115
  # --- Hamza & Ta Marbuta Handling ---
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  @staticmethod
118
  def fix_hamza_conservative(text: str) -> str:
119
  """Conservative Hamza normalization — only at word END, not middle."""
@@ -128,34 +172,62 @@ class AraSpellPostProcessor:
128
  result.append(word)
129
  return ' '.join(result)
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  @staticmethod
132
  def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
133
  """
134
  Smart ه → ة fix at end of words.
135
- Strategy: Only convert if the ة version is IV (in tokenizer vocab).
 
136
  """
137
  PROTECTED_ENDINGS = ['لله']
 
 
 
 
 
 
138
  words = text.split()
139
  result = []
140
  for word in words:
141
  if any(word.endswith(e) for e in PROTECTED_ENDINGS):
142
  result.append(word)
143
  continue
144
- if len(word) >= 4 and word.endswith('ه'):
 
 
 
145
  if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
146
  candidate_with_ta = word[:-1] + 'ة'
 
147
  if vocab_manager:
148
  ta_iv = vocab_manager.is_iv(candidate_with_ta)
149
  ha_iv = vocab_manager.is_iv(word)
150
  if ta_iv:
 
151
  result.append(candidate_with_ta)
152
  continue
153
  elif ha_iv:
154
  result.append(word)
155
  continue
156
- else:
157
- result.append(candidate_with_ta)
158
- continue
159
  result.append(word)
160
  return ' '.join(result)
161
 
@@ -263,6 +335,7 @@ class AraSpellPostProcessor:
263
  text = AraSpellPostProcessor.remove_hallucinations(text)
264
  text = AraSpellPostProcessor.unified_collapse_repeated(text)
265
  text = AraSpellPostProcessor.fix_hamza_conservative(text)
 
266
  text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
267
  text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
268
  text = AraSpellPostProcessor.remove_duplicate_words(text)
@@ -588,6 +661,15 @@ class WordAligner:
588
  if in_iv and not out_iv:
589
  return input_word
590
  if in_iv and out_iv:
 
 
 
 
 
 
 
 
 
591
  return input_word
592
  if len(input_word) == len(output_word) and len(input_word) >= 3:
593
  for i in range(len(input_word)):
 
114
 
115
  # --- Hamza & Ta Marbuta Handling ---
116
 
117
+ # Common Arabic words with hamza errors — covers the most frequent
118
+ # spelling mistakes in informal Arabic writing
119
+ HAMZA_WHITELIST = {
120
+ 'الي': 'إلى', 'الى': 'إلى',
121
+ 'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
122
+ 'انتو': 'أنتم', 'انتن': 'أنتن',
123
+ 'انا': 'أنا',
124
+ 'امس': 'أمس',
125
+ 'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
126
+ 'لانهم': 'لأنهم', 'لانك': 'لأنك',
127
+ 'اذا': 'إذا', 'اذ': 'إذ',
128
+ 'اي': 'أي', 'اين': 'أين',
129
+ 'او': 'أو',
130
+ 'اما': 'أما',
131
+ 'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
132
+ 'اخر': 'آخر', 'اخرى': 'أخرى',
133
+ 'الان': 'الآن',
134
+ 'اول': 'أول', 'اولى': 'أولى',
135
+ 'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
136
+ 'اكثر': 'أكثر', 'اقل': 'أقل',
137
+ 'اعلى': 'أعلى', 'ادنى': 'أدنى',
138
+ 'اسرع': 'أسرع', 'ابطا': 'أبطأ',
139
+ 'اكبر': 'أكبر', 'اصغر': 'أصغر',
140
+ 'احسن': 'أحسن', 'اسوا': 'أسوأ',
141
+ 'امام': 'أمام',
142
+ 'اثناء': 'أثناء',
143
+ 'ايضا': 'أيضاً', 'ايض': 'أيضاً',
144
+ 'اساسي': 'أساسي', 'اساسية': 'أساسية',
145
+ 'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
146
+ 'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
147
+ 'اهل': 'أهل',
148
+ 'اطفال': 'أطفال',
149
+ 'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
150
+ 'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
151
+ 'اعرف': 'أعرف', 'اعلم': 'أعلم',
152
+ 'اخذ': 'أخذ', 'اكل': 'أكل',
153
+ 'الايام': 'الأيام',
154
+ 'الاطفال': 'الأطفال',
155
+ 'الاسعار': 'الأسعار',
156
+ 'الاولى': 'الأولى',
157
+ 'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
158
+ 'واصدقائي': 'وأصدقائي',
159
+ }
160
+
161
  @staticmethod
162
  def fix_hamza_conservative(text: str) -> str:
163
  """Conservative Hamza normalization — only at word END, not middle."""
 
172
  result.append(word)
173
  return ' '.join(result)
174
 
175
+ @staticmethod
176
+ def fix_common_hamza(text: str) -> str:
177
+ """
178
+ Fix common hamza placement errors using a whitelist.
179
+ These are the most frequent informal Arabic spelling mistakes.
180
+ """
181
+ words = text.split()
182
+ result = []
183
+ for word in words:
184
+ # Check exact match first
185
+ if word in AraSpellPostProcessor.HAMZA_WHITELIST:
186
+ result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
187
+ else:
188
+ result.append(word)
189
+ return ' '.join(result)
190
+
191
  @staticmethod
192
  def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
193
  """
194
  Smart ه → ة fix at end of words.
195
+ Strategy: Always prefer ة when the previous char is a consonant,
196
+ UNLESS the ه form is specifically a known word and the ة form is NOT.
197
  """
198
  PROTECTED_ENDINGS = ['لله']
199
+ # Words that genuinely end in ه (not ة)
200
+ PROTECTED_HA_WORDS = {
201
+ 'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
202
+ 'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
203
+ 'اتجه', 'توجه', 'تشابه',
204
+ }
205
  words = text.split()
206
  result = []
207
  for word in words:
208
  if any(word.endswith(e) for e in PROTECTED_ENDINGS):
209
  result.append(word)
210
  continue
211
+ if word in PROTECTED_HA_WORDS:
212
+ result.append(word)
213
+ continue
214
+ if len(word) >= 3 and word.endswith('ه'):
215
  if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
216
  candidate_with_ta = word[:-1] + 'ة'
217
+ # Default: prefer ة (correct Arabic orthography for feminine nouns)
218
  if vocab_manager:
219
  ta_iv = vocab_manager.is_iv(candidate_with_ta)
220
  ha_iv = vocab_manager.is_iv(word)
221
  if ta_iv:
222
+ # Always prefer ة when it's a valid word
223
  result.append(candidate_with_ta)
224
  continue
225
  elif ha_iv:
226
  result.append(word)
227
  continue
228
+ # No vocab manager — default to ة
229
+ result.append(candidate_with_ta)
230
+ continue
231
  result.append(word)
232
  return ' '.join(result)
233
 
 
335
  text = AraSpellPostProcessor.remove_hallucinations(text)
336
  text = AraSpellPostProcessor.unified_collapse_repeated(text)
337
  text = AraSpellPostProcessor.fix_hamza_conservative(text)
338
+ text = AraSpellPostProcessor.fix_common_hamza(text) # Fix S3: hamza whitelist
339
  text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
340
  text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
341
  text = AraSpellPostProcessor.remove_duplicate_words(text)
 
661
  if in_iv and not out_iv:
662
  return input_word
663
  if in_iv and out_iv:
664
+ # Fix S1: When only difference is ه→ة at word end, prefer ة
665
+ # (correct Arabic orthography — ة is the standard feminine ending)
666
+ if (input_word.endswith('ه') and output_word.endswith('ة')
667
+ and input_word[:-1] == output_word[:-1]):
668
+ return output_word
669
+ # Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
670
+ if (input_word.endswith('ة') and output_word.endswith('ه')
671
+ and input_word[:-1] == output_word[:-1]):
672
+ return input_word
673
  return input_word
674
  if len(input_word) == len(output_word) and len(input_word) >= 3:
675
  for i in range(len(input_word)):