youssefreda9 commited on
Commit
a16af4a
·
1 Parent(s): 751ba66

fix(critical): stop spelling from corrupting correct words + fix pronoun agreement + reject hallucinations

Browse files

3 critical fixes:

1. IV PROTECTION: When both original AND correction are in-vocabulary (valid
Arabic words), ONLY accept the change if it's a known orthographic fix
(hamza whitelist or ه→ة). This blocks وكان→وكأن type corruption where
the model changes one correct word to a completely different correct word.

2. PRONOUN EXCLUSION: fix_subject_verb_agreement now excludes pronouns
(أنا, أنت, هو, etc.) from triggering plural verb agreement. Previously
it incorrectly changed أنا ذهبت → أنا ذهبتوا.

3. HALLUCINATION FILTER: Grammar diffs with Jaccard char similarity <0.3
are rejected (e.g. جالس→جاكسون). Prevents model hallucinations from
reaching the user.

Also adds [SPELLING] Accepted/Rejected debug logging for production tracing.

59/59 tests passing

Files changed (2) hide show
  1. src/app.py +57 -5
  2. src/nlp/grammar/grammar_rules.py +39 -15
src/app.py CHANGED
@@ -734,10 +734,14 @@ def _levenshtein(a, b):
734
  return dp[m][n]
735
 
736
 
737
- def _is_small_spelling_change(orig_word, corr_word):
738
  """
739
  Heuristic: only accept small spelling edits and ignore
740
  aggressive changes (to avoid over-editing).
 
 
 
 
741
  """
742
  if not orig_word or not corr_word:
743
  return False
@@ -759,6 +763,36 @@ def _is_small_spelling_change(orig_word, corr_word):
759
  if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
760
  return False
761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  dist = _levenshtein(orig_word, corr_word)
763
  max_len = max(len(orig_word), len(corr_word))
764
 
@@ -954,7 +988,8 @@ def analyze_text():
954
  # 1-word → 1-word: accept only small edits (typos)
955
  o_word = o_segment[0]
956
  c_word = c_segment[0]
957
- if _is_small_spelling_change(o_word, c_word):
 
958
  new_words.append(c_word)
959
  ctx.add_patch(
960
  'spelling', start_idx, end_idx,
@@ -962,6 +997,7 @@ def analyze_text():
962
  alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
963
  )
964
  else:
 
965
  new_words.append(current_text[start_idx:end_idx])
966
  elif len(o_segment) == 1 and len(c_segment) > 1:
967
  # 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
@@ -989,7 +1025,7 @@ def analyze_text():
989
  if ci < len(c_segment):
990
  c_word = c_segment[ci]
991
  # Check if this is a 1→1 small edit
992
- if _is_small_spelling_change(o_word, c_word):
993
  new_words.append(c_word)
994
  ctx.add_patch(
995
  'spelling', o_start, o_end,
@@ -1058,14 +1094,30 @@ def analyze_text():
1058
  f"'{d.get('original','')}' — locked by previous stage"
1059
  )
1060
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  # Re-label: if grammar's change is purely orthographic
1062
  # (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
1063
  stage_label = 'grammar'
1064
- if _is_spelling_only_change(d.get('original', ''), d.get('correction', '')):
1065
  stage_label = 'spelling'
1066
  ctx.add_patch(
1067
  stage_label, d['start'], d['end'],
1068
- d['correction'], confidence=1.0
1069
  )
1070
  ctx.mutate_text(corrected_grammar, OffsetMapper)
1071
  current_text = ctx.current_text
 
734
  return dp[m][n]
735
 
736
 
737
+ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
738
  """
739
  Heuristic: only accept small spelling edits and ignore
740
  aggressive changes (to avoid over-editing).
741
+
742
+ CRITICAL: If both words are in-vocabulary (both are valid Arabic words),
743
+ only accept known orthographic fixes (ه→ة, hamza whitelist).
744
+ This prevents the model from corrupting correct words (e.g. وكان→وكأن).
745
  """
746
  if not orig_word or not corr_word:
747
  return False
 
763
  if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
764
  return False
765
 
766
+ # CRITICAL: If both words are valid Arabic words, only accept known fixes.
767
+ # This prevents the spelling model from changing one correct word to another
768
+ # (e.g. وكان→وكأن, which changes "and was" to "as if" — a meaning change).
769
+ if vocab_manager:
770
+ orig_iv = vocab_manager.is_iv(orig_word)
771
+ corr_iv = vocab_manager.is_iv(corr_word)
772
+ if orig_iv and corr_iv:
773
+ # Both are valid words — only accept known orthographic fixes:
774
+ # 1. ه→ة at word end (feminine marker fix)
775
+ if (orig_word.endswith('ه') and corr_word.endswith('ة')
776
+ and orig_word[:-1] == corr_word[:-1]):
777
+ return True
778
+ # 2. ة→ه at word end (less common but valid)
779
+ if (orig_word.endswith('ة') and corr_word.endswith('ه')
780
+ and orig_word[:-1] == corr_word[:-1]):
781
+ return True
782
+ # 3. Word is in the hamza whitelist (known common errors)
783
+ from nlp.spelling.araspell_rules import AraSpellPostProcessor
784
+ if orig_word in AraSpellPostProcessor.HAMZA_WHITELIST:
785
+ return True
786
+ # 4. Check prefixed hamza (و+whitelist word, etc.)
787
+ for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
788
+ if orig_word.startswith(prefix) and len(orig_word) > len(prefix) + 1:
789
+ remainder = orig_word[len(prefix):]
790
+ if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
791
+ return True
792
+ # Both are valid words and change is NOT a known fix — REJECT
793
+ # This prevents وكان→وكأن, etc.
794
+ return False
795
+
796
  dist = _levenshtein(orig_word, corr_word)
797
  max_len = max(len(orig_word), len(corr_word))
798
 
 
988
  # 1-word → 1-word: accept only small edits (typos)
989
  o_word = o_segment[0]
990
  c_word = c_segment[0]
991
+ if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
992
+ logger.info(f"[SPELLING] Accepted: '{o_word}'→'{c_word}'")
993
  new_words.append(c_word)
994
  ctx.add_patch(
995
  'spelling', start_idx, end_idx,
 
997
  alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
998
  )
999
  else:
1000
+ logger.info(f"[SPELLING] Rejected: '{o_word}'→'{c_word}' (filter blocked)")
1001
  new_words.append(current_text[start_idx:end_idx])
1002
  elif len(o_segment) == 1 and len(c_segment) > 1:
1003
  # 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
 
1025
  if ci < len(c_segment):
1026
  c_word = c_segment[ci]
1027
  # Check if this is a 1→1 small edit
1028
+ if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
1029
  new_words.append(c_word)
1030
  ctx.add_patch(
1031
  'spelling', o_start, o_end,
 
1094
  f"'{d.get('original','')}' — locked by previous stage"
1095
  )
1096
  continue
1097
+
1098
+ # Reject grammar hallucinations (e.g. جالس→جاكسون)
1099
+ orig_text = d.get('original', '')
1100
+ corr_text = d.get('correction', '')
1101
+ if orig_text and corr_text:
1102
+ orig_chars = set(orig_text.replace(' ', ''))
1103
+ corr_chars = set(corr_text.replace(' ', ''))
1104
+ if orig_chars and corr_chars:
1105
+ jaccard = len(orig_chars & corr_chars) / len(orig_chars | corr_chars)
1106
+ if jaccard < 0.3:
1107
+ logger.info(
1108
+ f"[GRAMMAR] Rejected hallucination: '{orig_text}'→'{corr_text}' "
1109
+ f"(jaccard={jaccard:.2f})"
1110
+ )
1111
+ continue
1112
+
1113
  # Re-label: if grammar's change is purely orthographic
1114
  # (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
1115
  stage_label = 'grammar'
1116
+ if _is_spelling_only_change(orig_text, corr_text):
1117
  stage_label = 'spelling'
1118
  ctx.add_patch(
1119
  stage_label, d['start'], d['end'],
1120
+ corr_text, confidence=1.0
1121
  )
1122
  ctx.mutate_text(corrected_grammar, OffsetMapper)
1123
  current_text = ctx.current_text
src/nlp/grammar/grammar_rules.py CHANGED
@@ -163,11 +163,16 @@ class ArabicGrammarGuard:
163
 
164
  def fix_subject_verb_agreement(self, text):
165
  """
166
- Fix G1: When a plural/dual noun PRECEDES a singular verb (SVO order),
167
  the verb must agree in number and gender.
168
 
169
  Arabic rule: In VSO order, verb can be singular even with plural subject.
170
  But in SVO order, subject-verb agreement is required.
 
 
 
 
 
171
  """
172
  tokens = simple_word_tokenize(text)
173
  if len(tokens) < 2:
@@ -175,8 +180,16 @@ class ArabicGrammarGuard:
175
  disambig_tokens = self.mle.disambiguate(tokens)
176
  corrected_tokens = list(tokens)
177
 
178
- # Common plural nouns (masculine sound plural) ending in ون/ين/ات
179
- # and their expected verb conjugation patterns
 
 
 
 
 
 
 
 
180
  for i in range(len(disambig_tokens) - 1):
181
  noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
182
  verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
@@ -188,6 +201,10 @@ class ArabicGrammarGuard:
188
  noun_word = corrected_tokens[i]
189
  verb_word = corrected_tokens[i+1]
190
 
 
 
 
 
191
  # Only process noun → verb patterns (SVO order)
192
  if noun_pos != 'noun' or verb_pos != 'verb':
193
  continue
@@ -200,39 +217,46 @@ class ArabicGrammarGuard:
200
  if verb_num != 's':
201
  continue
202
 
203
- # Detect plural nouns
204
- is_plural_masc = (noun_word.endswith('ون') or noun_word.endswith('ين')
205
- or noun_num == 'p')
206
- is_plural_fem = (noun_word.endswith('ات') or
207
- (noun_gen == 'f' and noun_num == 'p'))
208
- # Common broken plurals and collective nouns
 
 
 
209
  KNOWN_PLURALS_MASC = {
210
  'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
211
  'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
212
- 'العمال', 'عمال', 'الناس', 'الشباب', 'الأبناء',
 
213
  }
214
  KNOWN_PLURALS_FEM = {
215
  'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
216
  'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
217
  }
 
218
  if noun_word in KNOWN_PLURALS_MASC:
219
  is_plural_masc = True
220
- if noun_word in KNOWN_PLURALS_FEM:
 
 
 
 
 
 
221
  is_plural_fem = True
222
 
223
  if not is_plural_masc and not is_plural_fem:
224
  continue
225
 
226
  # Fix the verb to agree with the plural subject
227
- # Past tense singular → plural
228
  if is_plural_fem:
229
- # Feminine plural: ذهب → ذهبن
230
  if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
231
- # Check if it's a past tense verb (typically 3-5 chars, no prefix)
232
  if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
233
  corrected_tokens[i+1] = verb_word + 'ن'
234
  elif is_plural_masc:
235
- # Masculine plural: ذهب → ذهبوا
236
  if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
237
  and not verb_word.endswith('ين')):
238
  if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
 
163
 
164
  def fix_subject_verb_agreement(self, text):
165
  """
166
+ Fix G1: When a CONFIRMED plural noun PRECEDES a singular verb (SVO order),
167
  the verb must agree in number and gender.
168
 
169
  Arabic rule: In VSO order, verb can be singular even with plural subject.
170
  But in SVO order, subject-verb agreement is required.
171
+
172
+ EXCLUSIONS:
173
+ - Pronouns (أنا, أنت, هو, etc.) — these are NOT plural
174
+ - Proper nouns — don't modify verbs after names
175
+ - Words tagged as singular by the disambiguator
176
  """
177
  tokens = simple_word_tokenize(text)
178
  if len(tokens) < 2:
 
180
  disambig_tokens = self.mle.disambiguate(tokens)
181
  corrected_tokens = list(tokens)
182
 
183
+ # Words that should NEVER trigger plural verb agreement
184
+ EXCLUDED_WORDS = {
185
+ # Pronouns (all singular/dual)
186
+ 'أنا', 'انا', 'أنت', 'انت', 'أنتِ', 'هو', 'هي',
187
+ 'نحن', 'أنتما', 'هما',
188
+ # Common words that look like nouns but aren't plural
189
+ 'كان', 'وكان', 'كانت', 'وكانت', 'ليس', 'ليست',
190
+ 'هذا', 'هذه', 'ذلك', 'تلك', 'هناك',
191
+ }
192
+
193
  for i in range(len(disambig_tokens) - 1):
194
  noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
195
  verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
 
201
  noun_word = corrected_tokens[i]
202
  verb_word = corrected_tokens[i+1]
203
 
204
+ # Skip excluded words
205
+ if noun_word in EXCLUDED_WORDS:
206
+ continue
207
+
208
  # Only process noun → verb patterns (SVO order)
209
  if noun_pos != 'noun' or verb_pos != 'verb':
210
  continue
 
217
  if verb_num != 's':
218
  continue
219
 
220
+ # Only trigger on CONFIRMED plurals:
221
+ # 1. Known broken plural nouns (hardcoded list)
222
+ # 2. Sound masculine plural ending in ون/ين
223
+ # 3. Sound feminine plural ending in ات
224
+ # Do NOT rely on POS tagger alone — it misclassifies too many words
225
+
226
+ is_plural_masc = False
227
+ is_plural_fem = False
228
+
229
  KNOWN_PLURALS_MASC = {
230
  'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
231
  'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
232
+ 'العمال', 'عمال', 'الشباب', 'الأبناء',
233
+ 'المهندسون', 'المعلمون', 'المهندسين', 'المعلمين',
234
  }
235
  KNOWN_PLURALS_FEM = {
236
  'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
237
  'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
238
  }
239
+
240
  if noun_word in KNOWN_PLURALS_MASC:
241
  is_plural_masc = True
242
+ elif noun_word in KNOWN_PLURALS_FEM:
243
+ is_plural_fem = True
244
+ elif noun_word.endswith('ون') or noun_word.endswith('ين'):
245
+ # Sound masculine plural — but only if 4+ chars (avoid short words)
246
+ if len(noun_word) >= 5:
247
+ is_plural_masc = True
248
+ elif noun_word.endswith('ات') and len(noun_word) >= 5:
249
  is_plural_fem = True
250
 
251
  if not is_plural_masc and not is_plural_fem:
252
  continue
253
 
254
  # Fix the verb to agree with the plural subject
 
255
  if is_plural_fem:
 
256
  if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
 
257
  if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
258
  corrected_tokens[i+1] = verb_word + 'ن'
259
  elif is_plural_masc:
 
260
  if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
261
  and not verb_word.endswith('ين')):
262
  if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):