fix(critical): stop spelling from corrupting correct words + fix pronoun agreement + reject hallucinations
Browse files3 critical fixes:
1. IV PROTECTION: When both original AND correction are in-vocabulary (valid
Arabic words), ONLY accept the change if it's a known orthographic fix
(hamza whitelist or ه→ة). This blocks وكان→وكأن type corruption where
the model changes one correct word to a completely different correct word.
2. PRONOUN EXCLUSION: fix_subject_verb_agreement now excludes pronouns
(أنا, أنت, هو, etc.) from triggering plural verb agreement. Previously
it incorrectly changed أنا ذهبت → أنا ذهبتوا.
3. HALLUCINATION FILTER: Grammar diffs with Jaccard char similarity <0.3
are rejected (e.g. جالس→جاكسون). Prevents model hallucinations from
reaching the user.
Also adds [SPELLING] Accepted/Rejected debug logging for production tracing.
59/59 tests passing
- src/app.py +57 -5
- src/nlp/grammar/grammar_rules.py +39 -15
|
@@ -734,10 +734,14 @@ def _levenshtein(a, b):
|
|
| 734 |
return dp[m][n]
|
| 735 |
|
| 736 |
|
| 737 |
-
def _is_small_spelling_change(orig_word, corr_word):
|
| 738 |
"""
|
| 739 |
Heuristic: only accept small spelling edits and ignore
|
| 740 |
aggressive changes (to avoid over-editing).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
"""
|
| 742 |
if not orig_word or not corr_word:
|
| 743 |
return False
|
|
@@ -759,6 +763,36 @@ def _is_small_spelling_change(orig_word, corr_word):
|
|
| 759 |
if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
|
| 760 |
return False
|
| 761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 762 |
dist = _levenshtein(orig_word, corr_word)
|
| 763 |
max_len = max(len(orig_word), len(corr_word))
|
| 764 |
|
|
@@ -954,7 +988,8 @@ def analyze_text():
|
|
| 954 |
# 1-word → 1-word: accept only small edits (typos)
|
| 955 |
o_word = o_segment[0]
|
| 956 |
c_word = c_segment[0]
|
| 957 |
-
if _is_small_spelling_change(o_word, c_word):
|
|
|
|
| 958 |
new_words.append(c_word)
|
| 959 |
ctx.add_patch(
|
| 960 |
'spelling', start_idx, end_idx,
|
|
@@ -962,6 +997,7 @@ def analyze_text():
|
|
| 962 |
alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
|
| 963 |
)
|
| 964 |
else:
|
|
|
|
| 965 |
new_words.append(current_text[start_idx:end_idx])
|
| 966 |
elif len(o_segment) == 1 and len(c_segment) > 1:
|
| 967 |
# 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
|
|
@@ -989,7 +1025,7 @@ def analyze_text():
|
|
| 989 |
if ci < len(c_segment):
|
| 990 |
c_word = c_segment[ci]
|
| 991 |
# Check if this is a 1→1 small edit
|
| 992 |
-
if _is_small_spelling_change(o_word, c_word):
|
| 993 |
new_words.append(c_word)
|
| 994 |
ctx.add_patch(
|
| 995 |
'spelling', o_start, o_end,
|
|
@@ -1058,14 +1094,30 @@ def analyze_text():
|
|
| 1058 |
f"'{d.get('original','')}' — locked by previous stage"
|
| 1059 |
)
|
| 1060 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1061 |
# Re-label: if grammar's change is purely orthographic
|
| 1062 |
# (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
|
| 1063 |
stage_label = 'grammar'
|
| 1064 |
-
if _is_spelling_only_change(
|
| 1065 |
stage_label = 'spelling'
|
| 1066 |
ctx.add_patch(
|
| 1067 |
stage_label, d['start'], d['end'],
|
| 1068 |
-
|
| 1069 |
)
|
| 1070 |
ctx.mutate_text(corrected_grammar, OffsetMapper)
|
| 1071 |
current_text = ctx.current_text
|
|
|
|
| 734 |
return dp[m][n]
|
| 735 |
|
| 736 |
|
| 737 |
+
def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
|
| 738 |
"""
|
| 739 |
Heuristic: only accept small spelling edits and ignore
|
| 740 |
aggressive changes (to avoid over-editing).
|
| 741 |
+
|
| 742 |
+
CRITICAL: If both words are in-vocabulary (both are valid Arabic words),
|
| 743 |
+
only accept known orthographic fixes (ه→ة, hamza whitelist).
|
| 744 |
+
This prevents the model from corrupting correct words (e.g. وكان→وكأن).
|
| 745 |
"""
|
| 746 |
if not orig_word or not corr_word:
|
| 747 |
return False
|
|
|
|
| 763 |
if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
|
| 764 |
return False
|
| 765 |
|
| 766 |
+
# CRITICAL: If both words are valid Arabic words, only accept known fixes.
|
| 767 |
+
# This prevents the spelling model from changing one correct word to another
|
| 768 |
+
# (e.g. وكان→وكأن, which changes "and was" to "as if" — a meaning change).
|
| 769 |
+
if vocab_manager:
|
| 770 |
+
orig_iv = vocab_manager.is_iv(orig_word)
|
| 771 |
+
corr_iv = vocab_manager.is_iv(corr_word)
|
| 772 |
+
if orig_iv and corr_iv:
|
| 773 |
+
# Both are valid words — only accept known orthographic fixes:
|
| 774 |
+
# 1. ه→ة at word end (feminine marker fix)
|
| 775 |
+
if (orig_word.endswith('ه') and corr_word.endswith('ة')
|
| 776 |
+
and orig_word[:-1] == corr_word[:-1]):
|
| 777 |
+
return True
|
| 778 |
+
# 2. ة→ه at word end (less common but valid)
|
| 779 |
+
if (orig_word.endswith('ة') and corr_word.endswith('ه')
|
| 780 |
+
and orig_word[:-1] == corr_word[:-1]):
|
| 781 |
+
return True
|
| 782 |
+
# 3. Word is in the hamza whitelist (known common errors)
|
| 783 |
+
from nlp.spelling.araspell_rules import AraSpellPostProcessor
|
| 784 |
+
if orig_word in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 785 |
+
return True
|
| 786 |
+
# 4. Check prefixed hamza (و+whitelist word, etc.)
|
| 787 |
+
for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
|
| 788 |
+
if orig_word.startswith(prefix) and len(orig_word) > len(prefix) + 1:
|
| 789 |
+
remainder = orig_word[len(prefix):]
|
| 790 |
+
if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 791 |
+
return True
|
| 792 |
+
# Both are valid words and change is NOT a known fix — REJECT
|
| 793 |
+
# This prevents وكان→وكأن, etc.
|
| 794 |
+
return False
|
| 795 |
+
|
| 796 |
dist = _levenshtein(orig_word, corr_word)
|
| 797 |
max_len = max(len(orig_word), len(corr_word))
|
| 798 |
|
|
|
|
| 988 |
# 1-word → 1-word: accept only small edits (typos)
|
| 989 |
o_word = o_segment[0]
|
| 990 |
c_word = c_segment[0]
|
| 991 |
+
if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
|
| 992 |
+
logger.info(f"[SPELLING] Accepted: '{o_word}'→'{c_word}'")
|
| 993 |
new_words.append(c_word)
|
| 994 |
ctx.add_patch(
|
| 995 |
'spelling', start_idx, end_idx,
|
|
|
|
| 997 |
alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
|
| 998 |
)
|
| 999 |
else:
|
| 1000 |
+
logger.info(f"[SPELLING] Rejected: '{o_word}'→'{c_word}' (filter blocked)")
|
| 1001 |
new_words.append(current_text[start_idx:end_idx])
|
| 1002 |
elif len(o_segment) == 1 and len(c_segment) > 1:
|
| 1003 |
# 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
|
|
|
|
| 1025 |
if ci < len(c_segment):
|
| 1026 |
c_word = c_segment[ci]
|
| 1027 |
# Check if this is a 1→1 small edit
|
| 1028 |
+
if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
|
| 1029 |
new_words.append(c_word)
|
| 1030 |
ctx.add_patch(
|
| 1031 |
'spelling', o_start, o_end,
|
|
|
|
| 1094 |
f"'{d.get('original','')}' — locked by previous stage"
|
| 1095 |
)
|
| 1096 |
continue
|
| 1097 |
+
|
| 1098 |
+
# Reject grammar hallucinations (e.g. جالس→جاكسون)
|
| 1099 |
+
orig_text = d.get('original', '')
|
| 1100 |
+
corr_text = d.get('correction', '')
|
| 1101 |
+
if orig_text and corr_text:
|
| 1102 |
+
orig_chars = set(orig_text.replace(' ', ''))
|
| 1103 |
+
corr_chars = set(corr_text.replace(' ', ''))
|
| 1104 |
+
if orig_chars and corr_chars:
|
| 1105 |
+
jaccard = len(orig_chars & corr_chars) / len(orig_chars | corr_chars)
|
| 1106 |
+
if jaccard < 0.3:
|
| 1107 |
+
logger.info(
|
| 1108 |
+
f"[GRAMMAR] Rejected hallucination: '{orig_text}'→'{corr_text}' "
|
| 1109 |
+
f"(jaccard={jaccard:.2f})"
|
| 1110 |
+
)
|
| 1111 |
+
continue
|
| 1112 |
+
|
| 1113 |
# Re-label: if grammar's change is purely orthographic
|
| 1114 |
# (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
|
| 1115 |
stage_label = 'grammar'
|
| 1116 |
+
if _is_spelling_only_change(orig_text, corr_text):
|
| 1117 |
stage_label = 'spelling'
|
| 1118 |
ctx.add_patch(
|
| 1119 |
stage_label, d['start'], d['end'],
|
| 1120 |
+
corr_text, confidence=1.0
|
| 1121 |
)
|
| 1122 |
ctx.mutate_text(corrected_grammar, OffsetMapper)
|
| 1123 |
current_text = ctx.current_text
|
|
@@ -163,11 +163,16 @@ class ArabicGrammarGuard:
|
|
| 163 |
|
| 164 |
def fix_subject_verb_agreement(self, text):
|
| 165 |
"""
|
| 166 |
-
Fix G1: When a plural
|
| 167 |
the verb must agree in number and gender.
|
| 168 |
|
| 169 |
Arabic rule: In VSO order, verb can be singular even with plural subject.
|
| 170 |
But in SVO order, subject-verb agreement is required.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
"""
|
| 172 |
tokens = simple_word_tokenize(text)
|
| 173 |
if len(tokens) < 2:
|
|
@@ -175,8 +180,16 @@ class ArabicGrammarGuard:
|
|
| 175 |
disambig_tokens = self.mle.disambiguate(tokens)
|
| 176 |
corrected_tokens = list(tokens)
|
| 177 |
|
| 178 |
-
#
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
for i in range(len(disambig_tokens) - 1):
|
| 181 |
noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
|
| 182 |
verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
|
|
@@ -188,6 +201,10 @@ class ArabicGrammarGuard:
|
|
| 188 |
noun_word = corrected_tokens[i]
|
| 189 |
verb_word = corrected_tokens[i+1]
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
# Only process noun → verb patterns (SVO order)
|
| 192 |
if noun_pos != 'noun' or verb_pos != 'verb':
|
| 193 |
continue
|
|
@@ -200,39 +217,46 @@ class ArabicGrammarGuard:
|
|
| 200 |
if verb_num != 's':
|
| 201 |
continue
|
| 202 |
|
| 203 |
-
#
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
| 209 |
KNOWN_PLURALS_MASC = {
|
| 210 |
'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
|
| 211 |
'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
|
| 212 |
-
'العمال', 'عمال', 'ال
|
|
|
|
| 213 |
}
|
| 214 |
KNOWN_PLURALS_FEM = {
|
| 215 |
'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
|
| 216 |
'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
|
| 217 |
}
|
|
|
|
| 218 |
if noun_word in KNOWN_PLURALS_MASC:
|
| 219 |
is_plural_masc = True
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
is_plural_fem = True
|
| 222 |
|
| 223 |
if not is_plural_masc and not is_plural_fem:
|
| 224 |
continue
|
| 225 |
|
| 226 |
# Fix the verb to agree with the plural subject
|
| 227 |
-
# Past tense singular → plural
|
| 228 |
if is_plural_fem:
|
| 229 |
-
# Feminine plural: ذهب → ذهبن
|
| 230 |
if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
|
| 231 |
-
# Check if it's a past tense verb (typically 3-5 chars, no prefix)
|
| 232 |
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|
| 233 |
corrected_tokens[i+1] = verb_word + 'ن'
|
| 234 |
elif is_plural_masc:
|
| 235 |
-
# Masculine plural: ذهب → ذهبوا
|
| 236 |
if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
|
| 237 |
and not verb_word.endswith('ين')):
|
| 238 |
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|
|
|
|
| 163 |
|
| 164 |
def fix_subject_verb_agreement(self, text):
|
| 165 |
"""
|
| 166 |
+
Fix G1: When a CONFIRMED plural noun PRECEDES a singular verb (SVO order),
|
| 167 |
the verb must agree in number and gender.
|
| 168 |
|
| 169 |
Arabic rule: In VSO order, verb can be singular even with plural subject.
|
| 170 |
But in SVO order, subject-verb agreement is required.
|
| 171 |
+
|
| 172 |
+
EXCLUSIONS:
|
| 173 |
+
- Pronouns (أنا, أنت, هو, etc.) — these are NOT plural
|
| 174 |
+
- Proper nouns — don't modify verbs after names
|
| 175 |
+
- Words tagged as singular by the disambiguator
|
| 176 |
"""
|
| 177 |
tokens = simple_word_tokenize(text)
|
| 178 |
if len(tokens) < 2:
|
|
|
|
| 180 |
disambig_tokens = self.mle.disambiguate(tokens)
|
| 181 |
corrected_tokens = list(tokens)
|
| 182 |
|
| 183 |
+
# Words that should NEVER trigger plural verb agreement
|
| 184 |
+
EXCLUDED_WORDS = {
|
| 185 |
+
# Pronouns (all singular/dual)
|
| 186 |
+
'أنا', 'انا', 'أنت', 'انت', 'أنتِ', 'هو', 'هي',
|
| 187 |
+
'نحن', 'أنتما', 'هما',
|
| 188 |
+
# Common words that look like nouns but aren't plural
|
| 189 |
+
'كان', 'وكان', 'كانت', 'وكانت', 'ليس', 'ليست',
|
| 190 |
+
'هذا', 'هذه', 'ذلك', 'تلك', 'هناك',
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
for i in range(len(disambig_tokens) - 1):
|
| 194 |
noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
|
| 195 |
verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
|
|
|
|
| 201 |
noun_word = corrected_tokens[i]
|
| 202 |
verb_word = corrected_tokens[i+1]
|
| 203 |
|
| 204 |
+
# Skip excluded words
|
| 205 |
+
if noun_word in EXCLUDED_WORDS:
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
# Only process noun → verb patterns (SVO order)
|
| 209 |
if noun_pos != 'noun' or verb_pos != 'verb':
|
| 210 |
continue
|
|
|
|
| 217 |
if verb_num != 's':
|
| 218 |
continue
|
| 219 |
|
| 220 |
+
# Only trigger on CONFIRMED plurals:
|
| 221 |
+
# 1. Known broken plural nouns (hardcoded list)
|
| 222 |
+
# 2. Sound masculine plural ending in ون/ين
|
| 223 |
+
# 3. Sound feminine plural ending in ات
|
| 224 |
+
# Do NOT rely on POS tagger alone — it misclassifies too many words
|
| 225 |
+
|
| 226 |
+
is_plural_masc = False
|
| 227 |
+
is_plural_fem = False
|
| 228 |
+
|
| 229 |
KNOWN_PLURALS_MASC = {
|
| 230 |
'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
|
| 231 |
'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
|
| 232 |
+
'العمال', 'عمال', 'الشباب', 'الأبناء',
|
| 233 |
+
'المهندسون', 'المعلمون', 'المهندسين', 'المعلمين',
|
| 234 |
}
|
| 235 |
KNOWN_PLURALS_FEM = {
|
| 236 |
'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
|
| 237 |
'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
|
| 238 |
}
|
| 239 |
+
|
| 240 |
if noun_word in KNOWN_PLURALS_MASC:
|
| 241 |
is_plural_masc = True
|
| 242 |
+
elif noun_word in KNOWN_PLURALS_FEM:
|
| 243 |
+
is_plural_fem = True
|
| 244 |
+
elif noun_word.endswith('ون') or noun_word.endswith('ين'):
|
| 245 |
+
# Sound masculine plural — but only if 4+ chars (avoid short words)
|
| 246 |
+
if len(noun_word) >= 5:
|
| 247 |
+
is_plural_masc = True
|
| 248 |
+
elif noun_word.endswith('ات') and len(noun_word) >= 5:
|
| 249 |
is_plural_fem = True
|
| 250 |
|
| 251 |
if not is_plural_masc and not is_plural_fem:
|
| 252 |
continue
|
| 253 |
|
| 254 |
# Fix the verb to agree with the plural subject
|
|
|
|
| 255 |
if is_plural_fem:
|
|
|
|
| 256 |
if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
|
|
|
|
| 257 |
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|
| 258 |
corrected_tokens[i+1] = verb_word + 'ن'
|
| 259 |
elif is_plural_masc:
|
|
|
|
| 260 |
if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
|
| 261 |
and not verb_word.endswith('ين')):
|
| 262 |
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|