Spaces:

bayan10
/

bayan-api

Running

youssefreda9 commited on 14 days ago

Commit

a16af4a

1 Parent(s): 751ba66

fix(critical): stop spelling from corrupting correct words + fix pronoun agreement + reject hallucinations

3 critical fixes:

1. IV PROTECTION: When both original AND correction are in-vocabulary (valid
Arabic words), ONLY accept the change if it's a known orthographic fix
(hamza whitelist or ه→ة). This blocks وكان→وكأن type corruption where
the model changes one correct word to a completely different correct word.

2. PRONOUN EXCLUSION: fix_subject_verb_agreement now excludes pronouns
(أنا, أنت, هو, etc.) from triggering plural verb agreement. Previously
it incorrectly changed أنا ذهبت → أنا ذهبتوا.

3. HALLUCINATION FILTER: Grammar diffs with Jaccard char similarity <0.3
are rejected (e.g. جالس→جاكسون). Prevents model hallucinations from
reaching the user.

Also adds [SPELLING] Accepted/Rejected debug logging for production tracing.

59/59 tests passing

Files changed (2) hide show

src/app.py +57 -5
src/nlp/grammar/grammar_rules.py +39 -15

src/app.py CHANGED Viewed

@@ -734,10 +734,14 @@ def _levenshtein(a, b):
     return dp[m][n]
-def _is_small_spelling_change(orig_word, corr_word):
     """
     Heuristic: only accept small spelling edits and ignore
     aggressive changes (to avoid over-editing).
     """
     if not orig_word or not corr_word:
         return False
@@ -759,6 +763,36 @@ def _is_small_spelling_change(orig_word, corr_word):
         if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
             return False
     dist = _levenshtein(orig_word, corr_word)
     max_len = max(len(orig_word), len(corr_word))
@@ -954,7 +988,8 @@ def analyze_text():
                                 # 1-word → 1-word: accept only small edits (typos)
                                 o_word = o_segment[0]
                                 c_word = c_segment[0]
-                                if _is_small_spelling_change(o_word, c_word):
                                     new_words.append(c_word)
                                     ctx.add_patch(
                                         'spelling', start_idx, end_idx,
@@ -962,6 +997,7 @@ def analyze_text():
                                         alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
                                     )
                                 else:
                                     new_words.append(current_text[start_idx:end_idx])
                             elif len(o_segment) == 1 and len(c_segment) > 1:
                                 # 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
@@ -989,7 +1025,7 @@ def analyze_text():
                                     if ci < len(c_segment):
                                         c_word = c_segment[ci]
                                         # Check if this is a 1→1 small edit
-                                        if _is_small_spelling_change(o_word, c_word):
                                             new_words.append(c_word)
                                             ctx.add_patch(
                                                 'spelling', o_start, o_end,
@@ -1058,14 +1094,30 @@ def analyze_text():
                             f"'{d.get('original','')}' — locked by previous stage"
                         )
                         continue
                     # Re-label: if grammar's change is purely orthographic
                     # (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
                     stage_label = 'grammar'
-                    if _is_spelling_only_change(d.get('original', ''), d.get('correction', '')):
                         stage_label = 'spelling'
                     ctx.add_patch(
                         stage_label, d['start'], d['end'],
-                        d['correction'], confidence=1.0
                     )
                 ctx.mutate_text(corrected_grammar, OffsetMapper)
                 current_text = ctx.current_text

     return dp[m][n]
+def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
     """
     Heuristic: only accept small spelling edits and ignore
     aggressive changes (to avoid over-editing).
+    CRITICAL: If both words are in-vocabulary (both are valid Arabic words),
+    only accept known orthographic fixes (ه→ة, hamza whitelist).
+    This prevents the model from corrupting correct words (e.g. وكان→وكأن).
     """
     if not orig_word or not corr_word:
         return False
         if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
             return False
+    # CRITICAL: If both words are valid Arabic words, only accept known fixes.
+    # This prevents the spelling model from changing one correct word to another
+    # (e.g. وكان→وكأن, which changes "and was" to "as if" — a meaning change).
+    if vocab_manager:
+        orig_iv = vocab_manager.is_iv(orig_word)
+        corr_iv = vocab_manager.is_iv(corr_word)
+        if orig_iv and corr_iv:
+            # Both are valid words — only accept known orthographic fixes:
+            # 1. ه→ة at word end (feminine marker fix)
+            if (orig_word.endswith('ه') and corr_word.endswith('ة')
+                    and orig_word[:-1] == corr_word[:-1]):
+                return True
+            # 2. ة→ه at word end (less common but valid)
+            if (orig_word.endswith('ة') and corr_word.endswith('ه')
+                    and orig_word[:-1] == corr_word[:-1]):
+                return True
+            # 3. Word is in the hamza whitelist (known common errors)
+            from nlp.spelling.araspell_rules import AraSpellPostProcessor
+            if orig_word in AraSpellPostProcessor.HAMZA_WHITELIST:
+                return True
+            # 4. Check prefixed hamza (و+whitelist word, etc.)
+            for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
+                if orig_word.startswith(prefix) and len(orig_word) > len(prefix) + 1:
+                    remainder = orig_word[len(prefix):]
+                    if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
+                        return True
+            # Both are valid words and change is NOT a known fix — REJECT
+            # This prevents وكان→وكأن, etc.
+            return False
     dist = _levenshtein(orig_word, corr_word)
     max_len = max(len(orig_word), len(corr_word))
                                 # 1-word → 1-word: accept only small edits (typos)
                                 o_word = o_segment[0]
                                 c_word = c_segment[0]
+                                if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
+                                    logger.info(f"[SPELLING] Accepted: '{o_word}'→'{c_word}'")
                                     new_words.append(c_word)
                                     ctx.add_patch(
                                         'spelling', start_idx, end_idx,
                                         alternatives=_get_spelling_alternatives(o_word, c_word, spell_checker),
                                     )
                                 else:
+                                    logger.info(f"[SPELLING] Rejected: '{o_word}'→'{c_word}' (filter blocked)")
                                     new_words.append(current_text[start_idx:end_idx])
                             elif len(o_segment) == 1 and len(c_segment) > 1:
                                 # 1-word → N words: accept word splits (e.g. فيالمدرسة → في المدرسة)
                                     if ci < len(c_segment):
                                         c_word = c_segment[ci]
                                         # Check if this is a 1→1 small edit
+                                        if _is_small_spelling_change(o_word, c_word, spell_checker.vocab_manager):
                                             new_words.append(c_word)
                                             ctx.add_patch(
                                                 'spelling', o_start, o_end,
                             f"'{d.get('original','')}' — locked by previous stage"
                         )
                         continue
+                    # Reject grammar hallucinations (e.g. جالس→جاكسون)
+                    orig_text = d.get('original', '')
+                    corr_text = d.get('correction', '')
+                    if orig_text and corr_text:
+                        orig_chars = set(orig_text.replace(' ', ''))
+                        corr_chars = set(corr_text.replace(' ', ''))
+                        if orig_chars and corr_chars:
+                            jaccard = len(orig_chars & corr_chars) / len(orig_chars | corr_chars)
+                            if jaccard < 0.3:
+                                logger.info(
+                                    f"[GRAMMAR] Rejected hallucination: '{orig_text}'→'{corr_text}' "
+                                    f"(jaccard={jaccard:.2f})"
+                                )
+                                continue
                     # Re-label: if grammar's change is purely orthographic
                     # (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
                     stage_label = 'grammar'
+                    if _is_spelling_only_change(orig_text, corr_text):
                         stage_label = 'spelling'
                     ctx.add_patch(
                         stage_label, d['start'], d['end'],
+                        corr_text, confidence=1.0
                     )
                 ctx.mutate_text(corrected_grammar, OffsetMapper)
                 current_text = ctx.current_text

src/nlp/grammar/grammar_rules.py CHANGED Viewed

@@ -163,11 +163,16 @@ class ArabicGrammarGuard:
     def fix_subject_verb_agreement(self, text):
         """
-        Fix G1: When a plural/dual noun PRECEDES a singular verb (SVO order),
         the verb must agree in number and gender.
         Arabic rule: In VSO order, verb can be singular even with plural subject.
         But in SVO order, subject-verb agreement is required.
         """
         tokens = simple_word_tokenize(text)
         if len(tokens) < 2:
@@ -175,8 +180,16 @@ class ArabicGrammarGuard:
         disambig_tokens = self.mle.disambiguate(tokens)
         corrected_tokens = list(tokens)
-        # Common plural nouns (masculine sound plural) ending in ون/ين/ات
-        # and their expected verb conjugation patterns
         for i in range(len(disambig_tokens) - 1):
             noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
             verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
@@ -188,6 +201,10 @@ class ArabicGrammarGuard:
             noun_word = corrected_tokens[i]
             verb_word = corrected_tokens[i+1]
             # Only process noun → verb patterns (SVO order)
             if noun_pos != 'noun' or verb_pos != 'verb':
                 continue
@@ -200,39 +217,46 @@ class ArabicGrammarGuard:
             if verb_num != 's':
                 continue
-            # Detect plural nouns
-            is_plural_masc = (noun_word.endswith('ون') or noun_word.endswith('ين')
-                             or noun_num == 'p')
-            is_plural_fem = (noun_word.endswith('ات') or
-                            (noun_gen == 'f' and noun_num == 'p'))
-            # Common broken plurals and collective nouns
             KNOWN_PLURALS_MASC = {
                 'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
                 'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
-                'العمال', 'عمال', 'الناس', 'الشباب', 'الأبناء',
             }
             KNOWN_PLURALS_FEM = {
                 'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
                 'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
             }
             if noun_word in KNOWN_PLURALS_MASC:
                 is_plural_masc = True
-            if noun_word in KNOWN_PLURALS_FEM:
                 is_plural_fem = True
             if not is_plural_masc and not is_plural_fem:
                 continue
             # Fix the verb to agree with the plural subject
-            # Past tense singular → plural
             if is_plural_fem:
-                # Feminine plural: ذهب → ذهبن
                 if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
-                    # Check if it's a past tense verb (typically 3-5 chars, no prefix)
                     if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
                         corrected_tokens[i+1] = verb_word + 'ن'
             elif is_plural_masc:
-                # Masculine plural: ذهب → ذهبوا
                 if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
                         and not verb_word.endswith('ين')):
                     if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):

     def fix_subject_verb_agreement(self, text):
         """
+        Fix G1: When a CONFIRMED plural noun PRECEDES a singular verb (SVO order),
         the verb must agree in number and gender.
         Arabic rule: In VSO order, verb can be singular even with plural subject.
         But in SVO order, subject-verb agreement is required.
+        EXCLUSIONS:
+        - Pronouns (أنا, أنت, هو, etc.) — these are NOT plural
+        - Proper nouns — don't modify verbs after names
+        - Words tagged as singular by the disambiguator
         """
         tokens = simple_word_tokenize(text)
         if len(tokens) < 2:
         disambig_tokens = self.mle.disambiguate(tokens)
         corrected_tokens = list(tokens)
+        # Words that should NEVER trigger plural verb agreement
+        EXCLUDED_WORDS = {
+            # Pronouns (all singular/dual)
+            'أنا', 'انا', 'أنت', 'انت', 'أنتِ', 'هو', 'هي',
+            'نحن', 'أنتما', 'هما',
+            # Common words that look like nouns but aren't plural
+            'كان', 'وكان', 'كانت', 'وكانت', 'ليس', 'ليست',
+            'هذا', 'هذه', 'ذلك', 'تلك', 'هناك',
+        }
         for i in range(len(disambig_tokens) - 1):
             noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
             verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
             noun_word = corrected_tokens[i]
             verb_word = corrected_tokens[i+1]
+            # Skip excluded words
+            if noun_word in EXCLUDED_WORDS:
+                continue
             # Only process noun → verb patterns (SVO order)
             if noun_pos != 'noun' or verb_pos != 'verb':
                 continue
             if verb_num != 's':
                 continue
+            # Only trigger on CONFIRMED plurals:
+            # 1. Known broken plural nouns (hardcoded list)
+            # 2. Sound masculine plural ending in ون/ين
+            # 3. Sound feminine plural ending in ات
+            # Do NOT rely on POS tagger alone — it misclassifies too many words
+            is_plural_masc = False
+            is_plural_fem = False
             KNOWN_PLURALS_MASC = {
                 'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
                 'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
+                'العمال', 'عمال', 'الشباب', 'الأبناء',
+                'المهندسون', 'المعلمون', 'المهندسين', 'المعلمين',
             }
             KNOWN_PLURALS_FEM = {
                 'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
                 'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
             }
             if noun_word in KNOWN_PLURALS_MASC:
                 is_plural_masc = True
+            elif noun_word in KNOWN_PLURALS_FEM:
+                is_plural_fem = True
+            elif noun_word.endswith('ون') or noun_word.endswith('ين'):
+                # Sound masculine plural — but only if 4+ chars (avoid short words)
+                if len(noun_word) >= 5:
+                    is_plural_masc = True
+            elif noun_word.endswith('ات') and len(noun_word) >= 5:
                 is_plural_fem = True
             if not is_plural_masc and not is_plural_fem:
                 continue
             # Fix the verb to agree with the plural subject
             if is_plural_fem:
                 if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
                     if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
                         corrected_tokens[i+1] = verb_word + 'ن'
             elif is_plural_masc:
                 if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
                         and not verb_word.endswith('ين')):
                     if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):