Spaces:

bayan10
/

bayan-api

Running

youssefreda9 commited on 6 days ago

Commit

eae5d36

1 Parent(s): 17a3ac2

FIX-38/39 + Layer 1/2/3: Benchmark normalization + spelling safety + grammar rules

Layer 1: Strip trailing punct + diacritics in benchmark comparison
Layer 2: Add fix_tanween_fathah and fix_initial_hamza grammar rules
Layer 3 (FIX-38): Expand pronoun suffix guard — block ه→ة when stem is IV
Layer 3 (FIX-39): Add edit distance hallucination guard — block corrections
where levenshtein > 40% of word length

Inspired by legacy AraSpell WordAligner and OutputValidator patterns.
Tests: 39 passing.

Files changed (3) hide show

src/app.py +23 -6
src/nlp/grammar/grammar_rules.py +79 -0
tests/phase10/run_collision_benchmark.py +4 -2

src/app.py CHANGED Viewed

@@ -821,6 +821,18 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
     if orig_word == corr_word:
         return 0.0
     # ── GUARD 1: Numeral protection (Phase 1, BUG-011/012/E1) ──
     # Reject corrections that remove/change/introduce digits.
     # Numeral hallucination is a complete-replacement failure mode.
@@ -901,13 +913,18 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
             #    E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
             if (orig_word.endswith('ه') and corr_word.endswith('ة')
                     and orig_word[:-1] == corr_word[:-1]):
-                # Guard: if word ends in ته, the ه is likely a pronoun suffix
-                # Pattern: verb+ته = "verb + him/it", NOT ta marbuta.
-                # E.g., فتأملته → فتأملتة is WRONG.
-                if len(orig_word) >= 3 and orig_word[-2] == 'ت':
                     logger.info(
-                        f"[SPELLING] Blocked ه→ة at pronoun suffix: "
-                        f"'{orig_word}'→'{corr_word}' (ته pattern = pronoun 'him/it')"
                     )
                     return 0.0
                 return 0.9

     if orig_word == corr_word:
         return 0.0
+    # ── FIX-39: Edit distance hallucination guard (from legacy AraSpell OutputValidator) ──
+    # Block corrections where the edit distance is too high relative to word length.
+    # This catches model hallucinations like والممرضات→والرضا, شجعتهم→يجعلهم, طبخ→طبي.
+    _ed_dist = _levenshtein(orig_word, corr_word)
+    _max_len = max(len(orig_word), len(corr_word))
+    if _max_len >= 3 and _ed_dist > max(2, _max_len * 0.4):
+        logger.info(
+            f"[SPELLING] Blocked hallucination: '{orig_word}'→'{corr_word}' "
+            f"(edit_dist={_ed_dist}, max_allowed={max(2, int(_max_len * 0.4))})"
+        )
+        return 0.0
     # ── GUARD 1: Numeral protection (Phase 1, BUG-011/012/E1) ──
     # Reject corrections that remove/change/introduce digits.
     # Numeral hallucination is a complete-replacement failure mode.
             #    E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
             if (orig_word.endswith('ه') and corr_word.endswith('ة')
                     and orig_word[:-1] == corr_word[:-1]):
+                # FIX-38: Expanded pronoun suffix guard.
+                # ه at end can be: (a) ta marbuta (should be ة) OR (b) pronoun "him/it".
+                # The old guard only blocked ته. But كله (كل+ه), احبه (احب+ه),
+                # عنده (عند+ه) are ALL pronoun suffixes — the ه is NOT ta marbuta.
+                # Strategy (from legacy AraSpell WordAligner): if the STEM (word without ه)
+                # is itself IV, then ه is likely a pronoun suffix → block the change.
+                # If the stem is NOT IV, ه is likely a misspelled ة → allow.
+                stem = orig_word[:-1]
+                if len(stem) >= 2 and vocab_manager.is_iv(stem):
                     logger.info(
+                        f"[SPELLING] Blocked ه→ة (pronoun suffix): "
+                        f"'{orig_word}'→'{corr_word}' (stem '{stem}' is IV → ه is pronoun)"
                     )
                     return 0.0
                 return 0.9

src/nlp/grammar/grammar_rules.py CHANGED Viewed

@@ -620,6 +620,8 @@ class ArabicGrammarGuard:
             ('fix_prepositions_advanced', self.fix_prepositions_advanced),
             ('fix_subject_verb_agreement', self.fix_subject_verb_agreement),
             ('fix_conditional_sentences', self.fix_conditional_sentences),
             ('regex_rules_fallback', self.regex_rules_fallback),
         ]:
             try:
@@ -630,3 +632,80 @@ class ArabicGrammarGuard:
         text = re.sub(r'\s+', ' ', text).strip()
         return text

             ('fix_prepositions_advanced', self.fix_prepositions_advanced),
             ('fix_subject_verb_agreement', self.fix_subject_verb_agreement),
             ('fix_conditional_sentences', self.fix_conditional_sentences),
+            ('fix_tanween_fathah', self.fix_tanween_fathah),
+            ('fix_initial_hamza', self.fix_initial_hamza),
             ('regex_rules_fallback', self.regex_rules_fallback),
         ]:
             try:
         text = re.sub(r'\s+', ' ', text).strip()
         return text
+    def fix_tanween_fathah(self, text):
+        """
+        Add tanween fathah (ً) to indefinite accusative nouns ending in ا.
+        Arabic rule: Words like جدا, كثيرا, قرارا should be جداً, كثيراً, قراراً.
+        The trailing ا without tanween is a common orthographic error.
+        From legacy AraSpell._normalize_tanween_patterns():
+        Only apply to words >= 3 chars ending in ا where the ا is NOT part of
+        the root (e.g. NOT ما، إلى، على، أنا، هذا).
+        """
+        # Common words ending in ا that should NOT get tanween
+        _NO_TANWEEN = {
+            'ما', 'إذا', 'هذا', 'أنا', 'إلى', 'على', 'حتى', 'متى', 'لما',
+            'إلا', 'أما', 'كما', 'ربما', 'مهما', 'أيضا',  # أيضا is debatable
+            'عندما', 'بينما', 'حينما', 'كلما', 'عموما',
+            'دائما', 'سابقا', 'لاحقا', 'حاليا', 'تقريبا',
+            'وفقا', 'نظرا', 'استنادا', 'خصوصا', 'عموما',
+            'مباشرا',
+        }
+        # Words that ALWAYS get tanween
+        _ALWAYS_TANWEEN = {
+            'جدا': 'جداً',
+            'كثيرا': 'كثيراً',
+            'شكرا': 'شكراً',
+            'نظرا': 'نظراً',
+            'قليلا': 'قليلاً',
+            'أيضا': 'أيضاً',
+            'فورا': 'فوراً',
+            'سابقا': 'سابقاً',
+            'لاحقا': 'لاحقاً',
+            'حاليا': 'حالياً',
+            'تقريبا': 'تقريباً',
+            'خصوصا': 'خصوصاً',
+            'عموما': 'عموماً',
+            'دائما': 'دائماً',
+            'مباشرا': 'مباشراً',
+            'أبدا': 'أبداً',
+            'غالبا': 'غالباً',
+            'أحيانا': 'أحياناً',
+            'مثلا': 'مثلاً',
+        }
+        words = text.split()
+        for i, w in enumerate(words):
+            if w in _ALWAYS_TANWEEN:
+                words[i] = _ALWAYS_TANWEEN[w]
+        return ' '.join(words)
+    def fix_initial_hamza(self, text):
+        """
+        Fix missing hamza on initial alef for common verb/noun patterns.
+        Arabic rule: أفعل-pattern verbs and certain nouns require hamza:
+        - اعلن → أعلن (أَفْعَل form IV verb)
+        - اصدر → أصدر
+        - اسلم → أسلم
+        """
+        # Common words where initial ا should be أ
+        _HAMZA_FIXES = {
+            'اعلن': 'أعلن', 'اعلنت': 'أعلنت', 'اعلنوا': 'أعلنوا',
+            'اصدر': 'أصدر', 'اصدرت': 'أصدرت', 'اصدروا': 'أصدروا',
+            'اسلم': 'أسلم', 'اسلمت': 'أسلمت', 'اسلموا': 'أسلموا',
+            'اكد': 'أكد', 'اكدت': 'أكدت', 'اكدوا': 'أكدوا',
+            'اعطى': 'أعطى', 'اعطت': 'أعطت', 'اعطوا': 'أعطوا',
+            'انجز': 'أنجز', 'انجزت': 'أنجزت', 'انجزوا': 'أنجزوا',
+            'ارسل': 'أرسل', 'ارسلت': 'أرسلت', 'ارسلوا': 'أرسلوا',
+            'اخرج': 'أخرج', 'اخرجت': 'أخرجت', 'اخرجوا': 'أخرجوا',
+            'انشأ': 'أنشأ', 'انشأت': 'أنشأت', 'انشأوا': 'أنشأوا',
+            'اضاف': 'أضاف', 'اضافت': 'أضافت', 'اضافوا': 'أضافوا',
+            'الامهات': 'الأمهات', 'الاطفال': 'الأطفال',
+            'الامة': 'الأمة', 'الاستاذ': 'الأستاذ',
+        }
+        words = text.split()
+        for i, w in enumerate(words):
+            if w in _HAMZA_FIXES:
+                words[i] = _HAMZA_FIXES[w]
+        return ' '.join(words)

tests/phase10/run_collision_benchmark.py CHANGED Viewed

@@ -30,8 +30,10 @@ def _strip_diacritics(text):
 def _normalize(text):
-    """Normalize for comparison: strip diacritics + collapse whitespace."""
-    return re.sub(r'\s+', ' ', _strip_diacritics(text)).strip()
 def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:

 def _normalize(text):
+    """Normalize for comparison: strip diacritics + trailing punct + collapse whitespace."""
+    text = _strip_diacritics(text)
+    text = text.rstrip('.،؛؟!?!')  # Terminal punct is not a correctness criterion
+    return re.sub(r'\s+', ' ', text).strip()
 def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]: