Spaces:

bayan10
/

bayan-api

Sleeping

youssefreda9 commited on 15 days ago

Commit

e68c40c

1 Parent(s): 53a22ae

fix: all model bugs — S1 S2 S3 G1 P1 (6 fixes across 4 files)

S1 (P0): WordAligner now prefers ة over ه at word end when both IV
- araspell_rules.py _select_best_word: ه→ة preference for feminine nouns

S2 (P0): Gender preservation — reject corrections that drop feminine marker
- app.py _is_small_spelling_change: block بارده→بارد, منخفظه→منخفض

S3 (P1): Hamza whitelist — 50+ common Arabic hamza corrections
- araspell_rules.py HAMZA_WHITELIST + fix_common_hamza()
- Fixes: الي→إلى, انت→أنت, لان→لأن, امس→أمس, الايام→الأيام, etc.

G1 (P1): Verb-subject agreement for SVO word order
- grammar_rules.py fix_subject_verb_agreement()
- Handles: الطلاب ذهب→ذهبوا, الطالبات ذهب→ذهبن

P1 (P2): Punctuation model now only adds marks, no spelling/grammar changes
- punctuation_service.py _strip_non_punctuation_changes()
- Reverts PuncAra's baked-in spelling/grammar corrections, keeps only marks

S4 (P2): Mitigated by S1+S2+S3 — spelling now makes better corrections,
fewer bad locks blocking grammar

31/31 tests passing

Files changed (4) hide show

src/app.py +8 -0
src/nlp/grammar/grammar_rules.py +81 -0
src/nlp/punctuation/punctuation_service.py +97 -1
src/nlp/spelling/araspell_rules.py +87 -5

src/app.py CHANGED Viewed

@@ -751,6 +751,14 @@ def _is_small_spelling_change(orig_word, corr_word):
     if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
         return False
     dist = _levenshtein(orig_word, corr_word)
     max_len = max(len(orig_word), len(corr_word))

     if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
         return False
+    # Fix S2: Reject corrections that drop feminine marker (ه/ة)
+    # e.g. بارده→بارد, منخفظه→منخفض — these are WORSE than no correction
+    feminine_endings = ('ه', 'ة')
+    if orig_word.endswith(feminine_endings) and not corr_word.endswith(feminine_endings):
+        # Only reject if the correction is just the word minus the ending
+        if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
+            return False
     dist = _levenshtein(orig_word, corr_word)
     max_len = max(len(orig_word), len(corr_word))

src/nlp/grammar/grammar_rules.py CHANGED Viewed

@@ -161,6 +161,85 @@ class ArabicGrammarGuard:
         text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
         return text
     def regex_rules_fallback(self, text):
         # إن وأخواتها
         text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
@@ -183,6 +262,8 @@ class ArabicGrammarGuard:
         text = self.fix_verbs_nasb_and_jazm(text)
         text = self.fix_gender_agreement(text)
         text = self.fix_prepositions_advanced(text)
         text = self.regex_rules_fallback(text)
         text = re.sub(r'\s+', ' ', text).strip()
         return text

         text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
         return text
+    def fix_subject_verb_agreement(self, text):
+        """
+        Fix G1: When a plural/dual noun PRECEDES a singular verb (SVO order),
+        the verb must agree in number and gender.
+        Arabic rule: In VSO order, verb can be singular even with plural subject.
+        But in SVO order, subject-verb agreement is required.
+        """
+        tokens = simple_word_tokenize(text)
+        if len(tokens) < 2:
+            return text
+        disambig_tokens = self.mle.disambiguate(tokens)
+        corrected_tokens = list(tokens)
+        # Common plural nouns (masculine sound plural) ending in ون/ين/ات
+        # and their expected verb conjugation patterns
+        for i in range(len(disambig_tokens) - 1):
+            noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
+            verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
+            if not noun_info or not verb_info:
+                continue
+            noun_pos = noun_info.analysis.get('pos', 'unknown')
+            verb_pos = verb_info.analysis.get('pos', 'unknown')
+            noun_word = corrected_tokens[i]
+            verb_word = corrected_tokens[i+1]
+            # Only process noun → verb patterns (SVO order)
+            if noun_pos != 'noun' or verb_pos != 'verb':
+                continue
+            noun_num = noun_info.analysis.get('num', 's')
+            noun_gen = noun_info.analysis.get('gen', 'm')
+            verb_num = verb_info.analysis.get('num', 's')
+            # Skip if verb is already plural
+            if verb_num != 's':
+                continue
+            # Detect plural nouns
+            is_plural_masc = (noun_word.endswith('ون') or noun_word.endswith('ين')
+                             or noun_num == 'p')
+            is_plural_fem = (noun_word.endswith('ات') or
+                            (noun_gen == 'f' and noun_num == 'p'))
+            # Common broken plurals and collective nouns
+            KNOWN_PLURALS_MASC = {
+                'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
+                'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
+                'العمال', 'عمال', 'الناس', 'الشباب', 'الأبناء',
+            }
+            KNOWN_PLURALS_FEM = {
+                'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
+                'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
+            }
+            if noun_word in KNOWN_PLURALS_MASC:
+                is_plural_masc = True
+            if noun_word in KNOWN_PLURALS_FEM:
+                is_plural_fem = True
+            if not is_plural_masc and not is_plural_fem:
+                continue
+            # Fix the verb to agree with the plural subject
+            # Past tense singular → plural
+            if is_plural_fem:
+                # Feminine plural: ذهب → ذهبن
+                if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
+                    # Check if it's a past tense verb (typically 3-5 chars, no prefix)
+                    if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
+                        corrected_tokens[i+1] = verb_word + 'ن'
+            elif is_plural_masc:
+                # Masculine plural: ذهب → ذهبوا
+                if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
+                        and not verb_word.endswith('ين')):
+                    if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
+                        corrected_tokens[i+1] = verb_word + 'وا'
+        return " ".join(corrected_tokens)
     def regex_rules_fallback(self, text):
         # إن وأخواتها
         text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
         text = self.fix_verbs_nasb_and_jazm(text)
         text = self.fix_gender_agreement(text)
         text = self.fix_prepositions_advanced(text)
+        text = self.fix_subject_verb_agreement(text)  # Fix G1
         text = self.regex_rules_fallback(text)
         text = re.sub(r'\s+', ' ', text).strip()
         return text

src/nlp/punctuation/punctuation_service.py CHANGED Viewed

@@ -27,14 +27,108 @@ class PunctuationChecker:
     Arabic punctuation restoration pipeline:
       1. Preprocessing (remove diacritics)
       2. Model inference (chunked, windowed — 50 words/chunk)
-      3. Postprocessing (typographic cleanup)
     """
     def __init__(self, model, tokenizer, device):
         self.model = model
         self.tokenizer = tokenizer
         self.device = device
     def _predict_chunk(self, text_chunk: str) -> str:
         """Run model inference on a single chunk (max 128 tokens)."""
         from nlp.punctuation.punctuation_rules import arabic_preprocessing
@@ -114,6 +208,8 @@ class PunctuationChecker:
             for paragraph in paragraphs:
                 punctuated = self._fix_punctuation(paragraph)
                 cleaned = arabic_postprocessing(punctuated)
                 processed_paragraphs.append(cleaned)

     Arabic punctuation restoration pipeline:
       1. Preprocessing (remove diacritics)
       2. Model inference (chunked, windowed — 50 words/chunk)
+      3. Postprocessing: strip non-punctuation changes (Fix P1)
+      4. Typographic cleanup
     """
+    # Arabic and common punctuation marks
+    PUNCTUATION_CHARS = set('.,;:!?،؛؟!.:«»"\'()-–—…')
     def __init__(self, model, tokenizer, device):
         self.model = model
         self.tokenizer = tokenizer
         self.device = device
+    @staticmethod
+    def _strip_punct(word: str) -> str:
+        """Remove leading/trailing punctuation from a word."""
+        return word.strip('.,;:!?،؛؟!.:«»"\'()-–—…')
+    def _strip_non_punctuation_changes(self, original: str, punctuated: str) -> str:
+        """
+        Fix P1: The PuncAra model was fine-tuned on data with spelling/grammar
+        corrections. We only want punctuation marks from this stage.
+        Strategy: Align original and punctuated word-by-word. For each word,
+        if the model changed the BASE text (not just added/moved punctuation),
+        revert to the original word but keep any punctuation the model added.
+        """
+        orig_words = original.split()
+        punc_words = punctuated.split()
+        if not orig_words or not punc_words:
+            return punctuated
+        # Build result by aligning words
+        result = []
+        oi = 0  # index into orig_words
+        pi = 0  # index into punc_words
+        while oi < len(orig_words) and pi < len(punc_words):
+            o_word = orig_words[oi]
+            p_word = punc_words[pi]
+            o_base = self._strip_punct(o_word)
+            p_base = self._strip_punct(p_word)
+            if o_base == p_base:
+                # Same base word — keep punctuation changes from model
+                result.append(p_word)
+                oi += 1
+                pi += 1
+            elif self._is_only_punct_difference(o_word, p_word):
+                # Words differ only by punctuation — keep model's punctuation
+                result.append(p_word)
+                oi += 1
+                pi += 1
+            else:
+                # Model changed the actual word content (spelling/grammar/hamza)
+                # Revert to original word but transfer any NEW punctuation
+                punct_suffix = ''
+                punct_prefix = ''
+                for ch in reversed(p_word):
+                    if ch in self.PUNCTUATION_CHARS:
+                        punct_suffix = ch + punct_suffix
+                    else:
+                        break
+                for ch in p_word:
+                    if ch in self.PUNCTUATION_CHARS:
+                        punct_prefix += ch
+                    else:
+                        break
+                # Only add punctuation that wasn't already there
+                if not o_word.endswith(punct_suffix) and punct_suffix:
+                    result.append(o_word + punct_suffix)
+                elif punct_prefix and not o_word.startswith(punct_prefix):
+                    result.append(punct_prefix + o_word)
+                else:
+                    result.append(o_word)
+                oi += 1
+                pi += 1
+        # Append remaining original words
+        while oi < len(orig_words):
+            result.append(orig_words[oi])
+            oi += 1
+        # Append remaining punctuation-only words from model
+        while pi < len(punc_words):
+            p_word = punc_words[pi]
+            if all(ch in self.PUNCTUATION_CHARS or ch.isspace() for ch in p_word):
+                result.append(p_word)
+            pi += 1
+        return ' '.join(result)
+    @staticmethod
+    def _is_only_punct_difference(word1: str, word2: str) -> bool:
+        """Check if two words differ only by punctuation characters."""
+        PUNCT = set('.,;:!?،؛؟!.:«»"\'()-–—…')
+        base1 = ''.join(c for c in word1 if c not in PUNCT)
+        base2 = ''.join(c for c in word2 if c not in PUNCT)
+        return base1 == base2
     def _predict_chunk(self, text_chunk: str) -> str:
         """Run model inference on a single chunk (max 128 tokens)."""
         from nlp.punctuation.punctuation_rules import arabic_preprocessing
             for paragraph in paragraphs:
                 punctuated = self._fix_punctuation(paragraph)
+                # Fix P1: Strip spelling/grammar changes, keep only punctuation
+                punctuated = self._strip_non_punctuation_changes(paragraph, punctuated)
                 cleaned = arabic_postprocessing(punctuated)
                 processed_paragraphs.append(cleaned)

src/nlp/spelling/araspell_rules.py CHANGED Viewed

@@ -114,6 +114,50 @@ class AraSpellPostProcessor:
     # --- Hamza & Ta Marbuta Handling ---
     @staticmethod
     def fix_hamza_conservative(text: str) -> str:
         """Conservative Hamza normalization — only at word END, not middle."""
@@ -128,34 +172,62 @@ class AraSpellPostProcessor:
             result.append(word)
         return ' '.join(result)
     @staticmethod
     def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
         """
         Smart ه → ة fix at end of words.
-        Strategy: Only convert if the ة version is IV (in tokenizer vocab).
         """
         PROTECTED_ENDINGS = ['لله']
         words = text.split()
         result = []
         for word in words:
             if any(word.endswith(e) for e in PROTECTED_ENDINGS):
                 result.append(word)
                 continue
-            if len(word) >= 4 and word.endswith('ه'):
                 if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
                     candidate_with_ta = word[:-1] + 'ة'
                     if vocab_manager:
                         ta_iv = vocab_manager.is_iv(candidate_with_ta)
                         ha_iv = vocab_manager.is_iv(word)
                         if ta_iv:
                             result.append(candidate_with_ta)
                             continue
                         elif ha_iv:
                             result.append(word)
                             continue
-                    else:
-                        result.append(candidate_with_ta)
-                        continue
             result.append(word)
         return ' '.join(result)
@@ -263,6 +335,7 @@ class AraSpellPostProcessor:
         text = AraSpellPostProcessor.remove_hallucinations(text)
         text = AraSpellPostProcessor.unified_collapse_repeated(text)
         text = AraSpellPostProcessor.fix_hamza_conservative(text)
         text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
         text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
         text = AraSpellPostProcessor.remove_duplicate_words(text)
@@ -588,6 +661,15 @@ class WordAligner:
         if in_iv and not out_iv:
             return input_word
         if in_iv and out_iv:
             return input_word
         if len(input_word) == len(output_word) and len(input_word) >= 3:
             for i in range(len(input_word)):

     # --- Hamza & Ta Marbuta Handling ---
+    # Common Arabic words with hamza errors — covers the most frequent
+    # spelling mistakes in informal Arabic writing
+    HAMZA_WHITELIST = {
+        'الي': 'إلى', 'الى': 'إلى',
+        'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
+        'انتو': 'أنتم', 'انتن': 'أنتن',
+        'انا': 'أنا',
+        'امس': 'أمس',
+        'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
+        'لانهم': 'لأنهم', 'لانك': 'لأنك',
+        'اذا': 'إذا', 'اذ': 'إذ',
+        'اي': 'أي', 'اين': 'أين',
+        'او': 'أو',
+        'اما': 'أما',
+        'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
+        'اخر': 'آخر', 'اخرى': 'أخرى',
+        'الان': 'الآن',
+        'اول': 'أول', 'اولى': 'أولى',
+        'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
+        'اكثر': 'أكثر', 'اقل': 'أقل',
+        'اعلى': 'أعلى', 'ادنى': 'أدنى',
+        'اسرع': 'أسرع', 'ابطا': 'أبطأ',
+        'اكبر': 'أكبر', 'اصغر': 'أصغر',
+        'احسن': 'أحسن', 'اسوا': 'أسوأ',
+        'امام': 'أمام',
+        'اثناء': 'أثناء',
+        'ايضا': 'أيضاً', 'ايض': 'أيضاً',
+        'اساسي': 'أساسي', 'اساسية': 'أساسية',
+        'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
+        'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
+        'اهل': 'أهل',
+        'اطفال': 'أطفال',
+        'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
+        'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
+        'اعرف': 'أعرف', 'اعلم': 'أعلم',
+        'اخذ': 'أخذ', 'اكل': 'أكل',
+        'الايام': 'الأيام',
+        'الاطفال': 'الأطفال',
+        'الاسعار': 'الأسعار',
+        'الاولى': 'الأولى',
+        'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
+        'واصدقائي': 'وأصدقائي',
+    }
     @staticmethod
     def fix_hamza_conservative(text: str) -> str:
         """Conservative Hamza normalization — only at word END, not middle."""
             result.append(word)
         return ' '.join(result)
+    @staticmethod
+    def fix_common_hamza(text: str) -> str:
+        """
+        Fix common hamza placement errors using a whitelist.
+        These are the most frequent informal Arabic spelling mistakes.
+        """
+        words = text.split()
+        result = []
+        for word in words:
+            # Check exact match first
+            if word in AraSpellPostProcessor.HAMZA_WHITELIST:
+                result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
+            else:
+                result.append(word)
+        return ' '.join(result)
     @staticmethod
     def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
         """
         Smart ه → ة fix at end of words.
+        Strategy: Always prefer ة when the previous char is a consonant,
+        UNLESS the ه form is specifically a known word and the ة form is NOT.
         """
         PROTECTED_ENDINGS = ['لله']
+        # Words that genuinely end in ه (not ة)
+        PROTECTED_HA_WORDS = {
+            'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
+            'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
+            'اتجه', 'توجه', 'تشابه',
+        }
         words = text.split()
         result = []
         for word in words:
             if any(word.endswith(e) for e in PROTECTED_ENDINGS):
                 result.append(word)
                 continue
+            if word in PROTECTED_HA_WORDS:
+                result.append(word)
+                continue
+            if len(word) >= 3 and word.endswith('ه'):
                 if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
                     candidate_with_ta = word[:-1] + 'ة'
+                    # Default: prefer ة (correct Arabic orthography for feminine nouns)
                     if vocab_manager:
                         ta_iv = vocab_manager.is_iv(candidate_with_ta)
                         ha_iv = vocab_manager.is_iv(word)
                         if ta_iv:
+                            # Always prefer ة when it's a valid word
                             result.append(candidate_with_ta)
                             continue
                         elif ha_iv:
                             result.append(word)
                             continue
+                    # No vocab manager — default to ة
+                    result.append(candidate_with_ta)
+                    continue
             result.append(word)
         return ' '.join(result)
         text = AraSpellPostProcessor.remove_hallucinations(text)
         text = AraSpellPostProcessor.unified_collapse_repeated(text)
         text = AraSpellPostProcessor.fix_hamza_conservative(text)
+        text = AraSpellPostProcessor.fix_common_hamza(text)  # Fix S3: hamza whitelist
         text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
         text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
         text = AraSpellPostProcessor.remove_duplicate_words(text)
         if in_iv and not out_iv:
             return input_word
         if in_iv and out_iv:
+            # Fix S1: When only difference is ه→ة at word end, prefer ة
+            # (correct Arabic orthography — ة is the standard feminine ending)
+            if (input_word.endswith('ه') and output_word.endswith('ة')
+                    and input_word[:-1] == output_word[:-1]):
+                return output_word
+            # Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
+            if (input_word.endswith('ة') and output_word.endswith('ه')
+                    and input_word[:-1] == output_word[:-1]):
+                return input_word
             return input_word
         if len(input_word) == len(output_word) and len(input_word) >= 3:
             for i in range(len(input_word)):