Spaces:

bayan10
/

bayan-api

Running

youssefreda9 commited on 9 days ago

Commit

4608bcd

1 Parent(s): 32a135f

FIX-44: OOV cleanup pass between spelling and grammar stages

NEW PIPELINE STEP after spelling, before grammar:
1. Trailing و removal (from legacy AraSpell):
- المصنعو→المصنع, الماهرينوومن→الماهرينوومن
- Catches PC004, PC008, PC010 benchmark failures

2. Edit-distance-1 OOV→IV correction:
- For remaining OOV words, find closest IV word in BERT vocab
- Only replaces when edit-1 candidate exists and first letter matches
- Catches: صممو→صمموا (PC001), حضرو→حضروا (PC042)

Also adds contextual_corrector.py module (MLM-based validation).
Tests: 39 passing.

Files changed (2) hide show

src/app.py +97 -0
src/nlp/spelling/contextual_corrector.py +311 -0

src/app.py CHANGED Viewed

@@ -1967,6 +1967,103 @@ def analyze_text():
                 logger.error(traceback.format_exc())
                 timing_ms['spelling_error'] = f"{type(e).__name__}: {str(e)[:200]}"
         # ── FIX-07: Religious text already detected above (before spelling) ──
         # _is_religious_text was set earlier to skip ALL stages for sacred text

                 logger.error(traceback.format_exc())
                 timing_ms['spelling_error'] = f"{type(e).__name__}: {str(e)[:200]}"
+        # ── FIX-44: OOV Cleanup Pass (between spelling and grammar) ──
+        # After spelling corrections, some OOV words remain because:
+        # 1. The model didn't correct them (missed)
+        # 2. Our guards blocked a bad correction (but word is still OOV)
+        # 3. Trailing و artifacts from model output
+        #
+        # For each remaining OOV word, try to find the closest IV word
+        # using edit-distance-1 candidates from BERT vocabulary.
+        if not _is_religious_text:
+          try:
+            from nlp.spelling.araspell_service import get_spelling_model
+            _oov_checker = get_spelling_model()
+            _oov_text = ctx.current_text
+            _oov_words = _oov_text.split()
+            _oov_changed = False
+            _oov_result = []
+            for _ow_idx, _ow in enumerate(_oov_words):
+                # Skip short words (prepositions etc.)
+                if len(_ow) <= 2:
+                    _oov_result.append(_ow)
+                    continue
+                # Strip trailing punctuation for IV check
+                _ow_clean = _ow.rstrip('.،؛؟!?!')
+                # Skip if already IV
+                if _oov_checker.vocab_manager.is_iv(_ow_clean):
+                    _oov_result.append(_ow)
+                    continue
+                # ── Trailing و removal (from legacy AraSpell L263-267) ──
+                # الماضيةو → الماضية, المصنعو → المصنع, الدروسو → الدروس
+                if (len(_ow_clean) > 4 and _ow_clean.endswith('و')
+                        and _ow_clean[-2] in 'ةهاأإآءين'):
+                    _wo_cand = _ow_clean[:-1]
+                    if _oov_checker.vocab_manager.is_iv(_wo_cand):
+                        _punct_suffix = _ow[len(_ow_clean):]  # preserve punctuation
+                        logger.info(
+                            f"[OOV-CLEANUP] Trailing و fix: '{_ow}'→'{_wo_cand}{_punct_suffix}'"
+                        )
+                        _oov_result.append(_wo_cand + _punct_suffix)
+                        _oov_changed = True
+                        # Create a patch for the UI
+                        _ow_pos = sum(len(w) + 1 for w in _oov_words[:_ow_idx])
+                        if _ow_pos + len(_ow) <= len(_oov_text):
+                            ctx.add_patch(
+                                'spelling', _ow_pos, _ow_pos + len(_ow),
+                                _wo_cand + _punct_suffix, confidence=0.75,
+                            )
+                        continue
+                # ── Edit-distance-1 OOV→IV correction ──
+                # Generate all edit-1 candidates and filter to IV words
+                try:
+                    _ed1_candidates = _oov_checker.edit_corrector.known(
+                        _oov_checker.edit_corrector.edits1(_ow_clean)
+                    )
+                    if _ed1_candidates:
+                        # Pick best: lowest vocab rank (most frequent)
+                        _best_cand = min(
+                            _ed1_candidates,
+                            key=lambda w: _oov_checker.vocab_manager.get_frequency_rank(w)
+                        )
+                        # Safety: don't change first letter (same guard as FIX-42b)
+                        if _best_cand[0] == _ow_clean[0] or (
+                            _best_cand[0] in 'أإآاء' and _ow_clean[0] in 'أإآاء'
+                        ):
+                            _punct_suffix = _ow[len(_ow_clean):]
+                            logger.info(
+                                f"[OOV-CLEANUP] Edit-1 fix: '{_ow}'→'{_best_cand}{_punct_suffix}'"
+                            )
+                            _oov_result.append(_best_cand + _punct_suffix)
+                            _oov_changed = True
+                            _ow_pos = sum(len(w) + 1 for w in _oov_words[:_ow_idx])
+                            if _ow_pos + len(_ow) <= len(_oov_text):
+                                ctx.add_patch(
+                                    'spelling', _ow_pos, _ow_pos + len(_ow),
+                                    _best_cand + _punct_suffix, confidence=0.65,
+                                )
+                            continue
+                except Exception:
+                    pass  # Edit-distance fallback is best-effort
+                _oov_result.append(_ow)
+            if _oov_changed:
+                _oov_new_text = ' '.join(_oov_result)
+                logger.info(f"[OOV-CLEANUP] Applied OOV fixes: '{_oov_text[:80]}' → '{_oov_new_text[:80]}'")
+                ctx.mutate_text(_oov_new_text, OffsetMapper)
+                current_text = ctx.current_text
+          except Exception as e:
+            logger.warning(f"[OOV-CLEANUP] Failed: {type(e).__name__}: {e}")
         # ── FIX-07: Religious text already detected above (before spelling) ──
         # _is_religious_text was set earlier to skip ALL stages for sacred text

src/nlp/spelling/contextual_corrector.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# ContextualCorrector — MLM-based contextual validation for spelling corrections
+# Adapted from legacy AraSpell ContextualCorrector.
+#
+# Purpose: After the spelling model produces corrections, this module validates
+# each OOV word by masking it and asking BERT what word should go there.
+# If BERT's top prediction is very different from the correction, the
+# original word is kept (the model hallucinated).
+#
+# Usage in pipeline: Called AFTER spelling correction, BEFORE grammar.
+# Only processes OOV words (never touches IV words).
+import logging
+import torch
+from typing import List, Tuple, Optional, Dict
+logger = logging.getLogger(__name__)
+# Singleton instance
+_instance = None
+_loading = False
+class ContextualCorrector:
+    """MLM-based contextual validation for spelling corrections.
+    Uses BERT's masked language model to validate spelling corrections.
+    For each OOV word in the corrected text:
+    1. Masks the word and asks BERT for predictions
+    2. If BERT strongly disagrees with the correction, reverts to original
+    3. Never touches IV words (they're already correct)
+    """
+    def __init__(self, model_name: str = 'aubmindlab/bert-base-arabertv02'):
+        """Initialize with BERT MLM model."""
+        from transformers import AutoTokenizer, AutoModelForMaskedLM
+        logger.info(f"[MLM] Loading contextual corrector: {model_name}")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Simple cache for scores
+        self._cache: Dict[str, float] = {}
+        self._cache_max = 5000
+        # Vocab for candidate filtering
+        self.vocab = self.tokenizer.get_vocab()
+        logger.info(f"[MLM] Contextual corrector loaded on {self.device}")
+    def score_word_in_context(self, text: str, position: int, word: str) -> float:
+        """Score how well a word fits in context using BERT MLM.
+        Args:
+            text: Full sentence
+            position: Word index (0-based) in the sentence
+            word: The word to score
+        Returns:
+            Probability score (0.0 to 1.0) — higher = better fit
+        """
+        cache_key = f"{text[:100]}|{position}|{word}"
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        words = text.split()
+        if position >= len(words):
+            return 0.0
+        # Create masked text
+        masked_words = words.copy()
+        masked_words[position] = '[MASK]'
+        masked_text = ' '.join(masked_words)
+        try:
+            inputs = self.tokenizer(
+                masked_text, return_tensors='pt',
+                padding=True, truncation=True, max_length=128
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+            # Find [MASK] token position
+            mask_idx = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+            if len(mask_idx) == 0:
+                return 0.0
+            # Get probability for the target word
+            logits = outputs.logits[0, mask_idx[0], :]
+            probs = torch.softmax(logits, dim=0)
+            word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
+            if not word_tokens:
+                return 0.0
+            score = probs[word_tokens[0]].item()
+        except Exception as e:
+            logger.warning(f"[MLM] Score error for '{word}': {e}")
+            score = 0.0
+        # Cache management
+        if len(self._cache) >= self._cache_max:
+            # Remove oldest 20% of entries
+            keys_to_remove = list(self._cache.keys())[:self._cache_max // 5]
+            for k in keys_to_remove:
+                del self._cache[k]
+        self._cache[cache_key] = score
+        return score
+    def validate_corrections(
+        self,
+        original_text: str,
+        corrected_text: str,
+        vocab_manager=None,
+        confidence_threshold: float = 0.001,
+        min_pred_score: float = 0.12,
+        similarity_threshold: float = 0.90,
+    ) -> str:
+        """Validate spelling corrections using MLM context.
+        For each word that changed between original and corrected:
+        - If the correction is OOV: revert (model hallucinated)
+        - If the correction scores very low in context AND the original
+          scores much better: revert
+        - If BERT has a better suggestion that's similar to original: use it
+        Args:
+            original_text: Text before spelling correction
+            corrected_text: Text after spelling correction
+            vocab_manager: VocabManager for IV/OOV checks
+            confidence_threshold: Min BERT score to keep a word without checking
+            min_pred_score: Min BERT score for a replacement candidate
+            similarity_threshold: Min similarity (Levenshtein) for replacements
+        Returns:
+            Validated text with hallucinations reverted
+        """
+        orig_words = original_text.split()
+        corr_words = corrected_text.split()
+        # Only process when word counts match (1:1 mapping)
+        if len(orig_words) != len(corr_words):
+            return corrected_text
+        result_words = corr_words.copy()
+        changes_made = 0
+        for i, (orig_w, corr_w) in enumerate(zip(orig_words, corr_words)):
+            # Skip unchanged words
+            if orig_w == corr_w:
+                continue
+            # Never touch IV words in correction
+            if vocab_manager and vocab_manager.is_iv(corr_w):
+                continue
+            # Score the correction in context
+            corr_score = self.score_word_in_context(corrected_text, i, corr_w)
+            # If correction has decent BERT confidence, keep it
+            if corr_score > confidence_threshold:
+                continue
+            # Score the original word in the corrected context
+            orig_score = self.score_word_in_context(corrected_text, i, orig_w)
+            # If original scores better, revert
+            if orig_score > corr_score * 10 and orig_score > 0.01:
+                logger.info(
+                    f"[MLM] Reverting hallucination: '{corr_w}'→'{orig_w}' "
+                    f"(corr_score={corr_score:.4f}, orig_score={orig_score:.4f})"
+                )
+                result_words[i] = orig_w
+                changes_made += 1
+                continue
+            # Try BERT's own top predictions as alternatives
+            predictions = self._predict_top_k(corrected_text, i, top_k=5)
+            for pred_word, pred_score in predictions:
+                if pred_word == corr_w or pred_word == orig_w:
+                    continue
+                # Must be IV
+                if vocab_manager and not vocab_manager.is_iv(pred_word):
+                    continue
+                # Must be similar to the original (not a random word)
+                similarity = self._similarity(orig_w, pred_word)
+                if similarity < similarity_threshold:
+                    continue
+                # Must have strong BERT confidence
+                if pred_score < min_pred_score:
+                    continue
+                # Must be a big improvement
+                if pred_score > corr_score * 50 and pred_score > 0.2:
+                    logger.info(
+                        f"[MLM] Replacing with BERT prediction: '{corr_w}'→'{pred_word}' "
+                        f"(pred_score={pred_score:.4f}, corr_score={corr_score:.4f})"
+                    )
+                    result_words[i] = pred_word
+                    changes_made += 1
+                    break
+        if changes_made:
+            logger.info(f"[MLM] Contextual validation: {changes_made} corrections adjusted")
+        return ' '.join(result_words)
+    def _predict_top_k(self, text: str, position: int, top_k: int = 5) -> List[Tuple[str, float]]:
+        """Predict top-k words for a masked position."""
+        words = text.split()
+        if position >= len(words):
+            return []
+        masked_words = words.copy()
+        masked_words[position] = '[MASK]'
+        masked_text = ' '.join(masked_words)
+        try:
+            inputs = self.tokenizer(
+                masked_text, return_tensors='pt',
+                padding=True, truncation=True, max_length=128
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+            mask_idx = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+            if len(mask_idx) == 0:
+                return []
+            logits = outputs.logits[0, mask_idx[0], :]
+            probs = torch.softmax(logits, dim=0)
+            top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
+            results = []
+            for j in range(top_k):
+                token_id = top_k_indices[j].item()
+                score = top_k_weights[j].item()
+                token = self.tokenizer.decode([token_id]).strip()
+                # Skip subword tokens and special tokens
+                if not token.startswith("##") and token not in self.tokenizer.all_special_tokens:
+                    results.append((token, score))
+            return results
+        except Exception as e:
+            logger.warning(f"[MLM] Prediction error: {e}")
+            return []
+    @staticmethod
+    def _similarity(a: str, b: str) -> float:
+        """Calculate normalized Levenshtein similarity between two strings."""
+        if not a or not b:
+            return 0.0
+        max_len = max(len(a), len(b))
+        if max_len == 0:
+            return 1.0
+        # Inline Levenshtein to avoid extra dependency
+        m, n = len(a), len(b)
+        dp = list(range(n + 1))
+        for i in range(1, m + 1):
+            prev = dp[0]
+            dp[0] = i
+            for j in range(1, n + 1):
+                temp = dp[j]
+                if a[i-1] == b[j-1]:
+                    dp[j] = prev
+                else:
+                    dp[j] = 1 + min(prev, dp[j], dp[j-1])
+                prev = temp
+        dist = dp[n]
+        return 1.0 - (dist / max_len)
+def get_contextual_corrector() -> Optional[ContextualCorrector]:
+    """Get or create the singleton ContextualCorrector instance.
+    Returns None if loading fails (graceful degradation).
+    """
+    global _instance, _loading
+    if _instance is not None:
+        return _instance
+    if _loading:
+        return None  # Prevent recursive loading
+    _loading = True
+    try:
+        _instance = ContextualCorrector()
+        return _instance
+    except Exception as e:
+        logger.warning(f"[MLM] Failed to load contextual corrector: {e}")
+        return None
+    finally:
+        _loading = False
+def is_loaded() -> bool:
+    """Check if the contextual corrector is loaded."""
+    return _instance is not None