# PuncAra — Arabic Punctuation Restoration Rules # Extracted from PuncAra.py — preprocessing + postprocessing + chunking logic. # All classes are imported by punctuation_service.py. # # MERGED: Best of V1 + V2 # - V2: Threshold >= 1 (not 5) — allows terminal punct on any real text # - V2: Fallback to `original` word count when `full_text` is empty # - V1: Softened exclamation guard — blocks ؟/! on SHORT texts (< 3 words) # without cue words, but allows on longer sentences import re import logging logger = logging.getLogger(__name__) def arabic_preprocessing(text: str) -> str: """Remove Arabic diacritics to normalize input for the model.""" arabic_diacritics = re.compile(r'[\u064B-\u0652]') return re.sub(arabic_diacritics, '', text).strip() def arabic_postprocessing(text: str) -> str: """ Typographic cleanup and punctuation normalization after model inference. Handles: bracket spacing, duplicate marks, chunk-join artifacts, etc. """ if not text: return text # 1. Protect numbers/fractions/time from incorrect conversion text = re.sub(r'(?<=\d),(?=\d)', '٪TEMP_COMMA٪', text) text = re.sub(r'(?<=\d):(?=\d)', '٪TEMP_COLON٪', text) # 2. Arabize typographic marks text = text.replace(',', '،').replace(';', '؛').replace('?', '؟') # 3. Fix internal spacing for brackets and Arabic quotes text = re.sub(r'\(\s+', '(', text) text = re.sub(r'\s+\)', ')', text) text = re.sub(r'\[\s+', '[', text) text = re.sub(r'\s+\]', ']', text) text = re.sub(r'«\s+', '«', text) text = re.sub(r'\s+»', '»', text) # 4. Remove repeated emotional marks (except ellipsis) text = re.sub(r'([،؛:!؟])\1+', r'\1', text) text = re.sub(r'\.{4,}', '...', text) # 5. Fix chunk-join contradictions text = re.sub(r'[،؛:]+([.!؟])', r'\1', text) text = re.sub(r'،؛|؛،', '؛', text) text = re.sub(r'([!؟])\.', r'\1', text) # 5.5 Syntactic context fixes for model hallucinations # Remove colons/semicolons before relative pronouns text = re.sub(r'[؛:]\s*(التي|الذي|الذين|اللتان|اللذان|اللاتي|اللواتي)', r' \1', text) # 1. Fix misplaced colons (e.g. قال: المعلم -> قال المعلم:) # Only applies if a colon is actually present on the verb or the name def _fix_misplaced(m): verb, col1, name, col2 = m.groups() if col1 == ':': return f"{verb}: {name}" if col2 == ':': return f"{verb} {name}:" return m.group(0) text = re.sub( r'\b([وفلس]?(?:قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت))(:?)\s+(ال[أ-ي]+|أحمد|محمد|محمود|علي|عمر|خالد|فاطمة|مريم|عائشة|خديجة)\b(:?)', _fix_misplaced, text ) # 2. Smart Colon Guard (looks up to 6 words back) _ALLOWED_COLON_CUES = r'^[وفلس]?(قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت|وضح|وضحت|أوضح|أوضحت|رد|ردت|التالي|الآتي|مثال|ملاحظة|تنبيه|تحذير|قائلا|قائلة|اسم|العمر|تاريخ|رقم|عاجل|الآتية|التالية)$' def _colon_guard(match): context = match.group(1) colon = match.group(2) words = re.findall(r'[\u0600-\u06FFa-zA-Z]+', context) if not words: return match.group(0) prev_word = words[-1] last_6_words = words[-6:] if any(re.match(_ALLOWED_COLON_CUES, w) for w in last_6_words): return match.group(0) if prev_word.startswith(('ال', 'لل', 'بال', 'فال', 'وال', 'كال')): return match.group(0) # Preserve the colon! Do not delete it. return context + ' ' text = re.sub(r'([^:]+)(:)', _colon_guard, text) # Remove colons after specific non-speech verbs (fallback for verbs without ال) text = re.sub(r'\b(يقدر|يستطيع|يمكن|يجب|ينبغي|يعتبر|يعد|يرى|يعتقد)\s*:', r'\1 ', text) # Replace semicolon with comma if followed by "و" (and) or similar conjunctions, as semicolon is for separate clauses text = re.sub(r'؛\s*(و|ف|ثم|أو|أم|بل)\b', r'، \1', text) # 6. Remove stray leading punctuation text = re.sub(r'^[،؛:!؟. \t]+', '', text) # 7. Ensure single space after punctuation before text text = re.sub(r'([،؛:!؟.])(?=\S)', r'\1 ', text) # 8. Restore protected numbers text = text.replace('٪TEMP_COMMA٪', ',').replace('٪TEMP_COLON٪', ':') # 9. Attach punctuation to preceding word text = re.sub(r'\s+([،؛:!؟.])', r'\1', text) # 10. Collapse horizontal spaces only text = re.sub(r'[ \t]+', ' ', text).strip() return text # ══════════════════════════════════════════════════════════════════════════════ # PUNCTUATION SAFETY LAYER — Pipeline Hardening v3.4 (Merged V1+V2) # ══════════════════════════════════════════════════════════════════════════════ ARABIC_PUNCT_CHARS = set('.,،؛؟!:;?!') MAX_PUNCT_DELTA = 3 MAX_PUNCT_DELTA_SHORT = 1 # Stricter cap for short texts (≤2 words) MAX_PUNCT_RATIO = 0.5 # max punctuation delta per word (multi-word diffs) # Exclamation/question cue words (from V1 FIX-29, used in softened guard) _EXCL_CUES = {'يا', 'ما', 'كم', 'لا', 'هل', 'أين', 'متى', 'كيف', 'لماذا', 'ماذا', 'أي', 'لعل', 'ليت'} def _normalize_for_comparison(text: str) -> str: """ Normalize Arabic for safe comparison. Only removes diacritics to prevent punctuation model from stripping harakat. Does NOT fold hamza/ya/ta-marbuta to ensure we catch spelling regressions! """ # Remove diacritics text = re.sub(r'[\u064B-\u0652]', '', text) return text def validate_punctuation_diff(diff: dict, full_text: str = '') -> bool: """ Return True ONLY if the diff is a safe punctuation-only change. ALLOWED: - Inserting 1 punctuation mark (short text) or 1–3 (long text) - Replacing one punctuation mark with another - Adding terminal punctuation to any text (1+ words) that lacks it - Adding ؟/! to short texts (< 3 words) ONLY with cue words REJECTED: - Adding/deleting/duplicating Arabic words - Rewriting phrases - Excessive punctuation repetition (3+ consecutive identical) - Punctuation spam: delta/word_count > 0.5 (multi-word diffs) - Short text (≤2 words): delta > 1 - Any diff: delta > MAX_PUNCT_DELTA - Adding terminal punctuation when text already ends with punct - Adding ؟/! to short texts without interrogative/exclamatory cues """ original = diff.get('original', '') correction = diff.get('correction', '') # ── Protect Structured Data (English, URLs, Emails, Hashtags, Code/JSON) ── # Block punctuation modifications near structured data unless it's a valid terminal punctuation if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original): is_at_end = False if full_text and 'end' in diff: is_at_end = diff['end'] >= len(full_text) - 2 elif not full_text: is_at_end = True orig_punct = sum(1 for c in original if c in '.,،؛؟!:;?!') corr_punct = sum(1 for c in correction if c in '.,،؛؟!:;?!') # Block mid-sentence punctuation additions (e.g. adding comma after English word) if corr_punct > orig_punct and not is_at_end: logger.info(f"[PUNC-SAFETY] Blocked mid-sentence punctuation on structured data: '{original}' -> '{correction}'") return False # Block spacing corruptions in JSON/Code (e.g. {"name"} -> { "name" }) if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original): # Only allow if the ONLY change is appending a terminal mark at the very end if original != correction and not (is_at_end and correction.endswith(('.', '؟')) and correction[:-1].rstrip() == original.rstrip()): logger.info(f"[PUNC-SAFETY] Blocked corruption of JSON/Code/URL: '{original}' -> '{correction}'") return False correction = diff.get('correction', '') # ── Rule 0 (FIX-01 + FIX-30 + Merged Guard): Terminal punctuation ── # PuncAra-v1 unconditionally adds . or ؟ to every sentence. # This rule catches the pattern: "word" → "word." / "word؟" / "word،" # where the ONLY change is appending 1-2 terminal punctuation marks. # # From V2 (FIX-30): Threshold lowered from 5 → 1. Even single-word # fragments deserve terminal punctuation (e.g. "اليوم" → "اليوم."). # # From V2 (FIX-30): When full_text isn't provided, fall back to # counting words in `original` instead of returning 0. # # From V1 (FIX-29, softened): For SHORT texts (< 3 words), block ؟/! # unless text contains interrogative/exclamatory cue words. For longer # texts (3+ words), allow any terminal punct freely. This prevents # "محمد" → "محمد؟" while still allowing "اليوم" → "اليوم.". TERMINAL_PUNCT = set('.,،؛؟!:;?!') orig_stripped = original.rstrip() corr_stripped = correction.rstrip() if orig_stripped and corr_stripped: # Check if correction is just original + terminal punct orig_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', original) corr_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', correction) if (_normalize_for_comparison(orig_alpha_r0) == _normalize_for_comparison(corr_alpha_r0)): # Same word content — check if only terminal punct was added orig_punct_end = sum(1 for c in original if c in TERMINAL_PUNCT) corr_punct_end = sum(1 for c in correction if c in TERMINAL_PUNCT) if corr_punct_end > orig_punct_end: # Only adding punctuation — check if it's at the END (terminal) orig_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', original) corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction) if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \ _normalize_for_comparison(corr_no_punct.replace(' ', '')): is_at_end = False if full_text and 'end' in diff: is_at_end = diff['end'] >= len(full_text) - 2 elif not full_text: is_at_end = True # If no context, assume it's a standalone fragment if not is_at_end: # Mid-sentence punctuation addition. This is safe to fall through to other rules. pass else: # This is a pure terminal-punctuation addition. # V2 FIX-30: Fall back to original when full_text is empty _word_count_source = full_text if full_text else original _full_word_count = len(re.findall( r'[\u0600-\u06FFa-zA-Z]+', _word_count_source )) _full_already_has_terminal = bool( re.search(r'[.،؛؟!?!][\s]*$', full_text) ) if full_text else False _full_has_ellipsis = full_text.rstrip().endswith('...') if full_text else False # V2 FIX-30: Allow for 1+ words (not 5) if _full_word_count >= 1 and not _full_already_has_terminal and not _full_has_ellipsis: # ── Softened FIX-29 (Merged): Short-text ؟/! guard ── # For short texts (< 3 words), block ؟ and ! unless # cue words are present. Prevents "محمد" → "محمد؟" # but allows "اليوم" → "اليوم." (period is safe). # For 3+ words, allow freely (V2 behavior). _added_punct = correction[len(orig_stripped):] if _full_word_count < 3 and ('!' in _added_punct or '؟' in _added_punct): _text_to_scan = full_text if full_text else original _has_cue = any(w in _EXCL_CUES for w in _text_to_scan.split()) if not _has_cue: logger.info( f"[PUNC-SAFETY] Blocked !/؟ on short text without cue: " f"'{original}' → '{correction}'" ) return False logger.info( f"[PUNC-SAFETY] Allowed terminal punct for sentence " f"({_full_word_count} words): " f"'{original}' → '{correction}'" ) # Fall through to remaining rules (don't return yet) else: # Already has terminal punct or ends in ellipsis → REJECT logger.info( f"[PUNC-SAFETY] TerminalPunctuationGuard triggered: removing trailing punctuation " f"'{original}' → '{correction}'" ) return False # ── Rule 1: Alphabetic content must be identical after normalization ── orig_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', original) corr_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', correction) if _normalize_for_comparison(orig_alpha) != _normalize_for_comparison(corr_alpha): return False # ── Rule 2: Reject excessive repetition (3+ consecutive identical) ── if re.search(r'([.,،؛؟!:;?])\1{2,}', correction): return False # ── Shared computation for Rules 3–5 ── orig_punct_count = sum(1 for c in original if c in ARABIC_PUNCT_CHARS) corr_punct_count = sum(1 for c in correction if c in ARABIC_PUNCT_CHARS) punct_delta = max(0, corr_punct_count - orig_punct_count) word_count = len(re.findall(r'[\u0600-\u06FFa-zA-Z]+', correction)) or 1 # ── Rule 3: Short-text hybrid cap (≤2 words → max 1 mark added) ── if word_count <= 2 and punct_delta > MAX_PUNCT_DELTA_SHORT: return False # ── Rule 4: Ratio-based spam protection (multi-word diffs) ── if word_count > 2 and punct_delta / word_count > MAX_PUNCT_RATIO: return False # ── Rule 5: Absolute delta cap ── if punct_delta > MAX_PUNCT_DELTA: return False return True