bayan-api / src /nlp /punctuation /punctuation_rules.py
youssefreda9's picture
HF Deploy: Fix syntax error with smart quotes in popup.js
fe1e225
Raw
History Blame Contribute Delete
15.5 kB
# PuncAra — Arabic Punctuation Restoration Rules
# Extracted from PuncAra.py — preprocessing + postprocessing + chunking logic.
# All classes are imported by punctuation_service.py.
#
# MERGED: Best of V1 + V2
# - V2: Threshold >= 1 (not 5) — allows terminal punct on any real text
# - V2: Fallback to `original` word count when `full_text` is empty
# - V1: Softened exclamation guard — blocks ؟/! on SHORT texts (< 3 words)
# without cue words, but allows on longer sentences
import re
import logging
logger = logging.getLogger(__name__)
def arabic_preprocessing(text: str) -> str:
"""Remove Arabic diacritics to normalize input for the model."""
arabic_diacritics = re.compile(r'[\u064B-\u0652]')
return re.sub(arabic_diacritics, '', text).strip()
def arabic_postprocessing(text: str) -> str:
"""
Typographic cleanup and punctuation normalization after model inference.
Handles: bracket spacing, duplicate marks, chunk-join artifacts, etc.
"""
if not text:
return text
# 1. Protect numbers/fractions/time from incorrect conversion
text = re.sub(r'(?<=\d),(?=\d)', '٪TEMP_COMMA٪', text)
text = re.sub(r'(?<=\d):(?=\d)', '٪TEMP_COLON٪', text)
# 2. Arabize typographic marks
text = text.replace(',', '،').replace(';', '؛').replace('?', '؟')
# 3. Fix internal spacing for brackets and Arabic quotes
text = re.sub(r'\(\s+', '(', text)
text = re.sub(r'\s+\)', ')', text)
text = re.sub(r'\[\s+', '[', text)
text = re.sub(r'\s+\]', ']', text)
text = re.sub(r'«\s+', '«', text)
text = re.sub(r'\s+»', '»', text)
# 4. Remove repeated emotional marks (except ellipsis)
text = re.sub(r'([،؛:!؟])\1+', r'\1', text)
text = re.sub(r'\.{4,}', '...', text)
# 5. Fix chunk-join contradictions
text = re.sub(r'[،؛:]+([.!؟])', r'\1', text)
text = re.sub(r'،؛|؛،', '؛', text)
text = re.sub(r'([!؟])\.', r'\1', text)
# 5.5 Syntactic context fixes for model hallucinations
# Remove colons/semicolons before relative pronouns
text = re.sub(r'[؛:]\s*(التي|الذي|الذين|اللتان|اللذان|اللاتي|اللواتي)', r' \1', text)
# 1. Fix misplaced colons (e.g. قال: المعلم -> قال المعلم:)
# Only applies if a colon is actually present on the verb or the name
def _fix_misplaced(m):
verb, col1, name, col2 = m.groups()
if col1 == ':':
return f"{verb}: {name}"
if col2 == ':':
return f"{verb} {name}:"
return m.group(0)
text = re.sub(
r'\b([وفلس]?(?:قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت))(:?)\s+(ال[أ-ي]+|أحمد|محمد|محمود|علي|عمر|خالد|فاطمة|مريم|عائشة|خديجة)\b(:?)',
_fix_misplaced, text
)
# 2. Smart Colon Guard (looks up to 6 words back)
_ALLOWED_COLON_CUES = r'^[وفلس]?(قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت|وضح|وضحت|أوضح|أوضحت|رد|ردت|التالي|الآتي|مثال|ملاحظة|تنبيه|تحذير|قائلا|قائلة|اسم|العمر|تاريخ|رقم|عاجل|الآتية|التالية)$'
def _colon_guard(match):
context = match.group(1)
colon = match.group(2)
words = re.findall(r'[\u0600-\u06FFa-zA-Z]+', context)
if not words:
return match.group(0)
prev_word = words[-1]
last_6_words = words[-6:]
if any(re.match(_ALLOWED_COLON_CUES, w) for w in last_6_words):
return match.group(0)
if prev_word.startswith(('ال', 'لل', 'بال', 'فال', 'وال', 'كال')):
return match.group(0) # Preserve the colon! Do not delete it.
return context + ' '
text = re.sub(r'([^:]+)(:)', _colon_guard, text)
# Remove colons after specific non-speech verbs (fallback for verbs without ال)
text = re.sub(r'\b(يقدر|يستطيع|يمكن|يجب|ينبغي|يعتبر|يعد|يرى|يعتقد)\s*:', r'\1 ', text)
# Replace semicolon with comma if followed by "و" (and) or similar conjunctions, as semicolon is for separate clauses
text = re.sub(r'؛\s*(و|ف|ثم|أو|أم|بل)\b', r'، \1', text)
# 6. Remove stray leading punctuation
text = re.sub(r'^[،؛:!؟. \t]+', '', text)
# 7. Ensure single space after punctuation before text
text = re.sub(r'([،؛:!؟.])(?=\S)', r'\1 ', text)
# 8. Restore protected numbers
text = text.replace('٪TEMP_COMMA٪', ',').replace('٪TEMP_COLON٪', ':')
# 9. Attach punctuation to preceding word
text = re.sub(r'\s+([،؛:!؟.])', r'\1', text)
# 10. Collapse horizontal spaces only
text = re.sub(r'[ \t]+', ' ', text).strip()
return text
# ══════════════════════════════════════════════════════════════════════════════
# PUNCTUATION SAFETY LAYER — Pipeline Hardening v3.4 (Merged V1+V2)
# ══════════════════════════════════════════════════════════════════════════════
ARABIC_PUNCT_CHARS = set('.,،؛؟!:;?!')
MAX_PUNCT_DELTA = 3
MAX_PUNCT_DELTA_SHORT = 1 # Stricter cap for short texts (≤2 words)
MAX_PUNCT_RATIO = 0.5 # max punctuation delta per word (multi-word diffs)
# Exclamation/question cue words (from V1 FIX-29, used in softened guard)
_EXCL_CUES = {'يا', 'ما', 'كم', 'لا', 'هل', 'أين', 'متى',
'كيف', 'لماذا', 'ماذا', 'أي', 'لعل', 'ليت'}
def _normalize_for_comparison(text: str) -> str:
"""
Normalize Arabic for safe comparison.
Only removes diacritics to prevent punctuation model from stripping harakat.
Does NOT fold hamza/ya/ta-marbuta to ensure we catch spelling regressions!
"""
# Remove diacritics
text = re.sub(r'[\u064B-\u0652]', '', text)
return text
def validate_punctuation_diff(diff: dict, full_text: str = '') -> bool:
"""
Return True ONLY if the diff is a safe punctuation-only change.
ALLOWED:
- Inserting 1 punctuation mark (short text) or 1–3 (long text)
- Replacing one punctuation mark with another
- Adding terminal punctuation to any text (1+ words) that lacks it
- Adding ؟/! to short texts (< 3 words) ONLY with cue words
REJECTED:
- Adding/deleting/duplicating Arabic words
- Rewriting phrases
- Excessive punctuation repetition (3+ consecutive identical)
- Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
- Short text (≤2 words): delta > 1
- Any diff: delta > MAX_PUNCT_DELTA
- Adding terminal punctuation when text already ends with punct
- Adding ؟/! to short texts without interrogative/exclamatory cues
"""
original = diff.get('original', '')
correction = diff.get('correction', '')
# ── Protect Structured Data (English, URLs, Emails, Hashtags, Code/JSON) ──
# Block punctuation modifications near structured data unless it's a valid terminal punctuation
if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original):
is_at_end = False
if full_text and 'end' in diff:
is_at_end = diff['end'] >= len(full_text) - 2
elif not full_text:
is_at_end = True
orig_punct = sum(1 for c in original if c in '.,،؛؟!:;?!')
corr_punct = sum(1 for c in correction if c in '.,،؛؟!:;?!')
# Block mid-sentence punctuation additions (e.g. adding comma after English word)
if corr_punct > orig_punct and not is_at_end:
logger.info(f"[PUNC-SAFETY] Blocked mid-sentence punctuation on structured data: '{original}' -> '{correction}'")
return False
# Block spacing corruptions in JSON/Code (e.g. {"name"} -> { "name" })
if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original):
# Only allow if the ONLY change is appending a terminal mark at the very end
if original != correction and not (is_at_end and correction.endswith(('.', '؟')) and correction[:-1].rstrip() == original.rstrip()):
logger.info(f"[PUNC-SAFETY] Blocked corruption of JSON/Code/URL: '{original}' -> '{correction}'")
return False
correction = diff.get('correction', '')
# ── Rule 0 (FIX-01 + FIX-30 + Merged Guard): Terminal punctuation ──
# PuncAra-v1 unconditionally adds . or ؟ to every sentence.
# This rule catches the pattern: "word" → "word." / "word؟" / "word،"
# where the ONLY change is appending 1-2 terminal punctuation marks.
#
# From V2 (FIX-30): Threshold lowered from 5 → 1. Even single-word
# fragments deserve terminal punctuation (e.g. "اليوم" → "اليوم.").
#
# From V2 (FIX-30): When full_text isn't provided, fall back to
# counting words in `original` instead of returning 0.
#
# From V1 (FIX-29, softened): For SHORT texts (< 3 words), block ؟/!
# unless text contains interrogative/exclamatory cue words. For longer
# texts (3+ words), allow any terminal punct freely. This prevents
# "محمد" → "محمد؟" while still allowing "اليوم" → "اليوم.".
TERMINAL_PUNCT = set('.,،؛؟!:;?!')
orig_stripped = original.rstrip()
corr_stripped = correction.rstrip()
if orig_stripped and corr_stripped:
# Check if correction is just original + terminal punct
orig_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', original)
corr_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', correction)
if (_normalize_for_comparison(orig_alpha_r0) ==
_normalize_for_comparison(corr_alpha_r0)):
# Same word content — check if only terminal punct was added
orig_punct_end = sum(1 for c in original if c in TERMINAL_PUNCT)
corr_punct_end = sum(1 for c in correction if c in TERMINAL_PUNCT)
if corr_punct_end > orig_punct_end:
# Only adding punctuation — check if it's at the END (terminal)
orig_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', original)
corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
_normalize_for_comparison(corr_no_punct.replace(' ', '')):
is_at_end = False
if full_text and 'end' in diff:
is_at_end = diff['end'] >= len(full_text) - 2
elif not full_text:
is_at_end = True # If no context, assume it's a standalone fragment
if not is_at_end:
# Mid-sentence punctuation addition. This is safe to fall through to other rules.
pass
else:
# This is a pure terminal-punctuation addition.
# V2 FIX-30: Fall back to original when full_text is empty
_word_count_source = full_text if full_text else original
_full_word_count = len(re.findall(
r'[\u0600-\u06FFa-zA-Z]+', _word_count_source
))
_full_already_has_terminal = bool(
re.search(r'[.،؛؟!?!][\s]*$', full_text)
) if full_text else False
_full_has_ellipsis = full_text.rstrip().endswith('...') if full_text else False
# V2 FIX-30: Allow for 1+ words (not 5)
if _full_word_count >= 1 and not _full_already_has_terminal and not _full_has_ellipsis:
# ── Softened FIX-29 (Merged): Short-text ؟/! guard ──
# For short texts (< 3 words), block ؟ and ! unless
# cue words are present. Prevents "محمد" → "محمد؟"
# but allows "اليوم" → "اليوم." (period is safe).
# For 3+ words, allow freely (V2 behavior).
_added_punct = correction[len(orig_stripped):]
if _full_word_count < 3 and ('!' in _added_punct or '؟' in _added_punct):
_text_to_scan = full_text if full_text else original
_has_cue = any(w in _EXCL_CUES for w in _text_to_scan.split())
if not _has_cue:
logger.info(
f"[PUNC-SAFETY] Blocked !/؟ on short text without cue: "
f"'{original}' → '{correction}'"
)
return False
logger.info(
f"[PUNC-SAFETY] Allowed terminal punct for sentence "
f"({_full_word_count} words): "
f"'{original}' → '{correction}'"
)
# Fall through to remaining rules (don't return yet)
else:
# Already has terminal punct or ends in ellipsis → REJECT
logger.info(
f"[PUNC-SAFETY] TerminalPunctuationGuard triggered: removing trailing punctuation "
f"'{original}' → '{correction}'"
)
return False
# ── Rule 1: Alphabetic content must be identical after normalization ──
orig_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', original)
corr_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', correction)
if _normalize_for_comparison(orig_alpha) != _normalize_for_comparison(corr_alpha):
return False
# ── Rule 2: Reject excessive repetition (3+ consecutive identical) ──
if re.search(r'([.,،؛؟!:;?])\1{2,}', correction):
return False
# ── Shared computation for Rules 3–5 ──
orig_punct_count = sum(1 for c in original if c in ARABIC_PUNCT_CHARS)
corr_punct_count = sum(1 for c in correction if c in ARABIC_PUNCT_CHARS)
punct_delta = max(0, corr_punct_count - orig_punct_count)
word_count = len(re.findall(r'[\u0600-\u06FFa-zA-Z]+', correction)) or 1
# ── Rule 3: Short-text hybrid cap (≤2 words → max 1 mark added) ──
if word_count <= 2 and punct_delta > MAX_PUNCT_DELTA_SHORT:
return False
# ── Rule 4: Ratio-based spam protection (multi-word diffs) ──
if word_count > 2 and punct_delta / word_count > MAX_PUNCT_RATIO:
return False
# ── Rule 5: Absolute delta cap ──
if punct_delta > MAX_PUNCT_DELTA:
return False
return True