# PuncAra — Arabic Punctuation Restoration Rules
# Extracted from PuncAra.py — preprocessing + postprocessing + chunking logic.
# All classes are imported by punctuation_service.py.
#
# MERGED: Best of V1 + V2
# - V2: Threshold >= 1 (not 5) — allows terminal punct on any real text
# - V2: Fallback to `original` word count when `full_text` is empty
# - V1: Softened exclamation guard — blocks ؟/! on SHORT texts (< 3 words)
#        without cue words, but allows on longer sentences

import re
import logging

logger = logging.getLogger(__name__)


def arabic_preprocessing(text: str) -> str:
    """Remove Arabic diacritics to normalize input for the model."""
    arabic_diacritics = re.compile(r'[\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text).strip()


def arabic_postprocessing(text: str) -> str:
    """
    Typographic cleanup and punctuation normalization after model inference.
    Handles: bracket spacing, duplicate marks, chunk-join artifacts, etc.
    """
    if not text:
        return text

    # 1. Protect numbers/fractions/time from incorrect conversion
    text = re.sub(r'(?<=\d),(?=\d)', '٪TEMP_COMMA٪', text)
    text = re.sub(r'(?<=\d):(?=\d)', '٪TEMP_COLON٪', text)

    # 2. Arabize typographic marks
    text = text.replace(',', '،').replace(';', '؛').replace('?', '؟')

    # 3. Fix internal spacing for brackets and Arabic quotes
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\[\s+', '[', text)
    text = re.sub(r'\s+\]', ']', text)
    text = re.sub(r'«\s+', '«', text)
    text = re.sub(r'\s+»', '»', text)

    # 4. Remove repeated emotional marks (except ellipsis)
    text = re.sub(r'([،؛:!؟])\1+', r'\1', text)
    text = re.sub(r'\.{4,}', '...', text)

    # 5. Fix chunk-join contradictions
    text = re.sub(r'[،؛:]+([.!؟])', r'\1', text)
    text = re.sub(r'،؛|؛،', '؛', text)
    text = re.sub(r'([!؟])\.', r'\1', text)

    # 5.5 Syntactic context fixes for model hallucinations
    # Remove colons/semicolons before relative pronouns
    text = re.sub(r'[؛:]\s*(التي|الذي|الذين|اللتان|اللذان|اللاتي|اللواتي)', r' \1', text)
    
    # 1. Fix misplaced colons (e.g. قال: المعلم -> قال المعلم:)
    # Only applies if a colon is actually present on the verb or the name
    def _fix_misplaced(m):
        verb, col1, name, col2 = m.groups()
        if col1 == ':':
            return f"{verb}: {name}"
        if col2 == ':':
            return f"{verb} {name}:"
        return m.group(0)
        
    text = re.sub(
        r'\b([وفلس]?(?:قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت))(:?)\s+(ال[أ-ي]+|أحمد|محمد|محمود|علي|عمر|خالد|فاطمة|مريم|عائشة|خديجة)\b(:?)',
        _fix_misplaced, text
    )

    # 2. Smart Colon Guard (looks up to 6 words back)
    _ALLOWED_COLON_CUES = r'^[وفلس]?(قال|يقول|قالت|تقول|أجاب|أجابت|سأل|سألت|أخبر|أخبرت|صرح|صرحت|أضاف|أضافت|أردف|أردفت|وضح|وضحت|أوضح|أوضحت|رد|ردت|التالي|الآتي|مثال|ملاحظة|تنبيه|تحذير|قائلا|قائلة|اسم|العمر|تاريخ|رقم|عاجل|الآتية|التالية)$'
    
    def _colon_guard(match):
        context = match.group(1)
        colon = match.group(2)
        
        words = re.findall(r'[\u0600-\u06FFa-zA-Z]+', context)
        if not words:
            return match.group(0)
            
        prev_word = words[-1]
        last_6_words = words[-6:]
        
        if any(re.match(_ALLOWED_COLON_CUES, w) for w in last_6_words):
            return match.group(0)
            
        if prev_word.startswith(('ال', 'لل', 'بال', 'فال', 'وال', 'كال')):
            return match.group(0)  # Preserve the colon! Do not delete it.
            
        return context + ' '
        
    text = re.sub(r'([^:]+)(:)', _colon_guard, text)
    
    # Remove colons after specific non-speech verbs (fallback for verbs without ال)
    text = re.sub(r'\b(يقدر|يستطيع|يمكن|يجب|ينبغي|يعتبر|يعد|يرى|يعتقد)\s*:', r'\1 ', text)
    # Replace semicolon with comma if followed by "و" (and) or similar conjunctions, as semicolon is for separate clauses
    text = re.sub(r'؛\s*(و|ف|ثم|أو|أم|بل)\b', r'، \1', text)

    # 6. Remove stray leading punctuation
    text = re.sub(r'^[،؛:!؟. \t]+', '', text)

    # 7. Ensure single space after punctuation before text
    text = re.sub(r'([،؛:!؟.])(?=\S)', r'\1 ', text)

    # 8. Restore protected numbers
    text = text.replace('٪TEMP_COMMA٪', ',').replace('٪TEMP_COLON٪', ':')

    # 9. Attach punctuation to preceding word
    text = re.sub(r'\s+([،؛:!؟.])', r'\1', text)

    # 10. Collapse horizontal spaces only
    text = re.sub(r'[ \t]+', ' ', text).strip()
    return text


# ══════════════════════════════════════════════════════════════════════════════
# PUNCTUATION SAFETY LAYER — Pipeline Hardening v3.4 (Merged V1+V2)
# ══════════════════════════════════════════════════════════════════════════════

ARABIC_PUNCT_CHARS = set('.,،؛؟!:;?!')
MAX_PUNCT_DELTA = 3
MAX_PUNCT_DELTA_SHORT = 1   # Stricter cap for short texts (≤2 words)
MAX_PUNCT_RATIO = 0.5       # max punctuation delta per word (multi-word diffs)

# Exclamation/question cue words (from V1 FIX-29, used in softened guard)
_EXCL_CUES = {'يا', 'ما', 'كم', 'لا', 'هل', 'أين', 'متى',
              'كيف', 'لماذا', 'ماذا', 'أي', 'لعل', 'ليت'}


def _normalize_for_comparison(text: str) -> str:
    """
    Normalize Arabic for safe comparison.
    Only removes diacritics to prevent punctuation model from stripping harakat.
    Does NOT fold hamza/ya/ta-marbuta to ensure we catch spelling regressions!
    """
    # Remove diacritics
    text = re.sub(r'[\u064B-\u0652]', '', text)
    return text


def validate_punctuation_diff(diff: dict, full_text: str = '') -> bool:
    """
    Return True ONLY if the diff is a safe punctuation-only change.

    ALLOWED:
        - Inserting 1 punctuation mark (short text) or 1–3 (long text)
        - Replacing one punctuation mark with another
        - Adding terminal punctuation to any text (1+ words) that lacks it
        - Adding ؟/! to short texts (< 3 words) ONLY with cue words

    REJECTED:
        - Adding/deleting/duplicating Arabic words
        - Rewriting phrases
        - Excessive punctuation repetition (3+ consecutive identical)
        - Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
        - Short text (≤2 words): delta > 1
        - Any diff: delta > MAX_PUNCT_DELTA
        - Adding terminal punctuation when text already ends with punct
        - Adding ؟/! to short texts without interrogative/exclamatory cues
    """
    original = diff.get('original', '')
    correction = diff.get('correction', '')

    # ── Protect Structured Data (English, URLs, Emails, Hashtags, Code/JSON) ──
    # Block punctuation modifications near structured data unless it's a valid terminal punctuation
    if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original):
        is_at_end = False
        if full_text and 'end' in diff:
            is_at_end = diff['end'] >= len(full_text) - 2
        elif not full_text:
            is_at_end = True
        
        orig_punct = sum(1 for c in original if c in '.,،؛؟!:;?!')
        corr_punct = sum(1 for c in correction if c in '.,،؛؟!:;?!')
        
        # Block mid-sentence punctuation additions (e.g. adding comma after English word)
        if corr_punct > orig_punct and not is_at_end:
            logger.info(f"[PUNC-SAFETY] Blocked mid-sentence punctuation on structured data: '{original}' -> '{correction}'")
            return False
            
        # Block spacing corruptions in JSON/Code (e.g. {"name"} -> { "name" })
        if re.search(r'[a-zA-Z]|\{|\[|<|#|@|://', original):
            # Only allow if the ONLY change is appending a terminal mark at the very end
            if original != correction and not (is_at_end and correction.endswith(('.', '؟')) and correction[:-1].rstrip() == original.rstrip()):
                logger.info(f"[PUNC-SAFETY] Blocked corruption of JSON/Code/URL: '{original}' -> '{correction}'")
                return False
    correction = diff.get('correction', '')

    # ── Rule 0 (FIX-01 + FIX-30 + Merged Guard): Terminal punctuation ──
    # PuncAra-v1 unconditionally adds . or ؟ to every sentence.
    # This rule catches the pattern: "word" → "word." / "word؟" / "word،"
    # where the ONLY change is appending 1-2 terminal punctuation marks.
    #
    # From V2 (FIX-30): Threshold lowered from 5 → 1. Even single-word
    # fragments deserve terminal punctuation (e.g. "اليوم" → "اليوم.").
    #
    # From V2 (FIX-30): When full_text isn't provided, fall back to
    # counting words in `original` instead of returning 0.
    #
    # From V1 (FIX-29, softened): For SHORT texts (< 3 words), block ؟/!
    # unless text contains interrogative/exclamatory cue words. For longer
    # texts (3+ words), allow any terminal punct freely. This prevents
    # "محمد" → "محمد؟" while still allowing "اليوم" → "اليوم.".
    TERMINAL_PUNCT = set('.,،؛؟!:;?!')
    orig_stripped = original.rstrip()
    corr_stripped = correction.rstrip()
    if orig_stripped and corr_stripped:
        # Check if correction is just original + terminal punct
        orig_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', original)
        corr_alpha_r0 = re.sub(r'[.,،؛؟!:;?\s]', '', correction)
        if (_normalize_for_comparison(orig_alpha_r0) ==
                _normalize_for_comparison(corr_alpha_r0)):
            # Same word content — check if only terminal punct was added
            orig_punct_end = sum(1 for c in original if c in TERMINAL_PUNCT)
            corr_punct_end = sum(1 for c in correction if c in TERMINAL_PUNCT)
            if corr_punct_end > orig_punct_end:
                # Only adding punctuation — check if it's at the END (terminal)
                orig_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', original)
                corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
                if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
                   _normalize_for_comparison(corr_no_punct.replace(' ', '')):
                    
                    is_at_end = False
                    if full_text and 'end' in diff:
                        is_at_end = diff['end'] >= len(full_text) - 2
                    elif not full_text:
                        is_at_end = True  # If no context, assume it's a standalone fragment
                    
                    if not is_at_end:
                        # Mid-sentence punctuation addition. This is safe to fall through to other rules.
                        pass
                    else:
                        # This is a pure terminal-punctuation addition.
                        # V2 FIX-30: Fall back to original when full_text is empty
                        _word_count_source = full_text if full_text else original
                        _full_word_count = len(re.findall(
                            r'[\u0600-\u06FFa-zA-Z]+', _word_count_source
                        ))
                        _full_already_has_terminal = bool(
                            re.search(r'[.،؛؟!?!][\s]*$', full_text)
                        ) if full_text else False
                        _full_has_ellipsis = full_text.rstrip().endswith('...') if full_text else False
    
                        # V2 FIX-30: Allow for 1+ words (not 5)
                        if _full_word_count >= 1 and not _full_already_has_terminal and not _full_has_ellipsis:
                            # ── Softened FIX-29 (Merged): Short-text ؟/! guard ──
                            # For short texts (< 3 words), block ؟ and ! unless
                            # cue words are present. Prevents "محمد" → "محمد؟"
                            # but allows "اليوم" → "اليوم." (period is safe).
                            # For 3+ words, allow freely (V2 behavior).
                            _added_punct = correction[len(orig_stripped):]
                            if _full_word_count < 3 and ('!' in _added_punct or '؟' in _added_punct):
                                _text_to_scan = full_text if full_text else original
                                _has_cue = any(w in _EXCL_CUES for w in _text_to_scan.split())
                                if not _has_cue:
                                    logger.info(
                                        f"[PUNC-SAFETY] Blocked !/؟ on short text without cue: "
                                        f"'{original}' → '{correction}'"
                                    )
                                    return False
    
                            logger.info(
                                f"[PUNC-SAFETY] Allowed terminal punct for sentence "
                                f"({_full_word_count} words): "
                                f"'{original}' → '{correction}'"
                            )
                            # Fall through to remaining rules (don't return yet)
                        else:
                            # Already has terminal punct or ends in ellipsis → REJECT
                            logger.info(
                                f"[PUNC-SAFETY] TerminalPunctuationGuard triggered: removing trailing punctuation "
                                f"'{original}' → '{correction}'"
                            )
                            return False


    # ── Rule 1: Alphabetic content must be identical after normalization ──
    orig_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', original)
    corr_alpha = re.sub(r'[.,،؛؟!:;?\s]', '', correction)

    if _normalize_for_comparison(orig_alpha) != _normalize_for_comparison(corr_alpha):
        return False

    # ── Rule 2: Reject excessive repetition (3+ consecutive identical) ──
    if re.search(r'([.,،؛؟!:;?])\1{2,}', correction):
        return False

    # ── Shared computation for Rules 3–5 ──
    orig_punct_count = sum(1 for c in original if c in ARABIC_PUNCT_CHARS)
    corr_punct_count = sum(1 for c in correction if c in ARABIC_PUNCT_CHARS)
    punct_delta = max(0, corr_punct_count - orig_punct_count)
    word_count = len(re.findall(r'[\u0600-\u06FFa-zA-Z]+', correction)) or 1

    # ── Rule 3: Short-text hybrid cap (≤2 words → max 1 mark added) ──
    if word_count <= 2 and punct_delta > MAX_PUNCT_DELTA_SHORT:
        return False

    # ── Rule 4: Ratio-based spam protection (multi-word diffs) ──
    if word_count > 2 and punct_delta / word_count > MAX_PUNCT_RATIO:
        return False

    # ── Rule 5: Absolute delta cap ──
    if punct_delta > MAX_PUNCT_DELTA:
        return False

    return True