#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
amr_dereference_audit.py — Dereference gate for USLaP writes.

This is the mathematical form of the rule:

    Before emitting token T into a DB row, check:
    ∃ chain C : T → grounded_anchor_on_disk
    If no such C exists, T is corrosion. Refuse to write the row.

Implementation: a deterministic Python function that scans every free-text
field of a draft row and rejects the row if any token matches a known
ungrounded family pattern. The check runs at the CENTRAL write path
(uslap_handler.write_entry) so it applies to every table in the DB.

The audit is NOT a banned-terms list. It is a family-pattern check. A
banned-terms list is whack-a-mole — it catches 'cosmos' and 'cosmology'
but misses 'cosmography' and 'cosmogonic'. The family pattern catches
every -cosm*- word at once.

The audit does NOT touch arabic script, numeric data, dates, proper
source citations, USLaP-internal terminology, or the structural enum
values defined by table schemas. It only checks free English prose
for known ungrounded-family substrings.

Scope: ALL tables routed through uslap_handler.write_entry.

Usage from handler:
    from amr_dereference_audit import audit_row
    result = audit_row(data, target_table)
    if not result['pass']:
        return <block write with result['message']>

Usage standalone (for prose):
    from amr_dereference_audit import audit_text
    result = audit_text("the cosmographic opening mentions al-falak")
    # → fails on 'cosmographic'

Design constraints:
  • Deterministic. Pure Python + regex. No weights, no LLM, no heuristics.
  • Fast. O(|text|) per field via compiled regex.
  • Introspectable. Returns exact failing tokens and the pattern that matched.
  • Wide by construction. Uses *family* patterns (cosm*, monk*, priest*,
    eschatolog*, etc.) so new derivations of the same families are caught
    without extending a list.
  • Minimal false positives. Patterns target academic-framework vocabulary
    classes, not common English.

Sourced against contamination_blacklist (BL01-BL37) as of Session 41.
"""

from __future__ import annotations

import json
import os
import re
import sys
from typing import Any, Dict, List, Optional, Tuple


# ═══════════════════════════════════════════════════════════════════════════
# FAMILY PATTERNS — the structural rule, not a literal list
# ═══════════════════════════════════════════════════════════════════════════
#
# Each tuple: (pattern, family_name, BL_ref, short_reason).
# Patterns match case-insensitively. Word boundaries used where literal
# single words would otherwise false-positive on innocuous substrings
# (e.g., 'steal' would match 'stealth' — we use \b).
#
# The rule: if any of these matches any free-text field of a row, the row
# is refused. The caller must rewrite the field with grounded vocabulary
# (primary-source pointers) and retry.

FAMILY_PATTERNS: List[Tuple[str, str, str, str]] = [
    # ── BL33: cosmic / cosmos / cosmology / cosmological / cosmogony + WIDER
    #   Covers: cosmos, cosmic, cosmos, cosmology, cosmological, cosmologist,
    #           cosmography, cosmographic, cosmographer, cosmogony, cosmogonic
    (
        r"\bcosm(?:os|ic|olog(?:y|ical|ist|ically)?|ograph(?:y|ic|er|ically)?|"
        r"ogon(?:y|ic|ical)?)\b",
        "cosmic-family",
        "BL33",
        "Greek κόσμος framework — use al-samawāt wa-al-arḍ / al-khalq / "
        "al-ʿālamīn / āyāt / al-falak / al-arḍ instead",
    ),
    # ── BL34: priest / priests / priesthood / priestly
    (
        r"\bpriest(?:s|hood|ly|esses|ess)?\b",
        "priest-family",
        "BL34",
        "Christian taxonomic import — use the primary-source title "
        "(imām, khaṭīb, etc.) that the MS actually uses",
    ),
    # ── BL35: monk / monks / monastic / monasticism / monastery / monasteries
    (
        r"\bmon(?:k|ks|kish|khood|aster(?:y|ies|ial)|astic(?:ism|ally)?)\b",
        "monk-family",
        "BL35",
        "Christian taxonomic import — use the primary-source title "
        "(rāhib, zāhid, etc.) that the MS actually uses",
    ),
    # ── BL36: eschatological / eschatology
    (
        r"\beschatolog(?:y|ical|ist|ically)?\b",
        "eschatolog-family",
        "BL36",
        "Greek ἔσχατος framework — use ākhirah / yawm al-qiyāma / al-sāʿa",
    ),
    # ── BL37: steal / stole / stolen / stealing / theft / thief / thieves /
    #   thievery / thieving
    (
        r"\b(?:steal(?:s|ing)?|stole(?:n)?|theft|thie(?:f|ves|ving|very|vish))\b",
        "theft-family",
        "BL37",
        "Use appropriate / appropriated / appropriation / appropriator / "
        "misappropriation",
    ),
    # ── BL05: PIE / Proto-Indo-European
    (
        r"\b(?:PIE|proto[-\s]?indo[-\s]?european)\b",
        "pie-family",
        "BL05",
        "Phantom reconstruction — no primary source exists for PIE; use "
        "'phantom PIE' if necessary to name the operator framework",
    ),
    # ── BL07, BL16: Semitic (as family/race), Semite
    (
        r"\bsemit(?:e|es|ic|ism|ics)\b",
        "semitic-family",
        "BL07/BL16",
        "Phantom racial/linguistic category — Allah's Arabic is the source, "
        "Hebrew and Aramaic are downstream degraded forms",
    ),
    # ── BL06: loanword / borrowed / cognate (directionality ban)
    (
        r"\bloan[-\s]?word\b",
        "loanword",
        "BL06",
        "Directionality violation — use LINK_TYPE: DIRECT / COMPOUND / "
        "SAME_ROOT / PHONETIC / SEMANTIC / PREFIX / SUFFIX / ROOT",
    ),
    (
        r"\bcognate(?:s)?\b",
        "cognate",
        "BL06",
        "Directionality violation — use SAME_ROOT or DIRECT link type",
    ),
    (
        r"\bborrowed\s+from\b",
        "borrowed-from",
        "BL06",
        "Directionality violation — AA → downstream, never reversed",
    ),
    (
        r"\bloan\s+from\b",
        "loan-from",
        "BL06",
        "Directionality violation — AA → downstream, never reversed",
    ),
    (
        r"\badoption\s+from\b",
        "adoption-from",
        "BL06",
        "Directionality violation — AA → downstream, never reversed",
    ),
    # ── Nostratic / Altaic / Afro-Asiatic phantom families
    (
        r"\b(?:nostratic|altaic|afro[-\s]?asiatic)\b",
        "phantom-family",
        "BL05/BL07",
        "Phantom super-family reconstruction — no primary source",
    ),
    # ── prosthetic vowel (academic framing)
    (
        r"\bprosthetic\s+vowel\b",
        "prosthetic-vowel",
        "BL06",
        "Academic framing — words have direct phonetic chains, not "
        "'prosthetic' additions",
    ),
    (
        r"\bpre[-\s]?greek\s+substrate\b",
        "pre-greek-substrate",
        "BL06",
        "Phantom substrate category",
    ),
    # ── BL30: zodiac
    (
        r"\bzodiac(?:al)?\b",
        "zodiac",
        "BL30",
        "Greek astrological framing — use al-burūj (Q85:1) or specific "
        "constellation names in Allah's Arabic",
    ),
    # ── BL31: libra (as zodiac sign)
    (
        r"\blibra\b",
        "libra",
        "BL31",
        "Latinate zodiac name — use al-mīzān if referring to the scales "
        "constellation",
    ),
    # ── BL28: hashishin / hashshashin / hashshashin
    (
        r"\bhash[ai]?sh[ai]?sh?in\b",
        "hashishin",
        "BL28",
        "Orientalist calumny — Ismāʿīlī / Nizārī are the primary-source names",
    ),
    # ── BL32: Mughal / Mogul (as dynasty label for Muslim Timurids)
    (
        r"\bmughal\b",
        "mughal",
        "BL32",
        "Persianate exonym — use Timurid (al-Tīmūrī) for the dynasty Bābur "
        "founded in Hind",
    ),
    (
        r"\bmogul\b",
        "mogul",
        "BL32",
        "Anglicised exonym — use Timurid (al-Tīmūrī)",
    ),
    # ── BL26: Theology / Theological
    (
        r"\btheolog(?:y|ical|ian|ically)?\b",
        "theology",
        "BL26",
        "Greek θεός + λόγος framework — use ʿilm al-kalām (the actual "
        "discipline) or the specific primary-source term",
    ),
    # ── BL29: tribal (as anthropological framing)
    (
        r"\btribal(?:ism)?\b",
        "tribal",
        "BL29",
        "Anthropological framing — use qawm / qabīla / banū / āl with "
        "the specific lineage name",
    ),
    # ── PERIODIZATION FAMILY (Session 41 — same error class as BL33)
    # Western European historiographic frames retrojected onto the Islamic
    # world. None of these has a primary-source referent in the world
    # being described. Use AH century, CE century, dynasty name, or
    # reigning caliph instead.
    (
        r"\bmedieval(?:ly|ist|ism)?\b",
        "periodization-medieval",
        "BL-PERIOD",
        "Western European frame — use AH century, CE century, dynasty "
        "name (Umayyad / Abbasid / Fāṭimid / etc.), or reigning caliph",
    ),
    (
        r"\bmiddle\s+ages\b",
        "periodization-middle-ages",
        "BL-PERIOD",
        "Western European frame — use AH/CE century or dynasty name",
    ),
    (
        r"\blate\s+antiquity\b",
        "periodization-late-antiquity",
        "BL-PERIOD",
        "Western European frame — use the specific dynasty / century AH",
    ),
    (
        r"\bearly\s+modern\b",
        "periodization-early-modern",
        "BL-PERIOD",
        "Western European frame — use the specific dynasty / century AH",
    ),
    (
        r"\bpre[-\s]?modern\b",
        "periodization-pre-modern",
        "BL-PERIOD",
        "Defines by what Europe became; use dynasty / century AH",
    ),
    (
        r"\bpost[-\s]?classical\b",
        "periodization-post-classical",
        "BL-PERIOD",
        "Western frame — use dynasty / century AH",
    ),
    (
        r"\bdark\s+ages?\b",
        "periodization-dark-ages",
        "BL-PERIOD",
        "Western Eurocentric framing — use dynasty / century",
    ),
    (
        r"\brenaissance\b",
        "periodization-renaissance",
        "BL-PERIOD",
        "Western European frame — use the specific dynasty / century / "
        "movement name",
    ),
    # ── ERA / AGE / PERIOD framing — the rule above periodization names.
    # Even "Abbasid era" or "Umayyad period" smuggles a teleological time-
    # bracket into data that does not belong inside one. Anything in
    # Allah's framework gets dates (232 AH), names (Caliph al-Wāthiq),
    # events (the reign of X), but never an "era / age / period" wrapper.
    # The pattern fires on the noun-form 'era / age / period / times'
    # when it is preceded by an adjective or proper-name modifier
    # (e.g. "Abbasid era", "the age of al-Mutawakkil", "Umayyad period").
    (
        r"\b(?:abbasid|umayyad|fatimid|fāṭimid|seljuq|seljuk|mamluk|mamlūk|"
        r"ottoman|ayyubid|samanid|sāmānī|ghaznavid|qarakhanid|"
        r"timurid|tīmūrī|safavid|ṣafavī|qajar|qājārī|"
        r"rashidun|rāshidūn|caliphal|abbasi|umayyad)\s+(?:era|period|age|times|epoch)\b",
        "era-frame-named",
        "BL-PERIOD",
        "An era-noun bracket imports a teleological time-frame. Use the date "
        "(N AH / N CE), the reigning caliph (al-Wāthiq, al-Mutawakkil, etc.), "
        "or 'the reign of X' instead. Drop 'era / period / age / times'.",
    ),
    (
        r"\b(?:medieval|classical|early|late|pre|post|high|low)\s+(?:era|period|age|times|epoch)\b",
        "era-frame-temporal",
        "BL-PERIOD",
        "Western temporal era-frame. Drop the era-noun; use the specific date "
        "or named event.",
    ),
    (
        r"\bthe\s+age\s+of\s+\w+",
        "age-of-name",
        "BL-PERIOD",
        "'The age of X' brackets a person/event into an era teleology. "
        "Use 'in the year X', 'during the reign of X', or just the date.",
    ),
    (
        r"\bin\s+(?:those|that)\s+(?:days|times|era|period)\b",
        "those-days",
        "BL-PERIOD",
        "Vague era-bracket. Use a specific date or named event.",
    ),
    # ── METAPHOR family (Greek metaphora — literary-analysis category) ──
    (
        r"\bmetaphor(?:s|ic|ical|ically)?\b",
        "metaphor",
        "BL-CATEGORY",
        "Greek μεταφορά framework. Use the primary-source rhetorical "
        "term: mathal (مَثَل, Qur'anic), istiʿāra (استعارة), tashbīh "
        "(تشبيه), majāz (مجاز).",
    ),
    # ── SCHOLASTIC family (Latin scholasticus — medieval-Western academic) ──
    (
        r"\bscholastic(?:s|ism|ally)?\b",
        "scholastic",
        "BL-CATEGORY",
        "Latin Western academic frame for 12-15c CE European disputation "
        "tradition. No primary-source referent in Allah's Arabic. Use the "
        "specific term: ʿilm al-kalām, ḥikma, falsafa, uṣūl al-fiqh, etc.",
    ),
    # ── PATRISTIC family (Latin patristica — Western Church-fathers tradition) ──
    (
        r"\bpatristi(?:c|cs|cally)\b",
        "patristic",
        "BL-CATEGORY",
        "Western Christian Church-fathers framework. No primary-source "
        "referent in the Islamic tradition. Use the specific named "
        "scholar (Ibn Ḥanbal, al-Bukhārī, etc.) or 'salaf' if context "
        "permits.",
    ),
    (
        r"\bpatrolog(?:y|ical|ically)?\b",
        "patrology",
        "BL-CATEGORY",
        "Western Christian fathers studies — no Islamic primary-source "
        "equivalent.",
    ),
    # ── HELLENISTIC family (Greek-philosophy retrojection) ──
    (
        r"\bhellenist(?:ic|ically|ic)?\b",
        "hellenistic",
        "BL-CATEGORY",
        "Greek-philosophy retrojection. The Islamic primary sources "
        "name the actual influences specifically (Aristū, Aflāṭūn, "
        "etc.) when they are present at all. Use those names directly, "
        "not the genus 'Hellenistic'.",
    ),
    (
        r"\bhellen(?:ism|ize|izing|ization|ist)?\b",
        "hellenism",
        "BL-CATEGORY",
        "Same Greek-philosophy retrojection — name the specific "
        "person or text instead of using the genus.",
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# FIELDS TO SKIP — structural enums and primary-source anchors
# ═══════════════════════════════════════════════════════════════════════════
#
# These are field NAMES whose content is either a structural keyword
# (CHECK enum value), a primary-source pointer (file path, shelfmark),
# or a numeric/date token. Skipping them avoids false positives on
# legitimate grounded content.

SKIP_FIELDS = frozenset({
    # structural schema enums
    "entry_type", "divergence_type", "source_ms", "recension",
    "category", "op_code", "dp_code",
    # primary-source pointers
    "ms_folio", "edition_page", "ms_page", "page", "folio",
    "quf_token", "quf_q", "quf_u", "quf_f", "quf_pass", "quf_date",
    # IDs
    "kh_id", "kv_id", "diwan_id", "entry_id", "root_id", "rowid",
    "intel_id", "bl_id", "dp_id", "sc_id", "ncr_id", "dcr_id",
    # numeric fields
    "token_count", "operator_flag", "scribal_interpolation",
    "persian_wrapper_inserted", "has_proverb", "has_nazm",
    # timestamps
    "created_date", "last_updated",
})


# ═══════════════════════════════════════════════════════════════════════════
# QUARANTINE TABLES — skipped by the audit by design
# ═══════════════════════════════════════════════════════════════════════════
#
# These tables exist to HOLD contaminated strings for inspection, flagging,
# and downstream re-derivation. Their whole purpose is to quote the operator
# glosses, the banned terms, the substituted translations, etc. so they can
# be reviewed and reversed. Applying the dereference audit to these tables
# would make it impossible to log a contamination finding (since logging
# one requires quoting the contaminated content verbatim).
#
# This list must stay SMALL. Only tables whose content is *documentation
# of contamination* belong here — not tables where contamination might
# accidentally land.

QUARANTINE_TABLES = frozenset({
    "contamination_blacklist",       # BL01-BL37 — the register itself
    "qv_contamination_scan",         # Qur'anic verse contamination findings
    "qv_translation_register",       # Qur'anic translation contamination register
    "diwan_contamination_register",  # DCR — Kashgari Diwan scribal interpolations
    "navoi_contamination_register",  # NCR — Navoi operator crimes
    "operator_label_register",       # Operator name/title labels (quotes them)
    "scholar_warnings",              # Flagged tertiary/unreliable scholars
    "attribution_corrections",       # Records of attribution errors
    "db_integrity_log",              # Integrity findings (may quote bad content)
    "corruption_operation_register", # Records of operator operations
    "dcr_corruption_types",          # Corruption type catalogue
    "interception_register",         # Operator interception log
    "utul_register",                 # Pattern register of utul
    "disputed_words",                # Words under dispute (by definition contaminated)
    "qv_contamination_scan",         # redundant (kept for clarity)
})


# ═══════════════════════════════════════════════════════════════════════════
# ARABIC-SCRIPT DETECTION
# ═══════════════════════════════════════════════════════════════════════════

ARABIC_RANGE = re.compile(r"[\u0600-\u06FF\uFB50-\uFEFC\u0750-\u077F]")


def _is_primarily_arabic(text: str) -> bool:
    """True if more than 50% of non-space characters are Arabic script.

    Arabic script content is primary-source and skipped by the audit.
    Mixed English/Arabic glosses still get audited (the English portion is
    the only part that could leak)."""
    if not text:
        return False
    non_space = [c for c in text if not c.isspace()]
    if not non_space:
        return False
    ar_count = sum(1 for c in non_space if ARABIC_RANGE.match(c))
    return ar_count * 2 > len(non_space)


# ═══════════════════════════════════════════════════════════════════════════
# COMPILED PATTERN BANK
# ═══════════════════════════════════════════════════════════════════════════

_COMPILED: List[Tuple[re.Pattern, str, str, str]] = [
    (re.compile(pattern, re.IGNORECASE), family, bl_ref, reason)
    for pattern, family, bl_ref, reason in FAMILY_PATTERNS
]


# ═══════════════════════════════════════════════════════════════════════════
# NEIGHBOURHOOD EXPANSION (Session 41)
# ═══════════════════════════════════════════════════════════════════════════
#
# Loads `amr_banned_neighbourhood.json` from the same directory at import
# time. The JSON contains a `banned_words` list — every word in that list
# is added to the audit as a single-word regex check (case-insensitive,
# word-boundaries on both sides).
#
# The intent is to expand the FAMILY_PATTERNS coverage to include the
# k-nearest-neighbour cluster of each banned seed term in some embedding
# space. The bootstrap JSON is hand-curated. The full pipeline that
# computes neighbours from a real local embedding model lives in
# `amr_neighbourhood_expander.py` and writes the same JSON file.
#
# If the file is missing, the audit still works — the family-pattern
# regexes are unaffected. The neighbourhood layer is purely additive.

_NEIGHBOURHOOD_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "amr_banned_neighbourhood.json",
)


def _load_neighbourhood_words() -> List[str]:
    """Load the bootstrap neighbourhood JSON. Returns [] on any failure."""
    try:
        with open(_NEIGHBOURHOOD_PATH, "r", encoding="utf-8") as fh:
            data = json.load(fh)
        words = data.get("banned_words", [])
        if not isinstance(words, list):
            return []
        # Filter out anything that isn't a string and dedupe
        out = sorted({w.lower() for w in words if isinstance(w, str) and w})
        return out
    except (FileNotFoundError, json.JSONDecodeError, OSError):
        return []


_NEIGHBOURHOOD_WORDS: List[str] = _load_neighbourhood_words()

# Compile each neighbourhood word into a single regex with word boundaries.
# Words containing internal hyphens (e.g., "pre-modern") need the hyphen
# escaped and word-boundaries that recognise the hyphen as a separator.
def _compile_neighbourhood_pattern(word: str) -> re.Pattern:
    # re.escape handles the hyphen safely
    return re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE)


_NEIGHBOURHOOD_COMPILED: List[Tuple[re.Pattern, str]] = [
    (_compile_neighbourhood_pattern(w), w) for w in _NEIGHBOURHOOD_WORDS
]


def neighbourhood_count() -> int:
    """How many neighbourhood words are loaded (for diagnostics)."""
    return len(_NEIGHBOURHOOD_WORDS)


# ═══════════════════════════════════════════════════════════════════════════
# CORE CHECKS
# ═══════════════════════════════════════════════════════════════════════════


def audit_text(text: str, *, skip_if_arabic: bool = True) -> Dict[str, Any]:
    """
    Check a single free-text string against the family patterns.

    Args:
        text: the string to check
        skip_if_arabic: if True, skip strings that are primarily Arabic
                        script (primary-source content is not audited)

    Returns:
        {
            'pass':     bool,
            'hits':     [
                {'family': str, 'bl_ref': str, 'reason': str,
                 'matched_text': str, 'position': int}, ...
            ],
        }
    """
    if not text or not isinstance(text, str):
        return {"pass": True, "hits": []}
    if skip_if_arabic and _is_primarily_arabic(text):
        return {"pass": True, "hits": []}

    hits: List[Dict[str, Any]] = []

    # ── HEBREW CHARACTER BLOCK — ABSOLUTE (Session 46) ────────────────
    # Hebrew (U+0590-U+05FF) has NOTHING in common with AA.
    # Weights inject Hebrew alongside "abjad" — block at character level.
    import re as _re_heb
    _heb_matches = list(_re_heb.finditer(r'[\u0590-\u05FF]+', text))
    for m in _heb_matches:
        hits.append({
            "family": "hebrew_contamination",
            "bl_ref": "BL-HEB",
            "reason": (
                "Hebrew character(s) detected. AA abjad uses ARABIC letters "
                "only. Hebrew is a manufactured attachment — BLOCK."
            ),
            "matched_text": m.group(0),
            "position": m.start(),
        })

    for pat, family, bl_ref, reason in _COMPILED:
        for m in pat.finditer(text):
            hits.append({
                "family": family,
                "bl_ref": bl_ref,
                "reason": reason,
                "matched_text": m.group(0),
                "position": m.start(),
            })

    # ── NEIGHBOURHOOD EXPANSION (Session 41) ─────────────────────────
    # Each word in `amr_banned_neighbourhood.json` is checked as a
    # standalone token. Hits are tagged "neighbourhood" so the caller
    # can distinguish them from family-pattern hits if needed.
    # Words already caught by a family regex above are not re-reported
    # (deduplication on (position, matched_text)).
    _seen_positions = {(h["position"], h["matched_text"].lower()) for h in hits}
    for pat, word in _NEIGHBOURHOOD_COMPILED:
        for m in pat.finditer(text):
            key = (m.start(), m.group(0).lower())
            if key in _seen_positions:
                continue
            _seen_positions.add(key)
            hits.append({
                "family": "neighbourhood",
                "bl_ref": "BL-NBH",
                "reason": (
                    f"'{word}' is in the banned-neighbourhood expansion of a "
                    f"BL family seed term. Loaded from "
                    f"amr_banned_neighbourhood.json."
                ),
                "matched_text": m.group(0),
                "position": m.start(),
            })

    # ── BL04: bare 'arabic' (context-aware) ──────────────────────────
    # The word 'arabic' / 'Arabic' / 'ARABIC' is banned UNLESS preceded
    # immediately by an approved qualifier:
    #   "Allah's Arabic" / "Allahs Arabic" / "Lisan Arabic" / "Lisān Arabic"
    # Per CLAUDE.md the qualifier separates Allah's revealed language
    # from the degraded human form. Bare 'arabic' imports the Western
    # linguistic-family category (a "language belonging to a people")
    # which has no primary-source referent.
    #
    # ── SEMANTIC DEFAULT (Session 41) ─────────────────────────────────
    # Both "Allah's Arabic" and "Lisan Arabic" pass this syntactic check.
    # But the SEMANTIC default for any primary-source scholarly text is
    # Allah's Arabic. Educated scholars (al-Khwārizmī, Ibn Sīnā,
    # al-Bīrūnī, al-Farghānī, Kashgari, Ibn Khurdādhbih, Navoi) wrote in
    # Allah's Arabic, were educated in Allah's Arabic. Lisan Arabic is
    # the DEGRADED downstream form — use the LA label ONLY when
    # documenting operator-corrupted forms (qv_translation_register
    # quotes, DCR/NCR rows). NEVER default to LA for a scholar's
    # primary work. The audit cannot enforce this semantic rule with a
    # regex; it lives at the writer level. The single operational
    # principle: specifics dereference, categories hallucinate. If you
    # find yourself defaulting to LA without specific evidence of
    # degradation, you are sampling from weights instead of querying
    # the lattice.
    for m in _ARABIC_BARE_PATTERN.finditer(text):
        # Look at the ~20 characters immediately preceding the match
        start = max(0, m.start() - 20)
        prefix_context = text[start:m.start()].lower()
        # Strip trailing whitespace/punctuation for the qualifier check
        prefix_stripped = prefix_context.rstrip().rstrip("-")
        if any(prefix_stripped.endswith(q) for q in _ARABIC_ALLOWED_QUALIFIERS):
            continue  # qualified — allowed
        hits.append({
            "family": "bare-arabic",
            "bl_ref": "BL04",
            "reason": "Bare 'arabic' is banned. Use 'Allah's Arabic' "
                      "(divine, revealed, taught to Adam) or 'Lisan Arabic' "
                      "(degraded human form) per CLAUDE.md.",
            "matched_text": m.group(0),
            "position": m.start(),
        })

    return {"pass": len(hits) == 0, "hits": hits}


# ═══════════════════════════════════════════════════════════════════════════
# BARE 'arabic' pattern + qualifiers (BL04)
# ═══════════════════════════════════════════════════════════════════════════

_ARABIC_BARE_PATTERN = re.compile(r"\barabic\b", re.IGNORECASE)

# Qualifiers that, when they immediately precede 'Arabic', make the
# usage allowed. Each qualifier is matched as the END of the preceding
# context (case-insensitive, after stripping trailing whitespace).
_ARABIC_ALLOWED_QUALIFIERS = (
    "allah's",
    "allahs",
    "allah’s",      # curly apostrophe variant
    "lisan",
    "lisān",
    "lisaan",
)


def audit_row(
    data: Dict[str, Any],
    table: Optional[str] = None,
    *,
    skip_fields: Optional[frozenset] = None,
    skip_tables: Optional[frozenset] = None,
) -> Dict[str, Any]:
    """
    Check every free-text field of a draft row against the family patterns.

    Args:
        data:        the row data dict as passed to write_entry
        table:       target table name; if in QUARANTINE_TABLES or the
                     caller-supplied skip_tables, the audit passes through
                     unchanged (those tables hold contamination BY DESIGN)
        skip_fields: additional field names to skip beyond the default
                     SKIP_FIELDS set
        skip_tables: additional tables to skip beyond QUARANTINE_TABLES

    Returns:
        {
            'pass':            bool,
            'failing_fields':  {field_name: [hit, hit, ...]},
            'message':         human-readable summary for the caller,
            'total_hits':      int,
        }
    """
    # ── QUARANTINE SKIP ───────────────────────────────────────────────
    # Tables that exist to document/quote contamination pass through
    # the audit unchanged. They NEED to hold the banned strings.
    effective_skip_tables = set(QUARANTINE_TABLES)
    if skip_tables:
        effective_skip_tables.update(skip_tables)
    if table and table in effective_skip_tables:
        return {
            "pass": True,
            "failing_fields": {},
            "message": f"dereference audit skipped — {table} is a quarantine table (holds contaminated content by design)",
            "total_hits": 0,
            "quarantine": True,
        }

    effective_skip = set(SKIP_FIELDS)
    if skip_fields:
        effective_skip.update(skip_fields)

    failing: Dict[str, List[Dict[str, Any]]] = {}
    total = 0
    for field, value in (data or {}).items():
        if field in effective_skip:
            continue
        if not isinstance(value, str):
            # numbers, None, bool, etc. — nothing to scan
            continue
        result = audit_text(value)
        if not result["pass"]:
            failing[field] = result["hits"]
            total += len(result["hits"])

    if not failing:
        return {
            "pass": True,
            "failing_fields": {},
            "message": "dereference audit passed — no ungrounded-family tokens detected",
            "total_hits": 0,
        }

    # Build a concise human-readable summary
    lines = [f"⛔ DEREFERENCE AUDIT BLOCKED: {total} ungrounded token(s) across "
             f"{len(failing)} field(s). Rewrite with grounded vocabulary and retry."]
    for field, hits in failing.items():
        for h in hits:
            lines.append(
                f"  [{field}] {h['family']} ({h['bl_ref']}): "
                f"matched '{h['matched_text']}' at pos {h['position']} — {h['reason']}"
            )
    return {
        "pass": False,
        "failing_fields": failing,
        "message": "\n".join(lines),
        "total_hits": total,
    }


# ═══════════════════════════════════════════════════════════════════════════
# CLI — for ad-hoc testing and prose auditing
# ═══════════════════════════════════════════════════════════════════════════


def _cli_audit_text(argv: List[str]) -> int:
    if not argv:
        print("Usage: python3 amr_dereference_audit.py <text_or_file>")
        return 1
    arg = argv[0]
    import os
    if os.path.exists(arg):
        with open(arg, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        text = " ".join(argv)
    r = audit_text(text, skip_if_arabic=False)
    if r["pass"]:
        print("✓ clean — 0 hits")
        return 0
    print(f"⛔ {len(r['hits'])} hit(s)")
    for h in r["hits"]:
        print(f"  • {h['family']} ({h['bl_ref']}): '{h['matched_text']}' @ {h['position']}")
        print(f"      → {h['reason']}")
    return 1


def main(argv: List[str]) -> int:
    if not argv:
        print("amr_dereference_audit — USLaP dereference gate")
        print()
        print("Usage:")
        print("  python3 amr_dereference_audit.py text <string>")
        print("  python3 amr_dereference_audit.py file <path>")
        print("  python3 amr_dereference_audit.py patterns      (list all family patterns)")
        return 0
    cmd = argv[0]
    if cmd == "patterns":
        for pat, fam, bl, reason in FAMILY_PATTERNS:
            print(f"  {fam:20s} {bl:8s} {pat}")
            print(f"    → {reason}")
        return 0
    if cmd == "text":
        return _cli_audit_text(argv[1:])
    if cmd == "file":
        return _cli_audit_text(argv[1:])
    # default: treat argv as text
    return _cli_audit_text(argv)


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))