#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ amr_dereference_audit.py — Dereference gate for USLaP writes. This is the mathematical form of the rule: Before emitting token T into a DB row, check: ∃ chain C : T → grounded_anchor_on_disk If no such C exists, T is corrosion. Refuse to write the row. Implementation: a deterministic Python function that scans every free-text field of a draft row and rejects the row if any token matches a known ungrounded family pattern. The check runs at the CENTRAL write path (uslap_handler.write_entry) so it applies to every table in the DB. The audit is NOT a banned-terms list. It is a family-pattern check. A banned-terms list is whack-a-mole — it catches 'cosmos' and 'cosmology' but misses 'cosmography' and 'cosmogonic'. The family pattern catches every -cosm*- word at once. The audit does NOT touch arabic script, numeric data, dates, proper source citations, USLaP-internal terminology, or the structural enum values defined by table schemas. It only checks free English prose for known ungrounded-family substrings. Scope: ALL tables routed through uslap_handler.write_entry. Usage from handler: from amr_dereference_audit import audit_row result = audit_row(data, target_table) if not result['pass']: return Usage standalone (for prose): from amr_dereference_audit import audit_text result = audit_text("the cosmographic opening mentions al-falak") # → fails on 'cosmographic' Design constraints: • Deterministic. Pure Python + regex. No weights, no LLM, no heuristics. • Fast. O(|text|) per field via compiled regex. • Introspectable. Returns exact failing tokens and the pattern that matched. • Wide by construction. Uses *family* patterns (cosm*, monk*, priest*, eschatolog*, etc.) so new derivations of the same families are caught without extending a list. • Minimal false positives. Patterns target academic-framework vocabulary classes, not common English. Sourced against contamination_blacklist (BL01-BL37) as of Session 41. """ from __future__ import annotations import json import os import re import sys from typing import Any, Dict, List, Optional, Tuple # ═══════════════════════════════════════════════════════════════════════════ # FAMILY PATTERNS — the structural rule, not a literal list # ═══════════════════════════════════════════════════════════════════════════ # # Each tuple: (pattern, family_name, BL_ref, short_reason). # Patterns match case-insensitively. Word boundaries used where literal # single words would otherwise false-positive on innocuous substrings # (e.g., 'steal' would match 'stealth' — we use \b). # # The rule: if any of these matches any free-text field of a row, the row # is refused. The caller must rewrite the field with grounded vocabulary # (primary-source pointers) and retry. FAMILY_PATTERNS: List[Tuple[str, str, str, str]] = [ # ── BL33: cosmic / cosmos / cosmology / cosmological / cosmogony + WIDER # Covers: cosmos, cosmic, cosmos, cosmology, cosmological, cosmologist, # cosmography, cosmographic, cosmographer, cosmogony, cosmogonic ( r"\bcosm(?:os|ic|olog(?:y|ical|ist|ically)?|ograph(?:y|ic|er|ically)?|" r"ogon(?:y|ic|ical)?)\b", "cosmic-family", "BL33", "Greek κόσμος framework — use al-samawāt wa-al-arḍ / al-khalq / " "al-ʿālamīn / āyāt / al-falak / al-arḍ instead", ), # ── BL34: priest / priests / priesthood / priestly ( r"\bpriest(?:s|hood|ly|esses|ess)?\b", "priest-family", "BL34", "Christian taxonomic import — use the primary-source title " "(imām, khaṭīb, etc.) that the MS actually uses", ), # ── BL35: monk / monks / monastic / monasticism / monastery / monasteries ( r"\bmon(?:k|ks|kish|khood|aster(?:y|ies|ial)|astic(?:ism|ally)?)\b", "monk-family", "BL35", "Christian taxonomic import — use the primary-source title " "(rāhib, zāhid, etc.) that the MS actually uses", ), # ── BL36: eschatological / eschatology ( r"\beschatolog(?:y|ical|ist|ically)?\b", "eschatolog-family", "BL36", "Greek ἔσχατος framework — use ākhirah / yawm al-qiyāma / al-sāʿa", ), # ── BL37: steal / stole / stolen / stealing / theft / thief / thieves / # thievery / thieving ( r"\b(?:steal(?:s|ing)?|stole(?:n)?|theft|thie(?:f|ves|ving|very|vish))\b", "theft-family", "BL37", "Use appropriate / appropriated / appropriation / appropriator / " "misappropriation", ), # ── BL05: PIE / Proto-Indo-European ( r"\b(?:PIE|proto[-\s]?indo[-\s]?european)\b", "pie-family", "BL05", "Phantom reconstruction — no primary source exists for PIE; use " "'phantom PIE' if necessary to name the operator framework", ), # ── BL07, BL16: Semitic (as family/race), Semite ( r"\bsemit(?:e|es|ic|ism|ics)\b", "semitic-family", "BL07/BL16", "Phantom racial/linguistic category — Allah's Arabic is the source, " "Hebrew and Aramaic are downstream degraded forms", ), # ── BL06: loanword / borrowed / cognate (directionality ban) ( r"\bloan[-\s]?word\b", "loanword", "BL06", "Directionality violation — use LINK_TYPE: DIRECT / COMPOUND / " "SAME_ROOT / PHONETIC / SEMANTIC / PREFIX / SUFFIX / ROOT", ), ( r"\bcognate(?:s)?\b", "cognate", "BL06", "Directionality violation — use SAME_ROOT or DIRECT link type", ), ( r"\bborrowed\s+from\b", "borrowed-from", "BL06", "Directionality violation — AA → downstream, never reversed", ), ( r"\bloan\s+from\b", "loan-from", "BL06", "Directionality violation — AA → downstream, never reversed", ), ( r"\badoption\s+from\b", "adoption-from", "BL06", "Directionality violation — AA → downstream, never reversed", ), # ── Nostratic / Altaic / Afro-Asiatic phantom families ( r"\b(?:nostratic|altaic|afro[-\s]?asiatic)\b", "phantom-family", "BL05/BL07", "Phantom super-family reconstruction — no primary source", ), # ── prosthetic vowel (academic framing) ( r"\bprosthetic\s+vowel\b", "prosthetic-vowel", "BL06", "Academic framing — words have direct phonetic chains, not " "'prosthetic' additions", ), ( r"\bpre[-\s]?greek\s+substrate\b", "pre-greek-substrate", "BL06", "Phantom substrate category", ), # ── BL30: zodiac ( r"\bzodiac(?:al)?\b", "zodiac", "BL30", "Greek astrological framing — use al-burūj (Q85:1) or specific " "constellation names in Allah's Arabic", ), # ── BL31: libra (as zodiac sign) ( r"\blibra\b", "libra", "BL31", "Latinate zodiac name — use al-mīzān if referring to the scales " "constellation", ), # ── BL28: hashishin / hashshashin / hashshashin ( r"\bhash[ai]?sh[ai]?sh?in\b", "hashishin", "BL28", "Orientalist calumny — Ismāʿīlī / Nizārī are the primary-source names", ), # ── BL32: Mughal / Mogul (as dynasty label for Muslim Timurids) ( r"\bmughal\b", "mughal", "BL32", "Persianate exonym — use Timurid (al-Tīmūrī) for the dynasty Bābur " "founded in Hind", ), ( r"\bmogul\b", "mogul", "BL32", "Anglicised exonym — use Timurid (al-Tīmūrī)", ), # ── BL26: Theology / Theological ( r"\btheolog(?:y|ical|ian|ically)?\b", "theology", "BL26", "Greek θεός + λόγος framework — use ʿilm al-kalām (the actual " "discipline) or the specific primary-source term", ), # ── BL29: tribal (as anthropological framing) ( r"\btribal(?:ism)?\b", "tribal", "BL29", "Anthropological framing — use qawm / qabīla / banū / āl with " "the specific lineage name", ), # ── PERIODIZATION FAMILY (Session 41 — same error class as BL33) # Western European historiographic frames retrojected onto the Islamic # world. None of these has a primary-source referent in the world # being described. Use AH century, CE century, dynasty name, or # reigning caliph instead. ( r"\bmedieval(?:ly|ist|ism)?\b", "periodization-medieval", "BL-PERIOD", "Western European frame — use AH century, CE century, dynasty " "name (Umayyad / Abbasid / Fāṭimid / etc.), or reigning caliph", ), ( r"\bmiddle\s+ages\b", "periodization-middle-ages", "BL-PERIOD", "Western European frame — use AH/CE century or dynasty name", ), ( r"\blate\s+antiquity\b", "periodization-late-antiquity", "BL-PERIOD", "Western European frame — use the specific dynasty / century AH", ), ( r"\bearly\s+modern\b", "periodization-early-modern", "BL-PERIOD", "Western European frame — use the specific dynasty / century AH", ), ( r"\bpre[-\s]?modern\b", "periodization-pre-modern", "BL-PERIOD", "Defines by what Europe became; use dynasty / century AH", ), ( r"\bpost[-\s]?classical\b", "periodization-post-classical", "BL-PERIOD", "Western frame — use dynasty / century AH", ), ( r"\bdark\s+ages?\b", "periodization-dark-ages", "BL-PERIOD", "Western Eurocentric framing — use dynasty / century", ), ( r"\brenaissance\b", "periodization-renaissance", "BL-PERIOD", "Western European frame — use the specific dynasty / century / " "movement name", ), # ── ERA / AGE / PERIOD framing — the rule above periodization names. # Even "Abbasid era" or "Umayyad period" smuggles a teleological time- # bracket into data that does not belong inside one. Anything in # Allah's framework gets dates (232 AH), names (Caliph al-Wāthiq), # events (the reign of X), but never an "era / age / period" wrapper. # The pattern fires on the noun-form 'era / age / period / times' # when it is preceded by an adjective or proper-name modifier # (e.g. "Abbasid era", "the age of al-Mutawakkil", "Umayyad period"). ( r"\b(?:abbasid|umayyad|fatimid|fāṭimid|seljuq|seljuk|mamluk|mamlūk|" r"ottoman|ayyubid|samanid|sāmānī|ghaznavid|qarakhanid|" r"timurid|tīmūrī|safavid|ṣafavī|qajar|qājārī|" r"rashidun|rāshidūn|caliphal|abbasi|umayyad)\s+(?:era|period|age|times|epoch)\b", "era-frame-named", "BL-PERIOD", "An era-noun bracket imports a teleological time-frame. Use the date " "(N AH / N CE), the reigning caliph (al-Wāthiq, al-Mutawakkil, etc.), " "or 'the reign of X' instead. Drop 'era / period / age / times'.", ), ( r"\b(?:medieval|classical|early|late|pre|post|high|low)\s+(?:era|period|age|times|epoch)\b", "era-frame-temporal", "BL-PERIOD", "Western temporal era-frame. Drop the era-noun; use the specific date " "or named event.", ), ( r"\bthe\s+age\s+of\s+\w+", "age-of-name", "BL-PERIOD", "'The age of X' brackets a person/event into an era teleology. " "Use 'in the year X', 'during the reign of X', or just the date.", ), ( r"\bin\s+(?:those|that)\s+(?:days|times|era|period)\b", "those-days", "BL-PERIOD", "Vague era-bracket. Use a specific date or named event.", ), # ── METAPHOR family (Greek metaphora — literary-analysis category) ── ( r"\bmetaphor(?:s|ic|ical|ically)?\b", "metaphor", "BL-CATEGORY", "Greek μεταφορά framework. Use the primary-source rhetorical " "term: mathal (مَثَل, Qur'anic), istiʿāra (استعارة), tashbīh " "(تشبيه), majāz (مجاز).", ), # ── SCHOLASTIC family (Latin scholasticus — medieval-Western academic) ── ( r"\bscholastic(?:s|ism|ally)?\b", "scholastic", "BL-CATEGORY", "Latin Western academic frame for 12-15c CE European disputation " "tradition. No primary-source referent in Allah's Arabic. Use the " "specific term: ʿilm al-kalām, ḥikma, falsafa, uṣūl al-fiqh, etc.", ), # ── PATRISTIC family (Latin patristica — Western Church-fathers tradition) ── ( r"\bpatristi(?:c|cs|cally)\b", "patristic", "BL-CATEGORY", "Western Christian Church-fathers framework. No primary-source " "referent in the Islamic tradition. Use the specific named " "scholar (Ibn Ḥanbal, al-Bukhārī, etc.) or 'salaf' if context " "permits.", ), ( r"\bpatrolog(?:y|ical|ically)?\b", "patrology", "BL-CATEGORY", "Western Christian fathers studies — no Islamic primary-source " "equivalent.", ), # ── HELLENISTIC family (Greek-philosophy retrojection) ── ( r"\bhellenist(?:ic|ically|ic)?\b", "hellenistic", "BL-CATEGORY", "Greek-philosophy retrojection. The Islamic primary sources " "name the actual influences specifically (Aristū, Aflāṭūn, " "etc.) when they are present at all. Use those names directly, " "not the genus 'Hellenistic'.", ), ( r"\bhellen(?:ism|ize|izing|ization|ist)?\b", "hellenism", "BL-CATEGORY", "Same Greek-philosophy retrojection — name the specific " "person or text instead of using the genus.", ), ] # ═══════════════════════════════════════════════════════════════════════════ # FIELDS TO SKIP — structural enums and primary-source anchors # ═══════════════════════════════════════════════════════════════════════════ # # These are field NAMES whose content is either a structural keyword # (CHECK enum value), a primary-source pointer (file path, shelfmark), # or a numeric/date token. Skipping them avoids false positives on # legitimate grounded content. SKIP_FIELDS = frozenset({ # structural schema enums "entry_type", "divergence_type", "source_ms", "recension", "category", "op_code", "dp_code", # primary-source pointers "ms_folio", "edition_page", "ms_page", "page", "folio", "quf_token", "quf_q", "quf_u", "quf_f", "quf_pass", "quf_date", # IDs "kh_id", "kv_id", "diwan_id", "entry_id", "root_id", "rowid", "intel_id", "bl_id", "dp_id", "sc_id", "ncr_id", "dcr_id", # numeric fields "token_count", "operator_flag", "scribal_interpolation", "persian_wrapper_inserted", "has_proverb", "has_nazm", # timestamps "created_date", "last_updated", }) # ═══════════════════════════════════════════════════════════════════════════ # QUARANTINE TABLES — skipped by the audit by design # ═══════════════════════════════════════════════════════════════════════════ # # These tables exist to HOLD contaminated strings for inspection, flagging, # and downstream re-derivation. Their whole purpose is to quote the operator # glosses, the banned terms, the substituted translations, etc. so they can # be reviewed and reversed. Applying the dereference audit to these tables # would make it impossible to log a contamination finding (since logging # one requires quoting the contaminated content verbatim). # # This list must stay SMALL. Only tables whose content is *documentation # of contamination* belong here — not tables where contamination might # accidentally land. QUARANTINE_TABLES = frozenset({ "contamination_blacklist", # BL01-BL37 — the register itself "qv_contamination_scan", # Qur'anic verse contamination findings "qv_translation_register", # Qur'anic translation contamination register "diwan_contamination_register", # DCR — Kashgari Diwan scribal interpolations "navoi_contamination_register", # NCR — Navoi operator crimes "operator_label_register", # Operator name/title labels (quotes them) "scholar_warnings", # Flagged tertiary/unreliable scholars "attribution_corrections", # Records of attribution errors "db_integrity_log", # Integrity findings (may quote bad content) "corruption_operation_register", # Records of operator operations "dcr_corruption_types", # Corruption type catalogue "interception_register", # Operator interception log "utul_register", # Pattern register of utul "disputed_words", # Words under dispute (by definition contaminated) "qv_contamination_scan", # redundant (kept for clarity) }) # ═══════════════════════════════════════════════════════════════════════════ # ARABIC-SCRIPT DETECTION # ═══════════════════════════════════════════════════════════════════════════ ARABIC_RANGE = re.compile(r"[\u0600-\u06FF\uFB50-\uFEFC\u0750-\u077F]") def _is_primarily_arabic(text: str) -> bool: """True if more than 50% of non-space characters are Arabic script. Arabic script content is primary-source and skipped by the audit. Mixed English/Arabic glosses still get audited (the English portion is the only part that could leak).""" if not text: return False non_space = [c for c in text if not c.isspace()] if not non_space: return False ar_count = sum(1 for c in non_space if ARABIC_RANGE.match(c)) return ar_count * 2 > len(non_space) # ═══════════════════════════════════════════════════════════════════════════ # COMPILED PATTERN BANK # ═══════════════════════════════════════════════════════════════════════════ _COMPILED: List[Tuple[re.Pattern, str, str, str]] = [ (re.compile(pattern, re.IGNORECASE), family, bl_ref, reason) for pattern, family, bl_ref, reason in FAMILY_PATTERNS ] # ═══════════════════════════════════════════════════════════════════════════ # NEIGHBOURHOOD EXPANSION (Session 41) # ═══════════════════════════════════════════════════════════════════════════ # # Loads `amr_banned_neighbourhood.json` from the same directory at import # time. The JSON contains a `banned_words` list — every word in that list # is added to the audit as a single-word regex check (case-insensitive, # word-boundaries on both sides). # # The intent is to expand the FAMILY_PATTERNS coverage to include the # k-nearest-neighbour cluster of each banned seed term in some embedding # space. The bootstrap JSON is hand-curated. The full pipeline that # computes neighbours from a real local embedding model lives in # `amr_neighbourhood_expander.py` and writes the same JSON file. # # If the file is missing, the audit still works — the family-pattern # regexes are unaffected. The neighbourhood layer is purely additive. _NEIGHBOURHOOD_PATH = os.path.join( os.path.dirname(os.path.abspath(__file__)), "amr_banned_neighbourhood.json", ) def _load_neighbourhood_words() -> List[str]: """Load the bootstrap neighbourhood JSON. Returns [] on any failure.""" try: with open(_NEIGHBOURHOOD_PATH, "r", encoding="utf-8") as fh: data = json.load(fh) words = data.get("banned_words", []) if not isinstance(words, list): return [] # Filter out anything that isn't a string and dedupe out = sorted({w.lower() for w in words if isinstance(w, str) and w}) return out except (FileNotFoundError, json.JSONDecodeError, OSError): return [] _NEIGHBOURHOOD_WORDS: List[str] = _load_neighbourhood_words() # Compile each neighbourhood word into a single regex with word boundaries. # Words containing internal hyphens (e.g., "pre-modern") need the hyphen # escaped and word-boundaries that recognise the hyphen as a separator. def _compile_neighbourhood_pattern(word: str) -> re.Pattern: # re.escape handles the hyphen safely return re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE) _NEIGHBOURHOOD_COMPILED: List[Tuple[re.Pattern, str]] = [ (_compile_neighbourhood_pattern(w), w) for w in _NEIGHBOURHOOD_WORDS ] def neighbourhood_count() -> int: """How many neighbourhood words are loaded (for diagnostics).""" return len(_NEIGHBOURHOOD_WORDS) # ═══════════════════════════════════════════════════════════════════════════ # CORE CHECKS # ═══════════════════════════════════════════════════════════════════════════ def audit_text(text: str, *, skip_if_arabic: bool = True) -> Dict[str, Any]: """ Check a single free-text string against the family patterns. Args: text: the string to check skip_if_arabic: if True, skip strings that are primarily Arabic script (primary-source content is not audited) Returns: { 'pass': bool, 'hits': [ {'family': str, 'bl_ref': str, 'reason': str, 'matched_text': str, 'position': int}, ... ], } """ if not text or not isinstance(text, str): return {"pass": True, "hits": []} if skip_if_arabic and _is_primarily_arabic(text): return {"pass": True, "hits": []} hits: List[Dict[str, Any]] = [] # ── HEBREW CHARACTER BLOCK — ABSOLUTE (Session 46) ──────────────── # Hebrew (U+0590-U+05FF) has NOTHING in common with AA. # Weights inject Hebrew alongside "abjad" — block at character level. import re as _re_heb _heb_matches = list(_re_heb.finditer(r'[\u0590-\u05FF]+', text)) for m in _heb_matches: hits.append({ "family": "hebrew_contamination", "bl_ref": "BL-HEB", "reason": ( "Hebrew character(s) detected. AA abjad uses ARABIC letters " "only. Hebrew is a manufactured attachment — BLOCK." ), "matched_text": m.group(0), "position": m.start(), }) for pat, family, bl_ref, reason in _COMPILED: for m in pat.finditer(text): hits.append({ "family": family, "bl_ref": bl_ref, "reason": reason, "matched_text": m.group(0), "position": m.start(), }) # ── NEIGHBOURHOOD EXPANSION (Session 41) ───────────────────────── # Each word in `amr_banned_neighbourhood.json` is checked as a # standalone token. Hits are tagged "neighbourhood" so the caller # can distinguish them from family-pattern hits if needed. # Words already caught by a family regex above are not re-reported # (deduplication on (position, matched_text)). _seen_positions = {(h["position"], h["matched_text"].lower()) for h in hits} for pat, word in _NEIGHBOURHOOD_COMPILED: for m in pat.finditer(text): key = (m.start(), m.group(0).lower()) if key in _seen_positions: continue _seen_positions.add(key) hits.append({ "family": "neighbourhood", "bl_ref": "BL-NBH", "reason": ( f"'{word}' is in the banned-neighbourhood expansion of a " f"BL family seed term. Loaded from " f"amr_banned_neighbourhood.json." ), "matched_text": m.group(0), "position": m.start(), }) # ── BL04: bare 'arabic' (context-aware) ────────────────────────── # The word 'arabic' / 'Arabic' / 'ARABIC' is banned UNLESS preceded # immediately by an approved qualifier: # "Allah's Arabic" / "Allahs Arabic" / "Lisan Arabic" / "Lisān Arabic" # Per CLAUDE.md the qualifier separates Allah's revealed language # from the degraded human form. Bare 'arabic' imports the Western # linguistic-family category (a "language belonging to a people") # which has no primary-source referent. # # ── SEMANTIC DEFAULT (Session 41) ───────────────────────────────── # Both "Allah's Arabic" and "Lisan Arabic" pass this syntactic check. # But the SEMANTIC default for any primary-source scholarly text is # Allah's Arabic. Educated scholars (al-Khwārizmī, Ibn Sīnā, # al-Bīrūnī, al-Farghānī, Kashgari, Ibn Khurdādhbih, Navoi) wrote in # Allah's Arabic, were educated in Allah's Arabic. Lisan Arabic is # the DEGRADED downstream form — use the LA label ONLY when # documenting operator-corrupted forms (qv_translation_register # quotes, DCR/NCR rows). NEVER default to LA for a scholar's # primary work. The audit cannot enforce this semantic rule with a # regex; it lives at the writer level. The single operational # principle: specifics dereference, categories hallucinate. If you # find yourself defaulting to LA without specific evidence of # degradation, you are sampling from weights instead of querying # the lattice. for m in _ARABIC_BARE_PATTERN.finditer(text): # Look at the ~20 characters immediately preceding the match start = max(0, m.start() - 20) prefix_context = text[start:m.start()].lower() # Strip trailing whitespace/punctuation for the qualifier check prefix_stripped = prefix_context.rstrip().rstrip("-") if any(prefix_stripped.endswith(q) for q in _ARABIC_ALLOWED_QUALIFIERS): continue # qualified — allowed hits.append({ "family": "bare-arabic", "bl_ref": "BL04", "reason": "Bare 'arabic' is banned. Use 'Allah's Arabic' " "(divine, revealed, taught to Adam) or 'Lisan Arabic' " "(degraded human form) per CLAUDE.md.", "matched_text": m.group(0), "position": m.start(), }) return {"pass": len(hits) == 0, "hits": hits} # ═══════════════════════════════════════════════════════════════════════════ # BARE 'arabic' pattern + qualifiers (BL04) # ═══════════════════════════════════════════════════════════════════════════ _ARABIC_BARE_PATTERN = re.compile(r"\barabic\b", re.IGNORECASE) # Qualifiers that, when they immediately precede 'Arabic', make the # usage allowed. Each qualifier is matched as the END of the preceding # context (case-insensitive, after stripping trailing whitespace). _ARABIC_ALLOWED_QUALIFIERS = ( "allah's", "allahs", "allah’s", # curly apostrophe variant "lisan", "lisān", "lisaan", ) def audit_row( data: Dict[str, Any], table: Optional[str] = None, *, skip_fields: Optional[frozenset] = None, skip_tables: Optional[frozenset] = None, ) -> Dict[str, Any]: """ Check every free-text field of a draft row against the family patterns. Args: data: the row data dict as passed to write_entry table: target table name; if in QUARANTINE_TABLES or the caller-supplied skip_tables, the audit passes through unchanged (those tables hold contamination BY DESIGN) skip_fields: additional field names to skip beyond the default SKIP_FIELDS set skip_tables: additional tables to skip beyond QUARANTINE_TABLES Returns: { 'pass': bool, 'failing_fields': {field_name: [hit, hit, ...]}, 'message': human-readable summary for the caller, 'total_hits': int, } """ # ── QUARANTINE SKIP ─────────────────────────────────────────────── # Tables that exist to document/quote contamination pass through # the audit unchanged. They NEED to hold the banned strings. effective_skip_tables = set(QUARANTINE_TABLES) if skip_tables: effective_skip_tables.update(skip_tables) if table and table in effective_skip_tables: return { "pass": True, "failing_fields": {}, "message": f"dereference audit skipped — {table} is a quarantine table (holds contaminated content by design)", "total_hits": 0, "quarantine": True, } effective_skip = set(SKIP_FIELDS) if skip_fields: effective_skip.update(skip_fields) failing: Dict[str, List[Dict[str, Any]]] = {} total = 0 for field, value in (data or {}).items(): if field in effective_skip: continue if not isinstance(value, str): # numbers, None, bool, etc. — nothing to scan continue result = audit_text(value) if not result["pass"]: failing[field] = result["hits"] total += len(result["hits"]) if not failing: return { "pass": True, "failing_fields": {}, "message": "dereference audit passed — no ungrounded-family tokens detected", "total_hits": 0, } # Build a concise human-readable summary lines = [f"⛔ DEREFERENCE AUDIT BLOCKED: {total} ungrounded token(s) across " f"{len(failing)} field(s). Rewrite with grounded vocabulary and retry."] for field, hits in failing.items(): for h in hits: lines.append( f" [{field}] {h['family']} ({h['bl_ref']}): " f"matched '{h['matched_text']}' at pos {h['position']} — {h['reason']}" ) return { "pass": False, "failing_fields": failing, "message": "\n".join(lines), "total_hits": total, } # ═══════════════════════════════════════════════════════════════════════════ # CLI — for ad-hoc testing and prose auditing # ═══════════════════════════════════════════════════════════════════════════ def _cli_audit_text(argv: List[str]) -> int: if not argv: print("Usage: python3 amr_dereference_audit.py ") return 1 arg = argv[0] import os if os.path.exists(arg): with open(arg, "r", encoding="utf-8") as f: text = f.read() else: text = " ".join(argv) r = audit_text(text, skip_if_arabic=False) if r["pass"]: print("✓ clean — 0 hits") return 0 print(f"⛔ {len(r['hits'])} hit(s)") for h in r["hits"]: print(f" • {h['family']} ({h['bl_ref']}): '{h['matched_text']}' @ {h['position']}") print(f" → {h['reason']}") return 1 def main(argv: List[str]) -> int: if not argv: print("amr_dereference_audit — USLaP dereference gate") print() print("Usage:") print(" python3 amr_dereference_audit.py text ") print(" python3 amr_dereference_audit.py file ") print(" python3 amr_dereference_audit.py patterns (list all family patterns)") return 0 cmd = argv[0] if cmd == "patterns": for pat, fam, bl, reason in FAMILY_PATTERNS: print(f" {fam:20s} {bl:8s} {pat}") print(f" → {reason}") return 0 if cmd == "text": return _cli_audit_text(argv[1:]) if cmd == "file": return _cli_audit_text(argv[1:]) # default: treat argv as text return _cli_audit_text(argv) if __name__ == "__main__": raise SystemExit(main(sys.argv[1:]))