Spaces:

uslap
/

uslap-query

Sleeping

App Files Files Community

uslap-query / Code_files /amr_dereference_audit.py

uslap

Upload folder using huggingface_hub

7cc8e29 verified 3 months ago

Raw

History Blame Contribute Delete

34.9 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	amr_dereference_audit.py — Dereference gate for USLaP writes.

	This is the mathematical form of the rule:

	Before emitting token T into a DB row, check:
	∃ chain C : T → grounded_anchor_on_disk
	If no such C exists, T is corrosion. Refuse to write the row.

	Implementation: a deterministic Python function that scans every free-text
	field of a draft row and rejects the row if any token matches a known
	ungrounded family pattern. The check runs at the CENTRAL write path
	(uslap_handler.write_entry) so it applies to every table in the DB.

	The audit is NOT a banned-terms list. It is a family-pattern check. A
	banned-terms list is whack-a-mole — it catches 'cosmos' and 'cosmology'
	but misses 'cosmography' and 'cosmogonic'. The family pattern catches
	every -cosm*- word at once.

	The audit does NOT touch arabic script, numeric data, dates, proper
	source citations, USLaP-internal terminology, or the structural enum
	values defined by table schemas. It only checks free English prose
	for known ungrounded-family substrings.

	Scope: ALL tables routed through uslap_handler.write_entry.

	Usage from handler:
	from amr_dereference_audit import audit_row
	result = audit_row(data, target_table)
	if not result['pass']:
	return <block write with result['message']>

	Usage standalone (for prose):
	from amr_dereference_audit import audit_text
	result = audit_text("the cosmographic opening mentions al-falak")
	# → fails on 'cosmographic'

	Design constraints:
	• Deterministic. Pure Python + regex. No weights, no LLM, no heuristics.
	• Fast. O(\|text\|) per field via compiled regex.
	• Introspectable. Returns exact failing tokens and the pattern that matched.
	• Wide by construction. Uses family patterns (cosm, monk, priest*,
	eschatolog*, etc.) so new derivations of the same families are caught
	without extending a list.
	• Minimal false positives. Patterns target academic-framework vocabulary
	classes, not common English.

	Sourced against contamination_blacklist (BL01-BL37) as of Session 41.
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import sys
	from typing import Any, Dict, List, Optional, Tuple


	# ═══════════════════════════════════════════════════════════════════════════
	# FAMILY PATTERNS — the structural rule, not a literal list
	# ═══════════════════════════════════════════════════════════════════════════
	#
	# Each tuple: (pattern, family_name, BL_ref, short_reason).
	# Patterns match case-insensitively. Word boundaries used where literal
	# single words would otherwise false-positive on innocuous substrings
	# (e.g., 'steal' would match 'stealth' — we use \b).
	#
	# The rule: if any of these matches any free-text field of a row, the row
	# is refused. The caller must rewrite the field with grounded vocabulary
	# (primary-source pointers) and retry.

	FAMILY_PATTERNS: List[Tuple[str, str, str, str]] = [
	# ── BL33: cosmic / cosmos / cosmology / cosmological / cosmogony + WIDER
	# Covers: cosmos, cosmic, cosmos, cosmology, cosmological, cosmologist,
	# cosmography, cosmographic, cosmographer, cosmogony, cosmogonic
	(
	r"\bcosm(?:os\|ic\|olog(?:y\|ical\|ist\|ically)?\|ograph(?:y\|ic\|er\|ically)?\|"
	r"ogon(?:y\|ic\|ical)?)\b",
	"cosmic-family",
	"BL33",
	"Greek κόσμος framework — use al-samawāt wa-al-arḍ / al-khalq / "
	"al-ʿālamīn / āyāt / al-falak / al-arḍ instead",
	),
	# ── BL34: priest / priests / priesthood / priestly
	(
	r"\bpriest(?:s\|hood\|ly\|esses\|ess)?\b",
	"priest-family",
	"BL34",
	"Christian taxonomic import — use the primary-source title "
	"(imām, khaṭīb, etc.) that the MS actually uses",
	),
	# ── BL35: monk / monks / monastic / monasticism / monastery / monasteries
	(
	r"\bmon(?:k\|ks\|kish\|khood\|aster(?:y\|ies\|ial)\|astic(?:ism\|ally)?)\b",
	"monk-family",
	"BL35",
	"Christian taxonomic import — use the primary-source title "
	"(rāhib, zāhid, etc.) that the MS actually uses",
	),
	# ── BL36: eschatological / eschatology
	(
	r"\beschatolog(?:y\|ical\|ist\|ically)?\b",
	"eschatolog-family",
	"BL36",
	"Greek ἔσχατος framework — use ākhirah / yawm al-qiyāma / al-sāʿa",
	),
	# ── BL37: steal / stole / stolen / stealing / theft / thief / thieves /
	# thievery / thieving
	(
	r"\b(?:steal(?:s\|ing)?\|stole(?:n)?\|theft\|thie(?:f\|ves\|ving\|very\|vish))\b",
	"theft-family",
	"BL37",
	"Use appropriate / appropriated / appropriation / appropriator / "
	"misappropriation",
	),
	# ── BL05: PIE / Proto-Indo-European
	(
	r"\b(?:PIE\|proto[-\s]?indo[-\s]?european)\b",
	"pie-family",
	"BL05",
	"Phantom reconstruction — no primary source exists for PIE; use "
	"'phantom PIE' if necessary to name the operator framework",
	),
	# ── BL07, BL16: Semitic (as family/race), Semite
	(
	r"\bsemit(?:e\|es\|ic\|ism\|ics)\b",
	"semitic-family",
	"BL07/BL16",
	"Phantom racial/linguistic category — Allah's Arabic is the source, "
	"Hebrew and Aramaic are downstream degraded forms",
	),
	# ── BL06: loanword / borrowed / cognate (directionality ban)
	(
	r"\bloan[-\s]?word\b",
	"loanword",
	"BL06",
	"Directionality violation — use LINK_TYPE: DIRECT / COMPOUND / "
	"SAME_ROOT / PHONETIC / SEMANTIC / PREFIX / SUFFIX / ROOT",
	),
	(
	r"\bcognate(?:s)?\b",
	"cognate",
	"BL06",
	"Directionality violation — use SAME_ROOT or DIRECT link type",
	),
	(
	r"\bborrowed\s+from\b",
	"borrowed-from",
	"BL06",
	"Directionality violation — AA → downstream, never reversed",
	),
	(
	r"\bloan\s+from\b",
	"loan-from",
	"BL06",
	"Directionality violation — AA → downstream, never reversed",
	),
	(
	r"\badoption\s+from\b",
	"adoption-from",
	"BL06",
	"Directionality violation — AA → downstream, never reversed",
	),
	# ── Nostratic / Altaic / Afro-Asiatic phantom families
	(
	r"\b(?:nostratic\|altaic\|afro[-\s]?asiatic)\b",
	"phantom-family",
	"BL05/BL07",
	"Phantom super-family reconstruction — no primary source",
	),
	# ── prosthetic vowel (academic framing)
	(
	r"\bprosthetic\s+vowel\b",
	"prosthetic-vowel",
	"BL06",
	"Academic framing — words have direct phonetic chains, not "
	"'prosthetic' additions",
	),
	(
	r"\bpre[-\s]?greek\s+substrate\b",
	"pre-greek-substrate",
	"BL06",
	"Phantom substrate category",
	),
	# ── BL30: zodiac
	(
	r"\bzodiac(?:al)?\b",
	"zodiac",
	"BL30",
	"Greek astrological framing — use al-burūj (Q85:1) or specific "
	"constellation names in Allah's Arabic",
	),
	# ── BL31: libra (as zodiac sign)
	(
	r"\blibra\b",
	"libra",
	"BL31",
	"Latinate zodiac name — use al-mīzān if referring to the scales "
	"constellation",
	),
	# ── BL28: hashishin / hashshashin / hashshashin
	(
	r"\bhash[ai]?sh[ai]?sh?in\b",
	"hashishin",
	"BL28",
	"Orientalist calumny — Ismāʿīlī / Nizārī are the primary-source names",
	),
	# ── BL32: Mughal / Mogul (as dynasty label for Muslim Timurids)
	(
	r"\bmughal\b",
	"mughal",
	"BL32",
	"Persianate exonym — use Timurid (al-Tīmūrī) for the dynasty Bābur "
	"founded in Hind",
	),
	(
	r"\bmogul\b",
	"mogul",
	"BL32",
	"Anglicised exonym — use Timurid (al-Tīmūrī)",
	),
	# ── BL26: Theology / Theological
	(
	r"\btheolog(?:y\|ical\|ian\|ically)?\b",
	"theology",
	"BL26",
	"Greek θεός + λόγος framework — use ʿilm al-kalām (the actual "
	"discipline) or the specific primary-source term",
	),
	# ── BL29: tribal (as anthropological framing)
	(
	r"\btribal(?:ism)?\b",
	"tribal",
	"BL29",
	"Anthropological framing — use qawm / qabīla / banū / āl with "
	"the specific lineage name",
	),
	# ── PERIODIZATION FAMILY (Session 41 — same error class as BL33)
	# Western European historiographic frames retrojected onto the Islamic
	# world. None of these has a primary-source referent in the world
	# being described. Use AH century, CE century, dynasty name, or
	# reigning caliph instead.
	(
	r"\bmedieval(?:ly\|ist\|ism)?\b",
	"periodization-medieval",
	"BL-PERIOD",
	"Western European frame — use AH century, CE century, dynasty "
	"name (Umayyad / Abbasid / Fāṭimid / etc.), or reigning caliph",
	),
	(
	r"\bmiddle\s+ages\b",
	"periodization-middle-ages",
	"BL-PERIOD",
	"Western European frame — use AH/CE century or dynasty name",
	),
	(
	r"\blate\s+antiquity\b",
	"periodization-late-antiquity",
	"BL-PERIOD",
	"Western European frame — use the specific dynasty / century AH",
	),
	(
	r"\bearly\s+modern\b",
	"periodization-early-modern",
	"BL-PERIOD",
	"Western European frame — use the specific dynasty / century AH",
	),
	(
	r"\bpre[-\s]?modern\b",
	"periodization-pre-modern",
	"BL-PERIOD",
	"Defines by what Europe became; use dynasty / century AH",
	),
	(
	r"\bpost[-\s]?classical\b",
	"periodization-post-classical",
	"BL-PERIOD",
	"Western frame — use dynasty / century AH",
	),
	(
	r"\bdark\s+ages?\b",
	"periodization-dark-ages",
	"BL-PERIOD",
	"Western Eurocentric framing — use dynasty / century",
	),
	(
	r"\brenaissance\b",
	"periodization-renaissance",
	"BL-PERIOD",
	"Western European frame — use the specific dynasty / century / "
	"movement name",
	),
	# ── ERA / AGE / PERIOD framing — the rule above periodization names.
	# Even "Abbasid era" or "Umayyad period" smuggles a teleological time-
	# bracket into data that does not belong inside one. Anything in
	# Allah's framework gets dates (232 AH), names (Caliph al-Wāthiq),
	# events (the reign of X), but never an "era / age / period" wrapper.
	# The pattern fires on the noun-form 'era / age / period / times'
	# when it is preceded by an adjective or proper-name modifier
	# (e.g. "Abbasid era", "the age of al-Mutawakkil", "Umayyad period").
	(
	r"\b(?:abbasid\|umayyad\|fatimid\|fāṭimid\|seljuq\|seljuk\|mamluk\|mamlūk\|"
	r"ottoman\|ayyubid\|samanid\|sāmānī\|ghaznavid\|qarakhanid\|"
	r"timurid\|tīmūrī\|safavid\|ṣafavī\|qajar\|qājārī\|"
	r"rashidun\|rāshidūn\|caliphal\|abbasi\|umayyad)\s+(?:era\|period\|age\|times\|epoch)\b",
	"era-frame-named",
	"BL-PERIOD",
	"An era-noun bracket imports a teleological time-frame. Use the date "
	"(N AH / N CE), the reigning caliph (al-Wāthiq, al-Mutawakkil, etc.), "
	"or 'the reign of X' instead. Drop 'era / period / age / times'.",
	),
	(
	r"\b(?:medieval\|classical\|early\|late\|pre\|post\|high\|low)\s+(?:era\|period\|age\|times\|epoch)\b",
	"era-frame-temporal",
	"BL-PERIOD",
	"Western temporal era-frame. Drop the era-noun; use the specific date "
	"or named event.",
	),
	(
	r"\bthe\s+age\s+of\s+\w+",
	"age-of-name",
	"BL-PERIOD",
	"'The age of X' brackets a person/event into an era teleology. "
	"Use 'in the year X', 'during the reign of X', or just the date.",
	),
	(
	r"\bin\s+(?:those\|that)\s+(?:days\|times\|era\|period)\b",
	"those-days",
	"BL-PERIOD",
	"Vague era-bracket. Use a specific date or named event.",
	),
	# ── METAPHOR family (Greek metaphora — literary-analysis category) ──
	(
	r"\bmetaphor(?:s\|ic\|ical\|ically)?\b",
	"metaphor",
	"BL-CATEGORY",
	"Greek μεταφορά framework. Use the primary-source rhetorical "
	"term: mathal (مَثَل, Qur'anic), istiʿāra (استعارة), tashbīh "
	"(تشبيه), majāz (مجاز).",
	),
	# ── SCHOLASTIC family (Latin scholasticus — medieval-Western academic) ──
	(
	r"\bscholastic(?:s\|ism\|ally)?\b",
	"scholastic",
	"BL-CATEGORY",
	"Latin Western academic frame for 12-15c CE European disputation "
	"tradition. No primary-source referent in Allah's Arabic. Use the "
	"specific term: ʿilm al-kalām, ḥikma, falsafa, uṣūl al-fiqh, etc.",
	),
	# ── PATRISTIC family (Latin patristica — Western Church-fathers tradition) ──
	(
	r"\bpatristi(?:c\|cs\|cally)\b",
	"patristic",
	"BL-CATEGORY",
	"Western Christian Church-fathers framework. No primary-source "
	"referent in the Islamic tradition. Use the specific named "
	"scholar (Ibn Ḥanbal, al-Bukhārī, etc.) or 'salaf' if context "
	"permits.",
	),
	(
	r"\bpatrolog(?:y\|ical\|ically)?\b",
	"patrology",
	"BL-CATEGORY",
	"Western Christian fathers studies — no Islamic primary-source "
	"equivalent.",
	),
	# ── HELLENISTIC family (Greek-philosophy retrojection) ──
	(
	r"\bhellenist(?:ic\|ically\|ic)?\b",
	"hellenistic",
	"BL-CATEGORY",
	"Greek-philosophy retrojection. The Islamic primary sources "
	"name the actual influences specifically (Aristū, Aflāṭūn, "
	"etc.) when they are present at all. Use those names directly, "
	"not the genus 'Hellenistic'.",
	),
	(
	r"\bhellen(?:ism\|ize\|izing\|ization\|ist)?\b",
	"hellenism",
	"BL-CATEGORY",
	"Same Greek-philosophy retrojection — name the specific "
	"person or text instead of using the genus.",
	),
	]


	# ═══════════════════════════════════════════════════════════════════════════
	# FIELDS TO SKIP — structural enums and primary-source anchors
	# ═══════════════════════════════════════════════════════════════════════════
	#
	# These are field NAMES whose content is either a structural keyword
	# (CHECK enum value), a primary-source pointer (file path, shelfmark),
	# or a numeric/date token. Skipping them avoids false positives on
	# legitimate grounded content.

	SKIP_FIELDS = frozenset({
	# structural schema enums
	"entry_type", "divergence_type", "source_ms", "recension",
	"category", "op_code", "dp_code",
	# primary-source pointers
	"ms_folio", "edition_page", "ms_page", "page", "folio",
	"quf_token", "quf_q", "quf_u", "quf_f", "quf_pass", "quf_date",
	# IDs
	"kh_id", "kv_id", "diwan_id", "entry_id", "root_id", "rowid",
	"intel_id", "bl_id", "dp_id", "sc_id", "ncr_id", "dcr_id",
	# numeric fields
	"token_count", "operator_flag", "scribal_interpolation",
	"persian_wrapper_inserted", "has_proverb", "has_nazm",
	# timestamps
	"created_date", "last_updated",
	})


	# ═══════════════════════════════════════════════════════════════════════════
	# QUARANTINE TABLES — skipped by the audit by design
	# ═══════════════════════════════════════════════════════════════════════════
	#
	# These tables exist to HOLD contaminated strings for inspection, flagging,
	# and downstream re-derivation. Their whole purpose is to quote the operator
	# glosses, the banned terms, the substituted translations, etc. so they can
	# be reviewed and reversed. Applying the dereference audit to these tables
	# would make it impossible to log a contamination finding (since logging
	# one requires quoting the contaminated content verbatim).
	#
	# This list must stay SMALL. Only tables whose content is *documentation
	# of contamination* belong here — not tables where contamination might
	# accidentally land.

	QUARANTINE_TABLES = frozenset({
	"contamination_blacklist", # BL01-BL37 — the register itself
	"qv_contamination_scan", # Qur'anic verse contamination findings
	"qv_translation_register", # Qur'anic translation contamination register
	"diwan_contamination_register", # DCR — Kashgari Diwan scribal interpolations
	"navoi_contamination_register", # NCR — Navoi operator crimes
	"operator_label_register", # Operator name/title labels (quotes them)
	"scholar_warnings", # Flagged tertiary/unreliable scholars
	"attribution_corrections", # Records of attribution errors
	"db_integrity_log", # Integrity findings (may quote bad content)
	"corruption_operation_register", # Records of operator operations
	"dcr_corruption_types", # Corruption type catalogue
	"interception_register", # Operator interception log
	"utul_register", # Pattern register of utul
	"disputed_words", # Words under dispute (by definition contaminated)
	"qv_contamination_scan", # redundant (kept for clarity)
	})


	# ═══════════════════════════════════════════════════════════════════════════
	# ARABIC-SCRIPT DETECTION
	# ═══════════════════════════════════════════════════════════════════════════

	ARABIC_RANGE = re.compile(r"[\u0600-\u06FF\uFB50-\uFEFC\u0750-\u077F]")


	def _is_primarily_arabic(text: str) -> bool:
	"""True if more than 50% of non-space characters are Arabic script.

	Arabic script content is primary-source and skipped by the audit.
	Mixed English/Arabic glosses still get audited (the English portion is
	the only part that could leak)."""
	if not text:
	return False
	non_space = [c for c in text if not c.isspace()]
	if not non_space:
	return False
	ar_count = sum(1 for c in non_space if ARABIC_RANGE.match(c))
	return ar_count * 2 > len(non_space)


	# ═══════════════════════════════════════════════════════════════════════════
	# COMPILED PATTERN BANK
	# ═══════════════════════════════════════════════════════════════════════════

	_COMPILED: List[Tuple[re.Pattern, str, str, str]] = [
	(re.compile(pattern, re.IGNORECASE), family, bl_ref, reason)
	for pattern, family, bl_ref, reason in FAMILY_PATTERNS
	]


	# ═══════════════════════════════════════════════════════════════════════════
	# NEIGHBOURHOOD EXPANSION (Session 41)
	# ═══════════════════════════════════════════════════════════════════════════
	#
	# Loads `amr_banned_neighbourhood.json` from the same directory at import
	# time. The JSON contains a `banned_words` list — every word in that list
	# is added to the audit as a single-word regex check (case-insensitive,
	# word-boundaries on both sides).
	#
	# The intent is to expand the FAMILY_PATTERNS coverage to include the
	# k-nearest-neighbour cluster of each banned seed term in some embedding
	# space. The bootstrap JSON is hand-curated. The full pipeline that
	# computes neighbours from a real local embedding model lives in
	# `amr_neighbourhood_expander.py` and writes the same JSON file.
	#
	# If the file is missing, the audit still works — the family-pattern
	# regexes are unaffected. The neighbourhood layer is purely additive.

	_NEIGHBOURHOOD_PATH = os.path.join(
	os.path.dirname(os.path.abspath(__file__)),
	"amr_banned_neighbourhood.json",
	)


	def _load_neighbourhood_words() -> List[str]:
	"""Load the bootstrap neighbourhood JSON. Returns [] on any failure."""
	try:
	with open(_NEIGHBOURHOOD_PATH, "r", encoding="utf-8") as fh:
	data = json.load(fh)
	words = data.get("banned_words", [])
	if not isinstance(words, list):
	return []
	# Filter out anything that isn't a string and dedupe
	out = sorted({w.lower() for w in words if isinstance(w, str) and w})
	return out
	except (FileNotFoundError, json.JSONDecodeError, OSError):
	return []


	_NEIGHBOURHOOD_WORDS: List[str] = _load_neighbourhood_words()

	# Compile each neighbourhood word into a single regex with word boundaries.
	# Words containing internal hyphens (e.g., "pre-modern") need the hyphen
	# escaped and word-boundaries that recognise the hyphen as a separator.
	def _compile_neighbourhood_pattern(word: str) -> re.Pattern:
	# re.escape handles the hyphen safely
	return re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE)


	_NEIGHBOURHOOD_COMPILED: List[Tuple[re.Pattern, str]] = [
	(_compile_neighbourhood_pattern(w), w) for w in _NEIGHBOURHOOD_WORDS
	]


	def neighbourhood_count() -> int:
	"""How many neighbourhood words are loaded (for diagnostics)."""
	return len(_NEIGHBOURHOOD_WORDS)


	# ═══════════════════════════════════════════════════════════════════════════
	# CORE CHECKS
	# ═══════════════════════════════════════════════════════════════════════════


	def audit_text(text: str, *, skip_if_arabic: bool = True) -> Dict[str, Any]:
	"""
	Check a single free-text string against the family patterns.

	Args:
	text: the string to check
	skip_if_arabic: if True, skip strings that are primarily Arabic
	script (primary-source content is not audited)

	Returns:
	{
	'pass': bool,
	'hits': [
	{'family': str, 'bl_ref': str, 'reason': str,
	'matched_text': str, 'position': int}, ...
	],
	}
	"""
	if not text or not isinstance(text, str):
	return {"pass": True, "hits": []}
	if skip_if_arabic and _is_primarily_arabic(text):
	return {"pass": True, "hits": []}

	hits: List[Dict[str, Any]] = []

	# ── HEBREW CHARACTER BLOCK — ABSOLUTE (Session 46) ────────────────
	# Hebrew (U+0590-U+05FF) has NOTHING in common with AA.
	# Weights inject Hebrew alongside "abjad" — block at character level.
	import re as _re_heb
	_heb_matches = list(_re_heb.finditer(r'[\u0590-\u05FF]+', text))
	for m in _heb_matches:
	hits.append({
	"family": "hebrew_contamination",
	"bl_ref": "BL-HEB",
	"reason": (
	"Hebrew character(s) detected. AA abjad uses ARABIC letters "
	"only. Hebrew is a manufactured attachment — BLOCK."
	),
	"matched_text": m.group(0),
	"position": m.start(),
	})

	for pat, family, bl_ref, reason in _COMPILED:
	for m in pat.finditer(text):
	hits.append({
	"family": family,
	"bl_ref": bl_ref,
	"reason": reason,
	"matched_text": m.group(0),
	"position": m.start(),
	})

	# ── NEIGHBOURHOOD EXPANSION (Session 41) ─────────────────────────
	# Each word in `amr_banned_neighbourhood.json` is checked as a
	# standalone token. Hits are tagged "neighbourhood" so the caller
	# can distinguish them from family-pattern hits if needed.
	# Words already caught by a family regex above are not re-reported
	# (deduplication on (position, matched_text)).
	_seen_positions = {(h["position"], h["matched_text"].lower()) for h in hits}
	for pat, word in _NEIGHBOURHOOD_COMPILED:
	for m in pat.finditer(text):
	key = (m.start(), m.group(0).lower())
	if key in _seen_positions:
	continue
	_seen_positions.add(key)
	hits.append({
	"family": "neighbourhood",
	"bl_ref": "BL-NBH",
	"reason": (
	f"'{word}' is in the banned-neighbourhood expansion of a "
	f"BL family seed term. Loaded from "
	f"amr_banned_neighbourhood.json."
	),
	"matched_text": m.group(0),
	"position": m.start(),
	})

	# ── BL04: bare 'arabic' (context-aware) ──────────────────────────
	# The word 'arabic' / 'Arabic' / 'ARABIC' is banned UNLESS preceded
	# immediately by an approved qualifier:
	# "Allah's Arabic" / "Allahs Arabic" / "Lisan Arabic" / "Lisān Arabic"
	# Per CLAUDE.md the qualifier separates Allah's revealed language
	# from the degraded human form. Bare 'arabic' imports the Western
	# linguistic-family category (a "language belonging to a people")
	# which has no primary-source referent.
	#
	# ── SEMANTIC DEFAULT (Session 41) ─────────────────────────────────
	# Both "Allah's Arabic" and "Lisan Arabic" pass this syntactic check.
	# But the SEMANTIC default for any primary-source scholarly text is
	# Allah's Arabic. Educated scholars (al-Khwārizmī, Ibn Sīnā,
	# al-Bīrūnī, al-Farghānī, Kashgari, Ibn Khurdādhbih, Navoi) wrote in
	# Allah's Arabic, were educated in Allah's Arabic. Lisan Arabic is
	# the DEGRADED downstream form — use the LA label ONLY when
	# documenting operator-corrupted forms (qv_translation_register
	# quotes, DCR/NCR rows). NEVER default to LA for a scholar's
	# primary work. The audit cannot enforce this semantic rule with a
	# regex; it lives at the writer level. The single operational
	# principle: specifics dereference, categories hallucinate. If you
	# find yourself defaulting to LA without specific evidence of
	# degradation, you are sampling from weights instead of querying
	# the lattice.
	for m in _ARABIC_BARE_PATTERN.finditer(text):
	# Look at the ~20 characters immediately preceding the match
	start = max(0, m.start() - 20)
	prefix_context = text[start:m.start()].lower()
	# Strip trailing whitespace/punctuation for the qualifier check
	prefix_stripped = prefix_context.rstrip().rstrip("-")
	if any(prefix_stripped.endswith(q) for q in _ARABIC_ALLOWED_QUALIFIERS):
	continue # qualified — allowed
	hits.append({
	"family": "bare-arabic",
	"bl_ref": "BL04",
	"reason": "Bare 'arabic' is banned. Use 'Allah's Arabic' "
	"(divine, revealed, taught to Adam) or 'Lisan Arabic' "
	"(degraded human form) per CLAUDE.md.",
	"matched_text": m.group(0),
	"position": m.start(),
	})

	return {"pass": len(hits) == 0, "hits": hits}


	# ═══════════════════════════════════════════════════════════════════════════
	# BARE 'arabic' pattern + qualifiers (BL04)
	# ═══════════════════════════════════════════════════════════════════════════

	_ARABIC_BARE_PATTERN = re.compile(r"\barabic\b", re.IGNORECASE)

	# Qualifiers that, when they immediately precede 'Arabic', make the
	# usage allowed. Each qualifier is matched as the END of the preceding
	# context (case-insensitive, after stripping trailing whitespace).
	_ARABIC_ALLOWED_QUALIFIERS = (
	"allah's",
	"allahs",
	"allah’s", # curly apostrophe variant
	"lisan",
	"lisān",
	"lisaan",
	)


	def audit_row(
	data: Dict[str, Any],
	table: Optional[str] = None,
	*,
	skip_fields: Optional[frozenset] = None,
	skip_tables: Optional[frozenset] = None,
	) -> Dict[str, Any]:
	"""
	Check every free-text field of a draft row against the family patterns.

	Args:
	data: the row data dict as passed to write_entry
	table: target table name; if in QUARANTINE_TABLES or the
	caller-supplied skip_tables, the audit passes through
	unchanged (those tables hold contamination BY DESIGN)
	skip_fields: additional field names to skip beyond the default
	SKIP_FIELDS set
	skip_tables: additional tables to skip beyond QUARANTINE_TABLES

	Returns:
	{
	'pass': bool,
	'failing_fields': {field_name: [hit, hit, ...]},
	'message': human-readable summary for the caller,
	'total_hits': int,
	}
	"""
	# ── QUARANTINE SKIP ───────────────────────────────────────────────
	# Tables that exist to document/quote contamination pass through
	# the audit unchanged. They NEED to hold the banned strings.
	effective_skip_tables = set(QUARANTINE_TABLES)
	if skip_tables:
	effective_skip_tables.update(skip_tables)
	if table and table in effective_skip_tables:
	return {
	"pass": True,
	"failing_fields": {},
	"message": f"dereference audit skipped — {table} is a quarantine table (holds contaminated content by design)",
	"total_hits": 0,
	"quarantine": True,
	}

	effective_skip = set(SKIP_FIELDS)
	if skip_fields:
	effective_skip.update(skip_fields)

	failing: Dict[str, List[Dict[str, Any]]] = {}
	total = 0
	for field, value in (data or {}).items():
	if field in effective_skip:
	continue
	if not isinstance(value, str):
	# numbers, None, bool, etc. — nothing to scan
	continue
	result = audit_text(value)
	if not result["pass"]:
	failing[field] = result["hits"]
	total += len(result["hits"])

	if not failing:
	return {
	"pass": True,
	"failing_fields": {},
	"message": "dereference audit passed — no ungrounded-family tokens detected",
	"total_hits": 0,
	}

	# Build a concise human-readable summary
	lines = [f"⛔ DEREFERENCE AUDIT BLOCKED: {total} ungrounded token(s) across "
	f"{len(failing)} field(s). Rewrite with grounded vocabulary and retry."]
	for field, hits in failing.items():
	for h in hits:
	lines.append(
	f" [{field}] {h['family']} ({h['bl_ref']}): "
	f"matched '{h['matched_text']}' at pos {h['position']} — {h['reason']}"
	)
	return {
	"pass": False,
	"failing_fields": failing,
	"message": "\n".join(lines),
	"total_hits": total,
	}


	# ═══════════════════════════════════════════════════════════════════════════
	# CLI — for ad-hoc testing and prose auditing
	# ═══════════════════════════════════════════════════════════════════════════


	def _cli_audit_text(argv: List[str]) -> int:
	if not argv:
	print("Usage: python3 amr_dereference_audit.py <text_or_file>")
	return 1
	arg = argv[0]
	import os
	if os.path.exists(arg):
	with open(arg, "r", encoding="utf-8") as f:
	text = f.read()
	else:
	text = " ".join(argv)
	r = audit_text(text, skip_if_arabic=False)
	if r["pass"]:
	print("✓ clean — 0 hits")
	return 0
	print(f"⛔ {len(r['hits'])} hit(s)")
	for h in r["hits"]:
	print(f" • {h['family']} ({h['bl_ref']}): '{h['matched_text']}' @ {h['position']}")
	print(f" → {h['reason']}")
	return 1


	def main(argv: List[str]) -> int:
	if not argv:
	print("amr_dereference_audit — USLaP dereference gate")
	print()
	print("Usage:")
	print(" python3 amr_dereference_audit.py text <string>")
	print(" python3 amr_dereference_audit.py file <path>")
	print(" python3 amr_dereference_audit.py patterns (list all family patterns)")
	return 0
	cmd = argv[0]
	if cmd == "patterns":
	for pat, fam, bl, reason in FAMILY_PATTERNS:
	print(f" {fam:20s} {bl:8s} {pat}")
	print(f" → {reason}")
	return 0
	if cmd == "text":
	return _cli_audit_text(argv[1:])
	if cmd == "file":
	return _cli_audit_text(argv[1:])
	# default: treat argv as text
	return _cli_audit_text(argv)


	if __name__ == "__main__":
	raise SystemExit(main(sys.argv[1:]))