uslap-query / Code_files /amr_dereference_audit.py
uslap's picture
Upload folder using huggingface_hub
7cc8e29 verified
Raw
History Blame Contribute Delete
34.9 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
amr_dereference_audit.py β€” Dereference gate for USLaP writes.
This is the mathematical form of the rule:
Before emitting token T into a DB row, check:
βˆƒ chain C : T β†’ grounded_anchor_on_disk
If no such C exists, T is corrosion. Refuse to write the row.
Implementation: a deterministic Python function that scans every free-text
field of a draft row and rejects the row if any token matches a known
ungrounded family pattern. The check runs at the CENTRAL write path
(uslap_handler.write_entry) so it applies to every table in the DB.
The audit is NOT a banned-terms list. It is a family-pattern check. A
banned-terms list is whack-a-mole β€” it catches 'cosmos' and 'cosmology'
but misses 'cosmography' and 'cosmogonic'. The family pattern catches
every -cosm*- word at once.
The audit does NOT touch arabic script, numeric data, dates, proper
source citations, USLaP-internal terminology, or the structural enum
values defined by table schemas. It only checks free English prose
for known ungrounded-family substrings.
Scope: ALL tables routed through uslap_handler.write_entry.
Usage from handler:
from amr_dereference_audit import audit_row
result = audit_row(data, target_table)
if not result['pass']:
return <block write with result['message']>
Usage standalone (for prose):
from amr_dereference_audit import audit_text
result = audit_text("the cosmographic opening mentions al-falak")
# β†’ fails on 'cosmographic'
Design constraints:
β€’ Deterministic. Pure Python + regex. No weights, no LLM, no heuristics.
β€’ Fast. O(|text|) per field via compiled regex.
β€’ Introspectable. Returns exact failing tokens and the pattern that matched.
β€’ Wide by construction. Uses *family* patterns (cosm*, monk*, priest*,
eschatolog*, etc.) so new derivations of the same families are caught
without extending a list.
β€’ Minimal false positives. Patterns target academic-framework vocabulary
classes, not common English.
Sourced against contamination_blacklist (BL01-BL37) as of Session 41.
"""
from __future__ import annotations
import json
import os
import re
import sys
from typing import Any, Dict, List, Optional, Tuple
# ═══════════════════════════════════════════════════════════════════════════
# FAMILY PATTERNS β€” the structural rule, not a literal list
# ═══════════════════════════════════════════════════════════════════════════
#
# Each tuple: (pattern, family_name, BL_ref, short_reason).
# Patterns match case-insensitively. Word boundaries used where literal
# single words would otherwise false-positive on innocuous substrings
# (e.g., 'steal' would match 'stealth' β€” we use \b).
#
# The rule: if any of these matches any free-text field of a row, the row
# is refused. The caller must rewrite the field with grounded vocabulary
# (primary-source pointers) and retry.
FAMILY_PATTERNS: List[Tuple[str, str, str, str]] = [
# ── BL33: cosmic / cosmos / cosmology / cosmological / cosmogony + WIDER
# Covers: cosmos, cosmic, cosmos, cosmology, cosmological, cosmologist,
# cosmography, cosmographic, cosmographer, cosmogony, cosmogonic
(
r"\bcosm(?:os|ic|olog(?:y|ical|ist|ically)?|ograph(?:y|ic|er|ically)?|"
r"ogon(?:y|ic|ical)?)\b",
"cosmic-family",
"BL33",
"Greek ΞΊΟŒΟƒΞΌΞΏΟ‚ framework β€” use al-samawāt wa-al-arḍ / al-khalq / "
"al-ʿālamīn / āyāt / al-falak / al-arḍ instead",
),
# ── BL34: priest / priests / priesthood / priestly
(
r"\bpriest(?:s|hood|ly|esses|ess)?\b",
"priest-family",
"BL34",
"Christian taxonomic import β€” use the primary-source title "
"(imām, khaṭīb, etc.) that the MS actually uses",
),
# ── BL35: monk / monks / monastic / monasticism / monastery / monasteries
(
r"\bmon(?:k|ks|kish|khood|aster(?:y|ies|ial)|astic(?:ism|ally)?)\b",
"monk-family",
"BL35",
"Christian taxonomic import β€” use the primary-source title "
"(rāhib, zāhid, etc.) that the MS actually uses",
),
# ── BL36: eschatological / eschatology
(
r"\beschatolog(?:y|ical|ist|ically)?\b",
"eschatolog-family",
"BL36",
"Greek ἔσχατος framework β€” use ākhirah / yawm al-qiyāma / al-sāʿa",
),
# ── BL37: steal / stole / stolen / stealing / theft / thief / thieves /
# thievery / thieving
(
r"\b(?:steal(?:s|ing)?|stole(?:n)?|theft|thie(?:f|ves|ving|very|vish))\b",
"theft-family",
"BL37",
"Use appropriate / appropriated / appropriation / appropriator / "
"misappropriation",
),
# ── BL05: PIE / Proto-Indo-European
(
r"\b(?:PIE|proto[-\s]?indo[-\s]?european)\b",
"pie-family",
"BL05",
"Phantom reconstruction β€” no primary source exists for PIE; use "
"'phantom PIE' if necessary to name the operator framework",
),
# ── BL07, BL16: Semitic (as family/race), Semite
(
r"\bsemit(?:e|es|ic|ism|ics)\b",
"semitic-family",
"BL07/BL16",
"Phantom racial/linguistic category β€” Allah's Arabic is the source, "
"Hebrew and Aramaic are downstream degraded forms",
),
# ── BL06: loanword / borrowed / cognate (directionality ban)
(
r"\bloan[-\s]?word\b",
"loanword",
"BL06",
"Directionality violation β€” use LINK_TYPE: DIRECT / COMPOUND / "
"SAME_ROOT / PHONETIC / SEMANTIC / PREFIX / SUFFIX / ROOT",
),
(
r"\bcognate(?:s)?\b",
"cognate",
"BL06",
"Directionality violation β€” use SAME_ROOT or DIRECT link type",
),
(
r"\bborrowed\s+from\b",
"borrowed-from",
"BL06",
"Directionality violation β€” AA β†’ downstream, never reversed",
),
(
r"\bloan\s+from\b",
"loan-from",
"BL06",
"Directionality violation β€” AA β†’ downstream, never reversed",
),
(
r"\badoption\s+from\b",
"adoption-from",
"BL06",
"Directionality violation β€” AA β†’ downstream, never reversed",
),
# ── Nostratic / Altaic / Afro-Asiatic phantom families
(
r"\b(?:nostratic|altaic|afro[-\s]?asiatic)\b",
"phantom-family",
"BL05/BL07",
"Phantom super-family reconstruction β€” no primary source",
),
# ── prosthetic vowel (academic framing)
(
r"\bprosthetic\s+vowel\b",
"prosthetic-vowel",
"BL06",
"Academic framing β€” words have direct phonetic chains, not "
"'prosthetic' additions",
),
(
r"\bpre[-\s]?greek\s+substrate\b",
"pre-greek-substrate",
"BL06",
"Phantom substrate category",
),
# ── BL30: zodiac
(
r"\bzodiac(?:al)?\b",
"zodiac",
"BL30",
"Greek astrological framing β€” use al-burΕ«j (Q85:1) or specific "
"constellation names in Allah's Arabic",
),
# ── BL31: libra (as zodiac sign)
(
r"\blibra\b",
"libra",
"BL31",
"Latinate zodiac name β€” use al-mΔ«zān if referring to the scales "
"constellation",
),
# ── BL28: hashishin / hashshashin / hashshashin
(
r"\bhash[ai]?sh[ai]?sh?in\b",
"hashishin",
"BL28",
"Orientalist calumny β€” IsmāʿīlΔ« / NizārΔ« are the primary-source names",
),
# ── BL32: Mughal / Mogul (as dynasty label for Muslim Timurids)
(
r"\bmughal\b",
"mughal",
"BL32",
"Persianate exonym β€” use Timurid (al-TΔ«mΕ«rΔ«) for the dynasty Bābur "
"founded in Hind",
),
(
r"\bmogul\b",
"mogul",
"BL32",
"Anglicised exonym β€” use Timurid (al-TΔ«mΕ«rΔ«)",
),
# ── BL26: Theology / Theological
(
r"\btheolog(?:y|ical|ian|ically)?\b",
"theology",
"BL26",
"Greek ΞΈΞ΅ΟŒΟ‚ + Ξ»ΟŒΞ³ΞΏΟ‚ framework β€” use ΚΏilm al-kalām (the actual "
"discipline) or the specific primary-source term",
),
# ── BL29: tribal (as anthropological framing)
(
r"\btribal(?:ism)?\b",
"tribal",
"BL29",
"Anthropological framing β€” use qawm / qabΔ«la / banΕ« / āl with "
"the specific lineage name",
),
# ── PERIODIZATION FAMILY (Session 41 β€” same error class as BL33)
# Western European historiographic frames retrojected onto the Islamic
# world. None of these has a primary-source referent in the world
# being described. Use AH century, CE century, dynasty name, or
# reigning caliph instead.
(
r"\bmedieval(?:ly|ist|ism)?\b",
"periodization-medieval",
"BL-PERIOD",
"Western European frame β€” use AH century, CE century, dynasty "
"name (Umayyad / Abbasid / Fāṭimid / etc.), or reigning caliph",
),
(
r"\bmiddle\s+ages\b",
"periodization-middle-ages",
"BL-PERIOD",
"Western European frame β€” use AH/CE century or dynasty name",
),
(
r"\blate\s+antiquity\b",
"periodization-late-antiquity",
"BL-PERIOD",
"Western European frame β€” use the specific dynasty / century AH",
),
(
r"\bearly\s+modern\b",
"periodization-early-modern",
"BL-PERIOD",
"Western European frame β€” use the specific dynasty / century AH",
),
(
r"\bpre[-\s]?modern\b",
"periodization-pre-modern",
"BL-PERIOD",
"Defines by what Europe became; use dynasty / century AH",
),
(
r"\bpost[-\s]?classical\b",
"periodization-post-classical",
"BL-PERIOD",
"Western frame β€” use dynasty / century AH",
),
(
r"\bdark\s+ages?\b",
"periodization-dark-ages",
"BL-PERIOD",
"Western Eurocentric framing β€” use dynasty / century",
),
(
r"\brenaissance\b",
"periodization-renaissance",
"BL-PERIOD",
"Western European frame β€” use the specific dynasty / century / "
"movement name",
),
# ── ERA / AGE / PERIOD framing β€” the rule above periodization names.
# Even "Abbasid era" or "Umayyad period" smuggles a teleological time-
# bracket into data that does not belong inside one. Anything in
# Allah's framework gets dates (232 AH), names (Caliph al-Wāthiq),
# events (the reign of X), but never an "era / age / period" wrapper.
# The pattern fires on the noun-form 'era / age / period / times'
# when it is preceded by an adjective or proper-name modifier
# (e.g. "Abbasid era", "the age of al-Mutawakkil", "Umayyad period").
(
r"\b(?:abbasid|umayyad|fatimid|fāṭimid|seljuq|seljuk|mamluk|mamlūk|"
r"ottoman|ayyubid|samanid|sāmānī|ghaznavid|qarakhanid|"
r"timurid|tīmūrī|safavid|ṣafavī|qajar|qājārī|"
r"rashidun|rāshidūn|caliphal|abbasi|umayyad)\s+(?:era|period|age|times|epoch)\b",
"era-frame-named",
"BL-PERIOD",
"An era-noun bracket imports a teleological time-frame. Use the date "
"(N AH / N CE), the reigning caliph (al-Wāthiq, al-Mutawakkil, etc.), "
"or 'the reign of X' instead. Drop 'era / period / age / times'.",
),
(
r"\b(?:medieval|classical|early|late|pre|post|high|low)\s+(?:era|period|age|times|epoch)\b",
"era-frame-temporal",
"BL-PERIOD",
"Western temporal era-frame. Drop the era-noun; use the specific date "
"or named event.",
),
(
r"\bthe\s+age\s+of\s+\w+",
"age-of-name",
"BL-PERIOD",
"'The age of X' brackets a person/event into an era teleology. "
"Use 'in the year X', 'during the reign of X', or just the date.",
),
(
r"\bin\s+(?:those|that)\s+(?:days|times|era|period)\b",
"those-days",
"BL-PERIOD",
"Vague era-bracket. Use a specific date or named event.",
),
# ── METAPHOR family (Greek metaphora β€” literary-analysis category) ──
(
r"\bmetaphor(?:s|ic|ical|ically)?\b",
"metaphor",
"BL-CATEGORY",
"Greek μΡταφορά framework. Use the primary-source rhetorical "
"term: mathal (Ω…ΩŽΨ«ΩŽΩ„, Qur'anic), istiʿāra (Ψ§Ψ³ΨͺΨΉΨ§Ψ±Ψ©), tashbΔ«h "
"(ΨͺΨ΄Ψ¨ΩŠΩ‡), majāz (Ω…Ψ¬Ψ§Ψ²).",
),
# ── SCHOLASTIC family (Latin scholasticus β€” medieval-Western academic) ──
(
r"\bscholastic(?:s|ism|ally)?\b",
"scholastic",
"BL-CATEGORY",
"Latin Western academic frame for 12-15c CE European disputation "
"tradition. No primary-source referent in Allah's Arabic. Use the "
"specific term: ΚΏilm al-kalām, αΈ₯ikma, falsafa, uαΉ£Ε«l al-fiqh, etc.",
),
# ── PATRISTIC family (Latin patristica β€” Western Church-fathers tradition) ──
(
r"\bpatristi(?:c|cs|cally)\b",
"patristic",
"BL-CATEGORY",
"Western Christian Church-fathers framework. No primary-source "
"referent in the Islamic tradition. Use the specific named "
"scholar (Ibn Ḁanbal, al-Bukhārī, etc.) or 'salaf' if context "
"permits.",
),
(
r"\bpatrolog(?:y|ical|ically)?\b",
"patrology",
"BL-CATEGORY",
"Western Christian fathers studies β€” no Islamic primary-source "
"equivalent.",
),
# ── HELLENISTIC family (Greek-philosophy retrojection) ──
(
r"\bhellenist(?:ic|ically|ic)?\b",
"hellenistic",
"BL-CATEGORY",
"Greek-philosophy retrojection. The Islamic primary sources "
"name the actual influences specifically (Aristū, Aflāṭūn, "
"etc.) when they are present at all. Use those names directly, "
"not the genus 'Hellenistic'.",
),
(
r"\bhellen(?:ism|ize|izing|ization|ist)?\b",
"hellenism",
"BL-CATEGORY",
"Same Greek-philosophy retrojection β€” name the specific "
"person or text instead of using the genus.",
),
]
# ═══════════════════════════════════════════════════════════════════════════
# FIELDS TO SKIP β€” structural enums and primary-source anchors
# ═══════════════════════════════════════════════════════════════════════════
#
# These are field NAMES whose content is either a structural keyword
# (CHECK enum value), a primary-source pointer (file path, shelfmark),
# or a numeric/date token. Skipping them avoids false positives on
# legitimate grounded content.
SKIP_FIELDS = frozenset({
# structural schema enums
"entry_type", "divergence_type", "source_ms", "recension",
"category", "op_code", "dp_code",
# primary-source pointers
"ms_folio", "edition_page", "ms_page", "page", "folio",
"quf_token", "quf_q", "quf_u", "quf_f", "quf_pass", "quf_date",
# IDs
"kh_id", "kv_id", "diwan_id", "entry_id", "root_id", "rowid",
"intel_id", "bl_id", "dp_id", "sc_id", "ncr_id", "dcr_id",
# numeric fields
"token_count", "operator_flag", "scribal_interpolation",
"persian_wrapper_inserted", "has_proverb", "has_nazm",
# timestamps
"created_date", "last_updated",
})
# ═══════════════════════════════════════════════════════════════════════════
# QUARANTINE TABLES β€” skipped by the audit by design
# ═══════════════════════════════════════════════════════════════════════════
#
# These tables exist to HOLD contaminated strings for inspection, flagging,
# and downstream re-derivation. Their whole purpose is to quote the operator
# glosses, the banned terms, the substituted translations, etc. so they can
# be reviewed and reversed. Applying the dereference audit to these tables
# would make it impossible to log a contamination finding (since logging
# one requires quoting the contaminated content verbatim).
#
# This list must stay SMALL. Only tables whose content is *documentation
# of contamination* belong here β€” not tables where contamination might
# accidentally land.
QUARANTINE_TABLES = frozenset({
"contamination_blacklist", # BL01-BL37 β€” the register itself
"qv_contamination_scan", # Qur'anic verse contamination findings
"qv_translation_register", # Qur'anic translation contamination register
"diwan_contamination_register", # DCR β€” Kashgari Diwan scribal interpolations
"navoi_contamination_register", # NCR β€” Navoi operator crimes
"operator_label_register", # Operator name/title labels (quotes them)
"scholar_warnings", # Flagged tertiary/unreliable scholars
"attribution_corrections", # Records of attribution errors
"db_integrity_log", # Integrity findings (may quote bad content)
"corruption_operation_register", # Records of operator operations
"dcr_corruption_types", # Corruption type catalogue
"interception_register", # Operator interception log
"utul_register", # Pattern register of utul
"disputed_words", # Words under dispute (by definition contaminated)
"qv_contamination_scan", # redundant (kept for clarity)
})
# ═══════════════════════════════════════════════════════════════════════════
# ARABIC-SCRIPT DETECTION
# ═══════════════════════════════════════════════════════════════════════════
ARABIC_RANGE = re.compile(r"[\u0600-\u06FF\uFB50-\uFEFC\u0750-\u077F]")
def _is_primarily_arabic(text: str) -> bool:
"""True if more than 50% of non-space characters are Arabic script.
Arabic script content is primary-source and skipped by the audit.
Mixed English/Arabic glosses still get audited (the English portion is
the only part that could leak)."""
if not text:
return False
non_space = [c for c in text if not c.isspace()]
if not non_space:
return False
ar_count = sum(1 for c in non_space if ARABIC_RANGE.match(c))
return ar_count * 2 > len(non_space)
# ═══════════════════════════════════════════════════════════════════════════
# COMPILED PATTERN BANK
# ═══════════════════════════════════════════════════════════════════════════
_COMPILED: List[Tuple[re.Pattern, str, str, str]] = [
(re.compile(pattern, re.IGNORECASE), family, bl_ref, reason)
for pattern, family, bl_ref, reason in FAMILY_PATTERNS
]
# ═══════════════════════════════════════════════════════════════════════════
# NEIGHBOURHOOD EXPANSION (Session 41)
# ═══════════════════════════════════════════════════════════════════════════
#
# Loads `amr_banned_neighbourhood.json` from the same directory at import
# time. The JSON contains a `banned_words` list β€” every word in that list
# is added to the audit as a single-word regex check (case-insensitive,
# word-boundaries on both sides).
#
# The intent is to expand the FAMILY_PATTERNS coverage to include the
# k-nearest-neighbour cluster of each banned seed term in some embedding
# space. The bootstrap JSON is hand-curated. The full pipeline that
# computes neighbours from a real local embedding model lives in
# `amr_neighbourhood_expander.py` and writes the same JSON file.
#
# If the file is missing, the audit still works β€” the family-pattern
# regexes are unaffected. The neighbourhood layer is purely additive.
_NEIGHBOURHOOD_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"amr_banned_neighbourhood.json",
)
def _load_neighbourhood_words() -> List[str]:
"""Load the bootstrap neighbourhood JSON. Returns [] on any failure."""
try:
with open(_NEIGHBOURHOOD_PATH, "r", encoding="utf-8") as fh:
data = json.load(fh)
words = data.get("banned_words", [])
if not isinstance(words, list):
return []
# Filter out anything that isn't a string and dedupe
out = sorted({w.lower() for w in words if isinstance(w, str) and w})
return out
except (FileNotFoundError, json.JSONDecodeError, OSError):
return []
_NEIGHBOURHOOD_WORDS: List[str] = _load_neighbourhood_words()
# Compile each neighbourhood word into a single regex with word boundaries.
# Words containing internal hyphens (e.g., "pre-modern") need the hyphen
# escaped and word-boundaries that recognise the hyphen as a separator.
def _compile_neighbourhood_pattern(word: str) -> re.Pattern:
# re.escape handles the hyphen safely
return re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE)
_NEIGHBOURHOOD_COMPILED: List[Tuple[re.Pattern, str]] = [
(_compile_neighbourhood_pattern(w), w) for w in _NEIGHBOURHOOD_WORDS
]
def neighbourhood_count() -> int:
"""How many neighbourhood words are loaded (for diagnostics)."""
return len(_NEIGHBOURHOOD_WORDS)
# ═══════════════════════════════════════════════════════════════════════════
# CORE CHECKS
# ═══════════════════════════════════════════════════════════════════════════
def audit_text(text: str, *, skip_if_arabic: bool = True) -> Dict[str, Any]:
"""
Check a single free-text string against the family patterns.
Args:
text: the string to check
skip_if_arabic: if True, skip strings that are primarily Arabic
script (primary-source content is not audited)
Returns:
{
'pass': bool,
'hits': [
{'family': str, 'bl_ref': str, 'reason': str,
'matched_text': str, 'position': int}, ...
],
}
"""
if not text or not isinstance(text, str):
return {"pass": True, "hits": []}
if skip_if_arabic and _is_primarily_arabic(text):
return {"pass": True, "hits": []}
hits: List[Dict[str, Any]] = []
# ── HEBREW CHARACTER BLOCK β€” ABSOLUTE (Session 46) ────────────────
# Hebrew (U+0590-U+05FF) has NOTHING in common with AA.
# Weights inject Hebrew alongside "abjad" β€” block at character level.
import re as _re_heb
_heb_matches = list(_re_heb.finditer(r'[\u0590-\u05FF]+', text))
for m in _heb_matches:
hits.append({
"family": "hebrew_contamination",
"bl_ref": "BL-HEB",
"reason": (
"Hebrew character(s) detected. AA abjad uses ARABIC letters "
"only. Hebrew is a manufactured attachment β€” BLOCK."
),
"matched_text": m.group(0),
"position": m.start(),
})
for pat, family, bl_ref, reason in _COMPILED:
for m in pat.finditer(text):
hits.append({
"family": family,
"bl_ref": bl_ref,
"reason": reason,
"matched_text": m.group(0),
"position": m.start(),
})
# ── NEIGHBOURHOOD EXPANSION (Session 41) ─────────────────────────
# Each word in `amr_banned_neighbourhood.json` is checked as a
# standalone token. Hits are tagged "neighbourhood" so the caller
# can distinguish them from family-pattern hits if needed.
# Words already caught by a family regex above are not re-reported
# (deduplication on (position, matched_text)).
_seen_positions = {(h["position"], h["matched_text"].lower()) for h in hits}
for pat, word in _NEIGHBOURHOOD_COMPILED:
for m in pat.finditer(text):
key = (m.start(), m.group(0).lower())
if key in _seen_positions:
continue
_seen_positions.add(key)
hits.append({
"family": "neighbourhood",
"bl_ref": "BL-NBH",
"reason": (
f"'{word}' is in the banned-neighbourhood expansion of a "
f"BL family seed term. Loaded from "
f"amr_banned_neighbourhood.json."
),
"matched_text": m.group(0),
"position": m.start(),
})
# ── BL04: bare 'arabic' (context-aware) ──────────────────────────
# The word 'arabic' / 'Arabic' / 'ARABIC' is banned UNLESS preceded
# immediately by an approved qualifier:
# "Allah's Arabic" / "Allahs Arabic" / "Lisan Arabic" / "Lisān Arabic"
# Per CLAUDE.md the qualifier separates Allah's revealed language
# from the degraded human form. Bare 'arabic' imports the Western
# linguistic-family category (a "language belonging to a people")
# which has no primary-source referent.
#
# ── SEMANTIC DEFAULT (Session 41) ─────────────────────────────────
# Both "Allah's Arabic" and "Lisan Arabic" pass this syntactic check.
# But the SEMANTIC default for any primary-source scholarly text is
# Allah's Arabic. Educated scholars (al-Khwārizmī, Ibn Sīnā,
# al-Bīrūnī, al-Farghānī, Kashgari, Ibn Khurdādhbih, Navoi) wrote in
# Allah's Arabic, were educated in Allah's Arabic. Lisan Arabic is
# the DEGRADED downstream form β€” use the LA label ONLY when
# documenting operator-corrupted forms (qv_translation_register
# quotes, DCR/NCR rows). NEVER default to LA for a scholar's
# primary work. The audit cannot enforce this semantic rule with a
# regex; it lives at the writer level. The single operational
# principle: specifics dereference, categories hallucinate. If you
# find yourself defaulting to LA without specific evidence of
# degradation, you are sampling from weights instead of querying
# the lattice.
for m in _ARABIC_BARE_PATTERN.finditer(text):
# Look at the ~20 characters immediately preceding the match
start = max(0, m.start() - 20)
prefix_context = text[start:m.start()].lower()
# Strip trailing whitespace/punctuation for the qualifier check
prefix_stripped = prefix_context.rstrip().rstrip("-")
if any(prefix_stripped.endswith(q) for q in _ARABIC_ALLOWED_QUALIFIERS):
continue # qualified β€” allowed
hits.append({
"family": "bare-arabic",
"bl_ref": "BL04",
"reason": "Bare 'arabic' is banned. Use 'Allah's Arabic' "
"(divine, revealed, taught to Adam) or 'Lisan Arabic' "
"(degraded human form) per CLAUDE.md.",
"matched_text": m.group(0),
"position": m.start(),
})
return {"pass": len(hits) == 0, "hits": hits}
# ═══════════════════════════════════════════════════════════════════════════
# BARE 'arabic' pattern + qualifiers (BL04)
# ═══════════════════════════════════════════════════════════════════════════
_ARABIC_BARE_PATTERN = re.compile(r"\barabic\b", re.IGNORECASE)
# Qualifiers that, when they immediately precede 'Arabic', make the
# usage allowed. Each qualifier is matched as the END of the preceding
# context (case-insensitive, after stripping trailing whitespace).
_ARABIC_ALLOWED_QUALIFIERS = (
"allah's",
"allahs",
"allah’s", # curly apostrophe variant
"lisan",
"lisān",
"lisaan",
)
def audit_row(
data: Dict[str, Any],
table: Optional[str] = None,
*,
skip_fields: Optional[frozenset] = None,
skip_tables: Optional[frozenset] = None,
) -> Dict[str, Any]:
"""
Check every free-text field of a draft row against the family patterns.
Args:
data: the row data dict as passed to write_entry
table: target table name; if in QUARANTINE_TABLES or the
caller-supplied skip_tables, the audit passes through
unchanged (those tables hold contamination BY DESIGN)
skip_fields: additional field names to skip beyond the default
SKIP_FIELDS set
skip_tables: additional tables to skip beyond QUARANTINE_TABLES
Returns:
{
'pass': bool,
'failing_fields': {field_name: [hit, hit, ...]},
'message': human-readable summary for the caller,
'total_hits': int,
}
"""
# ── QUARANTINE SKIP ───────────────────────────────────────────────
# Tables that exist to document/quote contamination pass through
# the audit unchanged. They NEED to hold the banned strings.
effective_skip_tables = set(QUARANTINE_TABLES)
if skip_tables:
effective_skip_tables.update(skip_tables)
if table and table in effective_skip_tables:
return {
"pass": True,
"failing_fields": {},
"message": f"dereference audit skipped β€” {table} is a quarantine table (holds contaminated content by design)",
"total_hits": 0,
"quarantine": True,
}
effective_skip = set(SKIP_FIELDS)
if skip_fields:
effective_skip.update(skip_fields)
failing: Dict[str, List[Dict[str, Any]]] = {}
total = 0
for field, value in (data or {}).items():
if field in effective_skip:
continue
if not isinstance(value, str):
# numbers, None, bool, etc. β€” nothing to scan
continue
result = audit_text(value)
if not result["pass"]:
failing[field] = result["hits"]
total += len(result["hits"])
if not failing:
return {
"pass": True,
"failing_fields": {},
"message": "dereference audit passed β€” no ungrounded-family tokens detected",
"total_hits": 0,
}
# Build a concise human-readable summary
lines = [f"β›” DEREFERENCE AUDIT BLOCKED: {total} ungrounded token(s) across "
f"{len(failing)} field(s). Rewrite with grounded vocabulary and retry."]
for field, hits in failing.items():
for h in hits:
lines.append(
f" [{field}] {h['family']} ({h['bl_ref']}): "
f"matched '{h['matched_text']}' at pos {h['position']} β€” {h['reason']}"
)
return {
"pass": False,
"failing_fields": failing,
"message": "\n".join(lines),
"total_hits": total,
}
# ═══════════════════════════════════════════════════════════════════════════
# CLI β€” for ad-hoc testing and prose auditing
# ═══════════════════════════════════════════════════════════════════════════
def _cli_audit_text(argv: List[str]) -> int:
if not argv:
print("Usage: python3 amr_dereference_audit.py <text_or_file>")
return 1
arg = argv[0]
import os
if os.path.exists(arg):
with open(arg, "r", encoding="utf-8") as f:
text = f.read()
else:
text = " ".join(argv)
r = audit_text(text, skip_if_arabic=False)
if r["pass"]:
print("βœ“ clean β€” 0 hits")
return 0
print(f"β›” {len(r['hits'])} hit(s)")
for h in r["hits"]:
print(f" β€’ {h['family']} ({h['bl_ref']}): '{h['matched_text']}' @ {h['position']}")
print(f" β†’ {h['reason']}")
return 1
def main(argv: List[str]) -> int:
if not argv:
print("amr_dereference_audit β€” USLaP dereference gate")
print()
print("Usage:")
print(" python3 amr_dereference_audit.py text <string>")
print(" python3 amr_dereference_audit.py file <path>")
print(" python3 amr_dereference_audit.py patterns (list all family patterns)")
return 0
cmd = argv[0]
if cmd == "patterns":
for pat, fam, bl, reason in FAMILY_PATTERNS:
print(f" {fam:20s} {bl:8s} {pat}")
print(f" β†’ {reason}")
return 0
if cmd == "text":
return _cli_audit_text(argv[1:])
if cmd == "file":
return _cli_audit_text(argv[1:])
# default: treat argv as text
return _cli_audit_text(argv)
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))