Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| amr_dereference_audit.py β Dereference gate for USLaP writes. | |
| This is the mathematical form of the rule: | |
| Before emitting token T into a DB row, check: | |
| β chain C : T β grounded_anchor_on_disk | |
| If no such C exists, T is corrosion. Refuse to write the row. | |
| Implementation: a deterministic Python function that scans every free-text | |
| field of a draft row and rejects the row if any token matches a known | |
| ungrounded family pattern. The check runs at the CENTRAL write path | |
| (uslap_handler.write_entry) so it applies to every table in the DB. | |
| The audit is NOT a banned-terms list. It is a family-pattern check. A | |
| banned-terms list is whack-a-mole β it catches 'cosmos' and 'cosmology' | |
| but misses 'cosmography' and 'cosmogonic'. The family pattern catches | |
| every -cosm*- word at once. | |
| The audit does NOT touch arabic script, numeric data, dates, proper | |
| source citations, USLaP-internal terminology, or the structural enum | |
| values defined by table schemas. It only checks free English prose | |
| for known ungrounded-family substrings. | |
| Scope: ALL tables routed through uslap_handler.write_entry. | |
| Usage from handler: | |
| from amr_dereference_audit import audit_row | |
| result = audit_row(data, target_table) | |
| if not result['pass']: | |
| return <block write with result['message']> | |
| Usage standalone (for prose): | |
| from amr_dereference_audit import audit_text | |
| result = audit_text("the cosmographic opening mentions al-falak") | |
| # β fails on 'cosmographic' | |
| Design constraints: | |
| β’ Deterministic. Pure Python + regex. No weights, no LLM, no heuristics. | |
| β’ Fast. O(|text|) per field via compiled regex. | |
| β’ Introspectable. Returns exact failing tokens and the pattern that matched. | |
| β’ Wide by construction. Uses *family* patterns (cosm*, monk*, priest*, | |
| eschatolog*, etc.) so new derivations of the same families are caught | |
| without extending a list. | |
| β’ Minimal false positives. Patterns target academic-framework vocabulary | |
| classes, not common English. | |
| Sourced against contamination_blacklist (BL01-BL37) as of Session 41. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from typing import Any, Dict, List, Optional, Tuple | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FAMILY PATTERNS β the structural rule, not a literal list | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # Each tuple: (pattern, family_name, BL_ref, short_reason). | |
| # Patterns match case-insensitively. Word boundaries used where literal | |
| # single words would otherwise false-positive on innocuous substrings | |
| # (e.g., 'steal' would match 'stealth' β we use \b). | |
| # | |
| # The rule: if any of these matches any free-text field of a row, the row | |
| # is refused. The caller must rewrite the field with grounded vocabulary | |
| # (primary-source pointers) and retry. | |
| FAMILY_PATTERNS: List[Tuple[str, str, str, str]] = [ | |
| # ββ BL33: cosmic / cosmos / cosmology / cosmological / cosmogony + WIDER | |
| # Covers: cosmos, cosmic, cosmos, cosmology, cosmological, cosmologist, | |
| # cosmography, cosmographic, cosmographer, cosmogony, cosmogonic | |
| ( | |
| r"\bcosm(?:os|ic|olog(?:y|ical|ist|ically)?|ograph(?:y|ic|er|ically)?|" | |
| r"ogon(?:y|ic|ical)?)\b", | |
| "cosmic-family", | |
| "BL33", | |
| "Greek ΞΊΟΟΞΌΞΏΟ framework β use al-samawΔt wa-al-arαΈ / al-khalq / " | |
| "al-ΚΏΔlamΔ«n / ΔyΔt / al-falak / al-arαΈ instead", | |
| ), | |
| # ββ BL34: priest / priests / priesthood / priestly | |
| ( | |
| r"\bpriest(?:s|hood|ly|esses|ess)?\b", | |
| "priest-family", | |
| "BL34", | |
| "Christian taxonomic import β use the primary-source title " | |
| "(imΔm, khaαΉΔ«b, etc.) that the MS actually uses", | |
| ), | |
| # ββ BL35: monk / monks / monastic / monasticism / monastery / monasteries | |
| ( | |
| r"\bmon(?:k|ks|kish|khood|aster(?:y|ies|ial)|astic(?:ism|ally)?)\b", | |
| "monk-family", | |
| "BL35", | |
| "Christian taxonomic import β use the primary-source title " | |
| "(rΔhib, zΔhid, etc.) that the MS actually uses", | |
| ), | |
| # ββ BL36: eschatological / eschatology | |
| ( | |
| r"\beschatolog(?:y|ical|ist|ically)?\b", | |
| "eschatolog-family", | |
| "BL36", | |
| "Greek αΌΟΟΞ±ΟΞΏΟ framework β use Δkhirah / yawm al-qiyΔma / al-sΔΚΏa", | |
| ), | |
| # ββ BL37: steal / stole / stolen / stealing / theft / thief / thieves / | |
| # thievery / thieving | |
| ( | |
| r"\b(?:steal(?:s|ing)?|stole(?:n)?|theft|thie(?:f|ves|ving|very|vish))\b", | |
| "theft-family", | |
| "BL37", | |
| "Use appropriate / appropriated / appropriation / appropriator / " | |
| "misappropriation", | |
| ), | |
| # ββ BL05: PIE / Proto-Indo-European | |
| ( | |
| r"\b(?:PIE|proto[-\s]?indo[-\s]?european)\b", | |
| "pie-family", | |
| "BL05", | |
| "Phantom reconstruction β no primary source exists for PIE; use " | |
| "'phantom PIE' if necessary to name the operator framework", | |
| ), | |
| # ββ BL07, BL16: Semitic (as family/race), Semite | |
| ( | |
| r"\bsemit(?:e|es|ic|ism|ics)\b", | |
| "semitic-family", | |
| "BL07/BL16", | |
| "Phantom racial/linguistic category β Allah's Arabic is the source, " | |
| "Hebrew and Aramaic are downstream degraded forms", | |
| ), | |
| # ββ BL06: loanword / borrowed / cognate (directionality ban) | |
| ( | |
| r"\bloan[-\s]?word\b", | |
| "loanword", | |
| "BL06", | |
| "Directionality violation β use LINK_TYPE: DIRECT / COMPOUND / " | |
| "SAME_ROOT / PHONETIC / SEMANTIC / PREFIX / SUFFIX / ROOT", | |
| ), | |
| ( | |
| r"\bcognate(?:s)?\b", | |
| "cognate", | |
| "BL06", | |
| "Directionality violation β use SAME_ROOT or DIRECT link type", | |
| ), | |
| ( | |
| r"\bborrowed\s+from\b", | |
| "borrowed-from", | |
| "BL06", | |
| "Directionality violation β AA β downstream, never reversed", | |
| ), | |
| ( | |
| r"\bloan\s+from\b", | |
| "loan-from", | |
| "BL06", | |
| "Directionality violation β AA β downstream, never reversed", | |
| ), | |
| ( | |
| r"\badoption\s+from\b", | |
| "adoption-from", | |
| "BL06", | |
| "Directionality violation β AA β downstream, never reversed", | |
| ), | |
| # ββ Nostratic / Altaic / Afro-Asiatic phantom families | |
| ( | |
| r"\b(?:nostratic|altaic|afro[-\s]?asiatic)\b", | |
| "phantom-family", | |
| "BL05/BL07", | |
| "Phantom super-family reconstruction β no primary source", | |
| ), | |
| # ββ prosthetic vowel (academic framing) | |
| ( | |
| r"\bprosthetic\s+vowel\b", | |
| "prosthetic-vowel", | |
| "BL06", | |
| "Academic framing β words have direct phonetic chains, not " | |
| "'prosthetic' additions", | |
| ), | |
| ( | |
| r"\bpre[-\s]?greek\s+substrate\b", | |
| "pre-greek-substrate", | |
| "BL06", | |
| "Phantom substrate category", | |
| ), | |
| # ββ BL30: zodiac | |
| ( | |
| r"\bzodiac(?:al)?\b", | |
| "zodiac", | |
| "BL30", | |
| "Greek astrological framing β use al-burΕ«j (Q85:1) or specific " | |
| "constellation names in Allah's Arabic", | |
| ), | |
| # ββ BL31: libra (as zodiac sign) | |
| ( | |
| r"\blibra\b", | |
| "libra", | |
| "BL31", | |
| "Latinate zodiac name β use al-mΔ«zΔn if referring to the scales " | |
| "constellation", | |
| ), | |
| # ββ BL28: hashishin / hashshashin / hashshashin | |
| ( | |
| r"\bhash[ai]?sh[ai]?sh?in\b", | |
| "hashishin", | |
| "BL28", | |
| "Orientalist calumny β IsmΔΚΏΔ«lΔ« / NizΔrΔ« are the primary-source names", | |
| ), | |
| # ββ BL32: Mughal / Mogul (as dynasty label for Muslim Timurids) | |
| ( | |
| r"\bmughal\b", | |
| "mughal", | |
| "BL32", | |
| "Persianate exonym β use Timurid (al-TΔ«mΕ«rΔ«) for the dynasty BΔbur " | |
| "founded in Hind", | |
| ), | |
| ( | |
| r"\bmogul\b", | |
| "mogul", | |
| "BL32", | |
| "Anglicised exonym β use Timurid (al-TΔ«mΕ«rΔ«)", | |
| ), | |
| # ββ BL26: Theology / Theological | |
| ( | |
| r"\btheolog(?:y|ical|ian|ically)?\b", | |
| "theology", | |
| "BL26", | |
| "Greek ΞΈΞ΅ΟΟ + Ξ»ΟΞ³ΞΏΟ framework β use ΚΏilm al-kalΔm (the actual " | |
| "discipline) or the specific primary-source term", | |
| ), | |
| # ββ BL29: tribal (as anthropological framing) | |
| ( | |
| r"\btribal(?:ism)?\b", | |
| "tribal", | |
| "BL29", | |
| "Anthropological framing β use qawm / qabΔ«la / banΕ« / Δl with " | |
| "the specific lineage name", | |
| ), | |
| # ββ PERIODIZATION FAMILY (Session 41 β same error class as BL33) | |
| # Western European historiographic frames retrojected onto the Islamic | |
| # world. None of these has a primary-source referent in the world | |
| # being described. Use AH century, CE century, dynasty name, or | |
| # reigning caliph instead. | |
| ( | |
| r"\bmedieval(?:ly|ist|ism)?\b", | |
| "periodization-medieval", | |
| "BL-PERIOD", | |
| "Western European frame β use AH century, CE century, dynasty " | |
| "name (Umayyad / Abbasid / FΔαΉimid / etc.), or reigning caliph", | |
| ), | |
| ( | |
| r"\bmiddle\s+ages\b", | |
| "periodization-middle-ages", | |
| "BL-PERIOD", | |
| "Western European frame β use AH/CE century or dynasty name", | |
| ), | |
| ( | |
| r"\blate\s+antiquity\b", | |
| "periodization-late-antiquity", | |
| "BL-PERIOD", | |
| "Western European frame β use the specific dynasty / century AH", | |
| ), | |
| ( | |
| r"\bearly\s+modern\b", | |
| "periodization-early-modern", | |
| "BL-PERIOD", | |
| "Western European frame β use the specific dynasty / century AH", | |
| ), | |
| ( | |
| r"\bpre[-\s]?modern\b", | |
| "periodization-pre-modern", | |
| "BL-PERIOD", | |
| "Defines by what Europe became; use dynasty / century AH", | |
| ), | |
| ( | |
| r"\bpost[-\s]?classical\b", | |
| "periodization-post-classical", | |
| "BL-PERIOD", | |
| "Western frame β use dynasty / century AH", | |
| ), | |
| ( | |
| r"\bdark\s+ages?\b", | |
| "periodization-dark-ages", | |
| "BL-PERIOD", | |
| "Western Eurocentric framing β use dynasty / century", | |
| ), | |
| ( | |
| r"\brenaissance\b", | |
| "periodization-renaissance", | |
| "BL-PERIOD", | |
| "Western European frame β use the specific dynasty / century / " | |
| "movement name", | |
| ), | |
| # ββ ERA / AGE / PERIOD framing β the rule above periodization names. | |
| # Even "Abbasid era" or "Umayyad period" smuggles a teleological time- | |
| # bracket into data that does not belong inside one. Anything in | |
| # Allah's framework gets dates (232 AH), names (Caliph al-WΔthiq), | |
| # events (the reign of X), but never an "era / age / period" wrapper. | |
| # The pattern fires on the noun-form 'era / age / period / times' | |
| # when it is preceded by an adjective or proper-name modifier | |
| # (e.g. "Abbasid era", "the age of al-Mutawakkil", "Umayyad period"). | |
| ( | |
| r"\b(?:abbasid|umayyad|fatimid|fΔαΉimid|seljuq|seljuk|mamluk|mamlΕ«k|" | |
| r"ottoman|ayyubid|samanid|sΔmΔnΔ«|ghaznavid|qarakhanid|" | |
| r"timurid|tΔ«mΕ«rΔ«|safavid|αΉ£afavΔ«|qajar|qΔjΔrΔ«|" | |
| r"rashidun|rΔshidΕ«n|caliphal|abbasi|umayyad)\s+(?:era|period|age|times|epoch)\b", | |
| "era-frame-named", | |
| "BL-PERIOD", | |
| "An era-noun bracket imports a teleological time-frame. Use the date " | |
| "(N AH / N CE), the reigning caliph (al-WΔthiq, al-Mutawakkil, etc.), " | |
| "or 'the reign of X' instead. Drop 'era / period / age / times'.", | |
| ), | |
| ( | |
| r"\b(?:medieval|classical|early|late|pre|post|high|low)\s+(?:era|period|age|times|epoch)\b", | |
| "era-frame-temporal", | |
| "BL-PERIOD", | |
| "Western temporal era-frame. Drop the era-noun; use the specific date " | |
| "or named event.", | |
| ), | |
| ( | |
| r"\bthe\s+age\s+of\s+\w+", | |
| "age-of-name", | |
| "BL-PERIOD", | |
| "'The age of X' brackets a person/event into an era teleology. " | |
| "Use 'in the year X', 'during the reign of X', or just the date.", | |
| ), | |
| ( | |
| r"\bin\s+(?:those|that)\s+(?:days|times|era|period)\b", | |
| "those-days", | |
| "BL-PERIOD", | |
| "Vague era-bracket. Use a specific date or named event.", | |
| ), | |
| # ββ METAPHOR family (Greek metaphora β literary-analysis category) ββ | |
| ( | |
| r"\bmetaphor(?:s|ic|ical|ically)?\b", | |
| "metaphor", | |
| "BL-CATEGORY", | |
| "Greek ΞΌΞ΅ΟΞ±ΟΞΏΟΞ¬ framework. Use the primary-source rhetorical " | |
| "term: mathal (Ω ΩΨ«ΩΩ, Qur'anic), istiΚΏΔra (Ψ§Ψ³ΨͺΨΉΨ§Ψ±Ψ©), tashbΔ«h " | |
| "(ΨͺΨ΄Ψ¨ΩΩ), majΔz (Ω Ψ¬Ψ§Ψ²).", | |
| ), | |
| # ββ SCHOLASTIC family (Latin scholasticus β medieval-Western academic) ββ | |
| ( | |
| r"\bscholastic(?:s|ism|ally)?\b", | |
| "scholastic", | |
| "BL-CATEGORY", | |
| "Latin Western academic frame for 12-15c CE European disputation " | |
| "tradition. No primary-source referent in Allah's Arabic. Use the " | |
| "specific term: ΚΏilm al-kalΔm, αΈ₯ikma, falsafa, uαΉ£Ε«l al-fiqh, etc.", | |
| ), | |
| # ββ PATRISTIC family (Latin patristica β Western Church-fathers tradition) ββ | |
| ( | |
| r"\bpatristi(?:c|cs|cally)\b", | |
| "patristic", | |
| "BL-CATEGORY", | |
| "Western Christian Church-fathers framework. No primary-source " | |
| "referent in the Islamic tradition. Use the specific named " | |
| "scholar (Ibn αΈ€anbal, al-BukhΔrΔ«, etc.) or 'salaf' if context " | |
| "permits.", | |
| ), | |
| ( | |
| r"\bpatrolog(?:y|ical|ically)?\b", | |
| "patrology", | |
| "BL-CATEGORY", | |
| "Western Christian fathers studies β no Islamic primary-source " | |
| "equivalent.", | |
| ), | |
| # ββ HELLENISTIC family (Greek-philosophy retrojection) ββ | |
| ( | |
| r"\bhellenist(?:ic|ically|ic)?\b", | |
| "hellenistic", | |
| "BL-CATEGORY", | |
| "Greek-philosophy retrojection. The Islamic primary sources " | |
| "name the actual influences specifically (AristΕ«, AflΔαΉΕ«n, " | |
| "etc.) when they are present at all. Use those names directly, " | |
| "not the genus 'Hellenistic'.", | |
| ), | |
| ( | |
| r"\bhellen(?:ism|ize|izing|ization|ist)?\b", | |
| "hellenism", | |
| "BL-CATEGORY", | |
| "Same Greek-philosophy retrojection β name the specific " | |
| "person or text instead of using the genus.", | |
| ), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FIELDS TO SKIP β structural enums and primary-source anchors | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # These are field NAMES whose content is either a structural keyword | |
| # (CHECK enum value), a primary-source pointer (file path, shelfmark), | |
| # or a numeric/date token. Skipping them avoids false positives on | |
| # legitimate grounded content. | |
| SKIP_FIELDS = frozenset({ | |
| # structural schema enums | |
| "entry_type", "divergence_type", "source_ms", "recension", | |
| "category", "op_code", "dp_code", | |
| # primary-source pointers | |
| "ms_folio", "edition_page", "ms_page", "page", "folio", | |
| "quf_token", "quf_q", "quf_u", "quf_f", "quf_pass", "quf_date", | |
| # IDs | |
| "kh_id", "kv_id", "diwan_id", "entry_id", "root_id", "rowid", | |
| "intel_id", "bl_id", "dp_id", "sc_id", "ncr_id", "dcr_id", | |
| # numeric fields | |
| "token_count", "operator_flag", "scribal_interpolation", | |
| "persian_wrapper_inserted", "has_proverb", "has_nazm", | |
| # timestamps | |
| "created_date", "last_updated", | |
| }) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # QUARANTINE TABLES β skipped by the audit by design | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # These tables exist to HOLD contaminated strings for inspection, flagging, | |
| # and downstream re-derivation. Their whole purpose is to quote the operator | |
| # glosses, the banned terms, the substituted translations, etc. so they can | |
| # be reviewed and reversed. Applying the dereference audit to these tables | |
| # would make it impossible to log a contamination finding (since logging | |
| # one requires quoting the contaminated content verbatim). | |
| # | |
| # This list must stay SMALL. Only tables whose content is *documentation | |
| # of contamination* belong here β not tables where contamination might | |
| # accidentally land. | |
| QUARANTINE_TABLES = frozenset({ | |
| "contamination_blacklist", # BL01-BL37 β the register itself | |
| "qv_contamination_scan", # Qur'anic verse contamination findings | |
| "qv_translation_register", # Qur'anic translation contamination register | |
| "diwan_contamination_register", # DCR β Kashgari Diwan scribal interpolations | |
| "navoi_contamination_register", # NCR β Navoi operator crimes | |
| "operator_label_register", # Operator name/title labels (quotes them) | |
| "scholar_warnings", # Flagged tertiary/unreliable scholars | |
| "attribution_corrections", # Records of attribution errors | |
| "db_integrity_log", # Integrity findings (may quote bad content) | |
| "corruption_operation_register", # Records of operator operations | |
| "dcr_corruption_types", # Corruption type catalogue | |
| "interception_register", # Operator interception log | |
| "utul_register", # Pattern register of utul | |
| "disputed_words", # Words under dispute (by definition contaminated) | |
| "qv_contamination_scan", # redundant (kept for clarity) | |
| }) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ARABIC-SCRIPT DETECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ARABIC_RANGE = re.compile(r"[\u0600-\u06FF\uFB50-\uFEFC\u0750-\u077F]") | |
| def _is_primarily_arabic(text: str) -> bool: | |
| """True if more than 50% of non-space characters are Arabic script. | |
| Arabic script content is primary-source and skipped by the audit. | |
| Mixed English/Arabic glosses still get audited (the English portion is | |
| the only part that could leak).""" | |
| if not text: | |
| return False | |
| non_space = [c for c in text if not c.isspace()] | |
| if not non_space: | |
| return False | |
| ar_count = sum(1 for c in non_space if ARABIC_RANGE.match(c)) | |
| return ar_count * 2 > len(non_space) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # COMPILED PATTERN BANK | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _COMPILED: List[Tuple[re.Pattern, str, str, str]] = [ | |
| (re.compile(pattern, re.IGNORECASE), family, bl_ref, reason) | |
| for pattern, family, bl_ref, reason in FAMILY_PATTERNS | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NEIGHBOURHOOD EXPANSION (Session 41) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # Loads `amr_banned_neighbourhood.json` from the same directory at import | |
| # time. The JSON contains a `banned_words` list β every word in that list | |
| # is added to the audit as a single-word regex check (case-insensitive, | |
| # word-boundaries on both sides). | |
| # | |
| # The intent is to expand the FAMILY_PATTERNS coverage to include the | |
| # k-nearest-neighbour cluster of each banned seed term in some embedding | |
| # space. The bootstrap JSON is hand-curated. The full pipeline that | |
| # computes neighbours from a real local embedding model lives in | |
| # `amr_neighbourhood_expander.py` and writes the same JSON file. | |
| # | |
| # If the file is missing, the audit still works β the family-pattern | |
| # regexes are unaffected. The neighbourhood layer is purely additive. | |
| _NEIGHBOURHOOD_PATH = os.path.join( | |
| os.path.dirname(os.path.abspath(__file__)), | |
| "amr_banned_neighbourhood.json", | |
| ) | |
| def _load_neighbourhood_words() -> List[str]: | |
| """Load the bootstrap neighbourhood JSON. Returns [] on any failure.""" | |
| try: | |
| with open(_NEIGHBOURHOOD_PATH, "r", encoding="utf-8") as fh: | |
| data = json.load(fh) | |
| words = data.get("banned_words", []) | |
| if not isinstance(words, list): | |
| return [] | |
| # Filter out anything that isn't a string and dedupe | |
| out = sorted({w.lower() for w in words if isinstance(w, str) and w}) | |
| return out | |
| except (FileNotFoundError, json.JSONDecodeError, OSError): | |
| return [] | |
| _NEIGHBOURHOOD_WORDS: List[str] = _load_neighbourhood_words() | |
| # Compile each neighbourhood word into a single regex with word boundaries. | |
| # Words containing internal hyphens (e.g., "pre-modern") need the hyphen | |
| # escaped and word-boundaries that recognise the hyphen as a separator. | |
| def _compile_neighbourhood_pattern(word: str) -> re.Pattern: | |
| # re.escape handles the hyphen safely | |
| return re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE) | |
| _NEIGHBOURHOOD_COMPILED: List[Tuple[re.Pattern, str]] = [ | |
| (_compile_neighbourhood_pattern(w), w) for w in _NEIGHBOURHOOD_WORDS | |
| ] | |
| def neighbourhood_count() -> int: | |
| """How many neighbourhood words are loaded (for diagnostics).""" | |
| return len(_NEIGHBOURHOOD_WORDS) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CORE CHECKS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def audit_text(text: str, *, skip_if_arabic: bool = True) -> Dict[str, Any]: | |
| """ | |
| Check a single free-text string against the family patterns. | |
| Args: | |
| text: the string to check | |
| skip_if_arabic: if True, skip strings that are primarily Arabic | |
| script (primary-source content is not audited) | |
| Returns: | |
| { | |
| 'pass': bool, | |
| 'hits': [ | |
| {'family': str, 'bl_ref': str, 'reason': str, | |
| 'matched_text': str, 'position': int}, ... | |
| ], | |
| } | |
| """ | |
| if not text or not isinstance(text, str): | |
| return {"pass": True, "hits": []} | |
| if skip_if_arabic and _is_primarily_arabic(text): | |
| return {"pass": True, "hits": []} | |
| hits: List[Dict[str, Any]] = [] | |
| # ββ HEBREW CHARACTER BLOCK β ABSOLUTE (Session 46) ββββββββββββββββ | |
| # Hebrew (U+0590-U+05FF) has NOTHING in common with AA. | |
| # Weights inject Hebrew alongside "abjad" β block at character level. | |
| import re as _re_heb | |
| _heb_matches = list(_re_heb.finditer(r'[\u0590-\u05FF]+', text)) | |
| for m in _heb_matches: | |
| hits.append({ | |
| "family": "hebrew_contamination", | |
| "bl_ref": "BL-HEB", | |
| "reason": ( | |
| "Hebrew character(s) detected. AA abjad uses ARABIC letters " | |
| "only. Hebrew is a manufactured attachment β BLOCK." | |
| ), | |
| "matched_text": m.group(0), | |
| "position": m.start(), | |
| }) | |
| for pat, family, bl_ref, reason in _COMPILED: | |
| for m in pat.finditer(text): | |
| hits.append({ | |
| "family": family, | |
| "bl_ref": bl_ref, | |
| "reason": reason, | |
| "matched_text": m.group(0), | |
| "position": m.start(), | |
| }) | |
| # ββ NEIGHBOURHOOD EXPANSION (Session 41) βββββββββββββββββββββββββ | |
| # Each word in `amr_banned_neighbourhood.json` is checked as a | |
| # standalone token. Hits are tagged "neighbourhood" so the caller | |
| # can distinguish them from family-pattern hits if needed. | |
| # Words already caught by a family regex above are not re-reported | |
| # (deduplication on (position, matched_text)). | |
| _seen_positions = {(h["position"], h["matched_text"].lower()) for h in hits} | |
| for pat, word in _NEIGHBOURHOOD_COMPILED: | |
| for m in pat.finditer(text): | |
| key = (m.start(), m.group(0).lower()) | |
| if key in _seen_positions: | |
| continue | |
| _seen_positions.add(key) | |
| hits.append({ | |
| "family": "neighbourhood", | |
| "bl_ref": "BL-NBH", | |
| "reason": ( | |
| f"'{word}' is in the banned-neighbourhood expansion of a " | |
| f"BL family seed term. Loaded from " | |
| f"amr_banned_neighbourhood.json." | |
| ), | |
| "matched_text": m.group(0), | |
| "position": m.start(), | |
| }) | |
| # ββ BL04: bare 'arabic' (context-aware) ββββββββββββββββββββββββββ | |
| # The word 'arabic' / 'Arabic' / 'ARABIC' is banned UNLESS preceded | |
| # immediately by an approved qualifier: | |
| # "Allah's Arabic" / "Allahs Arabic" / "Lisan Arabic" / "LisΔn Arabic" | |
| # Per CLAUDE.md the qualifier separates Allah's revealed language | |
| # from the degraded human form. Bare 'arabic' imports the Western | |
| # linguistic-family category (a "language belonging to a people") | |
| # which has no primary-source referent. | |
| # | |
| # ββ SEMANTIC DEFAULT (Session 41) βββββββββββββββββββββββββββββββββ | |
| # Both "Allah's Arabic" and "Lisan Arabic" pass this syntactic check. | |
| # But the SEMANTIC default for any primary-source scholarly text is | |
| # Allah's Arabic. Educated scholars (al-KhwΔrizmΔ«, Ibn SΔ«nΔ, | |
| # al-BΔ«rΕ«nΔ«, al-FarghΔnΔ«, Kashgari, Ibn KhurdΔdhbih, Navoi) wrote in | |
| # Allah's Arabic, were educated in Allah's Arabic. Lisan Arabic is | |
| # the DEGRADED downstream form β use the LA label ONLY when | |
| # documenting operator-corrupted forms (qv_translation_register | |
| # quotes, DCR/NCR rows). NEVER default to LA for a scholar's | |
| # primary work. The audit cannot enforce this semantic rule with a | |
| # regex; it lives at the writer level. The single operational | |
| # principle: specifics dereference, categories hallucinate. If you | |
| # find yourself defaulting to LA without specific evidence of | |
| # degradation, you are sampling from weights instead of querying | |
| # the lattice. | |
| for m in _ARABIC_BARE_PATTERN.finditer(text): | |
| # Look at the ~20 characters immediately preceding the match | |
| start = max(0, m.start() - 20) | |
| prefix_context = text[start:m.start()].lower() | |
| # Strip trailing whitespace/punctuation for the qualifier check | |
| prefix_stripped = prefix_context.rstrip().rstrip("-") | |
| if any(prefix_stripped.endswith(q) for q in _ARABIC_ALLOWED_QUALIFIERS): | |
| continue # qualified β allowed | |
| hits.append({ | |
| "family": "bare-arabic", | |
| "bl_ref": "BL04", | |
| "reason": "Bare 'arabic' is banned. Use 'Allah's Arabic' " | |
| "(divine, revealed, taught to Adam) or 'Lisan Arabic' " | |
| "(degraded human form) per CLAUDE.md.", | |
| "matched_text": m.group(0), | |
| "position": m.start(), | |
| }) | |
| return {"pass": len(hits) == 0, "hits": hits} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BARE 'arabic' pattern + qualifiers (BL04) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _ARABIC_BARE_PATTERN = re.compile(r"\barabic\b", re.IGNORECASE) | |
| # Qualifiers that, when they immediately precede 'Arabic', make the | |
| # usage allowed. Each qualifier is matched as the END of the preceding | |
| # context (case-insensitive, after stripping trailing whitespace). | |
| _ARABIC_ALLOWED_QUALIFIERS = ( | |
| "allah's", | |
| "allahs", | |
| "allahβs", # curly apostrophe variant | |
| "lisan", | |
| "lisΔn", | |
| "lisaan", | |
| ) | |
| def audit_row( | |
| data: Dict[str, Any], | |
| table: Optional[str] = None, | |
| *, | |
| skip_fields: Optional[frozenset] = None, | |
| skip_tables: Optional[frozenset] = None, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Check every free-text field of a draft row against the family patterns. | |
| Args: | |
| data: the row data dict as passed to write_entry | |
| table: target table name; if in QUARANTINE_TABLES or the | |
| caller-supplied skip_tables, the audit passes through | |
| unchanged (those tables hold contamination BY DESIGN) | |
| skip_fields: additional field names to skip beyond the default | |
| SKIP_FIELDS set | |
| skip_tables: additional tables to skip beyond QUARANTINE_TABLES | |
| Returns: | |
| { | |
| 'pass': bool, | |
| 'failing_fields': {field_name: [hit, hit, ...]}, | |
| 'message': human-readable summary for the caller, | |
| 'total_hits': int, | |
| } | |
| """ | |
| # ββ QUARANTINE SKIP βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tables that exist to document/quote contamination pass through | |
| # the audit unchanged. They NEED to hold the banned strings. | |
| effective_skip_tables = set(QUARANTINE_TABLES) | |
| if skip_tables: | |
| effective_skip_tables.update(skip_tables) | |
| if table and table in effective_skip_tables: | |
| return { | |
| "pass": True, | |
| "failing_fields": {}, | |
| "message": f"dereference audit skipped β {table} is a quarantine table (holds contaminated content by design)", | |
| "total_hits": 0, | |
| "quarantine": True, | |
| } | |
| effective_skip = set(SKIP_FIELDS) | |
| if skip_fields: | |
| effective_skip.update(skip_fields) | |
| failing: Dict[str, List[Dict[str, Any]]] = {} | |
| total = 0 | |
| for field, value in (data or {}).items(): | |
| if field in effective_skip: | |
| continue | |
| if not isinstance(value, str): | |
| # numbers, None, bool, etc. β nothing to scan | |
| continue | |
| result = audit_text(value) | |
| if not result["pass"]: | |
| failing[field] = result["hits"] | |
| total += len(result["hits"]) | |
| if not failing: | |
| return { | |
| "pass": True, | |
| "failing_fields": {}, | |
| "message": "dereference audit passed β no ungrounded-family tokens detected", | |
| "total_hits": 0, | |
| } | |
| # Build a concise human-readable summary | |
| lines = [f"β DEREFERENCE AUDIT BLOCKED: {total} ungrounded token(s) across " | |
| f"{len(failing)} field(s). Rewrite with grounded vocabulary and retry."] | |
| for field, hits in failing.items(): | |
| for h in hits: | |
| lines.append( | |
| f" [{field}] {h['family']} ({h['bl_ref']}): " | |
| f"matched '{h['matched_text']}' at pos {h['position']} β {h['reason']}" | |
| ) | |
| return { | |
| "pass": False, | |
| "failing_fields": failing, | |
| "message": "\n".join(lines), | |
| "total_hits": total, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CLI β for ad-hoc testing and prose auditing | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _cli_audit_text(argv: List[str]) -> int: | |
| if not argv: | |
| print("Usage: python3 amr_dereference_audit.py <text_or_file>") | |
| return 1 | |
| arg = argv[0] | |
| import os | |
| if os.path.exists(arg): | |
| with open(arg, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| else: | |
| text = " ".join(argv) | |
| r = audit_text(text, skip_if_arabic=False) | |
| if r["pass"]: | |
| print("β clean β 0 hits") | |
| return 0 | |
| print(f"β {len(r['hits'])} hit(s)") | |
| for h in r["hits"]: | |
| print(f" β’ {h['family']} ({h['bl_ref']}): '{h['matched_text']}' @ {h['position']}") | |
| print(f" β {h['reason']}") | |
| return 1 | |
| def main(argv: List[str]) -> int: | |
| if not argv: | |
| print("amr_dereference_audit β USLaP dereference gate") | |
| print() | |
| print("Usage:") | |
| print(" python3 amr_dereference_audit.py text <string>") | |
| print(" python3 amr_dereference_audit.py file <path>") | |
| print(" python3 amr_dereference_audit.py patterns (list all family patterns)") | |
| return 0 | |
| cmd = argv[0] | |
| if cmd == "patterns": | |
| for pat, fam, bl, reason in FAMILY_PATTERNS: | |
| print(f" {fam:20s} {bl:8s} {pat}") | |
| print(f" β {reason}") | |
| return 0 | |
| if cmd == "text": | |
| return _cli_audit_text(argv[1:]) | |
| if cmd == "file": | |
| return _cli_audit_text(argv[1:]) | |
| # default: treat argv as text | |
| return _cli_audit_text(argv) | |
| if __name__ == "__main__": | |
| raise SystemExit(main(sys.argv[1:])) | |