uslap-query / Code_files /archive /batch_runner.py
uslap's picture
Upload folder using huggingface_hub
7cc8e29 verified
Raw
History Blame Contribute Delete
25.9 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
USLaP Batch Runner v2.0
بِسْمِ Ψ§Ω„Ω„ΩŽΩ‘Ω‡Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­Ω’Ω…ΩŽΩ°Ω†Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­ΩΩŠΩ…Ω
Runs USLaP_Engine in DRY-RUN mode against a word list.
NO writes to the master Excel file. Discovery only.
Output:
- Batch Reports/BATCH_REPORT_<timestamp>.json (full machine-readable results)
- Batch Reports/BATCH_SUMMARY_<timestamp>.txt (human-readable summary)
Usage:
python3 batch_runner.py # uses built-in 500-word list
python3 batch_runner.py my_words.txt # uses your own word list (one word per line)
THREE-TIER OUTPUT SYSTEM (v2 β€” corrected per USLaP_BATCH_ENGINE_PROTOCOL):
ALREADY_IN_LATTICE β€” word already confirmed in lattice (skip β€” no action needed)
CONFIRMED_HIGH β€” score >= 8, Q+U pass, no R11 transposition flag
β†’ highest-confidence candidates, review before writing to A1_ENTRIES
PENDING_REVIEW β€” score 5–7, OR transposition flag detected (R11),
OR no ORIG1 root found (possible ORIG2 / Kashgari candidate)
β†’ human judgment required before any write
AUTO_REJECTED β€” score < 5 OR U-gate fail (consonant(s) unaccounted for)
β†’ discard; not a lattice candidate at current analysis level
CLUSTER_BACKLOG β€” words discovered via cluster expansion of CONFIRMED_HIGH roots
(engine found these, not in your input list β€” bonus discoveries)
Key improvements over v1:
- R11 transposition detection: roots assigned via semantic pull, not phonetic order,
are automatically demoted to PENDING_REVIEW regardless of score.
- N15 skeleton priority (R09): C/G/K-R-N pattern forces Ω‚-Ψ±-Ω† check first.
- M-prefix parallel path (R08a): words starting with M also tested with مُ stripped.
- Token count weight reduced (was 3 pts, now 1 pt) β€” eliminates semantic-first bias.
- Positional fidelity now contributes 2 pts β€” correct consonant ORDER rewarded.
"""
import sys
import os
import json
import io
import contextlib
from datetime import datetime
from pathlib import Path
# ─── PATH SETUP ───────────────────────────────────────────────────────────────
# "Code files " has a trailing space β€” this script lives inside it
THIS_DIR = Path(__file__).parent # "Code_files/"
WORKSPACE = Path("/Users/mmsetubal/Documents/USLaP workplace")
MASTER_FILE = WORKSPACE / "USLaP_Final_Data_Consolidated_Master_v3.xlsx"
OUTPUT_DIR = Path("/Users/mmsetubal/Documents/USLaP workplace/Batch Reports")
# Add this folder to sys.path so we can import USLaP_Engine
sys.path.insert(0, str(THIS_DIR))
# ─── SUPPRESS ENGINE STDOUT ───────────────────────────────────────────────────
# The engine prints detailed logs per word. In batch mode we suppress this
# and only capture the structured ProcessResult.
class _Suppress:
"""Context manager: silence stdout from engine, capture to string."""
def __enter__(self):
self._buf = io.StringIO()
self._redirect = contextlib.redirect_stdout(self._buf)
self._redirect.__enter__()
return self
def __exit__(self, *args):
self._redirect.__exit__(*args)
def text(self):
return self._buf.getvalue()
# ─── 500 COMMON ENGLISH WORDS (content words only) ───────────────────────────
# Curated from Oxford 3000. Function words excluded (engine already filters them).
# Covers: law, governance, nature, body, family, time, science, society, trade.
# These are prioritised for QUF discovery β€” they carry high civilisational weight.
WORD_LIST_500 = [
# GOVERNANCE + LAW
"rule","govern","law","order","judge","court","justice","crime","punish",
"king","queen","lord","master","servant","minister","council","senate",
"nation","state","power","authority","command","control","force","guard",
"prison","exile","rebel","conquer","empire","colony","treaty","border",
"tax","tribute","debt","coin","market","trade","merchant","contract",
# NATURE + COSMOS
"star","sun","moon","cloud","rain","wind","storm","thunder","lightning",
"river","sea","ocean","mountain","desert","valley","plain","island","cave",
"earth","fire","water","air","stone","rock","sand","dust","ice","snow",
"gold","silver","iron","copper","lead","salt","oil","glass","clay",
"tree","root","branch","leaf","flower","fruit","seed","grain","corn",
"forest","field","garden","harvest","soil","shadow","light","dark",
# BODY + MEDICINE
"head","face","eye","ear","nose","mouth","tongue","tooth","jaw","neck",
"shoulder","arm","hand","finger","chest","heart","lung","liver","kidney",
"stomach","blood","bone","skin","muscle","nerve","brain","spine","heel",
"wound","fever","cure","medicine","doctor","patient","pain","death",
"birth","grow","breathe","sleep","dream","wake","hunger","thirst",
# FAMILY + SOCIETY
"father","mother","brother","sister","son","daughter","child","family",
"husband","wife","marriage","widow","orphan","elder","youth","ancestor",
"tribe","clan","village","city","people","crowd","stranger","neighbor",
"friend","enemy","guest","host","servant","slave","soldier","priest",
# FAITH + RITUAL
"prayer","fast","pilgrimage","sacrifice","offering","altar","temple",
"sign","miracle","prophet","messenger","angel","spirit","soul","mercy",
"grace","blessing","curse","judgment","heaven","paradise","fire","torment",
"faith","trust","peace","truth","wisdom","knowledge","reason","conscience",
# TIME + CYCLE
"year","month","week","season","morning","evening","night","dawn","dusk",
"past","present","future","moment","hour","age","era","generation","century",
"beginning","end","return","repeat","cycle","calendar","feast","fast",
# SCIENCE + CRAFT
"number","count","measure","weight","balance","scale","ratio","angle",
"circle","square","triangle","center","point","line","surface","volume",
"medicine","surgery","fever","poison","antidote","herb","compound","formula",
"metal","forge","weapon","armor","shield","sword","arrow","bow","spear",
"ship","sail","navigation","compass","horizon","current","anchor","port",
"road","bridge","gate","wall","tower","palace","prison","market","temple",
"ink","paper","book","script","seal","letter","word","name","language",
# TRADE + ECONOMY
"price","value","profit","loss","interest","loan","pledge","property",
"buy","sell","exchange","barter","warehouse","caravan","merchant","journey",
"silk","cotton","wool","leather","spice","pepper","sugar","honey","grain",
"measure","standard","weight","balance","account","register","archive",
# COMMON VERBS (high QUF potential)
"create","form","shape","build","destroy","break","open","close","give",
"take","bring","send","carry","move","turn","rise","fall","enter","leave",
"speak","hear","see","know","think","remember","forget","learn","teach",
"rule","judge","punish","reward","protect","attack","defend","gather",
"divide","separate","connect","cover","reveal","hide","mark","name","call",
# QUALITIES + CONDITIONS
"holy","sacred","pure","clean","corrupt","evil","good","right","wrong",
"strong","weak","brave","coward","wise","foolish","rich","poor","free","slave",
"near","far","high","low","deep","shallow","large","small","full","empty",
"sharp","heavy","light","hard","soft","rough","smooth","clear","dark",
"new","ancient","first","last","same","different","true","false","certain",
# MOVEMENT + CONFLICT
"war","battle","victory","defeat","conquest","siege","retreat","escape",
"march","advance","surrender","peace","alliance","rebellion","revolution",
"migration","exile","refuge","settlement","frontier","territory","domain",
# ADDITIONAL HIGH-PRIORITY TERMS
"origin","source","root","branch","decay","corruption","restoration",
"revelation","scripture","verse","chapter","recitation","preservation",
"throne","crown","scepter","robe","banner","seal","ring","chain",
"martyr","witness","testimony","covenant","promise","oath","vow",
"treasury","archive","register","record","history","tradition","custom",
"census","survey","map","route","distance","direction","boundary",
"mission","message","envoy","ambassador","spy","agent","network",
"algebra","algorithm","cipher","secret","code","key","puzzle","riddle",
]
# Remove duplicates while preserving order
seen = set()
WORD_LIST_500 = [w for w in WORD_LIST_500 if not (w in seen or seen.add(w))]
# ─── RESULT SERIALISER ────────────────────────────────────────────────────────
def serialise_result(word: str, result) -> dict:
"""Convert ProcessResult to JSON-safe dict."""
rec = {
"word": word.upper(),
"existing_entry_id": result.existing_entry_id,
"category": _categorise(result),
"score": None,
"root_letters": None,
"ar_word": None,
"phonetic_chain": None,
"positional_score": None, # R11: consonant order fidelity (0.0–1.0)
"transposition_flag": False, # R11: True = consonant order inverted
"extra_consonants": 0, # Coverage: word consonants not in root
"q_gate": None,
"u_gate": None,
"f_gate": None,
"orig2_track": getattr(result, 'orig2_track', False),
"orig2_details": None,
"cluster_members": result.cluster_members[:20],
"log_lines": result.log,
}
# ORIG2 details
if getattr(result, 'orig2_track', False) and getattr(result, 'orig2_details', None):
rec["orig2_details"] = {
"kashgari_translit": result.orig2_details.get('kashgari_translit', ''),
"kashgari_meaning": result.orig2_details.get('kashgari_meaning', ''),
"kashgari_line": result.orig2_details.get('kashgari_line', 0),
"attestation_type": result.orig2_details.get('attestation_type', ''),
"skeleton": result.orig2_details.get('skeleton', ''),
"all_hits": result.orig2_details.get('all_hits', 0),
"bitig_warnings": result.orig2_details.get('bitig_warnings', []),
}
if result.confirmed_root:
rec["root_letters"] = result.confirmed_root.letters
rec["ar_word"] = result.confirmed_root.ar_word
rec["score"] = result.confirmed_root.score
rec["phonetic_chain"] = result.confirmed_root.phonetic_chain
rec["positional_score"] = getattr(result.confirmed_root, 'positional_score', None)
rec["transposition_flag"] = getattr(result.confirmed_root, 'transposition_flag', False)
rec["extra_consonants"] = getattr(result.confirmed_root, 'extra_consonants', 0)
if result.q_gate:
rec["q_gate"] = {
"passed": result.q_gate.passed,
"token_count": result.q_gate.details.get("token_count", 0),
"ar_word": result.q_gate.details.get("ar_word", ""),
"verse": result.q_gate.details.get("verse", ""),
"orig2_candidate": result.q_gate.details.get("orig2_candidate", False),
}
if result.u_gate:
rec["u_gate"] = {
"passed": result.u_gate.passed,
"phonetic_chain": result.u_gate.details.get("phonetic_chain", ""),
}
if result.f_gate:
rec["f_gate"] = {
"passed": result.f_gate.passed,
"ds_code": result.f_gate.details.get("ds_code", ""),
"network_id": result.f_gate.details.get("network_id", ""),
"dp_codes": result.f_gate.details.get("dp_codes", []),
}
return rec
def _categorise(result) -> str:
"""
Three-tier classification (v2.2):
ALREADY_IN_LATTICE β€” already confirmed
CONFIRMED_HIGH β€” score >= 8, Q+U pass, no R11 transposition
PENDING_REVIEW β€” score 5–7, or transposition, or no ORIG1 root,
or ORIG2 Kashgari match (always needs human review)
AUTO_REJECTED β€” score < 5, or U-gate fail, or no root at all
"""
if result.existing_entry_id is not None:
return "ALREADY_IN_LATTICE"
# ORIG2 track: always PENDING_REVIEW (human must verify Kashgari attestation)
if getattr(result, 'orig2_track', False):
return "PENDING_REVIEW"
if result.confirmed_root is None:
# No ORIG1 root AND no ORIG2 match β€” flag as PENDING_REVIEW for manual check
return "PENDING_REVIEW"
score = result.confirmed_root.score
q = result.q_gate.passed if result.q_gate else False
u = result.u_gate.passed if result.u_gate else False
trans = getattr(result.confirmed_root, 'transposition_flag', False)
# CONFIRMED_HIGH: strong phonetic + gate evidence, no transposition
if score >= 8 and q and u and not trans:
return "CONFIRMED_HIGH"
# PENDING_REVIEW: partial evidence β€” needs human QUF adjudication
if score >= 5 and (q or u):
return "PENDING_REVIEW"
# AUTO_REJECTED: insufficient evidence or consonant accounting failure
return "AUTO_REJECTED"
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def run_batch(word_list: list, output_dir: Path) -> dict:
"""
Run engine in dry_run=True mode on every word in word_list.
Returns full results dict. Saves JSON + TXT to output_dir.
"""
# β€” Import engine (suppress its startup prints)
print("Importing USLaP_Engine...")
with _Suppress():
from USLaP_Engine import USLaPEngine
print(f"Initialising engine with master file...")
with _Suppress() as s:
try:
engine = USLaPEngine(master_file=str(MASTER_FILE), skip_reports=True)
except Exception as e:
print(f"\nERROR: Engine init failed: {e}")
print(s.text())
sys.exit(1)
print(f"Engine ready.\n")
# β€” Buckets (three-tier v2)
results_by_cat = {
"ALREADY_IN_LATTICE": [],
"CONFIRMED_HIGH": [],
"PENDING_REVIEW": [],
"AUTO_REJECTED": [],
}
cluster_backlog = set() # words discovered via cluster expansion
total = len(word_list)
# β€” Process loop
for i, word in enumerate(word_list, 1):
pct = (i / total) * 100
print(f" [{i:>3}/{total}] {pct:>5.1f}% {word:<20}", end="", flush=True)
with _Suppress():
try:
result = engine.process(word, dry_run=True)
except Exception as e:
print(f" ERROR: {e}")
continue
rec = serialise_result(word, result)
cat = rec["category"]
results_by_cat[cat].append(rec)
# Collect cluster discoveries
for candidate in result.cluster_members:
if isinstance(candidate, str):
cluster_backlog.add(candidate.upper())
# Inline status
root = rec.get("root_letters", "?")
score = rec.get("score", "?")
trans = rec.get("transposition_flag", False)
pos = rec.get("positional_score")
pos_s = f" pos={pos:.2f}" if pos is not None else ""
r11 = " ⚠R11" if trans else ""
if cat == "ALREADY_IN_LATTICE":
print(f" βœ“ EXISTING #{result.existing_entry_id}")
elif cat == "CONFIRMED_HIGH":
print(f" β˜… CONFIRMED root={root:<12} score={score}/10{pos_s}")
elif cat == "PENDING_REVIEW":
if getattr(result, 'orig2_track', False):
# ORIG2 (Kashgari) match found
kd = getattr(result, 'orig2_details', {}) or {}
kt = kd.get('kashgari_translit', '?')
ka = kd.get('attestation_type', '?')
print(f" β—† ORIG2 Kashgari='{kt}' ({ka}) score={score}/10")
elif result.confirmed_root is None:
print(f" ~ PENDING (no ORIG1 root, no ORIG2 match)")
else:
print(f" ~ PENDING root={root:<12} score={score}/10{pos_s}{r11}")
else:
print(f" βœ— REJECTED root={root:<12} score={score}/10{r11}")
# β€” Remove input words and already-existing terms from cluster backlog
input_upper = {w.upper() for w in word_list}
existing_upper = {r["word"] for r in results_by_cat["ALREADY_IN_LATTICE"]}
cluster_backlog -= input_upper
cluster_backlog -= existing_upper
# β€” Build final report (three-tier v2)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report = {
"run_date": datetime.now().isoformat(),
"engine_version": "v2.0 (R09/R10/R11/R08a)",
"master_file": str(MASTER_FILE),
"total_words": total,
"summary": {
"already_in_lattice": len(results_by_cat["ALREADY_IN_LATTICE"]),
"confirmed_high": len(results_by_cat["CONFIRMED_HIGH"]),
"pending_review": len(results_by_cat["PENDING_REVIEW"]),
"auto_rejected": len(results_by_cat["AUTO_REJECTED"]),
"cluster_backlog": len(cluster_backlog),
},
"already_in_lattice": results_by_cat["ALREADY_IN_LATTICE"],
"confirmed_high": results_by_cat["CONFIRMED_HIGH"],
"pending_review": results_by_cat["PENDING_REVIEW"],
"auto_rejected": results_by_cat["AUTO_REJECTED"],
"cluster_backlog": sorted(cluster_backlog),
}
# β€” Save JSON
output_dir.mkdir(parents=True, exist_ok=True)
json_path = output_dir / f"BATCH_REPORT_{timestamp}.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"\n JSON report saved: {json_path}")
# β€” Save human-readable TXT summary
txt_path = output_dir / f"BATCH_SUMMARY_{timestamp}.txt"
_write_txt_summary(report, txt_path)
print(f" TXT summary saved: {txt_path}")
return report
def _write_txt_summary(report: dict, path: Path):
"""Write a human-readable summary TXT (three-tier v2)."""
s = report["summary"]
lines = [
"═" * 70,
" USLaP Batch Runner v2.0 β€” Discovery Summary",
" بِسْمِ Ψ§Ω„Ω„ΩŽΩ‘Ω‡Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­Ω’Ω…ΩŽΩ°Ω†Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­ΩΩŠΩ…Ω",
"═" * 70,
f" Run date: {report['run_date']}",
f" Engine: {report.get('engine_version', 'v2.0')}",
f" Words run: {report['total_words']}",
"─" * 70,
f" βœ“ Already in lattice: {s['already_in_lattice']:>4} (no action needed)",
f" β˜… CONFIRMED HIGH: {s['confirmed_high']:>4} ← review & write to A1_ENTRIES",
f" ~ PENDING REVIEW: {s['pending_review']:>4} ← human QUF adjudication required",
f" βœ— AUTO REJECTED: {s['auto_rejected']:>4} (U-gate fail or score < 5)",
f" + Cluster backlog: {s['cluster_backlog']:>4} (new words found via root expansion)",
"─" * 70,
"",
" β˜… CONFIRMED HIGH β€” review these first (score β‰₯ 8, Q+U pass, no R11 transposition):",
"─" * 70,
]
for rec in report["confirmed_high"]:
root = rec.get("root_letters", "?")
score = rec.get("score", "?")
chain = rec.get("phonetic_chain", "?") or "β€”"
tokens = rec.get("q_gate", {}).get("token_count", "?") if rec.get("q_gate") else "?"
pos = rec.get("positional_score")
pos_s = f" pos={pos:.2f}" if pos is not None else ""
net = rec.get("f_gate", {}).get("network_id", "") if rec.get("f_gate") else ""
net_s = f" [{net}]" if net else ""
lines.append(
f" {rec['word']:<22} root={root:<12} score={score}/10 tokens={tokens}{pos_s}{net_s}"
)
lines.append(f" chain: {chain}")
# Split PENDING into ORIG2 matches and regular PENDING
orig2_pending = [r for r in report["pending_review"] if r.get("orig2_track")]
other_pending = [r for r in report["pending_review"] if not r.get("orig2_track")]
if orig2_pending:
lines += [
"",
f" β—† ORIG2 (KASHGARI) MATCHES β€” {len(orig2_pending)} words attested in Bitig:",
"─" * 70,
]
for rec in orig2_pending:
od = rec.get("orig2_details", {}) or {}
kt = od.get("kashgari_translit", "?")
km = od.get("kashgari_meaning", "?")
kl = od.get("kashgari_line", "?")
ka = od.get("attestation_type", "?")
score = rec.get("score", "?")
warns = od.get("bitig_warnings", [])
warn_s = f" ⚠ {'; '.join(warns)}" if warns else ""
# Truncate meaning to 50 chars
km_short = km[:50] + "..." if len(km) > 50 else km
lines.append(
f" {rec['word']:<20} Kashgari='{kt}' ({ka}, line {kl}) score={score}/10{warn_s}"
)
lines.append(f" meaning: \"{km_short}\"")
lines += [
"",
f" ~ PENDING REVIEW β€” {len(other_pending)} words need human QUF adjudication:",
"─" * 70,
]
for rec in other_pending:
root = rec.get("root_letters") or "NO ORIG1 ROOT"
score = rec.get("score", "?")
trans = rec.get("transposition_flag", False)
q_ok = rec.get("q_gate", {}).get("passed", False) if rec.get("q_gate") else False
u_ok = rec.get("u_gate", {}).get("passed", False) if rec.get("u_gate") else False
flags = []
if trans: flags.append("⚠R11-TRANSPOSITION")
if not q_ok: flags.append("Q-FAIL (no ORIG1 or ORIG2)")
if not u_ok: flags.append("U-FAIL")
flag_s = " " + " | ".join(flags) if flags else ""
lines.append(f" {rec['word']:<22} root={root:<12} score={score}/10{flag_s}")
lines += [
"",
" + CLUSTER BACKLOG β€” words discovered via root expansion",
" (not in input list β€” found by the engine itself):",
"─" * 70,
]
for w in sorted(report["cluster_backlog"]):
lines.append(f" {w}")
lines += [
"",
"═" * 70,
" NEXT STEPS:",
" 1. Take CONFIRMED_HIGH entries β†’ confirm ROOT_ID + QUR_MEANING manually",
" 2. Run engine.process(word, dry_run=False) for approved CONFIRMED_HIGH words",
" 3. For PENDING_REVIEW with Q-FAIL: check Kashgari corpus (ORIG2 track)",
" 4. For PENDING_REVIEW with ⚠R11: recheck phonetic chain β€” transposition likely",
" 5. CLUSTER_BACKLOG: run batch_runner again with these as input for next cycle",
"═" * 70,
]
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
# ─── ENTRY POINT ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Check output dir exists
if not OUTPUT_DIR.exists():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Check master file
if not MASTER_FILE.exists():
print(f"ERROR: Master file not found:\n {MASTER_FILE}")
sys.exit(1)
# Word source: CLI arg (custom file) or built-in list
if len(sys.argv) > 1 and sys.argv[1] not in ("--dry-summary",):
custom_file = Path(sys.argv[1])
if not custom_file.exists():
print(f"ERROR: Word file not found: {custom_file}")
sys.exit(1)
with open(custom_file, "r", encoding="utf-8") as f:
word_list = [line.strip().lower() for line in f
if line.strip() and line.strip().isalpha()]
print(f"Loaded {len(word_list)} words from {custom_file.name}")
else:
word_list = WORD_LIST_500
print(f"Using built-in word list: {len(word_list)} words")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Master file: {MASTER_FILE.name}")
print(f"Mode: DRY RUN (no writes to Excel)\n")
print("─" * 62)
report = run_batch(word_list, OUTPUT_DIR)
# β€” Print terminal summary
s = report["summary"]
print("\n" + "═" * 70)
print(" BATCH COMPLETE β€” THREE-TIER SUMMARY (v2)")
print("═" * 70)
print(f" Words processed: {report['total_words']}")
print(f" βœ“ Already in lattice: {s['already_in_lattice']}")
print(f" β˜… CONFIRMED HIGH: {s['confirmed_high']} ← review & write to A1_ENTRIES")
print(f" ~ PENDING REVIEW: {s['pending_review']} ← human QUF adjudication")
print(f" βœ— AUTO REJECTED: {s['auto_rejected']}")
print(f" + Cluster backlog: {s['cluster_backlog']} ← bonus discoveries")
print("═" * 70)
print("\n Open the TXT summary for the full annotated review list.")
print(" Open the JSON report for machine-readable detail.")
print(" PENDING_REVIEW with Q-FAIL β†’ check Kashgari corpus (ORIG2 track).")
print(" PENDING_REVIEW with ⚠R11 β†’ recheck phonetic chain (transposition).")