uslap-query / Code_files /archive /batch_runner_ru.py
uslap's picture
Upload folder using huggingface_hub
7cc8e29 verified
Raw
History Blame Contribute Delete
27.7 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
USLaP Russian Batch Runner v1.0
بِسْمِ Ψ§Ω„Ω„ΩŽΩ‘Ω‡Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­Ω’Ω…ΩŽΩ°Ω†Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­ΩΩŠΩ…Ω
Runs USLaP_Engine v3.0 (dual-language) in DRY-RUN mode against a Russian word list.
NO writes to the master Excel file. Discovery only.
Output:
- Batch Reports/RU_BATCH_REPORT_<timestamp>.json (full machine-readable results)
- Batch Reports/RU_BATCH_SUMMARY_<timestamp>.txt (human-readable summary)
Usage:
python3 batch_runner_ru.py # uses built-in 300-word list
python3 batch_runner_ru.py my_words.txt # uses your own word list (one word per line)
THREE-TIER OUTPUT SYSTEM:
ALREADY_IN_LATTICE β€” word already confirmed in A1_Π—ΠΠŸΠ˜Π‘Π˜ (skip)
CONFIRMED_HIGH β€” score >= 8, Q+U pass, no R11 transposition
β†’ review before writing to A1_Π—ΠΠŸΠ˜Π‘Π˜
PENDING_REVIEW β€” score 5–7, OR transposition flag, OR ORIG2/Kashgari candidate
β†’ human judgment required
AUTO_REJECTED β€” score < 5 OR U-gate fail
β†’ discard at current analysis level
CLUSTER_BACKLOG β€” words discovered via cluster expansion
NOTE: Russia has >50% Bitig (ORIG2) influence. Many words will route to
PENDING_REVIEW as ORIG2 candidates requiring Kashgari attestation.
This is EXPECTED β€” not a failure. The Bitig track is the primary discovery
pathway for Russian.
"""
import sys
import os
import json
import io
import contextlib
from datetime import datetime
from pathlib import Path
# ─── PATH SETUP ───────────────────────────────────────────────────────────────
THIS_DIR = Path(__file__).parent # "Code_files/"
WORKSPACE = Path("/Users/mmsetubal/Documents/USLaP workplace")
MASTER_FILE = WORKSPACE / "USLaP_Final_Data_Consolidated_Master_v3.xlsx"
OUTPUT_DIR = Path("/Users/mmsetubal/Documents/USLaP workplace/Batch Reports")
sys.path.insert(0, str(THIS_DIR))
# ─── SUPPRESS ENGINE STDOUT ───────────────────────────────────────────────────
class _Suppress:
"""Context manager: silence stdout from engine, capture to string."""
def __enter__(self):
self._buf = io.StringIO()
self._redirect = contextlib.redirect_stdout(self._buf)
self._redirect.__enter__()
return self
def __exit__(self, *args):
self._redirect.__exit__(*args)
def text(self):
return self._buf.getvalue()
# ─── RUSSIAN WORD LIST ────────────────────────────────────────────────────────
# ~300 Russian words selected for QUF discovery.
# Covers: governance, military, trade, nature, body, household, food, crafts,
# religion, animals, clothing, science, time, family, society.
# Mix of suspected ORIG1 (Arabic) and ORIG2 (Bitig/Turkic) origins.
# Words already in A1_Π—ΠΠŸΠ˜Π‘Π˜ will be caught by DEDUP and reported as EXISTING.
RUSSIAN_WORD_LIST = [
# ═══ GOVERNANCE + LAW ═══
"Π·Π°ΠΊΠΎΠ½", "Π²Π»Π°ΡΡ‚ΡŒ", "ΠΏΡ€Π°Π²Π΄Π°", "суд", "ΠΏΡ€Π°Π²ΠΈΡ‚Π΅Π»ΡŒ", "порядок",
"Π΄Π΅Ρ€ΠΆΠ°Π²Π°", "прСстол", "воТдь", "ΠΏΠ°Π΄ΠΈΡˆΠ°Ρ…", "султан", "эмир",
"Π²ΠΎΠ΅Π²ΠΎΠ΄Π°", "Π΄ΡƒΠΌΠ°", "ΡƒΠΊΠ°Π·", "ярлык", "Π³Ρ€Π°ΠΌΠΎΡ‚Π°", "ΠΏΠ΅Ρ‡Π°Ρ‚ΡŒ",
"ханство", "улус", "Π±Π΅ΠΊ", "ΠΌΡƒΡ€Π·Π°", "Ρ‚Π΅ΠΌΠ½ΠΈΠΊ", "Π½ΠΎΠΉΠΎΠ½",
# ═══ MILITARY + WARFARE ═══
"войско", "ΠΏΠΎΠ»ΠΊ", "страТа", "Π΄ΠΎΠ·ΠΎΡ€", "засада",
"ΠΊΠΈΠ½ΠΆΠ°Π»", "сабля", "Π±ΡƒΠ»Π°Ρ‚", "ΠΊΠΎΠ»ΡŒΡ‡ΡƒΠ³Π°", "Ρ‰ΠΈΡ‚",
"знамя", "набСг", "осада", "побСда", "плСнник",
"дСсант", "Π³Π°Ρ€Π½ΠΈΠ·ΠΎΠ½", "ΠΊΡ€Π΅ΠΏΠΎΡΡ‚ΡŒ", "бастион", "батарСя",
# ═══ TRADE + ECONOMY ═══
"торговля", "Ρ†Π΅Π½Π°", "Π΄ΠΎΠ»Π³", "ΠΏΡ€ΠΈΠ±Ρ‹Π»ΡŒ", "Ρ€ΡƒΠ±Π»ΡŒ",
"Π±Π°Π½ΠΊ", "вСксСль", "ΠΏΡ€ΠΎΡ†Π΅Π½Ρ‚", "Π·Π°Π»ΠΎΠ³", "пошлина",
"Π»Π°Π²ΠΊΠ°", "ярмарка", "Π±Π°Ρ€Ρ‹Ρˆ", "бакшиш", "Π΄ΡƒΠΊΠ°Ρ‚",
"сСрСбро", "Π·ΠΎΠ»ΠΎΡ‚ΠΎ", "ΠΆΠ΅ΠΌΡ‡ΡƒΠ³", "Π±ΠΈΡ€ΡŽΠ·Π°", "ΡΠ½Ρ‚Π°Ρ€ΡŒ",
# ═══ NATURE + GEOGRAPHY ═══
"ΡΡ‚Π΅ΠΏΡŒ", "Ρ‚Π°ΠΉΠ³Π°", "Ρ‚ΡƒΠ½Π΄Ρ€Π°", "Π±ΠΎΠ»ΠΎΡ‚ΠΎ", "пустыня",
"Ρ€Π΅ΠΊΠ°", "ΠΎΠ·Π΅Ρ€ΠΎ", "ΠΌΠΎΡ€Π΅", "Π³ΠΎΡ€Π°", "Π΄ΠΎΠ»ΠΈΠ½Π°",
"камСнь", "Π³Π»ΠΈΠ½Π°", "пСсок", "соль", "Π½Π΅Ρ„Ρ‚ΡŒ",
"Π²Π΅Ρ‚Π΅Ρ€", "буря", "Π³Ρ€ΠΎΠ·Π°", "молния", "Ρ€Π°Π΄ΡƒΠ³Π°",
"лСс", "ΠΏΠΎΠ»Π΅", "сад", "Ρ€ΠΎΡ‰Π°", "ΠΎΠ²Ρ€Π°Π³",
# ═══ ANIMALS ═══
"Π²Π΅Ρ€Π±Π»ΡŽΠ΄", "лошадь", "Π±Π°Ρ€Π°Π½", "Π±Ρ‹ΠΊ", "осёл",
"соловСй", "Π±Π΅Ρ€ΠΊΡƒΡ‚", "сокол", "ΠΎΡ€Ρ‘Π»", "ΠΆΡƒΡ€Π°Π²Π»ΡŒ",
"ΠΊΠ°Π±Π°Π½", "барсук", "Π²ΠΎΠ»ΠΊ", "Ρ‚ΠΈΠ³Ρ€", "Ρ€Ρ‹ΡΡŒ",
"собака", "кошка", "Π²ΠΎΡ€ΠΎΠ½", "змСя", "Ρ€Ρ‹Π±Π°",
# ═══ BODY + HEALTH ═══
"Π³ΠΎΠ»ΠΎΠ²Π°", "сСрдцС", "ΠΊΡ€ΠΎΠ²ΡŒ", "ΠΊΠΎΡΡ‚ΡŒ", "ΠΊΠΎΠΆΠ°",
"Π³Π»Π°Π·", "ΡƒΡ…ΠΎ", "Ρ€ΡƒΠΊΠ°", "Π½ΠΎΠ³Π°", "ΠΏΠ°Π»Π΅Ρ†",
"ΠΊΡƒΠ»Π°ΠΊ", "Π³ΠΎΡ€Π»ΠΎ", "Π³Ρ€ΡƒΠ΄ΡŒ", "ΠΆΠΈΠ²ΠΎΡ‚", "спина",
"Ρ€Π°Π½Π°", "Π»Π΅ΠΊΠ°Ρ€ΡŒ", "Π²Ρ€Π°Ρ‡", "больной", "яд",
"бальзам", "мазь", "Ρ†Π΅Π»ΠΈΡ‚Π΅Π»ΡŒ", "ΠΆΠ°Ρ€", "ΡΠΌΠ΅Ρ€Ρ‚ΡŒ",
# ═══ FOOD + DRINK ═══
"ΠΏΠ»ΠΎΠ²", "лаваш", "ΡˆΠ°ΡˆΠ»Ρ‹ΠΊ", "Ρ…Π»Π΅Π±", "мясо",
"Ρ…ΡƒΡ€ΠΌΠ°", "Π½ΡƒΡ‚", "рис", "ΠΌΡ‘Π΄", "ΠΌΠΎΠ»ΠΎΠΊΠΎ",
"Ρ‡Π°ΠΉ", "Π²ΠΈΠ½ΠΎ", "сироп", "масло", "уксус",
"ΠΏΠ΅Ρ€Π΅Ρ†", "Ρ‚ΠΌΠΈΠ½", "ΡˆΠ°Ρ„Ρ€Π°Π½", "ΠΊΠΎΡ€ΠΈΡ†Π°", "ΠΈΠΌΠ±ΠΈΡ€ΡŒ",
"ΠΉΠΎΠ³ΡƒΡ€Ρ‚", "каша", "суп", "соус", "Π»ΠΈΠΌΠΎΠ½",
# ═══ HOUSEHOLD + TOOLS ═══
"ΠΊΠΎΠ²Ρ‘Ρ€", "Π΄ΠΈΠ²Π°Π½", "Ρ‚Π°Π±ΡƒΡ€Π΅Ρ‚", "ΠΏΠΎΠ΄ΡƒΡˆΠΊΠ°", "Π·Π΅Ρ€ΠΊΠ°Π»ΠΎ",
"ΠΊΡƒΠ²ΡˆΠΈΠ½", "Ρ‡Π°ΡˆΠΊΠ°", "блюдо", "Π»ΠΎΠΆΠΊΠ°", "Π½ΠΎΠΆ",
"самовар", "Ρ„ΠΎΠ½Π°Ρ€ΡŒ", "Π»Π°ΠΌΠΏΠ°", "свСча", "ΠΊΠΎΡ‚Ρ‘Π»",
"Π·Π°ΠΌΠΎΠΊ", "ΠΊΠ»ΡŽΡ‡", "ΠΏΠΈΠ»Π°", "ΠΌΠΎΠ»ΠΎΡ‚ΠΎΠΊ", "Ρ‚ΠΎΠΏΠΎΡ€",
"Π±Π°Π»ΠΊΠΎΠ½", "мансарда", "Ρ‡Π΅Ρ€Π΄Π°ΠΊ", "ΠΏΠΎΠ΄Π²Π°Π»", "Π·Π°Π±ΠΎΡ€",
# ═══ CLOTHING + TEXTILES ═══
"ΠΊΠ°Ρ„Ρ‚Π°Π½", "Ρ‡Π°Π»ΠΌΠ°", "ΡˆΠ°Ρ€ΠΎΠ²Π°Ρ€Ρ‹", "Ρ‚ΡƒΠ»ΡƒΠΏ", "ΡˆΡƒΠ±Π°",
"ΠΏΠ»Π°Ρ‚ΠΎΠΊ", "пояс", "сапог", "Π²ΠΎΠΉΠ»ΠΎΠΊ", "Π±Π°Ρ€Ρ…Π°Ρ‚",
"ΡˆΡ‘Π»ΠΊ", "Ρ…Π»ΠΎΠΏΠΎΠΊ", "ΠΏΠ°Ρ€Ρ‡Π°", "Ρ‚Π΅ΡΡŒΠΌΠ°", "Π½ΠΈΡ‚ΡŒ",
# ═══ RELIGION + FAITH ═══
"Π½Π°ΠΌΠ°Π·", "ΠΌΠΈΠ½Π±Π°Ρ€", "Ρ…Π°Π΄ΠΆ", "закят", "Π²Π°ΠΊΡ„",
"муэдзин", "ΠΈΠΌΠ°ΠΌ", "ΠΌΡƒΠ»Π»Π°", "Π΄Π΅Ρ€Π²ΠΈΡˆ", "суфий",
"ΠΌΠΈΡ…Ρ€Π°Π±", "масдТид", "ΠΌΠΈΠ½Π°Ρ€Π΅Ρ‚", "ΠΊΡƒΠΏΠΎΠ»", "ΠΌΠ΅Ρ‡Π΅Ρ‚ΡŒ",
# ═══ SCIENCE + CRAFT ═══
"Π°Π»Π³Π΅Π±Ρ€Π°", "Ρ†ΠΈΡ„Ρ€Π°", "число", "ΠΌΠ΅Ρ€Π°", "вСсы",
"Π·ΠΎΠ΄Ρ‡ΠΈΠΉ", "ΠΊΠ°ΠΌΠ΅Π½Ρ‰ΠΈΠΊ", "Π³ΠΎΠ½Ρ‡Π°Ρ€", "ΠΊΡƒΠ·Π½Π΅Ρ†", "Ρ‚ΠΊΠ°Ρ‡",
"Ρ‡Π΅Ρ€Π½ΠΈΠ»Π°", "Π±ΡƒΠΌΠ°Π³Π°", "ΠΊΠ½ΠΈΠ³Π°", "ΠΏΠ΅Ρ‡Π°Ρ‚ΡŒ", "Π±ΡƒΠΊΠ²Π°",
"астрономия", "химия", "гСомСтрия", "ΠΌΠ΅Π΄ΠΈΡ†ΠΈΠ½Π°", "хирургия",
# ═══ TIME + CALENDAR ═══
"врСмя", "час", "дСнь", "Π½ΠΎΡ‡ΡŒ", "ΡƒΡ‚Ρ€ΠΎ",
"рассвСт", "Π·Π°ΠΊΠ°Ρ‚", "Π»ΡƒΠ½Π°", "Π·Π²Π΅Π·Π΄Π°", "солнцС",
"Π³ΠΎΠ΄", "мСсяц", "нСдСля", "пятница", "суббота",
# ═══ FAMILY + SOCIETY ═══
"ΠΎΡ‚Π΅Ρ†", "ΠΌΠ°Ρ‚ΡŒ", "Π±Ρ€Π°Ρ‚", "сСстра", "сын",
"Π΄ΠΎΡ‡ΡŒ", "ΠΆΠ΅Π½Π°", "ΠΌΡƒΠΆ", "сСмья", "Ρ€ΠΎΠ΄",
"Π½Π°Ρ€ΠΎΠ΄", "плСмя", "ΠΎΠ±Ρ‰ΠΈΠ½Π°", "сосСд", "Π³ΠΎΡΡ‚ΡŒ",
"Π΄Ρ€ΡƒΠ³", "Π²Ρ€Π°Π³", "Ρ€Π°Π±", "свободный", "ΠΌΡƒΠ΄Ρ€Π΅Ρ†",
# ═══ ADDITIONAL HIGH-YIELD TERMS ═══
# (suspected Arabic/Turkic that aren't in A1_Π—ΠΠŸΠ˜Π‘Π˜ yet)
"ΡˆΠ°Ρ…Ρ‚Π°", "маяк", "Ρ‚Π°Π»Π°Π½Ρ‚", "Ρ€Π΅Ρ†Π΅ΠΏΡ‚", "Ρ‚ΡŽΡ€Π±Π°Π½",
"Π³Π°Ρ€Π΅ΠΌ", "Π³Π°Π·Π΅Ρ‚Π°", "ΠΆΡƒΡ€Π½Π°Π»", "Π°Π²Ρ‚ΠΎΠΌΠ°Ρ‚", "ΠΊΠΈΠ±ΠΈΡ‚ΠΊΠ°",
"Ρ‚Π°Ρ€Ρ…Π°Π½", "ΠΊΡƒΡ€ΡƒΠ»Ρ‚Π°ΠΉ", "Π±Π°ΠΉΡ€Π°ΠΌ", "аксакал", "Π±Π°Ρ‚Ρ‹Ρ€",
"ΠΈΠΌΠ°Π½", "ΠΊΠΈΡ‚Π°Π±", "Π΄ΠΆΠΈΡ…Π°Π΄", "ΡˆΠ°Ρ€ΠΈΠ°Ρ‚", "Ρ„Π΅Ρ‚Π²Π°",
"масло", "мастСр", "рСмСсло", "Ρ€Ρ‹Π½ΠΎΠΊ", "богатство",
"Π΄ΡƒΡˆΠ°", "Ρ€Π°Π·ΡƒΠΌ", "ΡΠΎΠ²Π΅ΡΡ‚ΡŒ", "истина", "ΡΠΏΡ€Π°Π²Π΅Π΄Π»ΠΈΠ²ΠΎΡΡ‚ΡŒ",
"хозяин", "намСстник", "посол", "Π΄ΠΎΠ³ΠΎΠ²ΠΎΡ€", "ΠΌΠΈΡ€",
"ΠΊΠ°Π·Π°Ρ€ΠΌΠ°", "Π»Π°Π·Π°Ρ€Π΅Ρ‚", "Π³ΠΎΡΠΏΠΈΡ‚Π°Π»ΡŒ", "Π°ΠΏΡ‚Π΅ΠΊΠ°", "бальзам",
"Ρ‚Π°Π±Π°ΠΊ", "кальян", "Ρ…Π½Π°", "мускус", "Π°ΠΌΠ±Ρ€Π°",
"Π°Ρ€Π±Π°Π»Π΅Ρ‚", "ΠΏΡƒΡˆΠΊΠ°", "ΠΏΠΎΡ€ΠΎΡ…", "снаряд", "ΠΌΡƒΡˆΠΊΠ΅Ρ‚",
]
# Remove duplicates while preserving order
_seen = set()
RUSSIAN_WORD_LIST = [w for w in RUSSIAN_WORD_LIST if not (w in _seen or _seen.add(w))]
# ─── RESULT SERIALISER ────────────────────────────────────────────────────────
def serialise_result(word: str, result) -> dict:
"""Convert ProcessResult to JSON-safe dict."""
rec = {
"word": word.upper(),
"existing_entry_id": result.existing_entry_id,
"category": _categorise(result),
"score": None,
"root_letters": None,
"ar_word": None,
"phonetic_chain": None,
"positional_score": None,
"transposition_flag": False,
"extra_consonants": 0,
"q_gate": None,
"u_gate": None,
"f_gate": None,
"orig2_track": getattr(result, 'orig2_track', False),
"orig2_details": None,
"cognate_crossref": None, # v3.3: English↔Russian cognate data
"compound_parts": None, # v3.4: compound word analysis (БАМО+ВАР)
"sem_review": getattr(result, 'sem_review', False), # v3.4
"cluster_members": result.cluster_members[:20],
"log_lines": result.log,
}
# v3.3: Cognate cross-reference data
cog = getattr(result, 'cognate_crossref', None)
if cog:
rec["cognate_crossref"] = {
"source": cog.get('source', ''),
"en_cousin": cog.get('en_cousin', ''),
"root_letters": cog.get('root_letters', ''),
"score": cog.get('score', None),
"phonetic_chain": cog.get('phonetic_chain', ''),
"variant_used": cog.get('variant_used', ''),
"word_form_used": cog.get('word_form_used', ''),
"entry_id": cog.get('entry_id', None),
"note": cog.get('note', ''),
}
# v3.4: Compound parts analysis
cp = getattr(result, 'compound_parts', None)
if cp:
rec["compound_parts"] = {
"label": cp.get('label', ''),
"bridge": cp.get('bridge', ''),
"prefix": cp.get('prefix'), # dict or None
"root": cp.get('root'), # dict or None
}
# ORIG2 details
if getattr(result, 'orig2_track', False) and getattr(result, 'orig2_details', None):
rec["orig2_details"] = {
"kashgari_translit": result.orig2_details.get('kashgari_translit', ''),
"kashgari_meaning": result.orig2_details.get('kashgari_meaning', ''),
"kashgari_line": result.orig2_details.get('kashgari_line', 0),
"attestation_type": result.orig2_details.get('attestation_type', ''),
"skeleton": result.orig2_details.get('skeleton', ''),
"all_hits": result.orig2_details.get('all_hits', 0),
"bitig_warnings": result.orig2_details.get('bitig_warnings', []),
}
if result.confirmed_root:
rec["root_letters"] = result.confirmed_root.letters
rec["ar_word"] = result.confirmed_root.ar_word
rec["score"] = result.confirmed_root.score
rec["phonetic_chain"] = result.confirmed_root.phonetic_chain
rec["positional_score"] = getattr(result.confirmed_root, 'positional_score', None)
rec["transposition_flag"] = getattr(result.confirmed_root, 'transposition_flag', False)
rec["extra_consonants"] = getattr(result.confirmed_root, 'extra_consonants', 0)
if result.q_gate:
rec["q_gate"] = {
"passed": result.q_gate.passed,
"token_count": result.q_gate.details.get("token_count", 0),
"ar_word": result.q_gate.details.get("ar_word", ""),
"verse": result.q_gate.details.get("verse", ""),
"orig2_candidate": result.q_gate.details.get("orig2_candidate", False),
}
if result.u_gate:
rec["u_gate"] = {
"passed": result.u_gate.passed,
"phonetic_chain": result.u_gate.details.get("phonetic_chain", ""),
}
if result.f_gate:
rec["f_gate"] = {
"passed": result.f_gate.passed,
"ds_code": result.f_gate.details.get("ds_code", ""),
"network_id": result.f_gate.details.get("network_id", ""),
"dp_codes": result.f_gate.details.get("dp_codes", []),
}
return rec
def _categorise(result) -> str:
"""
Three-tier classification:
ALREADY_IN_LATTICE β€” already in A1_Π—ΠΠŸΠ˜Π‘Π˜
CONFIRMED_HIGH β€” score >= 8, Q+U pass, no R11 transposition
PENDING_REVIEW β€” score 5–7, or transposition, or ORIG2 match
AUTO_REJECTED β€” score < 5, or U-gate fail, or no root at all
"""
if result.existing_entry_id is not None:
return "ALREADY_IN_LATTICE"
# ORIG2 track: always PENDING_REVIEW (needs Kashgari verification)
if getattr(result, 'orig2_track', False):
return "PENDING_REVIEW"
if result.confirmed_root is None:
return "PENDING_REVIEW"
score = result.confirmed_root.score
q = result.q_gate.passed if result.q_gate else False
u = result.u_gate.passed if result.u_gate else False
trans = getattr(result.confirmed_root, 'transposition_flag', False)
if score >= 8 and q and u and not trans:
return "CONFIRMED_HIGH"
if score >= 5 and (q or u):
return "PENDING_REVIEW"
return "AUTO_REJECTED"
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def run_batch(word_list: list, output_dir: Path) -> dict:
"""
Run engine in dry_run=True mode on every Russian word.
Returns full results dict. Saves JSON + TXT to output_dir.
"""
print("Importing USLaP_Engine (v3.0 dual-language)...")
with _Suppress():
from USLaP_Engine import USLaPEngine
print(f"Initialising engine with master file...")
with _Suppress() as s:
try:
engine = USLaPEngine(master_file=str(MASTER_FILE), skip_reports=True)
except Exception as e:
print(f"\nERROR: Engine init failed: {e}")
print(s.text())
sys.exit(1)
print(f"Engine ready (v3.0 β€” EN+RU dual-language).\n")
# Buckets
results_by_cat = {
"ALREADY_IN_LATTICE": [],
"CONFIRMED_HIGH": [],
"PENDING_REVIEW": [],
"AUTO_REJECTED": [],
}
cluster_backlog = set()
total = len(word_list)
# Process loop
for i, word in enumerate(word_list, 1):
pct = (i / total) * 100
print(f" [{i:>3}/{total}] {pct:>5.1f}% {word:<20}", end="", flush=True)
with _Suppress():
try:
result = engine.process(word, dry_run=True)
except Exception as e:
print(f" ERROR: {e}")
continue
rec = serialise_result(word, result)
cat = rec["category"]
results_by_cat[cat].append(rec)
# Collect cluster discoveries
for candidate in result.cluster_members:
if isinstance(candidate, str):
cluster_backlog.add(candidate.upper())
# Inline status
root = rec.get("root_letters", "?")
score = rec.get("score", "?")
trans = rec.get("transposition_flag", False)
pos = rec.get("positional_score")
pos_s = f" pos={pos:.2f}" if pos is not None else ""
r11 = " ⚠R11" if trans else ""
# v3.3: cognate suffix
cog_rec = rec.get("cognate_crossref")
cog_s = ""
if cog_rec and cog_rec.get("source") == "EN_PIPELINE":
cog_s = f" ↔{cog_rec['en_cousin']}β†’{cog_rec['root_letters']}(s{cog_rec.get('score','?')})"
elif cog_rec and cog_rec.get("source") == "LATTICE_ENTRY":
cog_s = f" ↔LAT#{cog_rec.get('entry_id','?')}"
# v3.4: compound suffix
cp_rec = rec.get("compound_parts")
cp_s = ""
if cp_rec and cp_rec.get("label"):
cp_s = f" [{cp_rec['label']}]"
if cat == "ALREADY_IN_LATTICE":
print(f" βœ“ EXISTING #{result.existing_entry_id}")
elif cat == "CONFIRMED_HIGH":
print(f" β˜… CONFIRMED root={root:<12} score={score}/10{pos_s}{cog_s}{cp_s}")
elif cat == "PENDING_REVIEW":
if getattr(result, 'orig2_track', False):
kd = getattr(result, 'orig2_details', {}) or {}
kt = kd.get('kashgari_translit', '?')
ka = kd.get('attestation_type', '?')
print(f" β—† ORIG2 Kashgari='{kt}' ({ka}) score={score}/10{cog_s}{cp_s}")
elif result.confirmed_root is None:
print(f" ~ PENDING (no ORIG1 root, no ORIG2 match)")
else:
print(f" ~ PENDING root={root:<12} score={score}/10{pos_s}{r11}{cog_s}{cp_s}")
else:
print(f" βœ— REJECTED root={root:<12} score={score}/10{r11}{cp_s}")
# Remove input + existing from cluster backlog
input_upper = {w.upper() for w in word_list}
existing_upper = {r["word"] for r in results_by_cat["ALREADY_IN_LATTICE"]}
cluster_backlog -= input_upper
cluster_backlog -= existing_upper
# Build report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report = {
"run_date": datetime.now().isoformat(),
"engine_version": "v3.0 (EN+RU dual-language + multi-candidate)",
"language": "Russian (RU)",
"master_file": str(MASTER_FILE),
"total_words": total,
"summary": {
"already_in_lattice": len(results_by_cat["ALREADY_IN_LATTICE"]),
"confirmed_high": len(results_by_cat["CONFIRMED_HIGH"]),
"pending_review": len(results_by_cat["PENDING_REVIEW"]),
"auto_rejected": len(results_by_cat["AUTO_REJECTED"]),
"cluster_backlog": len(cluster_backlog),
},
"already_in_lattice": results_by_cat["ALREADY_IN_LATTICE"],
"confirmed_high": results_by_cat["CONFIRMED_HIGH"],
"pending_review": results_by_cat["PENDING_REVIEW"],
"auto_rejected": results_by_cat["AUTO_REJECTED"],
"cluster_backlog": sorted(cluster_backlog),
}
# Save JSON
output_dir.mkdir(parents=True, exist_ok=True)
json_path = output_dir / f"RU_BATCH_REPORT_{timestamp}.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"\n JSON report saved: {json_path}")
# Save TXT summary
txt_path = output_dir / f"RU_BATCH_SUMMARY_{timestamp}.txt"
_write_txt_summary(report, txt_path)
print(f" TXT summary saved: {txt_path}")
return report
def _write_txt_summary(report: dict, path: Path):
"""Write a human-readable Russian batch summary."""
s = report["summary"]
lines = [
"═" * 70,
" USLaP Russian Batch Runner v1.0 β€” Discovery Summary",
" بِسْمِ Ψ§Ω„Ω„ΩŽΩ‘Ω‡Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­Ω’Ω…ΩŽΩ°Ω†Ω Ψ§Ω„Ψ±ΩŽΩ‘Ψ­ΩΩŠΩ…Ω",
" Language: Russian (RU) β€” >50% ORIG2 (Bitig/Turkic) expected",
"═" * 70,
f" Run date: {report['run_date']}",
f" Engine: {report.get('engine_version', 'v3.0')}",
f" Words run: {report['total_words']}",
"─" * 70,
f" βœ“ Already in A1_Π—ΠΠŸΠ˜Π‘Π˜: {s['already_in_lattice']:>4} (no action needed)",
f" β˜… CONFIRMED HIGH: {s['confirmed_high']:>4} ← review & write to A1_Π—ΠΠŸΠ˜Π‘Π˜",
f" ~ PENDING REVIEW: {s['pending_review']:>4} ← human QUF adjudication",
f" βœ— AUTO REJECTED: {s['auto_rejected']:>4} (U-gate fail or score < 5)",
f" + Cluster backlog: {s['cluster_backlog']:>4} (new words via root expansion)",
"─" * 70,
"",
" NOTE: High PENDING count is EXPECTED for Russian β€” many words are",
" ORIG2 (Bitig/Turkic) and need Kashgari attestation, not Q-gate.",
"",
" β˜… CONFIRMED HIGH β€” ORIG1 candidates (score β‰₯ 8, Q+U pass):",
"─" * 70,
]
for rec in report["confirmed_high"]:
root = rec.get("root_letters", "?")
score = rec.get("score", "?")
chain = rec.get("phonetic_chain", "?") or "β€”"
tokens = rec.get("q_gate", {}).get("token_count", "?") if rec.get("q_gate") else "?"
pos = rec.get("positional_score")
pos_s = f" pos={pos:.2f}" if pos is not None else ""
net = rec.get("f_gate", {}).get("network_id", "") if rec.get("f_gate") else ""
net_s = f" [{net}]" if net else ""
lines.append(
f" {rec['word']:<22} root={root:<12} score={score}/10 tokens={tokens}{pos_s}{net_s}"
)
lines.append(f" chain: {chain}")
# Split PENDING into ORIG2 and others
orig2_pending = [r for r in report["pending_review"] if r.get("orig2_track")]
other_pending = [r for r in report["pending_review"] if not r.get("orig2_track")]
if orig2_pending:
lines += [
"",
f" β—† ORIG2 (KASHGARI) MATCHES β€” {len(orig2_pending)} words attested in Bitig:",
"─" * 70,
]
for rec in orig2_pending:
od = rec.get("orig2_details", {}) or {}
kt = od.get("kashgari_translit", "?")
km = od.get("kashgari_meaning", "?")
kl = od.get("kashgari_line", "?")
ka = od.get("attestation_type", "?")
score = rec.get("score", "?")
warns = od.get("bitig_warnings", [])
warn_s = f" ⚠ {'; '.join(warns)}" if warns else ""
km_short = km[:50] + "..." if len(km) > 50 else km
lines.append(
f" {rec['word']:<20} Kashgari='{kt}' ({ka}, line {kl}) score={score}/10{warn_s}"
)
lines.append(f" meaning: \"{km_short}\"")
lines += [
"",
f" ~ PENDING REVIEW β€” {len(other_pending)} words need human QUF adjudication:",
"─" * 70,
]
for rec in other_pending:
root = rec.get("root_letters") or "NO ORIG1 ROOT"
score = rec.get("score", "?")
trans = rec.get("transposition_flag", False)
q_ok = rec.get("q_gate", {}).get("passed", False) if rec.get("q_gate") else False
u_ok = rec.get("u_gate", {}).get("passed", False) if rec.get("u_gate") else False
flags = []
if trans: flags.append("⚠R11-TRANSPOSITION")
if not q_ok: flags.append("Q-FAIL")
if not u_ok: flags.append("U-FAIL")
flag_s = " " + " | ".join(flags) if flags else ""
lines.append(f" {rec['word']:<22} root={root:<12} score={score}/10{flag_s}")
# Rejected
lines += [
"",
f" βœ— AUTO REJECTED β€” {len(report['auto_rejected'])} words:",
"─" * 70,
]
for rec in report["auto_rejected"]:
root = rec.get("root_letters") or "?"
score = rec.get("score", "?")
lines.append(f" {rec['word']:<22} root={root:<12} score={score}/10")
# Cluster backlog
lines += [
"",
" + CLUSTER BACKLOG β€” words discovered via root expansion:",
"─" * 70,
]
for w in sorted(report["cluster_backlog"]):
lines.append(f" {w}")
lines += [
"",
"═" * 70,
" NEXT STEPS:",
" 1. CONFIRMED_HIGH β†’ verify ROOT_ID + QUR_MEANING β†’ write to A1_Π—ΠΠŸΠ˜Π‘Π˜",
" 2. ORIG2 matches β†’ verify Kashgari attestation β†’ write to BITIG_A1_ENTRIES",
" 3. PENDING with Q-FAIL β†’ check Kashgari corpus (ORIG2 track)",
" 4. PENDING with ⚠R11 β†’ recheck phonetic chain (transposition)",
" 5. Cross-reference with English A1_ENTRIES for sibling entries",
" 6. CLUSTER_BACKLOG β†’ run batch_runner_ru again with these as input",
"═" * 70,
]
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
# ─── ENTRY POINT ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
if not OUTPUT_DIR.exists():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if not MASTER_FILE.exists():
print(f"ERROR: Master file not found:\n {MASTER_FILE}")
sys.exit(1)
# Word source: CLI arg (custom file) or built-in list
if len(sys.argv) > 1:
custom_file = Path(sys.argv[1])
if not custom_file.exists():
print(f"ERROR: Word file not found: {custom_file}")
sys.exit(1)
with open(custom_file, "r", encoding="utf-8") as f:
word_list = [line.strip().lower() for line in f if line.strip()]
print(f"Loaded {len(word_list)} words from {custom_file.name}")
else:
word_list = RUSSIAN_WORD_LIST
print(f"Using built-in Russian word list: {len(word_list)} words")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Master file: {MASTER_FILE.name}")
print(f"Language: Russian (RU) β€” ORIG1 + ORIG2 dual-track")
print(f"Mode: DRY RUN (no writes to Excel)\n")
print("─" * 62)
report = run_batch(word_list, OUTPUT_DIR)
# Terminal summary
s = report["summary"]
print("\n" + "═" * 70)
print(" RUSSIAN BATCH COMPLETE β€” THREE-TIER SUMMARY (v1.0)")
print("═" * 70)
print(f" Words processed: {report['total_words']}")
print(f" βœ“ Already in lattice: {s['already_in_lattice']}")
print(f" β˜… CONFIRMED HIGH: {s['confirmed_high']} ← review & write to A1_Π—ΠΠŸΠ˜Π‘Π˜")
print(f" ~ PENDING REVIEW: {s['pending_review']} ← human QUF adjudication")
print(f" βœ— AUTO REJECTED: {s['auto_rejected']}")
print(f" + Cluster backlog: {s['cluster_backlog']} ← bonus discoveries")
print("═" * 70)
print("\n NOTE: For Russian, high PENDING is expected (>50% ORIG2/Bitig).")
print(" ORIG2 matches need Kashgari attestation β€” NOT Q-gate.")
print(" Open TXT summary for annotated review. JSON for machine-readable data.")