Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| USLaP Russian Batch Runner v1.0 | |
| Ψ¨ΩΨ³ΩΩ Ω Ψ§ΩΩΩΩΩΩ Ψ§ΩΨ±ΩΩΨΩΩ ΩΩ°ΩΩ Ψ§ΩΨ±ΩΩΨΩΩΩ Ω | |
| Runs USLaP_Engine v3.0 (dual-language) in DRY-RUN mode against a Russian word list. | |
| NO writes to the master Excel file. Discovery only. | |
| Output: | |
| - Batch Reports/RU_BATCH_REPORT_<timestamp>.json (full machine-readable results) | |
| - Batch Reports/RU_BATCH_SUMMARY_<timestamp>.txt (human-readable summary) | |
| Usage: | |
| python3 batch_runner_ru.py # uses built-in 300-word list | |
| python3 batch_runner_ru.py my_words.txt # uses your own word list (one word per line) | |
| THREE-TIER OUTPUT SYSTEM: | |
| ALREADY_IN_LATTICE β word already confirmed in A1_ΠΠΠΠΠ‘Π (skip) | |
| CONFIRMED_HIGH β score >= 8, Q+U pass, no R11 transposition | |
| β review before writing to A1_ΠΠΠΠΠ‘Π | |
| PENDING_REVIEW β score 5β7, OR transposition flag, OR ORIG2/Kashgari candidate | |
| β human judgment required | |
| AUTO_REJECTED β score < 5 OR U-gate fail | |
| β discard at current analysis level | |
| CLUSTER_BACKLOG β words discovered via cluster expansion | |
| NOTE: Russia has >50% Bitig (ORIG2) influence. Many words will route to | |
| PENDING_REVIEW as ORIG2 candidates requiring Kashgari attestation. | |
| This is EXPECTED β not a failure. The Bitig track is the primary discovery | |
| pathway for Russian. | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import io | |
| import contextlib | |
| from datetime import datetime | |
| from pathlib import Path | |
| # βββ PATH SETUP βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| THIS_DIR = Path(__file__).parent # "Code_files/" | |
| WORKSPACE = Path("/Users/mmsetubal/Documents/USLaP workplace") | |
| MASTER_FILE = WORKSPACE / "USLaP_Final_Data_Consolidated_Master_v3.xlsx" | |
| OUTPUT_DIR = Path("/Users/mmsetubal/Documents/USLaP workplace/Batch Reports") | |
| sys.path.insert(0, str(THIS_DIR)) | |
| # βββ SUPPRESS ENGINE STDOUT βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _Suppress: | |
| """Context manager: silence stdout from engine, capture to string.""" | |
| def __enter__(self): | |
| self._buf = io.StringIO() | |
| self._redirect = contextlib.redirect_stdout(self._buf) | |
| self._redirect.__enter__() | |
| return self | |
| def __exit__(self, *args): | |
| self._redirect.__exit__(*args) | |
| def text(self): | |
| return self._buf.getvalue() | |
| # βββ RUSSIAN WORD LIST ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ~300 Russian words selected for QUF discovery. | |
| # Covers: governance, military, trade, nature, body, household, food, crafts, | |
| # religion, animals, clothing, science, time, family, society. | |
| # Mix of suspected ORIG1 (Arabic) and ORIG2 (Bitig/Turkic) origins. | |
| # Words already in A1_ΠΠΠΠΠ‘Π will be caught by DEDUP and reported as EXISTING. | |
| RUSSIAN_WORD_LIST = [ | |
| # βββ GOVERNANCE + LAW βββ | |
| "Π·Π°ΠΊΠΎΠ½", "Π²Π»Π°ΡΡΡ", "ΠΏΡΠ°Π²Π΄Π°", "ΡΡΠ΄", "ΠΏΡΠ°Π²ΠΈΡΠ΅Π»Ρ", "ΠΏΠΎΡΡΠ΄ΠΎΠΊ", | |
| "Π΄Π΅ΡΠΆΠ°Π²Π°", "ΠΏΡΠ΅ΡΡΠΎΠ»", "Π²ΠΎΠΆΠ΄Ρ", "ΠΏΠ°Π΄ΠΈΡΠ°Ρ ", "ΡΡΠ»ΡΠ°Π½", "ΡΠΌΠΈΡ", | |
| "Π²ΠΎΠ΅Π²ΠΎΠ΄Π°", "Π΄ΡΠΌΠ°", "ΡΠΊΠ°Π·", "ΡΡΠ»ΡΠΊ", "Π³ΡΠ°ΠΌΠΎΡΠ°", "ΠΏΠ΅ΡΠ°ΡΡ", | |
| "Ρ Π°Π½ΡΡΠ²ΠΎ", "ΡΠ»ΡΡ", "Π±Π΅ΠΊ", "ΠΌΡΡΠ·Π°", "ΡΠ΅ΠΌΠ½ΠΈΠΊ", "Π½ΠΎΠΉΠΎΠ½", | |
| # βββ MILITARY + WARFARE βββ | |
| "Π²ΠΎΠΉΡΠΊΠΎ", "ΠΏΠΎΠ»ΠΊ", "ΡΡΡΠ°ΠΆΠ°", "Π΄ΠΎΠ·ΠΎΡ", "Π·Π°ΡΠ°Π΄Π°", | |
| "ΠΊΠΈΠ½ΠΆΠ°Π»", "ΡΠ°Π±Π»Ρ", "Π±ΡΠ»Π°Ρ", "ΠΊΠΎΠ»ΡΡΡΠ³Π°", "ΡΠΈΡ", | |
| "Π·Π½Π°ΠΌΡ", "Π½Π°Π±Π΅Π³", "ΠΎΡΠ°Π΄Π°", "ΠΏΠΎΠ±Π΅Π΄Π°", "ΠΏΠ»Π΅Π½Π½ΠΈΠΊ", | |
| "Π΄Π΅ΡΠ°Π½Ρ", "Π³Π°ΡΠ½ΠΈΠ·ΠΎΠ½", "ΠΊΡΠ΅ΠΏΠΎΡΡΡ", "Π±Π°ΡΡΠΈΠΎΠ½", "Π±Π°ΡΠ°ΡΠ΅Ρ", | |
| # βββ TRADE + ECONOMY βββ | |
| "ΡΠΎΡΠ³ΠΎΠ²Π»Ρ", "ΡΠ΅Π½Π°", "Π΄ΠΎΠ»Π³", "ΠΏΡΠΈΠ±ΡΠ»Ρ", "ΡΡΠ±Π»Ρ", | |
| "Π±Π°Π½ΠΊ", "Π²Π΅ΠΊΡΠ΅Π»Ρ", "ΠΏΡΠΎΡΠ΅Π½Ρ", "Π·Π°Π»ΠΎΠ³", "ΠΏΠΎΡΠ»ΠΈΠ½Π°", | |
| "Π»Π°Π²ΠΊΠ°", "ΡΡΠΌΠ°ΡΠΊΠ°", "Π±Π°ΡΡΡ", "Π±Π°ΠΊΡΠΈΡ", "Π΄ΡΠΊΠ°Ρ", | |
| "ΡΠ΅ΡΠ΅Π±ΡΠΎ", "Π·ΠΎΠ»ΠΎΡΠΎ", "ΠΆΠ΅ΠΌΡΡΠ³", "Π±ΠΈΡΡΠ·Π°", "ΡΠ½ΡΠ°ΡΡ", | |
| # βββ NATURE + GEOGRAPHY βββ | |
| "ΡΡΠ΅ΠΏΡ", "ΡΠ°ΠΉΠ³Π°", "ΡΡΠ½Π΄ΡΠ°", "Π±ΠΎΠ»ΠΎΡΠΎ", "ΠΏΡΡΡΡΠ½Ρ", | |
| "ΡΠ΅ΠΊΠ°", "ΠΎΠ·Π΅ΡΠΎ", "ΠΌΠΎΡΠ΅", "Π³ΠΎΡΠ°", "Π΄ΠΎΠ»ΠΈΠ½Π°", | |
| "ΠΊΠ°ΠΌΠ΅Π½Ρ", "Π³Π»ΠΈΠ½Π°", "ΠΏΠ΅ΡΠΎΠΊ", "ΡΠΎΠ»Ρ", "Π½Π΅ΡΡΡ", | |
| "Π²Π΅ΡΠ΅Ρ", "Π±ΡΡΡ", "Π³ΡΠΎΠ·Π°", "ΠΌΠΎΠ»Π½ΠΈΡ", "ΡΠ°Π΄ΡΠ³Π°", | |
| "Π»Π΅Ρ", "ΠΏΠΎΠ»Π΅", "ΡΠ°Π΄", "ΡΠΎΡΠ°", "ΠΎΠ²ΡΠ°Π³", | |
| # βββ ANIMALS βββ | |
| "Π²Π΅ΡΠ±Π»ΡΠ΄", "Π»ΠΎΡΠ°Π΄Ρ", "Π±Π°ΡΠ°Π½", "Π±ΡΠΊ", "ΠΎΡΡΠ»", | |
| "ΡΠΎΠ»ΠΎΠ²Π΅ΠΉ", "Π±Π΅ΡΠΊΡΡ", "ΡΠΎΠΊΠΎΠ»", "ΠΎΡΡΠ»", "ΠΆΡΡΠ°Π²Π»Ρ", | |
| "ΠΊΠ°Π±Π°Π½", "Π±Π°ΡΡΡΠΊ", "Π²ΠΎΠ»ΠΊ", "ΡΠΈΠ³Ρ", "ΡΡΡΡ", | |
| "ΡΠΎΠ±Π°ΠΊΠ°", "ΠΊΠΎΡΠΊΠ°", "Π²ΠΎΡΠΎΠ½", "Π·ΠΌΠ΅Ρ", "ΡΡΠ±Π°", | |
| # βββ BODY + HEALTH βββ | |
| "Π³ΠΎΠ»ΠΎΠ²Π°", "ΡΠ΅ΡΠ΄ΡΠ΅", "ΠΊΡΠΎΠ²Ρ", "ΠΊΠΎΡΡΡ", "ΠΊΠΎΠΆΠ°", | |
| "Π³Π»Π°Π·", "ΡΡ ΠΎ", "ΡΡΠΊΠ°", "Π½ΠΎΠ³Π°", "ΠΏΠ°Π»Π΅Ρ", | |
| "ΠΊΡΠ»Π°ΠΊ", "Π³ΠΎΡΠ»ΠΎ", "Π³ΡΡΠ΄Ρ", "ΠΆΠΈΠ²ΠΎΡ", "ΡΠΏΠΈΠ½Π°", | |
| "ΡΠ°Π½Π°", "Π»Π΅ΠΊΠ°ΡΡ", "Π²ΡΠ°Ρ", "Π±ΠΎΠ»ΡΠ½ΠΎΠΉ", "ΡΠ΄", | |
| "Π±Π°Π»ΡΠ·Π°ΠΌ", "ΠΌΠ°Π·Ρ", "ΡΠ΅Π»ΠΈΡΠ΅Π»Ρ", "ΠΆΠ°Ρ", "ΡΠΌΠ΅ΡΡΡ", | |
| # βββ FOOD + DRINK βββ | |
| "ΠΏΠ»ΠΎΠ²", "Π»Π°Π²Π°Ρ", "ΡΠ°ΡΠ»ΡΠΊ", "Ρ Π»Π΅Π±", "ΠΌΡΡΠΎ", | |
| "Ρ ΡΡΠΌΠ°", "Π½ΡΡ", "ΡΠΈΡ", "ΠΌΡΠ΄", "ΠΌΠΎΠ»ΠΎΠΊΠΎ", | |
| "ΡΠ°ΠΉ", "Π²ΠΈΠ½ΠΎ", "ΡΠΈΡΠΎΠΏ", "ΠΌΠ°ΡΠ»ΠΎ", "ΡΠΊΡΡΡ", | |
| "ΠΏΠ΅ΡΠ΅Ρ", "ΡΠΌΠΈΠ½", "ΡΠ°ΡΡΠ°Π½", "ΠΊΠΎΡΠΈΡΠ°", "ΠΈΠΌΠ±ΠΈΡΡ", | |
| "ΠΉΠΎΠ³ΡΡΡ", "ΠΊΠ°ΡΠ°", "ΡΡΠΏ", "ΡΠΎΡΡ", "Π»ΠΈΠΌΠΎΠ½", | |
| # βββ HOUSEHOLD + TOOLS βββ | |
| "ΠΊΠΎΠ²ΡΡ", "Π΄ΠΈΠ²Π°Π½", "ΡΠ°Π±ΡΡΠ΅Ρ", "ΠΏΠΎΠ΄ΡΡΠΊΠ°", "Π·Π΅ΡΠΊΠ°Π»ΠΎ", | |
| "ΠΊΡΠ²ΡΠΈΠ½", "ΡΠ°ΡΠΊΠ°", "Π±Π»ΡΠ΄ΠΎ", "Π»ΠΎΠΆΠΊΠ°", "Π½ΠΎΠΆ", | |
| "ΡΠ°ΠΌΠΎΠ²Π°Ρ", "ΡΠΎΠ½Π°ΡΡ", "Π»Π°ΠΌΠΏΠ°", "ΡΠ²Π΅ΡΠ°", "ΠΊΠΎΡΡΠ»", | |
| "Π·Π°ΠΌΠΎΠΊ", "ΠΊΠ»ΡΡ", "ΠΏΠΈΠ»Π°", "ΠΌΠΎΠ»ΠΎΡΠΎΠΊ", "ΡΠΎΠΏΠΎΡ", | |
| "Π±Π°Π»ΠΊΠΎΠ½", "ΠΌΠ°Π½ΡΠ°ΡΠ΄Π°", "ΡΠ΅ΡΠ΄Π°ΠΊ", "ΠΏΠΎΠ΄Π²Π°Π»", "Π·Π°Π±ΠΎΡ", | |
| # βββ CLOTHING + TEXTILES βββ | |
| "ΠΊΠ°ΡΡΠ°Π½", "ΡΠ°Π»ΠΌΠ°", "ΡΠ°ΡΠΎΠ²Π°ΡΡ", "ΡΡΠ»ΡΠΏ", "ΡΡΠ±Π°", | |
| "ΠΏΠ»Π°ΡΠΎΠΊ", "ΠΏΠΎΡΡ", "ΡΠ°ΠΏΠΎΠ³", "Π²ΠΎΠΉΠ»ΠΎΠΊ", "Π±Π°ΡΡ Π°Ρ", | |
| "ΡΡΠ»ΠΊ", "Ρ Π»ΠΎΠΏΠΎΠΊ", "ΠΏΠ°ΡΡΠ°", "ΡΠ΅ΡΡΠΌΠ°", "Π½ΠΈΡΡ", | |
| # βββ RELIGION + FAITH βββ | |
| "Π½Π°ΠΌΠ°Π·", "ΠΌΠΈΠ½Π±Π°Ρ", "Ρ Π°Π΄ΠΆ", "Π·Π°ΠΊΡΡ", "Π²Π°ΠΊΡ", | |
| "ΠΌΡΡΠ΄Π·ΠΈΠ½", "ΠΈΠΌΠ°ΠΌ", "ΠΌΡΠ»Π»Π°", "Π΄Π΅ΡΠ²ΠΈΡ", "ΡΡΡΠΈΠΉ", | |
| "ΠΌΠΈΡ ΡΠ°Π±", "ΠΌΠ°ΡΠ΄ΠΆΠΈΠ΄", "ΠΌΠΈΠ½Π°ΡΠ΅Ρ", "ΠΊΡΠΏΠΎΠ»", "ΠΌΠ΅ΡΠ΅ΡΡ", | |
| # βββ SCIENCE + CRAFT βββ | |
| "Π°Π»Π³Π΅Π±ΡΠ°", "ΡΠΈΡΡΠ°", "ΡΠΈΡΠ»ΠΎ", "ΠΌΠ΅ΡΠ°", "Π²Π΅ΡΡ", | |
| "Π·ΠΎΠ΄ΡΠΈΠΉ", "ΠΊΠ°ΠΌΠ΅Π½ΡΠΈΠΊ", "Π³ΠΎΠ½ΡΠ°Ρ", "ΠΊΡΠ·Π½Π΅Ρ", "ΡΠΊΠ°Ρ", | |
| "ΡΠ΅ΡΠ½ΠΈΠ»Π°", "Π±ΡΠΌΠ°Π³Π°", "ΠΊΠ½ΠΈΠ³Π°", "ΠΏΠ΅ΡΠ°ΡΡ", "Π±ΡΠΊΠ²Π°", | |
| "Π°ΡΡΡΠΎΠ½ΠΎΠΌΠΈΡ", "Ρ ΠΈΠΌΠΈΡ", "Π³Π΅ΠΎΠΌΠ΅ΡΡΠΈΡ", "ΠΌΠ΅Π΄ΠΈΡΠΈΠ½Π°", "Ρ ΠΈΡΡΡΠ³ΠΈΡ", | |
| # βββ TIME + CALENDAR βββ | |
| "Π²ΡΠ΅ΠΌΡ", "ΡΠ°Ρ", "Π΄Π΅Π½Ρ", "Π½ΠΎΡΡ", "ΡΡΡΠΎ", | |
| "ΡΠ°ΡΡΠ²Π΅Ρ", "Π·Π°ΠΊΠ°Ρ", "Π»ΡΠ½Π°", "Π·Π²Π΅Π·Π΄Π°", "ΡΠΎΠ»Π½ΡΠ΅", | |
| "Π³ΠΎΠ΄", "ΠΌΠ΅ΡΡΡ", "Π½Π΅Π΄Π΅Π»Ρ", "ΠΏΡΡΠ½ΠΈΡΠ°", "ΡΡΠ±Π±ΠΎΡΠ°", | |
| # βββ FAMILY + SOCIETY βββ | |
| "ΠΎΡΠ΅Ρ", "ΠΌΠ°ΡΡ", "Π±ΡΠ°Ρ", "ΡΠ΅ΡΡΡΠ°", "ΡΡΠ½", | |
| "Π΄ΠΎΡΡ", "ΠΆΠ΅Π½Π°", "ΠΌΡΠΆ", "ΡΠ΅ΠΌΡΡ", "ΡΠΎΠ΄", | |
| "Π½Π°ΡΠΎΠ΄", "ΠΏΠ»Π΅ΠΌΡ", "ΠΎΠ±ΡΠΈΠ½Π°", "ΡΠΎΡΠ΅Π΄", "Π³ΠΎΡΡΡ", | |
| "Π΄ΡΡΠ³", "Π²ΡΠ°Π³", "ΡΠ°Π±", "ΡΠ²ΠΎΠ±ΠΎΠ΄Π½ΡΠΉ", "ΠΌΡΠ΄ΡΠ΅Ρ", | |
| # βββ ADDITIONAL HIGH-YIELD TERMS βββ | |
| # (suspected Arabic/Turkic that aren't in A1_ΠΠΠΠΠ‘Π yet) | |
| "ΡΠ°Ρ ΡΠ°", "ΠΌΠ°ΡΠΊ", "ΡΠ°Π»Π°Π½Ρ", "ΡΠ΅ΡΠ΅ΠΏΡ", "ΡΡΡΠ±Π°Π½", | |
| "Π³Π°ΡΠ΅ΠΌ", "Π³Π°Π·Π΅ΡΠ°", "ΠΆΡΡΠ½Π°Π»", "Π°Π²ΡΠΎΠΌΠ°Ρ", "ΠΊΠΈΠ±ΠΈΡΠΊΠ°", | |
| "ΡΠ°ΡΡ Π°Π½", "ΠΊΡΡΡΠ»ΡΠ°ΠΉ", "Π±Π°ΠΉΡΠ°ΠΌ", "Π°ΠΊΡΠ°ΠΊΠ°Π»", "Π±Π°ΡΡΡ", | |
| "ΠΈΠΌΠ°Π½", "ΠΊΠΈΡΠ°Π±", "Π΄ΠΆΠΈΡ Π°Π΄", "ΡΠ°ΡΠΈΠ°Ρ", "ΡΠ΅ΡΠ²Π°", | |
| "ΠΌΠ°ΡΠ»ΠΎ", "ΠΌΠ°ΡΡΠ΅Ρ", "ΡΠ΅ΠΌΠ΅ΡΠ»ΠΎ", "ΡΡΠ½ΠΎΠΊ", "Π±ΠΎΠ³Π°ΡΡΡΠ²ΠΎ", | |
| "Π΄ΡΡΠ°", "ΡΠ°Π·ΡΠΌ", "ΡΠΎΠ²Π΅ΡΡΡ", "ΠΈΡΡΠΈΠ½Π°", "ΡΠΏΡΠ°Π²Π΅Π΄Π»ΠΈΠ²ΠΎΡΡΡ", | |
| "Ρ ΠΎΠ·ΡΠΈΠ½", "Π½Π°ΠΌΠ΅ΡΡΠ½ΠΈΠΊ", "ΠΏΠΎΡΠΎΠ»", "Π΄ΠΎΠ³ΠΎΠ²ΠΎΡ", "ΠΌΠΈΡ", | |
| "ΠΊΠ°Π·Π°ΡΠΌΠ°", "Π»Π°Π·Π°ΡΠ΅Ρ", "Π³ΠΎΡΠΏΠΈΡΠ°Π»Ρ", "Π°ΠΏΡΠ΅ΠΊΠ°", "Π±Π°Π»ΡΠ·Π°ΠΌ", | |
| "ΡΠ°Π±Π°ΠΊ", "ΠΊΠ°Π»ΡΡΠ½", "Ρ Π½Π°", "ΠΌΡΡΠΊΡΡ", "Π°ΠΌΠ±ΡΠ°", | |
| "Π°ΡΠ±Π°Π»Π΅Ρ", "ΠΏΡΡΠΊΠ°", "ΠΏΠΎΡΠΎΡ ", "ΡΠ½Π°ΡΡΠ΄", "ΠΌΡΡΠΊΠ΅Ρ", | |
| ] | |
| # Remove duplicates while preserving order | |
| _seen = set() | |
| RUSSIAN_WORD_LIST = [w for w in RUSSIAN_WORD_LIST if not (w in _seen or _seen.add(w))] | |
| # βββ RESULT SERIALISER ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def serialise_result(word: str, result) -> dict: | |
| """Convert ProcessResult to JSON-safe dict.""" | |
| rec = { | |
| "word": word.upper(), | |
| "existing_entry_id": result.existing_entry_id, | |
| "category": _categorise(result), | |
| "score": None, | |
| "root_letters": None, | |
| "ar_word": None, | |
| "phonetic_chain": None, | |
| "positional_score": None, | |
| "transposition_flag": False, | |
| "extra_consonants": 0, | |
| "q_gate": None, | |
| "u_gate": None, | |
| "f_gate": None, | |
| "orig2_track": getattr(result, 'orig2_track', False), | |
| "orig2_details": None, | |
| "cognate_crossref": None, # v3.3: EnglishβRussian cognate data | |
| "compound_parts": None, # v3.4: compound word analysis (Π‘ΠΠΠ+ΠΠΠ ) | |
| "sem_review": getattr(result, 'sem_review', False), # v3.4 | |
| "cluster_members": result.cluster_members[:20], | |
| "log_lines": result.log, | |
| } | |
| # v3.3: Cognate cross-reference data | |
| cog = getattr(result, 'cognate_crossref', None) | |
| if cog: | |
| rec["cognate_crossref"] = { | |
| "source": cog.get('source', ''), | |
| "en_cousin": cog.get('en_cousin', ''), | |
| "root_letters": cog.get('root_letters', ''), | |
| "score": cog.get('score', None), | |
| "phonetic_chain": cog.get('phonetic_chain', ''), | |
| "variant_used": cog.get('variant_used', ''), | |
| "word_form_used": cog.get('word_form_used', ''), | |
| "entry_id": cog.get('entry_id', None), | |
| "note": cog.get('note', ''), | |
| } | |
| # v3.4: Compound parts analysis | |
| cp = getattr(result, 'compound_parts', None) | |
| if cp: | |
| rec["compound_parts"] = { | |
| "label": cp.get('label', ''), | |
| "bridge": cp.get('bridge', ''), | |
| "prefix": cp.get('prefix'), # dict or None | |
| "root": cp.get('root'), # dict or None | |
| } | |
| # ORIG2 details | |
| if getattr(result, 'orig2_track', False) and getattr(result, 'orig2_details', None): | |
| rec["orig2_details"] = { | |
| "kashgari_translit": result.orig2_details.get('kashgari_translit', ''), | |
| "kashgari_meaning": result.orig2_details.get('kashgari_meaning', ''), | |
| "kashgari_line": result.orig2_details.get('kashgari_line', 0), | |
| "attestation_type": result.orig2_details.get('attestation_type', ''), | |
| "skeleton": result.orig2_details.get('skeleton', ''), | |
| "all_hits": result.orig2_details.get('all_hits', 0), | |
| "bitig_warnings": result.orig2_details.get('bitig_warnings', []), | |
| } | |
| if result.confirmed_root: | |
| rec["root_letters"] = result.confirmed_root.letters | |
| rec["ar_word"] = result.confirmed_root.ar_word | |
| rec["score"] = result.confirmed_root.score | |
| rec["phonetic_chain"] = result.confirmed_root.phonetic_chain | |
| rec["positional_score"] = getattr(result.confirmed_root, 'positional_score', None) | |
| rec["transposition_flag"] = getattr(result.confirmed_root, 'transposition_flag', False) | |
| rec["extra_consonants"] = getattr(result.confirmed_root, 'extra_consonants', 0) | |
| if result.q_gate: | |
| rec["q_gate"] = { | |
| "passed": result.q_gate.passed, | |
| "token_count": result.q_gate.details.get("token_count", 0), | |
| "ar_word": result.q_gate.details.get("ar_word", ""), | |
| "verse": result.q_gate.details.get("verse", ""), | |
| "orig2_candidate": result.q_gate.details.get("orig2_candidate", False), | |
| } | |
| if result.u_gate: | |
| rec["u_gate"] = { | |
| "passed": result.u_gate.passed, | |
| "phonetic_chain": result.u_gate.details.get("phonetic_chain", ""), | |
| } | |
| if result.f_gate: | |
| rec["f_gate"] = { | |
| "passed": result.f_gate.passed, | |
| "ds_code": result.f_gate.details.get("ds_code", ""), | |
| "network_id": result.f_gate.details.get("network_id", ""), | |
| "dp_codes": result.f_gate.details.get("dp_codes", []), | |
| } | |
| return rec | |
| def _categorise(result) -> str: | |
| """ | |
| Three-tier classification: | |
| ALREADY_IN_LATTICE β already in A1_ΠΠΠΠΠ‘Π | |
| CONFIRMED_HIGH β score >= 8, Q+U pass, no R11 transposition | |
| PENDING_REVIEW β score 5β7, or transposition, or ORIG2 match | |
| AUTO_REJECTED β score < 5, or U-gate fail, or no root at all | |
| """ | |
| if result.existing_entry_id is not None: | |
| return "ALREADY_IN_LATTICE" | |
| # ORIG2 track: always PENDING_REVIEW (needs Kashgari verification) | |
| if getattr(result, 'orig2_track', False): | |
| return "PENDING_REVIEW" | |
| if result.confirmed_root is None: | |
| return "PENDING_REVIEW" | |
| score = result.confirmed_root.score | |
| q = result.q_gate.passed if result.q_gate else False | |
| u = result.u_gate.passed if result.u_gate else False | |
| trans = getattr(result.confirmed_root, 'transposition_flag', False) | |
| if score >= 8 and q and u and not trans: | |
| return "CONFIRMED_HIGH" | |
| if score >= 5 and (q or u): | |
| return "PENDING_REVIEW" | |
| return "AUTO_REJECTED" | |
| # βββ MAIN βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_batch(word_list: list, output_dir: Path) -> dict: | |
| """ | |
| Run engine in dry_run=True mode on every Russian word. | |
| Returns full results dict. Saves JSON + TXT to output_dir. | |
| """ | |
| print("Importing USLaP_Engine (v3.0 dual-language)...") | |
| with _Suppress(): | |
| from USLaP_Engine import USLaPEngine | |
| print(f"Initialising engine with master file...") | |
| with _Suppress() as s: | |
| try: | |
| engine = USLaPEngine(master_file=str(MASTER_FILE), skip_reports=True) | |
| except Exception as e: | |
| print(f"\nERROR: Engine init failed: {e}") | |
| print(s.text()) | |
| sys.exit(1) | |
| print(f"Engine ready (v3.0 β EN+RU dual-language).\n") | |
| # Buckets | |
| results_by_cat = { | |
| "ALREADY_IN_LATTICE": [], | |
| "CONFIRMED_HIGH": [], | |
| "PENDING_REVIEW": [], | |
| "AUTO_REJECTED": [], | |
| } | |
| cluster_backlog = set() | |
| total = len(word_list) | |
| # Process loop | |
| for i, word in enumerate(word_list, 1): | |
| pct = (i / total) * 100 | |
| print(f" [{i:>3}/{total}] {pct:>5.1f}% {word:<20}", end="", flush=True) | |
| with _Suppress(): | |
| try: | |
| result = engine.process(word, dry_run=True) | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| continue | |
| rec = serialise_result(word, result) | |
| cat = rec["category"] | |
| results_by_cat[cat].append(rec) | |
| # Collect cluster discoveries | |
| for candidate in result.cluster_members: | |
| if isinstance(candidate, str): | |
| cluster_backlog.add(candidate.upper()) | |
| # Inline status | |
| root = rec.get("root_letters", "?") | |
| score = rec.get("score", "?") | |
| trans = rec.get("transposition_flag", False) | |
| pos = rec.get("positional_score") | |
| pos_s = f" pos={pos:.2f}" if pos is not None else "" | |
| r11 = " β R11" if trans else "" | |
| # v3.3: cognate suffix | |
| cog_rec = rec.get("cognate_crossref") | |
| cog_s = "" | |
| if cog_rec and cog_rec.get("source") == "EN_PIPELINE": | |
| cog_s = f" β{cog_rec['en_cousin']}β{cog_rec['root_letters']}(s{cog_rec.get('score','?')})" | |
| elif cog_rec and cog_rec.get("source") == "LATTICE_ENTRY": | |
| cog_s = f" βLAT#{cog_rec.get('entry_id','?')}" | |
| # v3.4: compound suffix | |
| cp_rec = rec.get("compound_parts") | |
| cp_s = "" | |
| if cp_rec and cp_rec.get("label"): | |
| cp_s = f" [{cp_rec['label']}]" | |
| if cat == "ALREADY_IN_LATTICE": | |
| print(f" β EXISTING #{result.existing_entry_id}") | |
| elif cat == "CONFIRMED_HIGH": | |
| print(f" β CONFIRMED root={root:<12} score={score}/10{pos_s}{cog_s}{cp_s}") | |
| elif cat == "PENDING_REVIEW": | |
| if getattr(result, 'orig2_track', False): | |
| kd = getattr(result, 'orig2_details', {}) or {} | |
| kt = kd.get('kashgari_translit', '?') | |
| ka = kd.get('attestation_type', '?') | |
| print(f" β ORIG2 Kashgari='{kt}' ({ka}) score={score}/10{cog_s}{cp_s}") | |
| elif result.confirmed_root is None: | |
| print(f" ~ PENDING (no ORIG1 root, no ORIG2 match)") | |
| else: | |
| print(f" ~ PENDING root={root:<12} score={score}/10{pos_s}{r11}{cog_s}{cp_s}") | |
| else: | |
| print(f" β REJECTED root={root:<12} score={score}/10{r11}{cp_s}") | |
| # Remove input + existing from cluster backlog | |
| input_upper = {w.upper() for w in word_list} | |
| existing_upper = {r["word"] for r in results_by_cat["ALREADY_IN_LATTICE"]} | |
| cluster_backlog -= input_upper | |
| cluster_backlog -= existing_upper | |
| # Build report | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| report = { | |
| "run_date": datetime.now().isoformat(), | |
| "engine_version": "v3.0 (EN+RU dual-language + multi-candidate)", | |
| "language": "Russian (RU)", | |
| "master_file": str(MASTER_FILE), | |
| "total_words": total, | |
| "summary": { | |
| "already_in_lattice": len(results_by_cat["ALREADY_IN_LATTICE"]), | |
| "confirmed_high": len(results_by_cat["CONFIRMED_HIGH"]), | |
| "pending_review": len(results_by_cat["PENDING_REVIEW"]), | |
| "auto_rejected": len(results_by_cat["AUTO_REJECTED"]), | |
| "cluster_backlog": len(cluster_backlog), | |
| }, | |
| "already_in_lattice": results_by_cat["ALREADY_IN_LATTICE"], | |
| "confirmed_high": results_by_cat["CONFIRMED_HIGH"], | |
| "pending_review": results_by_cat["PENDING_REVIEW"], | |
| "auto_rejected": results_by_cat["AUTO_REJECTED"], | |
| "cluster_backlog": sorted(cluster_backlog), | |
| } | |
| # Save JSON | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| json_path = output_dir / f"RU_BATCH_REPORT_{timestamp}.json" | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(report, f, ensure_ascii=False, indent=2) | |
| print(f"\n JSON report saved: {json_path}") | |
| # Save TXT summary | |
| txt_path = output_dir / f"RU_BATCH_SUMMARY_{timestamp}.txt" | |
| _write_txt_summary(report, txt_path) | |
| print(f" TXT summary saved: {txt_path}") | |
| return report | |
| def _write_txt_summary(report: dict, path: Path): | |
| """Write a human-readable Russian batch summary.""" | |
| s = report["summary"] | |
| lines = [ | |
| "β" * 70, | |
| " USLaP Russian Batch Runner v1.0 β Discovery Summary", | |
| " Ψ¨ΩΨ³ΩΩ Ω Ψ§ΩΩΩΩΩΩ Ψ§ΩΨ±ΩΩΨΩΩ ΩΩ°ΩΩ Ψ§ΩΨ±ΩΩΨΩΩΩ Ω", | |
| " Language: Russian (RU) β >50% ORIG2 (Bitig/Turkic) expected", | |
| "β" * 70, | |
| f" Run date: {report['run_date']}", | |
| f" Engine: {report.get('engine_version', 'v3.0')}", | |
| f" Words run: {report['total_words']}", | |
| "β" * 70, | |
| f" β Already in A1_ΠΠΠΠΠ‘Π: {s['already_in_lattice']:>4} (no action needed)", | |
| f" β CONFIRMED HIGH: {s['confirmed_high']:>4} β review & write to A1_ΠΠΠΠΠ‘Π", | |
| f" ~ PENDING REVIEW: {s['pending_review']:>4} β human QUF adjudication", | |
| f" β AUTO REJECTED: {s['auto_rejected']:>4} (U-gate fail or score < 5)", | |
| f" + Cluster backlog: {s['cluster_backlog']:>4} (new words via root expansion)", | |
| "β" * 70, | |
| "", | |
| " NOTE: High PENDING count is EXPECTED for Russian β many words are", | |
| " ORIG2 (Bitig/Turkic) and need Kashgari attestation, not Q-gate.", | |
| "", | |
| " β CONFIRMED HIGH β ORIG1 candidates (score β₯ 8, Q+U pass):", | |
| "β" * 70, | |
| ] | |
| for rec in report["confirmed_high"]: | |
| root = rec.get("root_letters", "?") | |
| score = rec.get("score", "?") | |
| chain = rec.get("phonetic_chain", "?") or "β" | |
| tokens = rec.get("q_gate", {}).get("token_count", "?") if rec.get("q_gate") else "?" | |
| pos = rec.get("positional_score") | |
| pos_s = f" pos={pos:.2f}" if pos is not None else "" | |
| net = rec.get("f_gate", {}).get("network_id", "") if rec.get("f_gate") else "" | |
| net_s = f" [{net}]" if net else "" | |
| lines.append( | |
| f" {rec['word']:<22} root={root:<12} score={score}/10 tokens={tokens}{pos_s}{net_s}" | |
| ) | |
| lines.append(f" chain: {chain}") | |
| # Split PENDING into ORIG2 and others | |
| orig2_pending = [r for r in report["pending_review"] if r.get("orig2_track")] | |
| other_pending = [r for r in report["pending_review"] if not r.get("orig2_track")] | |
| if orig2_pending: | |
| lines += [ | |
| "", | |
| f" β ORIG2 (KASHGARI) MATCHES β {len(orig2_pending)} words attested in Bitig:", | |
| "β" * 70, | |
| ] | |
| for rec in orig2_pending: | |
| od = rec.get("orig2_details", {}) or {} | |
| kt = od.get("kashgari_translit", "?") | |
| km = od.get("kashgari_meaning", "?") | |
| kl = od.get("kashgari_line", "?") | |
| ka = od.get("attestation_type", "?") | |
| score = rec.get("score", "?") | |
| warns = od.get("bitig_warnings", []) | |
| warn_s = f" β {'; '.join(warns)}" if warns else "" | |
| km_short = km[:50] + "..." if len(km) > 50 else km | |
| lines.append( | |
| f" {rec['word']:<20} Kashgari='{kt}' ({ka}, line {kl}) score={score}/10{warn_s}" | |
| ) | |
| lines.append(f" meaning: \"{km_short}\"") | |
| lines += [ | |
| "", | |
| f" ~ PENDING REVIEW β {len(other_pending)} words need human QUF adjudication:", | |
| "β" * 70, | |
| ] | |
| for rec in other_pending: | |
| root = rec.get("root_letters") or "NO ORIG1 ROOT" | |
| score = rec.get("score", "?") | |
| trans = rec.get("transposition_flag", False) | |
| q_ok = rec.get("q_gate", {}).get("passed", False) if rec.get("q_gate") else False | |
| u_ok = rec.get("u_gate", {}).get("passed", False) if rec.get("u_gate") else False | |
| flags = [] | |
| if trans: flags.append("β R11-TRANSPOSITION") | |
| if not q_ok: flags.append("Q-FAIL") | |
| if not u_ok: flags.append("U-FAIL") | |
| flag_s = " " + " | ".join(flags) if flags else "" | |
| lines.append(f" {rec['word']:<22} root={root:<12} score={score}/10{flag_s}") | |
| # Rejected | |
| lines += [ | |
| "", | |
| f" β AUTO REJECTED β {len(report['auto_rejected'])} words:", | |
| "β" * 70, | |
| ] | |
| for rec in report["auto_rejected"]: | |
| root = rec.get("root_letters") or "?" | |
| score = rec.get("score", "?") | |
| lines.append(f" {rec['word']:<22} root={root:<12} score={score}/10") | |
| # Cluster backlog | |
| lines += [ | |
| "", | |
| " + CLUSTER BACKLOG β words discovered via root expansion:", | |
| "β" * 70, | |
| ] | |
| for w in sorted(report["cluster_backlog"]): | |
| lines.append(f" {w}") | |
| lines += [ | |
| "", | |
| "β" * 70, | |
| " NEXT STEPS:", | |
| " 1. CONFIRMED_HIGH β verify ROOT_ID + QUR_MEANING β write to A1_ΠΠΠΠΠ‘Π", | |
| " 2. ORIG2 matches β verify Kashgari attestation β write to BITIG_A1_ENTRIES", | |
| " 3. PENDING with Q-FAIL β check Kashgari corpus (ORIG2 track)", | |
| " 4. PENDING with β R11 β recheck phonetic chain (transposition)", | |
| " 5. Cross-reference with English A1_ENTRIES for sibling entries", | |
| " 6. CLUSTER_BACKLOG β run batch_runner_ru again with these as input", | |
| "β" * 70, | |
| ] | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| # βββ ENTRY POINT ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| if not OUTPUT_DIR.exists(): | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| if not MASTER_FILE.exists(): | |
| print(f"ERROR: Master file not found:\n {MASTER_FILE}") | |
| sys.exit(1) | |
| # Word source: CLI arg (custom file) or built-in list | |
| if len(sys.argv) > 1: | |
| custom_file = Path(sys.argv[1]) | |
| if not custom_file.exists(): | |
| print(f"ERROR: Word file not found: {custom_file}") | |
| sys.exit(1) | |
| with open(custom_file, "r", encoding="utf-8") as f: | |
| word_list = [line.strip().lower() for line in f if line.strip()] | |
| print(f"Loaded {len(word_list)} words from {custom_file.name}") | |
| else: | |
| word_list = RUSSIAN_WORD_LIST | |
| print(f"Using built-in Russian word list: {len(word_list)} words") | |
| print(f"Output directory: {OUTPUT_DIR}") | |
| print(f"Master file: {MASTER_FILE.name}") | |
| print(f"Language: Russian (RU) β ORIG1 + ORIG2 dual-track") | |
| print(f"Mode: DRY RUN (no writes to Excel)\n") | |
| print("β" * 62) | |
| report = run_batch(word_list, OUTPUT_DIR) | |
| # Terminal summary | |
| s = report["summary"] | |
| print("\n" + "β" * 70) | |
| print(" RUSSIAN BATCH COMPLETE β THREE-TIER SUMMARY (v1.0)") | |
| print("β" * 70) | |
| print(f" Words processed: {report['total_words']}") | |
| print(f" β Already in lattice: {s['already_in_lattice']}") | |
| print(f" β CONFIRMED HIGH: {s['confirmed_high']} β review & write to A1_ΠΠΠΠΠ‘Π") | |
| print(f" ~ PENDING REVIEW: {s['pending_review']} β human QUF adjudication") | |
| print(f" β AUTO REJECTED: {s['auto_rejected']}") | |
| print(f" + Cluster backlog: {s['cluster_backlog']} β bonus discoveries") | |
| print("β" * 70) | |
| print("\n NOTE: For Russian, high PENDING is expected (>50% ORIG2/Bitig).") | |
| print(" ORIG2 matches need Kashgari attestation β NOT Q-gate.") | |
| print(" Open TXT summary for annotated review. JSON for machine-readable data.") | |