#!/usr/bin/env python3 import re, glob, os, sqlite3 from collections import Counter SESSION_DIR = os.path.expanduser("~/.claude/projects/-Users-mmsetubal-Documents-USLaP-workplace") DB_PATH = "/Users/mmsetubal/Documents/USLaP workplace/Code_files/uslap_database_v3.db" files = sorted(glob.glob(os.path.join(SESSION_DIR, "*.jsonl"))) print(f"Scanning {len(files)} sessions...") corridor_assigns = {} # term -> corridor pat_update = re.compile(r"ds_corridor\s*=\s*'(DS\d+|ORIG2|TYPE[123]|DIRECT)'") pat_term_en = re.compile(r"en_term\s*=\s*'([A-Z][A-Za-z]+)'") pat_term_ru = re.compile(r"ru_term\s*=\s*'([А-ЯЁ][А-ЯЁа-яё]+)'") for filepath in files: session_id = os.path.basename(filepath)[:12] try: with open(filepath, 'r', errors='ignore') as fh: for line in fh: m_corr = pat_update.search(line) if m_corr: corridor = m_corr.group(1) m_en = pat_term_en.search(line) m_ru = pat_term_ru.search(line) if m_en: corridor_assigns[m_en.group(1).upper()] = corridor if m_ru: corridor_assigns[m_ru.group(1).upper()] = corridor except: continue print(f"\nTotal unique term->corridor assignments found in sessions: {len(corridor_assigns)}") by_corridor = Counter(corridor_assigns.values()) for c, cnt in sorted(by_corridor.items(), key=lambda x: -x[1]): print(f" {c:10s} {cnt:5d}") # Now diff against DB conn = _uslap_connect(DB_PATH) if _HAS_WRAPPER else sqlite3.connect(DB_PATH) conn.execute("PRAGMA foreign_keys = ON") c = conn.cursor() # Get current corridors db_corridors = {} c.execute("SELECT en_term, ru_term, ds_corridor FROM entries WHERE ds_corridor IS NOT NULL AND ds_corridor != ''") for row in c.fetchall(): if row[0]: db_corridors[row[0].upper()] = row[2] if row[1]: db_corridors[row[1].upper()] = row[2] # Get entries with NO corridor c.execute("SELECT en_term, ru_term FROM entries WHERE ds_corridor IS NULL OR ds_corridor = ''") no_corridor = set() for row in c.fetchall(): if row[0]: no_corridor.add(row[0].upper()) if row[1]: no_corridor.add(row[1].upper()) conn.close() # Find lost corridors lost = {} for term, corr in corridor_assigns.items(): if term in no_corridor: lost[term] = corr print(f"\n{'='*70}") print(f"LOST CORRIDORS: {len(lost)} terms had corridor in sessions but NOT in DB") print(f"{'='*70}") lost_by_corr = Counter(lost.values()) for c, cnt in sorted(lost_by_corr.items(), key=lambda x: -x[1]): print(f" {c:10s} {cnt:5d}") print(f"\nSample lost assignments:") for term, corr in sorted(list(lost.items()))[:50]: print(f" {term:25s} -> {corr}") if len(lost) > 50: print(f" ... and {len(lost) - 50} more")