Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| USLaP CHAIN EXTRACTION — Extract full phonetic chains, shift IDs, DP codes, | |
| decay levels, and chronology from archived session logs. | |
| Usage: | |
| python3 extract_chains.py scan — extract all chain data | |
| python3 extract_chains.py diff — compare extracted vs live DB | |
| python3 extract_chains.py report — full report with recovery candidates | |
| """ | |
| import re, glob, os, sys, sqlite3 | |
| from collections import defaultdict | |
| SESSION_DIR = os.path.expanduser("~/.claude/projects/-Users-mmsetubal-Documents-USLaP-workplace") | |
| DB_PATH = "/Users/mmsetubal/Documents/USLaP workplace/Code_files/uslap_database_v3.db" | |
| # ============================================================ | |
| # PATTERNS — extract structured data from session text | |
| # ============================================================ | |
| # Phonetic chain: ق→C(S01), ر→R(S15), ن→N(S18) | |
| RE_CHAIN = re.compile( | |
| r'([^\s,;|]+→[^\s,;|]+\(S\d{2}\)(?:\s*,\s*[^\s,;|]+→[^\s,;|]+\(S\d{2}\))*)' | |
| ) | |
| # ORIG2 chain: ORIG2: term → skeleton | |
| RE_ORIG2 = re.compile(r'ORIG2[:\s]+(\S+)\s*→\s*(\S+)') | |
| # Shift IDs: S01, S02, etc | |
| RE_SHIFTS = re.compile(r'\b(S\d{2})\b') | |
| # DP codes: DP01, DP08, etc | |
| RE_DP = re.compile(r'\b(DP\d{2})\b') | |
| # Operations: OP_NASAL, OP_SUFFIX, etc | |
| RE_OPS = re.compile(r'\b(OP_[A-Z_]+)\b') | |
| # Decay level | |
| RE_DECAY = re.compile(r'\b(NEAR|MINIMAL|MEDIUM|HIGH|VERY.HIGH|MAXIMUM|ORGANIC|INSTITUTIONAL)\b') | |
| # Corridor | |
| RE_CORRIDOR = re.compile(r'\b(DS0[1-9]|DS1[0-4]|ORIG2|TYPE[123]|DIRECT)\b') | |
| # Score | |
| RE_SCORE = re.compile(r'score[=:\s]+(\d{1,2})/10|\bscore[=:\s]+(\d{1,2})\b', re.IGNORECASE) | |
| # Pattern (inversion type) | |
| RE_PATTERN = re.compile(r'\bpattern[=:\s]+(A\+?[BCD]?|B\+?[CD]?|C|D)\b', re.IGNORECASE) | |
| # EN term near chain data | |
| RE_EN_TERM = re.compile(r'\b([A-Z]{3,20})\b') | |
| # RU term near chain data | |
| RE_RU_TERM = re.compile(r'\b([А-ЯЁ]{3,20})\b') | |
| # Root assignment | |
| RE_ROOT = re.compile(r'((?:R\d+|T_?(?:BITIG)?\d+))') | |
| # QV reference | |
| RE_QV = re.compile(r'\b(QV\d{1,3})\b') | |
| # Quranic reference | |
| RE_QURAN = re.compile(r'\b(Q\d{1,3}:\d{1,3})\b') | |
| # Chronology reference | |
| RE_CHRONO = re.compile(r'\b(C\d{1,3})\b') | |
| # Network reference | |
| RE_NETWORK = re.compile(r'\b(N\d{2})\b') | |
| # Allah Name reference | |
| RE_ALLAH = re.compile(r'\b(A\d{2})\b') | |
| def extract_entry_data(line): | |
| """Extract all structured data from a line.""" | |
| data = {} | |
| chains = RE_CHAIN.findall(line) | |
| if chains: | |
| data['phonetic_chain'] = chains[0] # Take first/best | |
| orig2 = RE_ORIG2.findall(line) | |
| if orig2: | |
| data['orig2_chain'] = f"ORIG2: {orig2[0][0]} → {orig2[0][1]}" | |
| shifts = list(set(RE_SHIFTS.findall(line))) | |
| if shifts: | |
| data['shifts'] = sorted(shifts) | |
| dps = list(set(RE_DP.findall(line))) | |
| if dps: | |
| data['dp_codes'] = sorted(dps) | |
| ops = list(set(RE_OPS.findall(line))) | |
| if ops: | |
| data['ops_applied'] = sorted(ops) | |
| decay = RE_DECAY.findall(line) | |
| if decay: | |
| data['decay_level'] = decay[0] | |
| corridor = RE_CORRIDOR.findall(line) | |
| if corridor: | |
| data['ds_corridor'] = corridor[0] | |
| score = RE_SCORE.findall(line) | |
| if score: | |
| s = score[0][0] or score[0][1] | |
| if s and 1 <= int(s) <= 10: | |
| data['score'] = int(s) | |
| pattern = RE_PATTERN.findall(line) | |
| if pattern: | |
| data['pattern'] = pattern[0].upper() | |
| roots = RE_ROOT.findall(line) | |
| if roots: | |
| data['root_ids'] = list(set(roots)) | |
| qv = RE_QV.findall(line) | |
| if qv: | |
| data['qv_refs'] = sorted(set(qv)) | |
| quran = RE_QURAN.findall(line) | |
| if quran: | |
| data['qur_refs'] = sorted(set(quran)) | |
| chrono = RE_CHRONO.findall(line) | |
| if chrono: | |
| # Filter out C01 etc that aren't chrono refs (e.g. hex codes) | |
| valid = [c for c in chrono if int(c[1:]) <= 200] | |
| if valid: | |
| data['chrono_refs'] = sorted(set(valid)) | |
| network = RE_NETWORK.findall(line) | |
| if network: | |
| data['network_id'] = sorted(set(network)) | |
| allah = RE_ALLAH.findall(line) | |
| if allah: | |
| valid = [a for a in allah if int(a[1:]) <= 99] | |
| if valid: | |
| data['allah_name_id'] = sorted(set(valid)) | |
| return data | |
| def find_term_for_data(line, data): | |
| """Find which term this data belongs to.""" | |
| terms = [] | |
| en_terms = RE_EN_TERM.findall(line) | |
| # Filter noise | |
| noise = {'INSERT','UPDATE','SELECT','DELETE','FROM','WHERE','INTO','TABLE', | |
| 'CREATE','ALTER','DROP','TEXT','INTEGER','NULL','PASS','FAIL','TRUE', | |
| 'FALSE','ROOT','ORIG','TYPE','NONE','BITIG','KASHGARI','PATTERN', | |
| 'CHAIN','SCORE','NEAR','MEDIUM','HIGH','MAXIMUM','DIRECT','ORGANIC', | |
| 'INSTITUTIONAL','THE','AND','FOR','NOT','WITH','THIS','THAT','HAS', | |
| 'WAS','ARE','BUT','ALL','CAN','HAD','HER','ONE','OUR','OUT','YOU', | |
| 'QURANIC','ARABIC','SHIFT','ENTRY','QUERY','INDEX','COUNT','CHECK', | |
| 'CORRIDOR','DECAY','LEVEL','PHONETIC','SEMANTIC','COMPOUND','PREFIX', | |
| 'SUFFIX','DERIVATIVE','VERIFIED','WRITTEN','ANALYSED','PENDING', | |
| 'SEARCH','FOUND','MATCH','CONSONANT','VOWEL','RADICAL','TRILITERAL'} | |
| en_terms = [t for t in en_terms if t not in noise and len(t) >= 3] | |
| ru_terms = RE_RU_TERM.findall(line) | |
| ru_noise = {'ЗАПИСИ','БУКВЫ','КОРЕНЬ','СЛОВО','ЗНАЧЕНИЕ','ЗАПИСЬ','ИСТОЧНИК'} | |
| ru_terms = [t for t in ru_terms if t not in ru_noise and len(t) >= 3] | |
| return en_terms + ru_terms | |
| def scan_sessions(): | |
| """Extract all chain data from all sessions.""" | |
| files = sorted(glob.glob(os.path.join(SESSION_DIR, "*.jsonl"))) | |
| print(f"Scanning {len(files)} sessions...") | |
| # term -> best chain data (most complete) | |
| all_data = defaultdict(list) | |
| for filepath in files: | |
| session_id = os.path.basename(filepath)[:12] | |
| size_mb = os.path.getsize(filepath) / 1024 / 1024 | |
| session_count = 0 | |
| try: | |
| with open(filepath, 'r', errors='ignore') as fh: | |
| for line in fh: | |
| # Only process lines with chain data | |
| if not RE_CHAIN.search(line) and not RE_ORIG2.search(line): | |
| continue | |
| data = extract_entry_data(line) | |
| if not data.get('phonetic_chain') and not data.get('orig2_chain'): | |
| continue | |
| terms = find_term_for_data(line, data) | |
| for term in terms: | |
| data['session'] = session_id | |
| all_data[term].append(dict(data)) | |
| session_count += 1 | |
| if session_count > 0: | |
| print(f" {session_id} ({size_mb:.1f}MB): {session_count} chain extractions") | |
| except Exception as e: | |
| continue | |
| # Deduplicate: keep the most complete data per term | |
| best = {} | |
| for term, entries in all_data.items(): | |
| # Score each entry by completeness | |
| def completeness(d): | |
| score = 0 | |
| if d.get('phonetic_chain'): score += 10 | |
| if d.get('dp_codes'): score += len(d['dp_codes']) * 3 | |
| if d.get('ops_applied'): score += len(d['ops_applied']) * 2 | |
| if d.get('decay_level'): score += 5 | |
| if d.get('ds_corridor'): score += 5 | |
| if d.get('score'): score += 3 | |
| if d.get('pattern'): score += 3 | |
| if d.get('qur_refs'): score += len(d['qur_refs']) | |
| if d.get('shifts'): score += len(d['shifts']) | |
| return score | |
| entries.sort(key=completeness, reverse=True) | |
| best[term] = entries[0] | |
| print(f"\nTotal unique terms with chain data: {len(best)}") | |
| return best | |
| def diff_against_db(extracted): | |
| """Compare extracted chain data vs live DB.""" | |
| conn = _uslap_connect(DB_PATH) if _HAS_WRAPPER else sqlite3.connect(DB_PATH) | |
| conn.execute("PRAGMA foreign_keys = ON") | |
| c = conn.cursor() | |
| # Get current DB state | |
| c.execute("""SELECT entry_id, en_term, ru_term, root_id, phonetic_chain, | |
| dp_codes, ops_applied, decay_level, ds_corridor, pattern, score | |
| FROM entries""") | |
| db_entries = {} | |
| for row in c.fetchall(): | |
| eid, en, ru, rid, chain, dp, ops, decay, corr, pat, score = row | |
| record = { | |
| 'entry_id': eid, 'root_id': rid, 'phonetic_chain': chain, | |
| 'dp_codes': dp, 'ops_applied': ops, 'decay_level': decay, | |
| 'ds_corridor': corr, 'pattern': pat, 'score': score | |
| } | |
| if en: db_entries[en.upper()] = record | |
| if ru: db_entries[ru.upper()] = record | |
| conn.close() | |
| # Categorize | |
| recoverable = [] # In DB but missing chain data, session has it | |
| conflicts = [] # Both have data but different | |
| not_in_db = [] # Term not in DB | |
| already_complete = [] # DB already has this data | |
| for term, session_data in extracted.items(): | |
| if term in db_entries: | |
| db = db_entries[term] | |
| # Check what's missing in DB that session has | |
| missing_fields = [] | |
| if not db['phonetic_chain'] and session_data.get('phonetic_chain'): | |
| missing_fields.append('phonetic_chain') | |
| if not db['dp_codes'] and session_data.get('dp_codes'): | |
| missing_fields.append('dp_codes') | |
| if not db['ops_applied'] and session_data.get('ops_applied'): | |
| missing_fields.append('ops_applied') | |
| if not db['decay_level'] and session_data.get('decay_level'): | |
| missing_fields.append('decay_level') | |
| if missing_fields: | |
| recoverable.append({ | |
| 'term': term, | |
| 'entry_id': db['entry_id'], | |
| 'missing': missing_fields, | |
| 'session_data': session_data | |
| }) | |
| else: | |
| already_complete.append(term) | |
| else: | |
| not_in_db.append({'term': term, 'session_data': session_data}) | |
| return { | |
| 'recoverable': recoverable, | |
| 'conflicts': conflicts, | |
| 'not_in_db': not_in_db, | |
| 'already_complete': already_complete | |
| } | |
| def report(): | |
| """Full extraction + diff report.""" | |
| print("=" * 80) | |
| print("USLaP CHAIN DATA RECOVERY REPORT") | |
| print("=" * 80) | |
| extracted = scan_sessions() | |
| results = diff_against_db(extracted) | |
| print(f"\n{'='*80}") | |
| print(f"RECOVERABLE: {len(results['recoverable'])} entries have session chain data missing from DB") | |
| print(f"{'='*80}") | |
| # Group by missing field | |
| by_field = defaultdict(int) | |
| for item in results['recoverable']: | |
| for f in item['missing']: | |
| by_field[f] += 1 | |
| print("\nMissing fields breakdown:") | |
| for field, count in sorted(by_field.items(), key=lambda x: -x[1]): | |
| print(f" {field:25s} {count:5d} entries") | |
| print(f"\nSample recoverable entries:") | |
| for item in sorted(results['recoverable'], key=lambda x: len(x['missing']), reverse=True)[:30]: | |
| sd = item['session_data'] | |
| chain = sd.get('phonetic_chain', sd.get('orig2_chain', ''))[:60] | |
| dps = ','.join(sd.get('dp_codes', [])) | |
| ops = ','.join(sd.get('ops_applied', [])) | |
| decay = sd.get('decay_level', '') | |
| print(f" {item['term']:20s} missing={item['missing']}") | |
| if chain: print(f" chain: {chain}") | |
| if dps: print(f" dp: {dps}") | |
| if ops: print(f" ops: {ops}") | |
| if decay: print(f" decay: {decay}") | |
| if len(results['recoverable']) > 30: | |
| print(f" ... and {len(results['recoverable']) - 30} more") | |
| print(f"\n{'='*80}") | |
| print(f"ALREADY COMPLETE: {len(results['already_complete'])} entries already have chain data in DB") | |
| print(f"NOT IN DB: {len(results['not_in_db'])} terms found in sessions but not in DB") | |
| print(f"{'='*80}") | |
| # Generate SQL for recoverable items | |
| print(f"\n{'='*80}") | |
| print("RECOVERY SQL (dry run — not executed)") | |
| print(f"{'='*80}") | |
| sql_count = 0 | |
| for item in results['recoverable']: | |
| sd = item['session_data'] | |
| sets = [] | |
| if 'phonetic_chain' in item['missing'] and sd.get('phonetic_chain'): | |
| chain = sd['phonetic_chain'].replace("'", "''") | |
| sets.append(f"phonetic_chain = '{chain}'") | |
| if 'dp_codes' in item['missing'] and sd.get('dp_codes'): | |
| dps = ','.join(sd['dp_codes']) | |
| sets.append(f"dp_codes = '{dps}'") | |
| if 'ops_applied' in item['missing'] and sd.get('ops_applied'): | |
| ops = ','.join(sd['ops_applied']) | |
| sets.append(f"ops_applied = '{ops}'") | |
| if 'decay_level' in item['missing'] and sd.get('decay_level'): | |
| sets.append(f"decay_level = '{sd['decay_level']}'") | |
| if sets: | |
| sql = f"UPDATE entries SET {', '.join(sets)} WHERE entry_id = {item['entry_id']};" | |
| print(sql) | |
| sql_count += 1 | |
| print(f"\n-- Total: {sql_count} UPDATE statements") | |
| return results | |
| if __name__ == '__main__': | |
| cmd = sys.argv[1] if len(sys.argv) > 1 else 'report' | |
| if cmd == 'scan': | |
| scan_sessions() | |
| elif cmd == 'diff': | |
| extracted = scan_sessions() | |
| results = diff_against_db(extracted) | |
| print(f"Recoverable: {len(results['recoverable'])} | Complete: {len(results['already_complete'])} | Not in DB: {len(results['not_in_db'])}") | |
| elif cmd == 'report': | |
| report() | |
| else: | |
| print(f"Unknown: {cmd}. Use: scan | diff | report") | |