#!/usr/bin/env python3
"""
USLaP CHAIN EXTRACTION — Extract full phonetic chains, shift IDs, DP codes,
decay levels, and chronology from archived session logs.

Usage:
    python3 extract_chains.py scan     — extract all chain data
    python3 extract_chains.py diff     — compare extracted vs live DB
    python3 extract_chains.py report   — full report with recovery candidates
"""

import re, glob, os, sys, sqlite3
from collections import defaultdict

SESSION_DIR = os.path.expanduser("~/.claude/projects/-Users-mmsetubal-Documents-USLaP-workplace")
DB_PATH = "/Users/mmsetubal/Documents/USLaP workplace/Code_files/uslap_database_v3.db"

# ============================================================
# PATTERNS — extract structured data from session text
# ============================================================

# Phonetic chain: ق→C(S01), ر→R(S15), ن→N(S18)
RE_CHAIN = re.compile(
    r'([^\s,;|]+→[^\s,;|]+\(S\d{2}\)(?:\s*,\s*[^\s,;|]+→[^\s,;|]+\(S\d{2}\))*)'
)

# ORIG2 chain: ORIG2: term → skeleton
RE_ORIG2 = re.compile(r'ORIG2[:\s]+(\S+)\s*→\s*(\S+)')

# Shift IDs: S01, S02, etc
RE_SHIFTS = re.compile(r'\b(S\d{2})\b')

# DP codes: DP01, DP08, etc
RE_DP = re.compile(r'\b(DP\d{2})\b')

# Operations: OP_NASAL, OP_SUFFIX, etc
RE_OPS = re.compile(r'\b(OP_[A-Z_]+)\b')

# Decay level
RE_DECAY = re.compile(r'\b(NEAR|MINIMAL|MEDIUM|HIGH|VERY.HIGH|MAXIMUM|ORGANIC|INSTITUTIONAL)\b')

# Corridor
RE_CORRIDOR = re.compile(r'\b(DS0[1-9]|DS1[0-4]|ORIG2|TYPE[123]|DIRECT)\b')

# Score
RE_SCORE = re.compile(r'score[=:\s]+(\d{1,2})/10|\bscore[=:\s]+(\d{1,2})\b', re.IGNORECASE)

# Pattern (inversion type)
RE_PATTERN = re.compile(r'\bpattern[=:\s]+(A\+?[BCD]?|B\+?[CD]?|C|D)\b', re.IGNORECASE)

# EN term near chain data
RE_EN_TERM = re.compile(r'\b([A-Z]{3,20})\b')

# RU term near chain data
RE_RU_TERM = re.compile(r'\b([А-ЯЁ]{3,20})\b')

# Root assignment
RE_ROOT = re.compile(r'((?:R\d+|T_?(?:BITIG)?\d+))')

# QV reference
RE_QV = re.compile(r'\b(QV\d{1,3})\b')

# Quranic reference
RE_QURAN = re.compile(r'\b(Q\d{1,3}:\d{1,3})\b')

# Chronology reference
RE_CHRONO = re.compile(r'\b(C\d{1,3})\b')

# Network reference
RE_NETWORK = re.compile(r'\b(N\d{2})\b')

# Allah Name reference
RE_ALLAH = re.compile(r'\b(A\d{2})\b')


def extract_entry_data(line):
    """Extract all structured data from a line."""
    data = {}

    chains = RE_CHAIN.findall(line)
    if chains:
        data['phonetic_chain'] = chains[0]  # Take first/best

    orig2 = RE_ORIG2.findall(line)
    if orig2:
        data['orig2_chain'] = f"ORIG2: {orig2[0][0]} → {orig2[0][1]}"

    shifts = list(set(RE_SHIFTS.findall(line)))
    if shifts:
        data['shifts'] = sorted(shifts)

    dps = list(set(RE_DP.findall(line)))
    if dps:
        data['dp_codes'] = sorted(dps)

    ops = list(set(RE_OPS.findall(line)))
    if ops:
        data['ops_applied'] = sorted(ops)

    decay = RE_DECAY.findall(line)
    if decay:
        data['decay_level'] = decay[0]

    corridor = RE_CORRIDOR.findall(line)
    if corridor:
        data['ds_corridor'] = corridor[0]

    score = RE_SCORE.findall(line)
    if score:
        s = score[0][0] or score[0][1]
        if s and 1 <= int(s) <= 10:
            data['score'] = int(s)

    pattern = RE_PATTERN.findall(line)
    if pattern:
        data['pattern'] = pattern[0].upper()

    roots = RE_ROOT.findall(line)
    if roots:
        data['root_ids'] = list(set(roots))

    qv = RE_QV.findall(line)
    if qv:
        data['qv_refs'] = sorted(set(qv))

    quran = RE_QURAN.findall(line)
    if quran:
        data['qur_refs'] = sorted(set(quran))

    chrono = RE_CHRONO.findall(line)
    if chrono:
        # Filter out C01 etc that aren't chrono refs (e.g. hex codes)
        valid = [c for c in chrono if int(c[1:]) <= 200]
        if valid:
            data['chrono_refs'] = sorted(set(valid))

    network = RE_NETWORK.findall(line)
    if network:
        data['network_id'] = sorted(set(network))

    allah = RE_ALLAH.findall(line)
    if allah:
        valid = [a for a in allah if int(a[1:]) <= 99]
        if valid:
            data['allah_name_id'] = sorted(set(valid))

    return data


def find_term_for_data(line, data):
    """Find which term this data belongs to."""
    terms = []

    en_terms = RE_EN_TERM.findall(line)
    # Filter noise
    noise = {'INSERT','UPDATE','SELECT','DELETE','FROM','WHERE','INTO','TABLE',
             'CREATE','ALTER','DROP','TEXT','INTEGER','NULL','PASS','FAIL','TRUE',
             'FALSE','ROOT','ORIG','TYPE','NONE','BITIG','KASHGARI','PATTERN',
             'CHAIN','SCORE','NEAR','MEDIUM','HIGH','MAXIMUM','DIRECT','ORGANIC',
             'INSTITUTIONAL','THE','AND','FOR','NOT','WITH','THIS','THAT','HAS',
             'WAS','ARE','BUT','ALL','CAN','HAD','HER','ONE','OUR','OUT','YOU',
             'QURANIC','ARABIC','SHIFT','ENTRY','QUERY','INDEX','COUNT','CHECK',
             'CORRIDOR','DECAY','LEVEL','PHONETIC','SEMANTIC','COMPOUND','PREFIX',
             'SUFFIX','DERIVATIVE','VERIFIED','WRITTEN','ANALYSED','PENDING',
             'SEARCH','FOUND','MATCH','CONSONANT','VOWEL','RADICAL','TRILITERAL'}
    en_terms = [t for t in en_terms if t not in noise and len(t) >= 3]

    ru_terms = RE_RU_TERM.findall(line)
    ru_noise = {'ЗАПИСИ','БУКВЫ','КОРЕНЬ','СЛОВО','ЗНАЧЕНИЕ','ЗАПИСЬ','ИСТОЧНИК'}
    ru_terms = [t for t in ru_terms if t not in ru_noise and len(t) >= 3]

    return en_terms + ru_terms


def scan_sessions():
    """Extract all chain data from all sessions."""
    files = sorted(glob.glob(os.path.join(SESSION_DIR, "*.jsonl")))
    print(f"Scanning {len(files)} sessions...")

    # term -> best chain data (most complete)
    all_data = defaultdict(list)

    for filepath in files:
        session_id = os.path.basename(filepath)[:12]
        size_mb = os.path.getsize(filepath) / 1024 / 1024
        session_count = 0

        try:
            with open(filepath, 'r', errors='ignore') as fh:
                for line in fh:
                    # Only process lines with chain data
                    if not RE_CHAIN.search(line) and not RE_ORIG2.search(line):
                        continue

                    data = extract_entry_data(line)
                    if not data.get('phonetic_chain') and not data.get('orig2_chain'):
                        continue

                    terms = find_term_for_data(line, data)
                    for term in terms:
                        data['session'] = session_id
                        all_data[term].append(dict(data))
                        session_count += 1

            if session_count > 0:
                print(f"  {session_id} ({size_mb:.1f}MB): {session_count} chain extractions")
        except Exception as e:
            continue

    # Deduplicate: keep the most complete data per term
    best = {}
    for term, entries in all_data.items():
        # Score each entry by completeness
        def completeness(d):
            score = 0
            if d.get('phonetic_chain'): score += 10
            if d.get('dp_codes'): score += len(d['dp_codes']) * 3
            if d.get('ops_applied'): score += len(d['ops_applied']) * 2
            if d.get('decay_level'): score += 5
            if d.get('ds_corridor'): score += 5
            if d.get('score'): score += 3
            if d.get('pattern'): score += 3
            if d.get('qur_refs'): score += len(d['qur_refs'])
            if d.get('shifts'): score += len(d['shifts'])
            return score

        entries.sort(key=completeness, reverse=True)
        best[term] = entries[0]

    print(f"\nTotal unique terms with chain data: {len(best)}")
    return best


def diff_against_db(extracted):
    """Compare extracted chain data vs live DB."""
    conn = _uslap_connect(DB_PATH) if _HAS_WRAPPER else sqlite3.connect(DB_PATH)
    conn.execute("PRAGMA foreign_keys = ON")
    c = conn.cursor()

    # Get current DB state
    c.execute("""SELECT entry_id, en_term, ru_term, root_id, phonetic_chain,
                        dp_codes, ops_applied, decay_level, ds_corridor, pattern, score
                 FROM entries""")

    db_entries = {}
    for row in c.fetchall():
        eid, en, ru, rid, chain, dp, ops, decay, corr, pat, score = row
        record = {
            'entry_id': eid, 'root_id': rid, 'phonetic_chain': chain,
            'dp_codes': dp, 'ops_applied': ops, 'decay_level': decay,
            'ds_corridor': corr, 'pattern': pat, 'score': score
        }
        if en: db_entries[en.upper()] = record
        if ru: db_entries[ru.upper()] = record

    conn.close()

    # Categorize
    recoverable = []    # In DB but missing chain data, session has it
    conflicts = []      # Both have data but different
    not_in_db = []      # Term not in DB
    already_complete = [] # DB already has this data

    for term, session_data in extracted.items():
        if term in db_entries:
            db = db_entries[term]

            # Check what's missing in DB that session has
            missing_fields = []
            if not db['phonetic_chain'] and session_data.get('phonetic_chain'):
                missing_fields.append('phonetic_chain')
            if not db['dp_codes'] and session_data.get('dp_codes'):
                missing_fields.append('dp_codes')
            if not db['ops_applied'] and session_data.get('ops_applied'):
                missing_fields.append('ops_applied')
            if not db['decay_level'] and session_data.get('decay_level'):
                missing_fields.append('decay_level')

            if missing_fields:
                recoverable.append({
                    'term': term,
                    'entry_id': db['entry_id'],
                    'missing': missing_fields,
                    'session_data': session_data
                })
            else:
                already_complete.append(term)
        else:
            not_in_db.append({'term': term, 'session_data': session_data})

    return {
        'recoverable': recoverable,
        'conflicts': conflicts,
        'not_in_db': not_in_db,
        'already_complete': already_complete
    }


def report():
    """Full extraction + diff report."""
    print("=" * 80)
    print("USLaP CHAIN DATA RECOVERY REPORT")
    print("=" * 80)

    extracted = scan_sessions()
    results = diff_against_db(extracted)

    print(f"\n{'='*80}")
    print(f"RECOVERABLE: {len(results['recoverable'])} entries have session chain data missing from DB")
    print(f"{'='*80}")

    # Group by missing field
    by_field = defaultdict(int)
    for item in results['recoverable']:
        for f in item['missing']:
            by_field[f] += 1

    print("\nMissing fields breakdown:")
    for field, count in sorted(by_field.items(), key=lambda x: -x[1]):
        print(f"  {field:25s} {count:5d} entries")

    print(f"\nSample recoverable entries:")
    for item in sorted(results['recoverable'], key=lambda x: len(x['missing']), reverse=True)[:30]:
        sd = item['session_data']
        chain = sd.get('phonetic_chain', sd.get('orig2_chain', ''))[:60]
        dps = ','.join(sd.get('dp_codes', []))
        ops = ','.join(sd.get('ops_applied', []))
        decay = sd.get('decay_level', '')
        print(f"  {item['term']:20s} missing={item['missing']}")
        if chain: print(f"    chain: {chain}")
        if dps: print(f"    dp: {dps}")
        if ops: print(f"    ops: {ops}")
        if decay: print(f"    decay: {decay}")

    if len(results['recoverable']) > 30:
        print(f"  ... and {len(results['recoverable']) - 30} more")

    print(f"\n{'='*80}")
    print(f"ALREADY COMPLETE: {len(results['already_complete'])} entries already have chain data in DB")
    print(f"NOT IN DB: {len(results['not_in_db'])} terms found in sessions but not in DB")
    print(f"{'='*80}")

    # Generate SQL for recoverable items
    print(f"\n{'='*80}")
    print("RECOVERY SQL (dry run — not executed)")
    print(f"{'='*80}")

    sql_count = 0
    for item in results['recoverable']:
        sd = item['session_data']
        sets = []
        if 'phonetic_chain' in item['missing'] and sd.get('phonetic_chain'):
            chain = sd['phonetic_chain'].replace("'", "''")
            sets.append(f"phonetic_chain = '{chain}'")
        if 'dp_codes' in item['missing'] and sd.get('dp_codes'):
            dps = ','.join(sd['dp_codes'])
            sets.append(f"dp_codes = '{dps}'")
        if 'ops_applied' in item['missing'] and sd.get('ops_applied'):
            ops = ','.join(sd['ops_applied'])
            sets.append(f"ops_applied = '{ops}'")
        if 'decay_level' in item['missing'] and sd.get('decay_level'):
            sets.append(f"decay_level = '{sd['decay_level']}'")

        if sets:
            sql = f"UPDATE entries SET {', '.join(sets)} WHERE entry_id = {item['entry_id']};"
            print(sql)
            sql_count += 1

    print(f"\n-- Total: {sql_count} UPDATE statements")

    return results


if __name__ == '__main__':
    cmd = sys.argv[1] if len(sys.argv) > 1 else 'report'
    if cmd == 'scan':
        scan_sessions()
    elif cmd == 'diff':
        extracted = scan_sessions()
        results = diff_against_db(extracted)
        print(f"Recoverable: {len(results['recoverable'])} | Complete: {len(results['already_complete'])} | Not in DB: {len(results['not_in_db'])}")
    elif cmd == 'report':
        report()
    else:
        print(f"Unknown: {cmd}. Use: scan | diff | report")