import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi
import os
import time
import shutil
from pathlib import Path
import json

# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
CONCEPTNET_BASE = "http://conceptnet.io"  # CRITICAL: Full URL base
# =========================

print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")

if HF_TOKEN:
    print(f"✅ HF_TOKEN found (length: {len(HF_TOKEN)})")
else:
    print("⚠️  No HF_TOKEN - checkpointing disabled")

ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"

def log_progress(message, level="INFO"):
    """Enhanced logging with timestamp"""
    timestamp = time.strftime("%H:%M:%S")
    prefix = {
        "INFO": "ℹ️ ",
        "SUCCESS": "✅",
        "ERROR": "❌",
        "WARN": "⚠️ ",
        "CHECKPOINT": "💾",
        "DEBUG": "🔍"
    }.get(level, "")
    print(f"[{timestamp}] {prefix} {message}")

def verify_database_has_indices(db_path):
    """Verify database has required indices"""
    log_progress(f"Verifying indices in {os.path.basename(db_path)}...", "DEBUG")
    
    if not os.path.exists(db_path):
        log_progress("Database file does not exist", "ERROR")
        return False, 0
    
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
        custom_indices = cursor.fetchall()
        
        conn.close()
        
        has_all = len(custom_indices) >= 4
        log_progress(f"Found {len(custom_indices)} custom indices (need 4+): {has_all}", "SUCCESS" if has_all else "WARN")
        
        return has_all, len(custom_indices)
        
    except Exception as e:
        log_progress(f"Error verifying indices: {e}", "ERROR")
        return False, 0

def check_remote_progress():
    """Check remote progress with detailed logging"""
    log_progress("Checking remote progress...", "DEBUG")
    
    if not HF_TOKEN:
        log_progress("No HF_TOKEN - cannot check remote", "WARN")
        return {
            "completed_indices": [],
            "analyzed_tables": [],
            "database_uploaded": False,
            "indexing_complete": False
        }
    
    try:
        api = HfApi()
        
        try:
            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
            log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS")
        except:
            log_progress("Repository does not exist yet", "INFO")
            return {
                "completed_indices": [],
                "analyzed_tables": [],
                "database_uploaded": False,
                "indexing_complete": False
            }
        
        try:
            progress_path = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=PROGRESS_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            
            with open(progress_path, 'r') as f:
                progress = json.load(f)
            
            log_progress("Remote progress loaded:", "SUCCESS")
            log_progress(f"  Completed indices: {progress.get('completed_indices', [])}", "INFO")
            log_progress(f"  Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO")
            log_progress(f"  Indexing complete: {progress.get('indexing_complete', False)}", "INFO")
            
            return progress
            
        except Exception as e:
            log_progress("No progress file found (starting fresh)", "INFO")
            return {
                "completed_indices": [],
                "analyzed_tables": [],
                "database_uploaded": False,
                "indexing_complete": False
            }
            
    except Exception as e:
        log_progress(f"Error checking remote: {e}", "ERROR")
        return {
            "completed_indices": [],
            "analyzed_tables": [],
            "database_uploaded": False,
            "indexing_complete": False
        }

def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
    """Update remote progress file"""
    log_progress("Updating remote progress...", "DEBUG")
    
    if not HF_TOKEN:
        log_progress("Cannot update progress: No HF_TOKEN", "WARN")
        return False
    
    if analyzed_tables is None:
        analyzed_tables = []
    
    try:
        api = HfApi()
        
        try:
            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
        except:
            log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO")
            api.create_repo(
                repo_id=INDEXED_REPO_ID,
                repo_type="dataset",
                token=HF_TOKEN,
                private=False
            )
        
        progress = {
            "completed_indices": completed_indices,
            "analyzed_tables": analyzed_tables,
            "database_uploaded": database_uploaded,
            "indexing_complete": indexing_complete,
            "timestamp": time.time(),
            "languages": TARGET_LANGUAGES
        }
        
        progress_path = "/tmp/indexing_progress.json"
        with open(progress_path, 'w') as f:
            json.dump(progress, f, indent=2)
        
        api.upload_file(
            path_or_fileobj=progress_path,
            path_in_repo=PROGRESS_FILENAME,
            repo_id=INDEXED_REPO_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables"
        )
        
        log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables", "CHECKPOINT")
        return True
        
    except Exception as e:
        log_progress(f"Failed to update progress: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return False

def upload_database_checkpoint(message=""):
    """Upload database with WAL checkpoint"""
    log_progress("Starting database upload...", "CHECKPOINT")
    
    if not HF_TOKEN:
        log_progress("Cannot upload: No HF_TOKEN", "WARN")
        return False
    
    if not os.path.exists(LOCAL_DB_PATH):
        log_progress("Database file doesn't exist", "ERROR")
        return False
    
    try:
        # CRITICAL: Checkpoint WAL to merge changes into main file
        log_progress("Checkpointing WAL...", "DEBUG")
        conn = sqlite3.connect(LOCAL_DB_PATH)
        conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
        conn.close()
        log_progress("WAL checkpoint complete", "SUCCESS")
        
        # Verify indices are in file
        has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
        log_progress(f"Pre-upload verification: {idx_count} indices", "SUCCESS" if has_indices else "WARN")
        
        api = HfApi()
        db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
        
        log_progress(f"Uploading {db_size:.2f} GB to {INDEXED_REPO_ID}...", "CHECKPOINT")
        if message:
            log_progress(f"  Message: {message}", "INFO")
        log_progress("  This will take 2-5 minutes...", "INFO")
        
        start = time.time()
        
        api.upload_file(
            path_or_fileobj=LOCAL_DB_PATH,
            path_in_repo=INDEXED_DB_FILENAME,
            repo_id=INDEXED_REPO_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            commit_message=message or "Database checkpoint"
        )
        
        elapsed = time.time() - start
        speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0
        
        log_progress(f"Upload complete in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS")
        log_progress(f"View at: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO")
        
        return True
        
    except Exception as e:
        log_progress(f"Upload failed: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return False

def create_indexed_database():
    """Create or download indexed database with comprehensive checkpointing"""
    log_progress("="*60, "INFO")
    log_progress("STARTING DATABASE SETUP", "INFO")
    log_progress("="*60, "INFO")
    
    # Check remote progress
    progress = check_remote_progress()
    completed_indices = set(progress.get("completed_indices", []))
    analyzed_tables = set(progress.get("analyzed_tables", []))
    database_uploaded = progress.get("database_uploaded", False)
    indexing_complete = progress.get("indexing_complete", False)
    
    # If fully complete, download and return
    if indexing_complete:
        log_progress("Fully indexed database exists!", "SUCCESS")
        log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
        
        try:
            indexed_path = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=INDEXED_DB_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            
            log_progress(f"Downloaded to: {indexed_path}", "SUCCESS")
            
            # Verify it actually has indices
            has_indices, idx_count = verify_database_has_indices(indexed_path)
            
            if has_indices:
                log_progress(f"Verified {idx_count} indices present", "SUCCESS")
                return indexed_path
            else:
                log_progress(f"CORRUPTED: Only {idx_count}/4 indices found!", "ERROR")
                log_progress("The database needs to be re-indexed", "WARN")
                
                # Reset and rebuild
                indexing_complete = False
                completed_indices = set()
                analyzed_tables = set()
                database_uploaded = False
                update_remote_progress([], [], False, False)
            
        except Exception as e:
            log_progress(f"Download failed: {e}", "ERROR")
            log_progress("Will create locally", "INFO")
    
    # Download partially indexed DB if checkpoint exists
    if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH):
        log_progress("Checkpoint detected - downloading partial DB...", "INFO")
        log_progress(f"  Indices done: {sorted(completed_indices)}", "INFO")
        log_progress(f"  Tables analyzed: {sorted(analyzed_tables)}", "INFO")
        
        try:
            indexed_path = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=INDEXED_DB_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            
            log_progress("Downloaded partial DB", "SUCCESS")
            
            # Verify indices
            has_indices, idx_count = verify_database_has_indices(indexed_path)
            
            if idx_count >= len(completed_indices):
                log_progress(f"Verified {idx_count} indices (expected {len(completed_indices)})", "SUCCESS")
                
                log_progress(f"Copying to {LOCAL_DB_PATH}...", "DEBUG")
                start = time.time()
                shutil.copy2(indexed_path, LOCAL_DB_PATH)
                elapsed = time.time() - start
                log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
                log_progress("Resuming from checkpoint ✅", "SUCCESS")
            else:
                log_progress(f"Index mismatch: found {idx_count}, expected {len(completed_indices)}", "ERROR")
                log_progress("Will start from scratch", "WARN")
                completed_indices = set()
                analyzed_tables = set()
            
        except Exception as e:
            log_progress(f"Could not download partial DB: {e}", "WARN")
            log_progress("Will start from original", "INFO")
            completed_indices = set()
            analyzed_tables = set()
    
    # Download original if needed
    if not os.path.exists(LOCAL_DB_PATH):
        if completed_indices or analyzed_tables:
            log_progress("Failed to resume - clearing progress", "WARN")
            update_remote_progress([], [], False, False)
            completed_indices = set()
            analyzed_tables = set()
        
        log_progress("Downloading original ConceptNet database...", "INFO")
        
        original_path = hf_hub_download(
            repo_id=ORIGINAL_REPO_ID,
            filename=ORIGINAL_DB_FILENAME,
            repo_type="dataset"
        )
        
        original_size = os.path.getsize(original_path)
        free_space = shutil.disk_usage("/tmp")[2]
        
        log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO")
        log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO")
        
        if free_space < original_size * 2:
            raise Exception(f"Insufficient space! Need {original_size * 2 / (2**30):.1f} GB, have {free_space / (2**30):.1f} GB")
        
        log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO")
        start = time.time()
        shutil.copy2(original_path, LOCAL_DB_PATH)
        elapsed = time.time() - start
        log_progress(f"Copied {original_size / (2**30):.2f} GB in {elapsed:.1f}s ({original_size / elapsed / (2**20):.1f} MB/s)", "SUCCESS")
    
    # Only index if not complete
    if not (len(completed_indices) >= 4 and len(analyzed_tables) >= 4):
        log_progress("Indexing required", "INFO")
        
        # Connect
        log_progress("Opening database connection...", "DEBUG")
        conn = sqlite3.connect(LOCAL_DB_PATH)
        cursor = conn.cursor()
        
        # Optimizations
        log_progress("Setting PRAGMA optimizations...", "DEBUG")
        cursor.execute("PRAGMA journal_mode = WAL")
        cursor.execute("PRAGMA synchronous = NORMAL")
        cursor.execute("PRAGMA cache_size = -512000")
        cursor.execute("PRAGMA temp_store = MEMORY")
        
        # PHASE 1: Indices
        log_progress("="*60, "INFO")
        log_progress("PHASE 1: CREATING INDICES", "INFO")
        log_progress("="*60, "INFO")
        
        indices_to_create = [
            ("idx_edge_start_id", "edge", "start_id"),
            ("idx_edge_end_id", "edge", "end_id"),
            ("idx_edge_rel_id", "edge", "rel_id"),
            ("idx_node_label", "node", "label"),
        ]
        
        for i, (idx_name, table, column) in enumerate(indices_to_create, 1):
            if idx_name in completed_indices:
                log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED", "INFO")
                continue
            
            log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO")
            
            start = time.time()
            
            try:
                cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
                conn.commit()
                
                elapsed = time.time() - start
                log_progress(f"  Created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS")
                
                completed_indices.add(idx_name)
                update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
                upload_database_checkpoint(f"Checkpoint: {idx_name} ({i}/{len(indices_to_create)})")
                
            except Exception as e:
                log_progress(f"Failed to create {idx_name}: {e}", "ERROR")
                conn.close()
                raise
        
        # PHASE 2: ANALYZE
        log_progress("="*60, "INFO")
        log_progress("PHASE 2: ANALYZING TABLES", "INFO")
        log_progress("="*60, "INFO")
        
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
        tables = [row[0] for row in cursor.fetchall()]
        
        log_progress(f"Found {len(tables)} tables: {tables}", "INFO")
        
        for i, table in enumerate(tables, 1):
            if table in analyzed_tables:
                log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED", "INFO")
                continue
            
            log_progress(f"[{i}/{len(tables)}] Analyzing {table}...", "INFO")
            
            try:
                cursor.execute(f"SELECT COUNT(*) FROM {table}")
                row_count = cursor.fetchone()[0]
                log_progress(f"  Rows: {row_count:,}", "INFO")
            except:
                log_progress("  Could not count rows", "WARN")
            
            start = time.time()
            
            try:
                cursor.execute(f"ANALYZE {table}")
                conn.commit()
                
                elapsed = time.time() - start
                log_progress(f"  Analyzed in {elapsed:.1f}s", "SUCCESS")
                
                analyzed_tables.add(table)
                update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
                upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")
                
            except Exception as e:
                log_progress(f"Failed to analyze {table}: {e}", "ERROR")
                log_progress("Continuing...", "WARN")
        
        # Final checkpoint
        log_progress("Final WAL checkpoint...", "INFO")
        cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)")
        conn.commit()
        conn.close()
        log_progress("Database closed", "SUCCESS")
        
        # Final upload
        log_progress("="*60, "INFO")
        log_progress("FINAL UPLOAD", "INFO")
        log_progress("="*60, "INFO")
        
        has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
        log_progress(f"Final check: {idx_count} indices", "SUCCESS" if has_indices else "ERROR")
        
        upload_database_checkpoint("COMPLETE - All indices and analysis done")
        update_remote_progress(list(completed_indices), list(analyzed_tables), True, True)
        
        log_progress("="*60, "SUCCESS")
        log_progress("INDEXING COMPLETE!", "SUCCESS")
        log_progress("="*60, "SUCCESS")
    
    return LOCAL_DB_PATH

# Initialize
DB_PATH = create_indexed_database()

def get_db_connection():
    """Create optimized connection"""
    log_progress("Creating DB connection", "DEBUG")
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    conn.execute("PRAGMA cache_size = -256000")
    conn.execute("PRAGMA mmap_size = 4294967296")
    return conn

def run_diagnostics():
    """Run comprehensive diagnostics"""
    log_progress("="*60, "INFO")
    log_progress("RUNNING DIAGNOSTICS", "INFO")
    log_progress("="*60, "INFO")
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # 1. Sample nodes
            log_progress("\n1. Sample node IDs:", "INFO")
            cursor.execute("SELECT id, label FROM node LIMIT 10")
            for node_id, label in cursor.fetchall():
                print(f"   {node_id} -> {label}")
            
            # 2. Test correct pattern
            log_progress("\n2. Testing CORRECT pattern (no leading %):", "INFO")
            test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%"
            log_progress(f"   Pattern: {test_pattern}", "DEBUG")
            
            start = time.time()
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (test_pattern,))
            results = cursor.fetchall()
            elapsed = time.time() - start
            
            log_progress(f"   Found {len(results)} in {elapsed:.3f}s", "SUCCESS" if elapsed < 1 else "WARN")
            for node_id, label in results:
                print(f"     {node_id} -> {label}")
            
            # 3. Check index usage
            log_progress("\n3. Checking index usage:", "INFO")
            cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '{test_pattern}'")
            plan = cursor.fetchall()
            uses_index = any('INDEX' in str(row).upper() for row in plan)
            log_progress(f"   Uses index: {uses_index}", "SUCCESS" if uses_index else "ERROR")
            for row in plan:
                print(f"     {row}")
            
            # 4. Test wrong pattern
            log_progress("\n4. Testing WRONG pattern (leading %):", "WARN")
            wrong_pattern = f"%/c/en/dog%"
            log_progress(f"   Pattern: {wrong_pattern}", "DEBUG")
            
            start = time.time()
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (wrong_pattern,))
            results = cursor.fetchall()
            elapsed = time.time() - start
            
            log_progress(f"   Found {len(results)} in {elapsed:.3f}s (SLOW!)", "WARN" if elapsed > 1 else "INFO")
            
            cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM node WHERE id LIKE '{wrong_pattern}'")
            plan = cursor.fetchall()
            uses_index = any('INDEX' in str(row).upper() for row in plan)
            log_progress(f"   Uses index: {uses_index} (should be False)", "WARN" if uses_index else "INFO")
            
            log_progress("\n" + "="*60, "INFO")
            log_progress("DIAGNOSTICS COMPLETE", "SUCCESS")
            log_progress("="*60 + "\n", "INFO")
            
    except Exception as e:
        log_progress(f"Diagnostics failed: {e}", "ERROR")
        import traceback
        traceback.print_exc()

# Run diagnostics
run_diagnostics()

def get_semantic_profile(word, lang='en', progress=gr.Progress()):
    """Get semantic profile with CORRECT URL pattern"""
    log_progress(f"Semantic profile request: word='{word}', lang='{lang}'", "DEBUG")
    progress(0, desc="Starting...")
    
    if not word:
        return "⚠️ Please enter a word."
    
    if lang not in TARGET_LANGUAGES:
        return f"⚠️ Language '{lang}' not supported. Available: {', '.join(TARGET_LANGUAGES)}"
    
    word = word.strip().lower().replace(' ', '_')
    
    # CORRECT pattern - no leading % allows index usage!
    like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%"
    log_progress(f"Using pattern: {like_path}", "DEBUG")
    
    relations = [
        "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
        "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", 
        "/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom", "/r/SimilarTo"
    ]
    
    output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            progress(0.05, desc="Finding nodes...")
            
            start = time.time()
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
            nodes = cursor.fetchall()
            elapsed = time.time() - start
            
            log_progress(f"Found {len(nodes)} nodes in {elapsed:.3f}s", "SUCCESS" if nodes else "WARN")
            
            if not nodes:
                return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**\n\nSearched: `{like_path}`"
            
            for node_id, label in nodes[:3]:
                output_md += f"**Node:** `{node_id}`\n"
                output_md += f"**Label:** {label}\n\n"
                log_progress(f"  Found node: {node_id} ({label})", "DEBUG")
            
            total_relations = 0
            
            for i, rel in enumerate(relations):
                progress((i + 1) / len(relations), desc=f"Querying {rel}...")
                log_progress(f"Querying relation: {rel}", "DEBUG")
                
                output_md += f"## {rel}\n\n"
                has_results = False
                
                # Outgoing edges
                start = time.time()
                cursor.execute("""
                    SELECT en.label, e.weight
                    FROM edge e
                    JOIN node en ON e.end_id = en.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.start_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC
                    LIMIT 7
                """, (like_path, rel))
                
                out_results = cursor.fetchall()
                elapsed = time.time() - start
                log_progress(f"  Outgoing: {len(out_results)} results in {elapsed:.3f}s", "DEBUG")
                
                for label, weight in out_results:
                    output_md += f"- **{word}** {rel} → *{label}* `[{weight:.3f}]`\n"
                    has_results = True
                    total_relations += 1
                
                # Incoming edges
                start = time.time()
                cursor.execute("""
                    SELECT s.label, e.weight
                    FROM edge e
                    JOIN node s ON e.start_id = s.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.end_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC
                    LIMIT 7
                """, (like_path, rel))
                
                in_results = cursor.fetchall()
                elapsed = time.time() - start
                log_progress(f"  Incoming: {len(in_results)} results in {elapsed:.3f}s", "DEBUG")
                
                for label, weight in in_results:
                    output_md += f"- *{label}* {rel} → **{word}** `[{weight:.3f}]`\n"
                    has_results = True
                    total_relations += 1
                
                if not has_results:
                    output_md += "*No results*\n"
                
                output_md += "\n"
            
            progress(1.0, desc="Complete!")
            
            output_md += "---\n"
            output_md += f"**Total relations:** {total_relations}\n"
            
            log_progress(f"Profile complete: {total_relations} relations found", "SUCCESS")
            
            return output_md
            
    except Exception as e:
        log_progress(f"Error in semantic profile: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return f"**❌ Error:**\n\n```\n{e}\n```"

def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
    """Query builder with CORRECT patterns"""
    log_progress(f"Query request: start={start_node}, rel={relation}, end={end_node}, limit={limit}", "DEBUG")
    progress(0, desc="Building query...")
    
    query = """
        SELECT
            e.id AS edge_id,
            s.id AS start_id,
            r.label AS relation,
            en.id AS end_id,
            e.weight,
            s.label AS start_label,
            en.label AS end_label
        FROM edge e
        JOIN relation r ON e.rel_id = r.id
        JOIN node s ON e.start_id = s.id
        JOIN node en ON e.end_id = en.id
        WHERE 1=1
    """
    
    params = []
    
    try:
        with get_db_connection() as conn:
            progress(0.3, desc="Adding filters...")
            
            # Language filter - use correct URL pattern!
            lang_conditions = []
            for lang in TARGET_LANGUAGES:
                lang_conditions.append(f"s.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
                lang_conditions.append(f"en.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
            query += f" AND ({' OR '.join(lang_conditions)})"
            
            # Start node filter
            if start_node and start_node.strip():
                if start_node.startswith('http://'):
                    pattern = f"{start_node}%"
                else:
                    # User enters just word, we construct full URL
                    pattern = f"{CONCEPTNET_BASE}/c/%/{start_node}%"
                query += " AND s.id LIKE ?"
                params.append(pattern)
                log_progress(f"Start filter: {pattern}", "DEBUG")
            
            # Relation filter
            if relation and relation.strip():
                rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
                if '%' in relation:
                    query += " AND r.label LIKE ?"
                else:
                    query += " AND r.label = ?"
                params.append(rel_value)
                log_progress(f"Relation filter: {rel_value}", "DEBUG")
            
            # End node filter
            if end_node and end_node.strip():
                if end_node.startswith('http://'):
                    pattern = f"{end_node}%"
                else:
                    pattern = f"{CONCEPTNET_BASE}/c/%/{end_node}%"
                query += " AND en.id LIKE ?"
                params.append(pattern)
                log_progress(f"End filter: {pattern}", "DEBUG")
            
            query += " ORDER BY e.weight DESC LIMIT ?"
            params.append(limit)
            
            progress(0.6, desc="Executing...")
            log_progress(f"Executing query with {len(params)} params", "DEBUG")
            
            start_time = time.time()
            df = pd.read_sql_query(query, conn, params=params)
            elapsed = time.time() - start_time
            
            log_progress(f"Query complete: {len(df)} results in {elapsed:.2f}s", "SUCCESS")
            
            progress(1.0, desc="Complete!")
            
            if df.empty:
                return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
            
            df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
            return df, f"✅ {len(df)} results in {elapsed:.2f}s"
            
    except Exception as e:
        log_progress(f"Query error: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), f"**❌ Error:** {e}"

def run_raw_query(sql_query):
    """Execute raw SQL with logging"""
    log_progress(f"Raw SQL query: {sql_query[:100]}...", "DEBUG")
    
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "❌ Only SELECT queries allowed"
    
    try:
        with get_db_connection() as conn:
            start = time.time()
            df = pd.read_sql_query(sql_query, conn)
            elapsed = time.time() - start
            
            log_progress(f"Raw query complete: {len(df)} rows in {elapsed:.3f}s", "SUCCESS")
            
            return df, f"✅ {len(df)} rows in {elapsed:.3f}s"
            
    except Exception as e:
        log_progress(f"Raw query error: {e}", "ERROR")
        return pd.DataFrame(), f"❌ Error: {e}"

def get_schema_info():
    """Get schema with sample queries"""
    log_progress("Loading schema info", "DEBUG")
    
    md = f"# 📚 Database Schema\n\n"
    md += f"**Repository:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
    md += f"**Base URL:** `{CONCEPTNET_BASE}`\n\n"
    
    md += "## Sample Queries\n\n"
    md += "**Finding nodes:**\n```sql\n"
    md += f"-- English 'dog'\n"
    md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%';\n\n"
    md += f"-- German 'hund'\n"
    md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/de/hund%';\n"
    md += "```\n\n"
    
    md += "**Finding edges:**\n```sql\n"
    md += f"-- Edges from 'dog'\n"
    md += f"SELECT * FROM edge WHERE start_id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10;\n"
    md += "```\n\n"
    
    md += "⚠️ **Important:** Do NOT use leading `%` in LIKE queries (prevents index usage!)\n\n"
    md += "✅ **Good:** `LIKE 'http://conceptnet.io/c/en/dog%'`\n"
    md += "❌ **Bad:** `LIKE '%/c/en/dog%'`\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            md += "## Tables\n\n"
            
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
            
            for table, in cursor.fetchall():
                cursor.execute(f"SELECT COUNT(*) FROM {table}")
                count = cursor.fetchone()[0]
                
                md += f"### {table} ({count:,} rows)\n\n"
                
                # Show columns
                cursor.execute(f"PRAGMA table_info({table})")
                cols = cursor.fetchall()
                
                md += "| Column | Type |\n|:--|:--|\n"
                for col in cols:
                    md += f"| `{col[1]}` | `{col[2]}` |\n"
                
                # Show indices
                cursor.execute(f"PRAGMA index_list({table})")
                indices = cursor.fetchall()
                
                if indices:
                    md += f"\n**Indices ({len(indices)}):**\n"
                    for idx in indices:
                        custom = " 🆕" if idx[1].startswith("idx_") else ""
                        md += f"- `{idx[1]}`{custom}\n"
                
                md += "\n"
            
            log_progress("Schema loaded successfully", "SUCCESS")
            
    except Exception as e:
        log_progress(f"Schema error: {e}", "ERROR")
        md += f"\n**Error loading schema:** {e}\n"
    
    return md

# UI
with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 ConceptNet Explorer")
    gr.Markdown(
        f"**Multi-language semantic network explorer** | "
        f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
        f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})"
    )
    gr.Markdown("✅ **Optimized with custom indices** - Fast queries using correct URL patterns")
    
    with gr.Tabs():
        with gr.TabItem("🔍 Semantic Profile"):
            gr.Markdown("**Explore semantic relations for any word**")
            
            with gr.Row():
                word_input = gr.Textbox(
                    label="Word",
                    placeholder="dog",
                    value="dog",
                    info="Enter a word to explore"
                )
                lang_input = gr.Dropdown(
                    choices=TARGET_LANGUAGES,
                    value="en",
                    label="Language",
                    info="Select language"
                )
            
            semantic_btn = gr.Button("🔍 Get Semantic Profile", variant="primary", size="lg")
            semantic_output = gr.Markdown("*Enter a word and click the button to start...*")
            
            gr.Markdown("**Examples:** dog (en), hund (de), perro (es), chien (fr), 犬 (ja)")
        
        with gr.TabItem("⚡ Query Builder"):
            gr.Markdown("**Build custom queries to find specific relationships**")
            
            with gr.Row():
                start_input = gr.Textbox(
                    label="Start Node",
                    placeholder="dog",
                    info="Enter word or full URL"
                )
                rel_input = gr.Textbox(
                    label="Relation",
                    placeholder="IsA",
                    value="IsA",
                    info="e.g., IsA, PartOf, UsedFor"
                )
                end_input = gr.Textbox(
                    label="End Node",
                    placeholder="",
                    info="Leave empty for all"
                )
            
            limit_slider = gr.Slider(
                label="Result Limit",
                minimum=1,
                maximum=200,
                value=50,
                step=1
            )
            
            query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")
            
            status_output = gr.Markdown("*Ready to query...*")
            results_output = gr.DataFrame(
                label="Results",
                wrap=True,
                interactive=False
            )
        
        with gr.TabItem("💻 Raw SQL"):
            gr.Markdown("**Execute custom SQL queries** (SELECT only)")
            
            raw_sql_input = gr.Textbox(
                label="SQL Query",
                value=f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10",
                lines=5,
                info="Write your SELECT query"
            )
            
            raw_btn = gr.Button("▶️ Execute Query", variant="secondary", size="lg")
            
            raw_status = gr.Markdown()
            raw_results = gr.DataFrame(label="Query Results", wrap=True)
            
            gr.Markdown(
                "**Tips:**\n"
                "- Always use `LIMIT` to prevent timeouts\n"
                f"- Node IDs start with: `{CONCEPTNET_BASE}/c/{{lang}}/{{word}}`\n"
                "- Don't use leading `%` in LIKE queries for best performance"
            )
        
        with gr.TabItem("📊 Schema & Info"):
            gr.Markdown("**Database schema and structure information**")
            
            schema_btn = gr.Button("📊 Load Schema", variant="secondary", size="lg")
            schema_output = gr.Markdown("*Click button to load schema...*")
    
    gr.Markdown(
        "---\n"
        "**Performance:** Custom indices on `edge.start_id`, `edge.end_id`, `edge.rel_id`, `node.label` | "
        "**Check server logs for detailed query timing and diagnostics**"
    )
    
    # Wire up event handlers
    semantic_btn.click(
        fn=get_semantic_profile,
        inputs=[word_input, lang_input],
        outputs=semantic_output
    )
    
    query_btn.click(
        fn=run_query,
        inputs=[start_input, rel_input, end_input, limit_slider],
        outputs=[results_output, status_output]
    )
    
    raw_btn.click(
        fn=run_raw_query,
        inputs=raw_sql_input,
        outputs=[raw_results, raw_status]
    )
    
    schema_btn.click(
        fn=get_schema_info,
        inputs=None,
        outputs=schema_output
    )

if __name__ == "__main__":
    log_progress("="*60, "SUCCESS")
    log_progress("APP READY!", "SUCCESS")
    log_progress("="*60, "SUCCESS")
    log_progress(f"Database: {DB_PATH}", "INFO")
    log_progress(f"Size: {os.path.getsize(DB_PATH) / (2**30):.2f} GB", "INFO")
    log_progress("="*60 + "\n", "SUCCESS")
    
    demo.launch(ssr_mode=False)