import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi, HfFolder
import os
import time
import shutil
from pathlib import Path
import json

# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
# =========================

print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

# Get HF token
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
if not HF_TOKEN:
    try:
        HF_TOKEN = HfFolder.get_token()
    except:
        pass

if not HF_TOKEN:
    print("⚠️  WARNING: No HF_TOKEN found!")
    print("   Add HF_TOKEN in Space settings to enable checkpointing")

# Original database
ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"

def log_progress(message, level="INFO"):
    """Enhanced logging with timestamp"""
    timestamp = time.strftime("%H:%M:%S")
    prefix = {
        "INFO": "ℹ️ ",
        "SUCCESS": "✅",
        "ERROR": "❌",
        "WARN": "⚠️ ",
        "CHECKPOINT": "💾"
    }.get(level, "")
    print(f"[{timestamp}] {prefix} {message}")

def check_remote_progress():
    """Check remote progress with detailed logging"""
    if not HF_TOKEN:
        log_progress("No HF_TOKEN - cannot check remote progress", "WARN")
        return {
            "completed_indices": [],
            "analyzed_tables": [],
            "database_uploaded": False,
            "indexing_complete": False
        }
    
    try:
        api = HfApi()
        
        # Check if repo exists
        try:
            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
            log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS")
        except:
            log_progress("Repository doesn't exist yet", "INFO")
            return {
                "completed_indices": [],
                "analyzed_tables": [],
                "database_uploaded": False,
                "indexing_complete": False
            }
        
        # Download progress file
        try:
            progress_path = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=PROGRESS_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            
            with open(progress_path, 'r') as f:
                progress = json.load(f)
            
            log_progress("Remote progress loaded:", "INFO")
            log_progress(f"  Completed indices: {progress.get('completed_indices', [])}", "INFO")
            log_progress(f"  Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO")
            log_progress(f"  Indexing complete: {progress.get('indexing_complete', False)}", "INFO")
            
            return progress
            
        except Exception as e:
            log_progress("No progress file found (starting fresh)", "INFO")
            return {
                "completed_indices": [],
                "analyzed_tables": [],
                "database_uploaded": False,
                "indexing_complete": False
            }
            
    except Exception as e:
        log_progress(f"Error checking remote: {e}", "ERROR")
        return {
            "completed_indices": [],
            "analyzed_tables": [],
            "database_uploaded": False,
            "indexing_complete": False
        }

def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
    """Update progress with detailed tracking"""
    if not HF_TOKEN:
        log_progress("Cannot update progress: No HF_TOKEN", "WARN")
        return False
    
    if analyzed_tables is None:
        analyzed_tables = []
    
    try:
        api = HfApi()
        
        # Create repo if needed
        try:
            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
        except:
            log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO")
            api.create_repo(
                repo_id=INDEXED_REPO_ID,
                repo_type="dataset",
                token=HF_TOKEN,
                private=False
            )
        
        # Create progress file
        progress = {
            "completed_indices": completed_indices,
            "analyzed_tables": analyzed_tables,
            "database_uploaded": database_uploaded,
            "indexing_complete": indexing_complete,
            "timestamp": time.time(),
            "languages": TARGET_LANGUAGES
        }
        
        progress_path = "/tmp/indexing_progress.json"
        with open(progress_path, 'w') as f:
            json.dump(progress, f, indent=2)
        
        # Upload
        api.upload_file(
            path_or_fileobj=progress_path,
            path_in_repo=PROGRESS_FILENAME,
            repo_id=INDEXED_REPO_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed"
        )
        
        log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed", "CHECKPOINT")
        return True
        
    except Exception as e:
        log_progress(f"Failed to update progress: {e}", "ERROR")
        return False

def upload_database_checkpoint(message=""):
    """Upload database with progress reporting"""
    if not HF_TOKEN:
        log_progress("Cannot upload: No HF_TOKEN", "WARN")
        return False
    
    if not os.path.exists(LOCAL_DB_PATH):
        log_progress("Database file doesn't exist", "ERROR")
        return False
    
    try:
        api = HfApi()
        
        db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
        log_progress(f"Uploading database checkpoint ({db_size:.2f} GB)...", "CHECKPOINT")
        log_progress(f"  {message}", "INFO")
        log_progress(f"  This may take 5-10 minutes...", "INFO")
        
        start = time.time()
        
        api.upload_file(
            path_or_fileobj=LOCAL_DB_PATH,
            path_in_repo=INDEXED_DB_FILENAME,
            repo_id=INDEXED_REPO_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            commit_message=message or "Database checkpoint"
        )
        
        elapsed = time.time() - start
        log_progress(f"Database uploaded in {elapsed:.1f}s ({db_size*8/elapsed:.1f} Mbps)", "SUCCESS")
        
        return True
        
    except Exception as e:
        log_progress(f"Upload failed: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return False

def create_indexed_database():
    """Create indexed database with comprehensive checkpointing"""
    log_progress("="*60, "INFO")
    log_progress("STARTING INDEXED DATABASE CREATION", "INFO")
    log_progress("="*60, "INFO")
    
    # Check remote progress
    progress = check_remote_progress()
    completed_indices = set(progress.get("completed_indices", []))
    analyzed_tables = set(progress.get("analyzed_tables", []))
    database_uploaded = progress.get("database_uploaded", False)
    indexing_complete = progress.get("indexing_complete", False)
    
    # If fully complete, download and return
    if indexing_complete:
        log_progress("Fully indexed database exists!", "SUCCESS")
        log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
        
        try:
            indexed_path = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=INDEXED_DB_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            log_progress(f"Downloaded: {indexed_path}", "SUCCESS")
            return indexed_path
            
        except Exception as e:
            log_progress(f"Download failed: {e}", "ERROR")
            log_progress("Will create locally", "INFO")
    
    # Check for partial progress
    if completed_indices or analyzed_tables:
        log_progress("Resuming from checkpoint:", "INFO")
        log_progress(f"  Completed indices: {sorted(completed_indices)}", "INFO")
        log_progress(f"  Analyzed tables: {sorted(analyzed_tables)}", "INFO")
    
    # Get or create local database
    if os.path.exists(LOCAL_DB_PATH) and (completed_indices or analyzed_tables):
        log_progress("Using existing local database", "SUCCESS")
    elif database_uploaded:
        log_progress("Downloading partial database from HF...", "INFO")
        try:
            remote_db = hf_hub_download(
                repo_id=INDEXED_REPO_ID,
                filename=INDEXED_DB_FILENAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            shutil.copy2(remote_db, LOCAL_DB_PATH)
            log_progress("Downloaded partial database", "SUCCESS")
        except:
            log_progress("No partial database, starting from original", "INFO")
    
    if not os.path.exists(LOCAL_DB_PATH):
        # Download and copy original
        log_progress("Downloading original database...", "INFO")
        original_path = hf_hub_download(
            repo_id=ORIGINAL_REPO_ID,
            filename=ORIGINAL_DB_FILENAME,
            repo_type="dataset"
        )
        
        original_size = os.path.getsize(original_path)
        free_space = shutil.disk_usage("/tmp")[2]
        
        log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO")
        log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO")
        
        if free_space < original_size * 2:
            raise Exception(f"Not enough space! Need {original_size * 2 / (2**30):.1f} GB")
        
        log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO")
        start = time.time()
        shutil.copy2(original_path, LOCAL_DB_PATH)
        elapsed = time.time() - start
        log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
    
    # Connect to database
    conn = sqlite3.connect(LOCAL_DB_PATH)
    cursor = conn.cursor()
    
    # Enable optimizations
    cursor.execute("PRAGMA journal_mode = WAL")
    cursor.execute("PRAGMA synchronous = NORMAL")
    cursor.execute("PRAGMA cache_size = -512000")
    
    # PHASE 1: Create Indices
    log_progress("="*60, "INFO")
    log_progress("PHASE 1: CREATING INDICES", "INFO")
    log_progress("="*60, "INFO")
    
    indices_to_create = [
        ("idx_edge_start_id", "edge", "start_id"),
        ("idx_edge_end_id", "edge", "end_id"),
        ("idx_edge_rel_id", "edge", "rel_id"),
        ("idx_node_label", "node", "label"),
    ]
    
    for i, (idx_name, table, column) in enumerate(indices_to_create, 1):
        if idx_name in completed_indices:
            log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED (already complete)", "INFO")
            continue
        
        log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO")
        
        start = time.time()
        
        try:
            cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
            conn.commit()
            
            elapsed = time.time() - start
            log_progress(f"  Index created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS")
            
            # Update progress
            completed_indices.add(idx_name)
            update_remote_progress(
                list(completed_indices),
                list(analyzed_tables),
                database_uploaded=False,
                indexing_complete=False
            )
            
            # Upload checkpoint
            upload_database_checkpoint(f"Checkpoint: {idx_name} created")
            
        except Exception as e:
            log_progress(f"Failed to create {idx_name}: {e}", "ERROR")
            conn.close()
            raise
    
    # PHASE 2: Analyze Tables (per-table with checkpoints)
    log_progress("="*60, "INFO")
    log_progress("PHASE 2: ANALYZING TABLES", "INFO")
    log_progress("="*60, "INFO")
    
    # Get list of tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
    tables = [row[0] for row in cursor.fetchall()]
    
    log_progress(f"Found {len(tables)} tables to analyze: {tables}", "INFO")
    
    for i, table in enumerate(tables, 1):
        if table in analyzed_tables:
            log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED (already analyzed)", "INFO")
            continue
        
        log_progress(f"[{i}/{len(tables)}] Analyzing table: {table}", "INFO")
        
        # Get table size for progress estimation
        cursor.execute(f"SELECT COUNT(*) FROM {table}")
        row_count = cursor.fetchone()[0]
        log_progress(f"  Table has {row_count:,} rows", "INFO")
        
        start = time.time()
        
        try:
            # Run ANALYZE on this specific table
            cursor.execute(f"ANALYZE {table}")
            conn.commit()
            
            elapsed = time.time() - start
            log_progress(f"  Analyzed in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS")
            
            # Update progress
            analyzed_tables.add(table)
            update_remote_progress(
                list(completed_indices),
                list(analyzed_tables),
                database_uploaded=False,
                indexing_complete=False
            )
            
            # Upload checkpoint after each table
            log_progress(f"  Uploading checkpoint after analyzing {table}...", "CHECKPOINT")
            upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")
            
        except Exception as e:
            log_progress(f"Failed to analyze {table}: {e}", "ERROR")
            log_progress("Continuing with next table...", "WARN")
    
    conn.close()
    
    # PHASE 3: Final upload and completion
    log_progress("="*60, "INFO")
    log_progress("PHASE 3: FINAL UPLOAD", "INFO")
    log_progress("="*60, "INFO")
    
    log_progress("All indexing and analysis complete!", "SUCCESS")
    log_progress("Performing final upload...", "INFO")
    
    upload_database_checkpoint("Final indexed database - COMPLETE")
    
    # Mark as complete
    update_remote_progress(
        list(completed_indices),
        list(analyzed_tables),
        database_uploaded=True,
        indexing_complete=True
    )
    
    indexed_size = os.path.getsize(LOCAL_DB_PATH)
    
    log_progress("="*60, "SUCCESS")
    log_progress("INDEXING COMPLETE!", "SUCCESS")
    log_progress("="*60, "SUCCESS")
    log_progress(f"Final size: {indexed_size / (2**30):.2f} GB", "INFO")
    log_progress(f"Indices: {sorted(completed_indices)}", "INFO")
    log_progress(f"Analyzed: {sorted(analyzed_tables)}", "INFO")
    log_progress(f"Saved to: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO")
    log_progress("="*60, "SUCCESS")
    
    return LOCAL_DB_PATH

# Initialize database
DB_PATH = create_indexed_database()

def get_db_connection():
    """Create optimized connection"""
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    conn.execute("PRAGMA cache_size = -256000")
    conn.execute("PRAGMA mmap_size = 4294967296")
    return conn

def verify_indices():
    """Verify indices"""
    log_progress("="*60, "INFO")
    log_progress("VERIFYING INDICES", "INFO")
    log_progress("="*60, "INFO")
    
    with get_db_connection() as conn:
        cursor = conn.cursor()
        
        cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
        custom_indices = cursor.fetchall()
        
        log_progress(f"Custom indices: {len(custom_indices)}", "INFO")
        for idx in custom_indices:
            log_progress(f"  ✓ {idx[0]}", "SUCCESS")
        
        # Speed test
        log_progress("Running speed test...", "INFO")
        start = time.time()
        cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE '/c/de/hund%'")
        count = cursor.fetchone()[0]
        elapsed = time.time() - start
        
        status = "SUCCESS" if elapsed < 1 else "WARN" if elapsed < 5 else "ERROR"
        log_progress(f"Query: {count} results in {elapsed:.3f}s", status)
        
        log_progress("="*60, "INFO")

verify_indices()

def get_semantic_profile(word, lang='de', progress=gr.Progress()):
    """Semantic profile"""
    progress(0, desc="Starting...")
    
    if not word:
        return "⚠️ Please enter a word."
    
    word = word.strip().lower().replace(' ', '_')
    like_path = f"/c/{lang}/{word}%"
    
    relations = [
        "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
        "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", 
        "/r/AtLocation", "/r/RelatedTo"
    ]
    
    output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            progress(0.05, desc="Finding nodes...")
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
            nodes = cursor.fetchall()
            
            if not nodes:
                return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**"
            
            for node_id, label in nodes[:3]:
                output_md += f"**Node:** `{node_id}` ({label})\n"
            output_md += "\n"
            
            total = 0
            
            for i, rel in enumerate(relations):
                progress((i + 1) / len(relations), desc=f"Querying {rel}...")
                
                output_md += f"## {rel}\n\n"
                found = False
                
                # Outgoing
                cursor.execute("""
                    SELECT en.label, e.weight
                    FROM edge e
                    JOIN node en ON e.end_id = en.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.start_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC LIMIT 7
                """, (like_path, rel))
                
                for label, weight in cursor.fetchall():
                    output_md += f"- **{word}** {rel} → *{label}* `[{weight:.3f}]`\n"
                    found = True
                    total += 1
                
                # Incoming
                cursor.execute("""
                    SELECT s.label, e.weight
                    FROM edge e
                    JOIN node s ON e.start_id = s.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.end_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC LIMIT 7
                """, (like_path, rel))
                
                for label, weight in cursor.fetchall():
                    output_md += f"- *{label}* {rel} → **{word}** `[{weight:.3f}]`\n"
                    found = True
                    total += 1
                
                if not found:
                    output_md += "*No results*\n"
                output_md += "\n"
            
            progress(1.0, desc="Complete!")
            output_md += f"---\n**Total:** {total} relations\n"
            return output_md
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        return f"**❌ Error:** {e}"

def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
    """Query builder"""
    progress(0, desc="Starting...")
    
    query = """
        SELECT e.id, s.id, r.label, en.id, e.weight, s.label, en.label
        FROM edge e
        JOIN relation r ON e.rel_id = r.id
        JOIN node s ON e.start_id = s.id
        JOIN node en ON e.end_id = en.id
        WHERE 1=1
    """
    
    params = []
    
    try:
        with get_db_connection() as conn:
            progress(0.3, desc="Building...")
            
            # Language filter
            lang_cond = []
            for lang in TARGET_LANGUAGES:
                lang_cond.append(f"s.id LIKE '/c/{lang}/%'")
                lang_cond.append(f"en.id LIKE '/c/{lang}/%'")
            query += f" AND ({' OR '.join(lang_cond)})"
            
            if start_node and start_node.strip():
                pattern = start_node if '%' in start_node else f"%{start_node}%"
                query += " AND s.id LIKE ?"
                params.append(pattern)
            
            if relation and relation.strip():
                rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
                query += " AND r.label = ?" if '%' not in relation else " AND r.label LIKE ?"
                params.append(rel_value)
            
            if end_node and end_node.strip():
                pattern = end_node if '%' in end_node else f"%{end_node}%"
                query += " AND en.id LIKE ?"
                params.append(pattern)
            
            query += " ORDER BY e.weight DESC LIMIT ?"
            params.append(limit)
            
            progress(0.6, desc="Executing...")
            
            start_time = time.time()
            df = pd.read_sql_query(query, conn, params=params)
            elapsed = time.time() - start_time
            
            progress(1.0, desc="Complete!")
            
            if df.empty:
                return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
            
            df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
            return df, f"✅ {len(df)} results in {elapsed:.2f}s"
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), f"**❌ Error:** {e}"

def run_raw_query(sql_query):
    """Raw SQL"""
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "Only SELECT allowed"
    try:
        with get_db_connection() as conn:
            df = pd.read_sql_query(sql_query, conn)
            return df, f"✅ {len(df)} rows"
    except Exception as e:
        return pd.DataFrame(), f"Error: {e}"

def get_schema_info():
    """Schema info"""
    with get_db_connection() as conn:
        cursor = conn.cursor()
        md = f"# 📚 Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
        for table, in cursor.fetchall():
            cursor.execute(f"SELECT COUNT(*) FROM {table}")
            md += f"## {table} ({cursor.fetchone()[0]:,} rows)\n\n"
        return md

# UI
with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"# 🧠 ConceptNet ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
    gr.Markdown(f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | ✅ Per-table checkpoints")
    
    with gr.Tabs():
        with gr.TabItem("🔍 Profile"):
            with gr.Row():
                word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
                lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Lang")
            semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
            semantic_output = gr.Markdown()
        
        with gr.TabItem("⚡ Query"):
            with gr.Row():
                start_input = gr.Textbox(label="Start", placeholder="hund", value="hund")
                rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA")
                end_input = gr.Textbox(label="End", placeholder="")
            limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50)
            query_btn = gr.Button("▶️ Run", variant="primary", size="lg")
            status_output = gr.Markdown()
            results_output = gr.DataFrame(wrap=True)
        
        with gr.TabItem("💻 SQL"):
            raw_sql_input = gr.Textbox(label="SQL", value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", lines=3)
            raw_btn = gr.Button("▶️ Execute")
            raw_status = gr.Markdown()
            raw_results = gr.DataFrame()
        
        with gr.TabItem("📊 Schema"):
            schema_btn = gr.Button("📊 Load")
            schema_output = gr.Markdown()
    
    gr.Markdown("---\n✅ **Per-table ANALYZE with checkpoints!** Check server logs for detailed progress.")
    
    semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
    query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
    raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
    schema_btn.click(get_schema_info, None, schema_output)

if __name__ == "__main__":
    log_progress("App ready with per-table ANALYZE checkpoints!", "SUCCESS")
    demo.launch(ssr_mode=False)