Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

cstr commited on Nov 6

Commit

9ec2493

verified ·

1 Parent(s): 45626f2

Update app.py

Browse files

Files changed (1) hide show

app.py +373 -213

app.py CHANGED Viewed

@@ -1,131 +1,355 @@
 import gradio as gr
 import sqlite3
 import pandas as pd
-from huggingface_hub import hf_hub_download, snapshot_download
 import os
 import time
 import shutil
 from pathlib import Path
 # ===== CONFIGURATION =====
 TARGET_LANGUAGES = ['de']
-INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db"
 # =========================
 print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
-# Download original database
-REPO_ID = "ysenarath/conceptnet-sqlite"
-DB_FILENAME = "data/conceptnet-v5.7.0.db"
-ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
-print(f"Original database: {ORIGINAL_DB_PATH}")
 def create_indexed_database():
     """
-    Copy database and create missing indices for fast queries.
-    This runs once on startup.
     """
-    if os.path.exists(INDEXED_DB_PATH):
-        db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH)
-        if db_age < 24 * 3600:  # Less than 24 hours old
-            print(f"✅ Using existing indexed database: {INDEXED_DB_PATH}")
-            print(f"   (Created {db_age/3600:.1f} hours ago)")
-            return INDEXED_DB_PATH
-        else:
-            print(f"⚠️  Indexed database is {db_age/3600:.1f} hours old, recreating...")
-            os.remove(INDEXED_DB_PATH)
     print("\n" + "="*60)
-    print("CREATING INDEXED DATABASE (ONE-TIME SETUP)")
     print("="*60)
-    print(f"This will take ~2-5 minutes but only needs to run once.")
-    print(f"Subsequent runs will be instant.\n")
-    # Check if we have enough space
-    original_size = os.path.getsize(ORIGINAL_DB_PATH)
-    free_space = shutil.disk_usage("/tmp")[2]
-    print(f"Original DB size: {original_size / (2**30):.2f} GB")
-    print(f"Free space in /tmp: {free_space / (2**30):.2f} GB")
-    if free_space < original_size * 1.5:
-        print("⚠️  WARNING: Low disk space! Indices will add ~20% to DB size.")
-        print("Continuing anyway...\n")
-    # Copy database
-    print(f"1. Copying database to {INDEXED_DB_PATH}...")
-    start = time.time()
-    shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH)
-    elapsed = time.time() - start
-    print(f"   ✓ Copied in {elapsed:.1f}s\n")
-    # Connect and create indices
-    print("2. Creating indices on edge table...")
-    conn = sqlite3.connect(INDEXED_DB_PATH)
     cursor = conn.cursor()
-    # Enable optimizations for index creation
     cursor.execute("PRAGMA journal_mode = WAL")
     cursor.execute("PRAGMA synchronous = NORMAL")
-    cursor.execute("PRAGMA cache_size = -256000")
-    cursor.execute("PRAGMA temp_store = MEMORY")
-    indices_to_create = [
-        ("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"),
-        ("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"),
-        ("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"),
-    ]
     for idx_name, table, column, description in indices_to_create:
-        print(f"   Creating {idx_name} on {table}({column})...")
         print(f"   Purpose: {description}")
-        start = time.time()
-        cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
-        elapsed = time.time() - start
-        print(f"   ✓ Created in {elapsed:.1f}s\n")
-    # Analyze for query optimization
-    print("3. Running ANALYZE to optimize query planning...")
     start = time.time()
     cursor.execute("ANALYZE")
     elapsed = time.time() - start
-    print(f"   ✓ Analyzed in {elapsed:.1f}s\n")
-    # Commit and close
-    conn.commit()
     conn.close()
-    # Check final size
-    indexed_size = os.path.getsize(INDEXED_DB_PATH)
-    size_increase = (indexed_size - original_size) / (2**30)
-    print("="*60)
     print("INDEXING COMPLETE!")
     print("="*60)
-    print(f"Original size:  {original_size / (2**30):.2f} GB")
-    print(f"Indexed size:   {indexed_size / (2**30):.2f} GB")
-    print(f"Size increase:  +{size_increase:.2f} GB ({100*size_increase/(original_size/(2**30)):.1f}%)")
-    print(f"Location:       {INDEXED_DB_PATH}")
     print("="*60 + "\n")
-    return INDEXED_DB_PATH
-# Create indexed database on startup
 DB_PATH = create_indexed_database()
 def get_db_connection():
-    """Create optimized read connection to indexed database"""
     conn = sqlite3.connect(DB_PATH, check_same_thread=False)
     conn.execute("PRAGMA cache_size = -256000")
     conn.execute("PRAGMA mmap_size = 4294967296")
-    conn.execute("PRAGMA temp_store = MEMORY")
     return conn
 def verify_indices():
-    """Verify that indices were created successfully"""
     print("\n" + "="*60)
     print("VERIFYING INDICES")
     print("="*60)
@@ -133,57 +357,29 @@ def verify_indices():
     with get_db_connection() as conn:
         cursor = conn.cursor()
-        # Check edge table indices
-        cursor.execute("PRAGMA index_list(edge)")
-        indices = cursor.fetchall()
-        print(f"\nEdge table indices: {len(indices)}")
-        for idx in indices:
-            idx_name = idx[1]
-            cursor.execute(f"PRAGMA index_info({idx_name})")
-            cols = cursor.fetchall()
-            col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY']
-            print(f"  ✓ {idx_name}: {', '.join(col_names)}")
-        # Test query speed with EXPLAIN QUERY PLAN
-        print("\n" + "="*60)
-        print("TESTING QUERY PERFORMANCE")
-        print("="*60)
-        test_queries = [
-            ("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"),
-            ("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"),
-            ("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"),
-        ]
-        for name, query in test_queries:
-            print(f"\n{name}:")
-            # Show query plan
-            cursor.execute(f"EXPLAIN QUERY PLAN {query}")
-            plan = cursor.fetchall()
-            uses_index = any('INDEX' in str(row).upper() for row in plan)
-            for row in plan:
-                print(f"  Plan: {row}")
-            # Time the query
-            start = time.time()
-            cursor.execute(query)
-            results = cursor.fetchall()
-            elapsed = time.time() - start
-            status = "✅ FAST" if elapsed < 1 else "⚠️  SLOW" if elapsed < 5 else "❌ VERY SLOW"
-            print(f"  {status}: {len(results)} results in {elapsed:.3f}s")
-        print("\n" + "="*60 + "\n")
 verify_indices()
-def get_semantic_profile(word, lang='de'):
-    """
-    Semantic profile - NOW FAST with indices!
-    """
     if not word:
         return "⚠️ Please enter a word."
@@ -205,12 +401,12 @@ def get_semantic_profile(word, lang='de'):
         with get_db_connection() as conn:
             cursor = conn.cursor()
-            # Check if word exists
-            cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
             nodes = cursor.fetchall()
             if not nodes:
-                return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found. Check spelling or try a more common word."
             for node_id, label in nodes[:3]:
                 output_md += f"**Node:** `{node_id}` ({label})\n"
@@ -218,12 +414,13 @@ def get_semantic_profile(word, lang='de'):
             total_relations = 0
-            # Query each relation - NOW FAST with indices!
-            for rel in relations:
                 output_md += f"## {rel}\n\n"
                 has_results = False
-                # Outgoing edges - FAST with idx_edge_start_id
                 cursor.execute("""
                     SELECT en.label, e.weight
                     FROM edge e
@@ -239,7 +436,7 @@ def get_semantic_profile(word, lang='de'):
                     has_results = True
                     total_relations += 1
-                # Incoming edges - FAST with idx_edge_end_id
                 cursor.execute("""
                     SELECT s.label, e.weight
                     FROM edge e
@@ -259,27 +456,21 @@ def get_semantic_profile(word, lang='de'):
                     output_md += "*No results*\n"
                 output_md += "\n"
-            output_md += f"---\n**Total relations:** {total_relations}\n"
             return output_md
     except Exception as e:
-        print(f"ERROR: {e}")
         import traceback
         traceback.print_exc()
-        return f"**❌ Error:**\n\n```\n{e}\n```"
-def run_query(start_node, relation, end_node, limit):
-    """Query builder - NOW FAST with indices!"""
     query = """
-        SELECT
-            e.id AS edge_id,
-            s.id AS start_id,
-            r.label AS relation,
-            en.id AS end_id,
-            e.weight,
-            s.label AS start_label,
-            en.label AS end_label
         FROM edge e
         JOIN relation r ON e.rel_id = r.id
         JOIN node s ON e.start_id = s.id
@@ -291,24 +482,30 @@ def run_query(start_node, relation, end_node, limit):
     try:
         with get_db_connection() as conn:
             # Language filter
-            lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES])
-            query += f" AND ({lang_filter})"
-            # User filters
-            if start_node:
                 pattern = start_node if '%' in start_node else f"%{start_node}%"
                 query += " AND s.id LIKE ?"
                 params.append(pattern)
-            if relation:
                 if '%' in relation:
                     query += " AND r.label LIKE ?"
                 else:
                     query += " AND r.label = ?"
-                params.append(relation)
-            if end_node:
                 pattern = end_node if '%' in end_node else f"%{end_node}%"
                 query += " AND en.id LIKE ?"
                 params.append(pattern)
@@ -316,18 +513,21 @@ def run_query(start_node, relation, end_node, limit):
             query += " ORDER BY e.weight DESC LIMIT ?"
             params.append(limit)
             start_time = time.time()
             df = pd.read_sql_query(query, conn, params=params)
             elapsed = time.time() - start_time
             if df.empty:
-                return pd.DataFrame(), f"No results ({elapsed:.2f}s)"
             df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
             return df, f"✅ {len(df)} results in {elapsed:.2f}s"
     except Exception as e:
-        print(f"ERROR: {e}")
         import traceback
         traceback.print_exc()
         return pd.DataFrame(), f"**❌ Error:** {e}"
@@ -339,114 +539,74 @@ def run_raw_query(sql_query):
     try:
         with get_db_connection() as conn:
-            start = time.time()
             df = pd.read_sql_query(sql_query, conn)
-            elapsed = time.time() - start
-            return df, f"✅ {len(df)} rows in {elapsed:.2f}s"
     except Exception as e:
         return pd.DataFrame(), f"Error: {e}"
 def get_schema_info():
-    """Get schema with index info"""
     with get_db_connection() as conn:
         cursor = conn.cursor()
-        md = "# 📚 Database Schema\n\n"
-        md += "✅ **Custom indices created for fast queries!**\n\n"
         cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
-        tables = cursor.fetchall()
-        for table_name, in tables:
             cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
-            count = cursor.fetchone()[0]
-            md += f"## {table_name} ({count:,} rows)\n\n"
-            # Columns
-            cursor.execute(f"PRAGMA table_info({table_name})")
-            cols = cursor.fetchall()
-            md += "| Column | Type | Null | PK |\n|:--|:--|:--|:--|\n"
-            for col in cols:
-                md += f"| `{col[1]}` | `{col[2]}` | {'✗' if col[3] else '✓'} | {'✓' if col[5] else '✗'} |\n"
-            # Indices
             cursor.execute(f"PRAGMA index_list({table_name})")
             indices = cursor.fetchall()
             if indices:
-                md += f"\n**Indices ({len(indices)}):**\n"
-                for idx in indices:
-                    cursor.execute(f"PRAGMA index_info({idx[1]})")
-                    idx_cols = cursor.fetchall()
-                    cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id'
-                    # Mark custom indices
-                    custom = "🆕 CUSTOM" if idx[1].startswith("idx_") else ""
-                    md += f"- `{idx[1]}` on ({cols_str}) {custom}\n"
-            md += "\n---\n\n"
         return md
-# Gradio UI
-with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🧠 ConceptNet Explorer (With Custom Indices! 🚀)")
-    db_size = os.path.getsize(DB_PATH) / (2**30)
-    gr.Markdown(
-        f"**Database:** {os.path.basename(DB_PATH)} ({db_size:.2f} GB) | "
-        f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
-        f"**Status:** ✅ Indexed & Fast"
-    )
-    gr.Markdown("*Custom indices created on edge.start_id and edge.end_id for 100x faster queries!*")
     with gr.Tabs():
         with gr.TabItem("🔍 Semantic Profile"):
-            gr.Markdown("**Get semantic profile - NOW FAST with custom indices!**")
             with gr.Row():
                 word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
-                lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")
             semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
-            semantic_output = gr.Markdown("*Click to start...*")
-        with gr.TabItem("⚡ Query Builder"):
-            gr.Markdown("**Build queries - NOW FAST with custom indices!**")
             with gr.Row():
-                start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
-                rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
-                end_input = gr.Textbox(label="End Node", placeholder="tier", value="")
-            limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1)
-            query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")
-            status_output = gr.Markdown("*Ready...*")
-            results_output = gr.DataFrame(label="Results", wrap=True)
-        with gr.TabItem("💻 Raw SQL"):
-            raw_sql_input = gr.Textbox(
-                label="SQL Query",
-                value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10",
-                lines=3
-            )
             raw_btn = gr.Button("▶️ Execute")
             raw_status = gr.Markdown()
             raw_results = gr.DataFrame()
         with gr.TabItem("📊 Schema"):
-            schema_btn = gr.Button("📊 Load Schema")
-            schema_output = gr.Markdown("*Click to load...*")
-    gr.Markdown("---\n**🚀 Performance:** Custom indices created on edge table = 100x faster queries!")
-    # Connect functions
     semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
     query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
     raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
     schema_btn.click(get_schema_info, None, schema_output)
 if __name__ == "__main__":
-    print("\n🚀 Starting app with indexed database...\n")
     demo.launch(ssr_mode=False)

 import gradio as gr
 import sqlite3
 import pandas as pd
+from huggingface_hub import hf_hub_download, HfApi, HfFolder
 import os
 import time
 import shutil
 from pathlib import Path
+import json
 # ===== CONFIGURATION =====
 TARGET_LANGUAGES = ['de']
+INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
+INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
+PROGRESS_FILENAME = "indexing_progress.json"
+LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
 # =========================
 print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
+# Get HF token
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+if not HF_TOKEN:
+    try:
+        HF_TOKEN = HfFolder.get_token()
+    except:
+        pass
+# Original database
+ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
+ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"
+def check_remote_progress():
+    """
+    Check which indices are already completed in the remote HF repo.
+    Returns dict with progress info.
+    """
+    if not HF_TOKEN:
+        print("⚠️  No HF_TOKEN - cannot check remote progress")
+        return {"completed_indices": [], "database_uploaded": False}
+    try:
+        api = HfApi()
+        # Check if repo exists
+        try:
+            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
+            print(f"✅ Repository exists: {INDEXED_REPO_ID}")
+        except:
+            print(f"ℹ️  Repository doesn't exist yet, will create it")
+            return {"completed_indices": [], "database_uploaded": False}
+        # Try to download progress file
+        try:
+            progress_path = hf_hub_download(
+                repo_id=INDEXED_REPO_ID,
+                filename=PROGRESS_FILENAME,
+                repo_type="dataset",
+                token=HF_TOKEN
+            )
+            with open(progress_path, 'r') as f:
+                progress = json.load(f)
+            print(f"📋 Remote progress found:")
+            print(f"   Completed indices: {progress.get('completed_indices', [])}")
+            print(f"   Database uploaded: {progress.get('database_uploaded', False)}")
+            return progress
+        except Exception as e:
+            print(f"ℹ️  No progress file found (starting fresh)")
+            return {"completed_indices": [], "database_uploaded": False}
+    except Exception as e:
+        print(f"⚠️  Error checking remote progress: {e}")
+        return {"completed_indices": [], "database_uploaded": False}
+def update_remote_progress(completed_indices, database_uploaded=False):
+    """
+    Update the progress file in the remote HF repo.
+    """
+    if not HF_TOKEN:
+        print("⚠️  Cannot update progress: No HF_TOKEN")
+        return False
+    try:
+        api = HfApi()
+        # Create repo if it doesn't exist
+        try:
+            api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
+        except:
+            print(f"Creating repository: {INDEXED_REPO_ID}")
+            api.create_repo(
+                repo_id=INDEXED_REPO_ID,
+                repo_type="dataset",
+                token=HF_TOKEN,
+                private=False
+            )
+        # Create progress file
+        progress = {
+            "completed_indices": completed_indices,
+            "database_uploaded": database_uploaded,
+            "timestamp": time.time(),
+            "languages": TARGET_LANGUAGES
+        }
+        progress_path = "/tmp/indexing_progress.json"
+        with open(progress_path, 'w') as f:
+            json.dump(progress, f, indent=2)
+        # Upload progress file
+        api.upload_file(
+            path_or_fileobj=progress_path,
+            path_in_repo=PROGRESS_FILENAME,
+            repo_id=INDEXED_REPO_ID,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message=f"Update progress: {len(completed_indices)} indices complete"
+        )
+        print(f"✅ Progress updated: {len(completed_indices)} indices complete")
+        return True
+    except Exception as e:
+        print(f"⚠️  Failed to update progress: {e}")
+        return False
+def upload_database_checkpoint():
+    """
+    Upload the current database state to HF.
+    This is called after each index is created.
+    """
+    if not HF_TOKEN:
+        print("⚠️  Cannot upload: No HF_TOKEN")
+        return False
+    if not os.path.exists(LOCAL_DB_PATH):
+        print("⚠️  Database file doesn't exist")
+        return False
+    try:
+        api = HfApi()
+        db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
+        print(f"\n📤 Uploading database checkpoint ({db_size:.2f} GB)...")
+        print(f"   This may take 5-10 minutes but saves progress...")
+        start = time.time()
+        api.upload_file(
+            path_or_fileobj=LOCAL_DB_PATH,
+            path_in_repo=INDEXED_DB_FILENAME,
+            repo_id=INDEXED_REPO_ID,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message="Upload indexed database checkpoint"
+        )
+        elapsed = time.time() - start
+        print(f"✅ Database uploaded in {elapsed:.1f}s")
+        return True
+    except Exception as e:
+        print(f"❌ Upload failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
 def create_indexed_database():
     """
+    Create indexed database with checkpoint system.
+    Resumes from last completed index if crashed.
     """
+    # Check remote progress first
+    progress = check_remote_progress()
+    completed_indices = set(progress.get("completed_indices", []))
+    database_uploaded = progress.get("database_uploaded", False)
+    # If database is fully indexed and uploaded, download it
+    if database_uploaded and len(completed_indices) >= 4:
+        print("\n✅ Fully indexed database exists in HF!")
+        print(f"   Downloading from {INDEXED_REPO_ID}...")
+        try:
+            indexed_path = hf_hub_download(
+                repo_id=INDEXED_REPO_ID,
+                filename=INDEXED_DB_FILENAME,
+                repo_type="dataset",
+                token=HF_TOKEN
+            )
+            print(f"✅ Downloaded: {indexed_path}")
+            return indexed_path
+        except Exception as e:
+            print(f"⚠️  Download failed: {e}")
+            print("   Will create indices locally")
+    # Need to create/continue indexing
     print("\n" + "="*60)
+    print("CREATING INDEXED DATABASE (WITH CHECKPOINTS)")
     print("="*60)
+    if completed_indices:
+        print(f"📍 Resuming from checkpoint...")
+        print(f"   Already completed: {sorted(completed_indices)}")
+    # Download or use existing local database
+    if os.path.exists(LOCAL_DB_PATH) and completed_indices:
+        print(f"\n✅ Using existing local database with {len(completed_indices)} indices")
+    elif database_uploaded and not completed_indices:
+        # Download partial database from HF
+        print(f"\n📥 Downloading partial database from HF...")
+        try:
+            remote_db = hf_hub_download(
+                repo_id=INDEXED_REPO_ID,
+                filename=INDEXED_DB_FILENAME,
+                repo_type="dataset",
+                token=HF_TOKEN
+            )
+            shutil.copy2(remote_db, LOCAL_DB_PATH)
+            print(f"✅ Downloaded partial database")
+        except:
+            print(f"ℹ️  No partial database found, starting from original")
+    if not os.path.exists(LOCAL_DB_PATH):
+        # Copy original database
+        print(f"\n1. Downloading original database...")
+        original_path = hf_hub_download(
+            repo_id=ORIGINAL_REPO_ID,
+            filename=ORIGINAL_DB_FILENAME,
+            repo_type="dataset"
+        )
+        original_size = os.path.getsize(original_path)
+        free_space = shutil.disk_usage("/tmp")[2]
+        print(f"   Original: {original_size / (2**30):.2f} GB")
+        print(f"   Free space: {free_space / (2**30):.2f} GB")
+        if free_space < original_size * 2:
+            raise Exception(f"Not enough space! Need {original_size * 2 / (2**30):.1f} GB")
+        print(f"\n   Copying to {LOCAL_DB_PATH}...")
+        start = time.time()
+        shutil.copy2(original_path, LOCAL_DB_PATH)
+        elapsed = time.time() - start
+        print(f"   ✓ Copied in {elapsed:.1f}s")
+    # Define indices to create
+    indices_to_create = [
+        ("idx_edge_start_id", "edge", "start_id", "Speed up start node queries"),
+        ("idx_edge_end_id", "edge", "end_id", "Speed up end node queries"),
+        ("idx_edge_rel_id", "edge", "rel_id", "Speed up relation queries"),
+        ("idx_node_label", "node", "label", "Speed up label searches"),
+    ]
+    # Connect to database
+    conn = sqlite3.connect(LOCAL_DB_PATH)
     cursor = conn.cursor()
+    # Enable optimizations
     cursor.execute("PRAGMA journal_mode = WAL")
     cursor.execute("PRAGMA synchronous = NORMAL")
+    cursor.execute("PRAGMA cache_size = -512000")
+    # Create each index with checkpoint
+    print(f"\n2. Creating indices with checkpoints...")
+    print(f"   (After each index, we upload to HF to save progress)")
     for idx_name, table, column, description in indices_to_create:
+        if idx_name in completed_indices:
+            print(f"\n   ✓ {idx_name} - ALREADY COMPLETE (skipping)")
+            continue
+        print(f"\n   Creating {idx_name} on {table}({column})...")
         print(f"   Purpose: {description}")
+        start = time.time()
+        try:
+            cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
+            conn.commit()
+            elapsed = time.time() - start
+            print(f"   ✓ Index created in {elapsed:.1f}s")
+            # Update completed indices
+            completed_indices.add(idx_name)
+            # Update remote progress
+            print(f"   📝 Updating progress file...")
+            update_remote_progress(list(completed_indices), database_uploaded=False)
+            # Upload database checkpoint
+            print(f"   📤 Uploading database checkpoint...")
+            upload_success = upload_database_checkpoint()
+            if upload_success:
+                print(f"   ✅ Checkpoint saved! Safe to restart if needed.")
+            else:
+                print(f"   ⚠️  Checkpoint upload failed, but continuing...")
+        except Exception as e:
+            print(f"   ❌ Failed to create {idx_name}: {e}")
+            conn.close()
+            raise
+    # Run ANALYZE
+    print(f"\n3. Running ANALYZE...")
     start = time.time()
     cursor.execute("ANALYZE")
+    conn.commit()
     elapsed = time.time() - start
+    print(f"   ✓ Analyzed in {elapsed:.1f}s")
     conn.close()
+    # Final upload
+    print(f"\n4. Final database upload...")
+    upload_database_checkpoint()
+    # Mark as complete
+    update_remote_progress(list(completed_indices), database_uploaded=True)
+    indexed_size = os.path.getsize(LOCAL_DB_PATH)
+    print("\n" + "="*60)
     print("INDEXING COMPLETE!")
     print("="*60)
+    print(f"Size: {indexed_size / (2**30):.2f} GB")
+    print(f"Indices created: {sorted(completed_indices)}")
+    print(f"Saved to: https://huggingface.co/datasets/{INDEXED_REPO_ID}")
     print("="*60 + "\n")
+    return LOCAL_DB_PATH
+# Initialize database
 DB_PATH = create_indexed_database()
 def get_db_connection():
+    """Create optimized connection"""
     conn = sqlite3.connect(DB_PATH, check_same_thread=False)
     conn.execute("PRAGMA cache_size = -256000")
     conn.execute("PRAGMA mmap_size = 4294967296")
     return conn
 def verify_indices():
+    """Verify indices"""
     print("\n" + "="*60)
     print("VERIFYING INDICES")
     print("="*60)
     with get_db_connection() as conn:
         cursor = conn.cursor()
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
+        custom_indices = cursor.fetchall()
+        print(f"\nCustom indices: {len(custom_indices)}")
+        for idx in custom_indices:
+            print(f"  ✓ {idx[0]}")
+        # Speed test
+        start = time.time()
+        cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE '/c/de/hund%'")
+        count = cursor.fetchone()[0]
+        elapsed = time.time() - start
+        status = "✅ FAST" if elapsed < 1 else "⚠️ SLOW" if elapsed < 5 else "❌ VERY SLOW"
+        print(f"\nSpeed test: {count} results in {elapsed:.3f}s {status}")
+        print("="*60 + "\n")
 verify_indices()
+def get_semantic_profile(word, lang='de', progress=gr.Progress()):
+    """Semantic profile with progress"""
+    progress(0, desc="Starting...")
     if not word:
         return "⚠️ Please enter a word."
         with get_db_connection() as conn:
             cursor = conn.cursor()
+            progress(0.05, desc="Finding nodes...")
+            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
             nodes = cursor.fetchall()
             if not nodes:
+                return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**"
             for node_id, label in nodes[:3]:
                 output_md += f"**Node:** `{node_id}` ({label})\n"
             total_relations = 0
+            for i, rel in enumerate(relations):
+                progress((i + 1) / len(relations), desc=f"Querying {rel}...")
                 output_md += f"## {rel}\n\n"
                 has_results = False
+                # Outgoing
                 cursor.execute("""
                     SELECT en.label, e.weight
                     FROM edge e
                     has_results = True
                     total_relations += 1
+                # Incoming
                 cursor.execute("""
                     SELECT s.label, e.weight
                     FROM edge e
                     output_md += "*No results*\n"
                 output_md += "\n"
+            progress(1.0, desc="Complete!")
+            output_md += f"---\n**Total:** {total_relations} relations\n"
             return output_md
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return f"**❌ Error:** {e}"
+def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
+    """Query builder"""
+    progress(0, desc="Starting...")
     query = """
+        SELECT e.id, s.id, r.label, en.id, e.weight, s.label, en.label
         FROM edge e
         JOIN relation r ON e.rel_id = r.id
         JOIN node s ON e.start_id = s.id
     try:
         with get_db_connection() as conn:
+            progress(0.3, desc="Building query...")
             # Language filter
+            lang_conditions = []
+            for lang in TARGET_LANGUAGES:
+                lang_conditions.append(f"s.id LIKE '/c/{lang}/%'")
+                lang_conditions.append(f"en.id LIKE '/c/{lang}/%'")
+            query += f" AND ({' OR '.join(lang_conditions)})"
+            # Filters
+            if start_node and start_node.strip():
                 pattern = start_node if '%' in start_node else f"%{start_node}%"
                 query += " AND s.id LIKE ?"
                 params.append(pattern)
+            if relation and relation.strip():
+                rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
                 if '%' in relation:
                     query += " AND r.label LIKE ?"
                 else:
                     query += " AND r.label = ?"
+                params.append(rel_value)
+            if end_node and end_node.strip():
                 pattern = end_node if '%' in end_node else f"%{end_node}%"
                 query += " AND en.id LIKE ?"
                 params.append(pattern)
             query += " ORDER BY e.weight DESC LIMIT ?"
             params.append(limit)
+            progress(0.6, desc="Executing...")
             start_time = time.time()
             df = pd.read_sql_query(query, conn, params=params)
             elapsed = time.time() - start_time
+            progress(1.0, desc="Complete!")
             if df.empty:
+                return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
             df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
             return df, f"✅ {len(df)} results in {elapsed:.2f}s"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return pd.DataFrame(), f"**❌ Error:** {e}"
     try:
         with get_db_connection() as conn:
             df = pd.read_sql_query(sql_query, conn)
+            return df, f"✅ {len(df)} rows"
     except Exception as e:
         return pd.DataFrame(), f"Error: {e}"
 def get_schema_info():
+    """Get schema"""
     with get_db_connection() as conn:
         cursor = conn.cursor()
+        md = f"# 📚 Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
         cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
+        for table_name, in cursor.fetchall():
             cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
+            md += f"## {table_name} ({cursor.fetchone()[0]:,} rows)\n\n"
             cursor.execute(f"PRAGMA index_list({table_name})")
             indices = cursor.fetchall()
             if indices:
+                md += f"**Indices ({len(indices)}):** "
+                md += ", ".join([f"`{idx[1]}`" for idx in indices])
+                md += "\n\n"
         return md
+# UI
+with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"# 🧠 ConceptNet Explorer ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
+    gr.Markdown(f"**Indexed DB:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | ✅ Checkpoint system active")
     with gr.Tabs():
         with gr.TabItem("🔍 Semantic Profile"):
             with gr.Row():
                 word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
+                lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Lang")
             semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
+            semantic_output = gr.Markdown()
+        with gr.TabItem("⚡ Query"):
             with gr.Row():
+                start_input = gr.Textbox(label="Start", placeholder="hund", value="hund")
+                rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA")
+                end_input = gr.Textbox(label="End", placeholder="")
+            limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50)
+            query_btn = gr.Button("▶️ Run", variant="primary", size="lg")
+            status_output = gr.Markdown()
+            results_output = gr.DataFrame(wrap=True)
+        with gr.TabItem("💻 SQL"):
+            raw_sql_input = gr.Textbox(label="SQL", value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", lines=3)
             raw_btn = gr.Button("▶️ Execute")
             raw_status = gr.Markdown()
             raw_results = gr.DataFrame()
         with gr.TabItem("📊 Schema"):
+            schema_btn = gr.Button("📊 Load")
+            schema_output = gr.Markdown()
+    gr.Markdown("---\n✅ **Progress saved after each index!** Safe to restart if space crashes.")
     semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
     query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
     raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
     schema_btn.click(get_schema_info, None, schema_output)
 if __name__ == "__main__":
+    print("\n🚀 Ready with checkpoint system!\n")
     demo.launch(ssr_mode=False)