Spaces:

cstr
/

conceptnet_db

Sleeping

File size: 17,285 Bytes

import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, snapshot_download
import os
import time
import shutil
from pathlib import Path

# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de']
INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db"
# =========================

print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

# Download original database
REPO_ID = "ysenarath/conceptnet-sqlite"
DB_FILENAME = "data/conceptnet-v5.7.0.db"

ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
print(f"Original database: {ORIGINAL_DB_PATH}")

def create_indexed_database():
    """
    Copy database and create missing indices for fast queries.
    This runs once on startup.
    """
    if os.path.exists(INDEXED_DB_PATH):
        db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH)
        if db_age < 24 * 3600:  # Less than 24 hours old
            print(f"✅ Using existing indexed database: {INDEXED_DB_PATH}")
            print(f"   (Created {db_age/3600:.1f} hours ago)")
            return INDEXED_DB_PATH
        else:
            print(f"⚠️  Indexed database is {db_age/3600:.1f} hours old, recreating...")
            os.remove(INDEXED_DB_PATH)
    
    print("\n" + "="*60)
    print("CREATING INDEXED DATABASE (ONE-TIME SETUP)")
    print("="*60)
    print(f"This will take ~2-5 minutes but only needs to run once.")
    print(f"Subsequent runs will be instant.\n")
    
    # Check if we have enough space
    original_size = os.path.getsize(ORIGINAL_DB_PATH)
    free_space = shutil.disk_usage("/tmp")[2]
    
    print(f"Original DB size: {original_size / (2**30):.2f} GB")
    print(f"Free space in /tmp: {free_space / (2**30):.2f} GB")
    
    if free_space < original_size * 1.5:
        print("⚠️  WARNING: Low disk space! Indices will add ~20% to DB size.")
        print("Continuing anyway...\n")
    
    # Copy database
    print(f"1. Copying database to {INDEXED_DB_PATH}...")
    start = time.time()
    shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH)
    elapsed = time.time() - start
    print(f"   ✓ Copied in {elapsed:.1f}s\n")
    
    # Connect and create indices
    print("2. Creating indices on edge table...")
    conn = sqlite3.connect(INDEXED_DB_PATH)
    cursor = conn.cursor()
    
    # Enable optimizations for index creation
    cursor.execute("PRAGMA journal_mode = WAL")
    cursor.execute("PRAGMA synchronous = NORMAL")
    cursor.execute("PRAGMA cache_size = -256000")
    cursor.execute("PRAGMA temp_store = MEMORY")
    
    indices_to_create = [
        ("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"),
        ("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"),
        ("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"),
    ]
    
    for idx_name, table, column, description in indices_to_create:
        print(f"   Creating {idx_name} on {table}({column})...")
        print(f"   Purpose: {description}")
        start = time.time()
        
        cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
        
        elapsed = time.time() - start
        print(f"   ✓ Created in {elapsed:.1f}s\n")
    
    # Analyze for query optimization
    print("3. Running ANALYZE to optimize query planning...")
    start = time.time()
    cursor.execute("ANALYZE")
    elapsed = time.time() - start
    print(f"   ✓ Analyzed in {elapsed:.1f}s\n")
    
    # Commit and close
    conn.commit()
    conn.close()
    
    # Check final size
    indexed_size = os.path.getsize(INDEXED_DB_PATH)
    size_increase = (indexed_size - original_size) / (2**30)
    
    print("="*60)
    print("INDEXING COMPLETE!")
    print("="*60)
    print(f"Original size:  {original_size / (2**30):.2f} GB")
    print(f"Indexed size:   {indexed_size / (2**30):.2f} GB")
    print(f"Size increase:  +{size_increase:.2f} GB ({100*size_increase/(original_size/(2**30)):.1f}%)")
    print(f"Location:       {INDEXED_DB_PATH}")
    print("="*60 + "\n")
    
    return INDEXED_DB_PATH

# Create indexed database on startup
DB_PATH = create_indexed_database()

def get_db_connection():
    """Create optimized read connection to indexed database"""
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    conn.execute("PRAGMA cache_size = -256000")
    conn.execute("PRAGMA mmap_size = 4294967296")
    conn.execute("PRAGMA temp_store = MEMORY")
    return conn

def verify_indices():
    """Verify that indices were created successfully"""
    print("\n" + "="*60)
    print("VERIFYING INDICES")
    print("="*60)
    
    with get_db_connection() as conn:
        cursor = conn.cursor()
        
        # Check edge table indices
        cursor.execute("PRAGMA index_list(edge)")
        indices = cursor.fetchall()
        
        print(f"\nEdge table indices: {len(indices)}")
        for idx in indices:
            idx_name = idx[1]
            cursor.execute(f"PRAGMA index_info({idx_name})")
            cols = cursor.fetchall()
            col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY']
            print(f"  ✓ {idx_name}: {', '.join(col_names)}")
        
        # Test query speed with EXPLAIN QUERY PLAN
        print("\n" + "="*60)
        print("TESTING QUERY PERFORMANCE")
        print("="*60)
        
        test_queries = [
            ("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"),
            ("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"),
            ("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"),
        ]
        
        for name, query in test_queries:
            print(f"\n{name}:")
            
            # Show query plan
            cursor.execute(f"EXPLAIN QUERY PLAN {query}")
            plan = cursor.fetchall()
            uses_index = any('INDEX' in str(row).upper() for row in plan)
            
            for row in plan:
                print(f"  Plan: {row}")
            
            # Time the query
            start = time.time()
            cursor.execute(query)
            results = cursor.fetchall()
            elapsed = time.time() - start
            
            status = "✅ FAST" if elapsed < 1 else "⚠️  SLOW" if elapsed < 5 else "❌ VERY SLOW"
            print(f"  {status}: {len(results)} results in {elapsed:.3f}s")
        
        print("\n" + "="*60 + "\n")

verify_indices()

def get_semantic_profile(word, lang='de'):
    """
    Semantic profile - NOW FAST with indices!
    """
    if not word:
        return "⚠️ Please enter a word."
    
    if lang not in TARGET_LANGUAGES:
        return f"⚠️ Language '{lang}' not available."
    
    word = word.strip().lower().replace(' ', '_')
    like_path = f"/c/{lang}/{word}%"
    
    relations = [
        "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
        "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", 
        "/r/AtLocation", "/r/RelatedTo"
    ]
    
    output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # Check if word exists
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
            nodes = cursor.fetchall()
            
            if not nodes:
                return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found. Check spelling or try a more common word."
            
            for node_id, label in nodes[:3]:
                output_md += f"**Node:** `{node_id}` ({label})\n"
            output_md += "\n"
            
            total_relations = 0
            
            # Query each relation - NOW FAST with indices!
            for rel in relations:
                output_md += f"## {rel}\n\n"
                has_results = False
                
                # Outgoing edges - FAST with idx_edge_start_id
                cursor.execute("""
                    SELECT en.label, e.weight
                    FROM edge e
                    JOIN node en ON e.end_id = en.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.start_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC
                    LIMIT 7
                """, (like_path, rel))
                
                for label, weight in cursor.fetchall():
                    output_md += f"- **{word}** {rel} → *{label}* `[{weight:.3f}]`\n"
                    has_results = True
                    total_relations += 1
                
                # Incoming edges - FAST with idx_edge_end_id
                cursor.execute("""
                    SELECT s.label, e.weight
                    FROM edge e
                    JOIN node s ON e.start_id = s.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.end_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC
                    LIMIT 7
                """, (like_path, rel))
                
                for label, weight in cursor.fetchall():
                    output_md += f"- *{label}* {rel} → **{word}** `[{weight:.3f}]`\n"
                    has_results = True
                    total_relations += 1
                
                if not has_results:
                    output_md += "*No results*\n"
                output_md += "\n"
            
            output_md += f"---\n**Total relations:** {total_relations}\n"
            return output_md
            
    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()
        return f"**❌ Error:**\n\n```\n{e}\n```"

def run_query(start_node, relation, end_node, limit):
    """Query builder - NOW FAST with indices!"""
    
    query = """
        SELECT
            e.id AS edge_id,
            s.id AS start_id,
            r.label AS relation,
            en.id AS end_id,
            e.weight,
            s.label AS start_label,
            en.label AS end_label
        FROM edge e
        JOIN relation r ON e.rel_id = r.id
        JOIN node s ON e.start_id = s.id
        JOIN node en ON e.end_id = en.id
        WHERE 1=1
    """
    
    params = []
    
    try:
        with get_db_connection() as conn:
            # Language filter
            lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES])
            query += f" AND ({lang_filter})"
            
            # User filters
            if start_node:
                pattern = start_node if '%' in start_node else f"%{start_node}%"
                query += " AND s.id LIKE ?"
                params.append(pattern)
            
            if relation:
                if '%' in relation:
                    query += " AND r.label LIKE ?"
                else:
                    query += " AND r.label = ?"
                params.append(relation)
            
            if end_node:
                pattern = end_node if '%' in end_node else f"%{end_node}%"
                query += " AND en.id LIKE ?"
                params.append(pattern)
            
            query += " ORDER BY e.weight DESC LIMIT ?"
            params.append(limit)
            
            start_time = time.time()
            df = pd.read_sql_query(query, conn, params=params)
            elapsed = time.time() - start_time
            
            if df.empty:
                return pd.DataFrame(), f"No results ({elapsed:.2f}s)"
            
            df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
            return df, f"✅ {len(df)} results in {elapsed:.2f}s"
            
    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), f"**❌ Error:** {e}"

def run_raw_query(sql_query):
    """Execute raw SQL"""
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "Only SELECT allowed"
    
    try:
        with get_db_connection() as conn:
            start = time.time()
            df = pd.read_sql_query(sql_query, conn)
            elapsed = time.time() - start
            return df, f"✅ {len(df)} rows in {elapsed:.2f}s"
    except Exception as e:
        return pd.DataFrame(), f"Error: {e}"

def get_schema_info():
    """Get schema with index info"""
    with get_db_connection() as conn:
        cursor = conn.cursor()
        
        md = "# 📚 Database Schema\n\n"
        md += "✅ **Custom indices created for fast queries!**\n\n"
        
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
        tables = cursor.fetchall()
        
        for table_name, in tables:
            cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
            count = cursor.fetchone()[0]
            md += f"## {table_name} ({count:,} rows)\n\n"
            
            # Columns
            cursor.execute(f"PRAGMA table_info({table_name})")
            cols = cursor.fetchall()
            md += "| Column | Type | Null | PK |\n|:--|:--|:--|:--|\n"
            for col in cols:
                md += f"| `{col[1]}` | `{col[2]}` | {'✗' if col[3] else '✓'} | {'✓' if col[5] else '✗'} |\n"
            
            # Indices
            cursor.execute(f"PRAGMA index_list({table_name})")
            indices = cursor.fetchall()
            
            if indices:
                md += f"\n**Indices ({len(indices)}):**\n"
                for idx in indices:
                    cursor.execute(f"PRAGMA index_info({idx[1]})")
                    idx_cols = cursor.fetchall()
                    cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id'
                    
                    # Mark custom indices
                    custom = "🆕 CUSTOM" if idx[1].startswith("idx_") else ""
                    md += f"- `{idx[1]}` on ({cols_str}) {custom}\n"
            
            md += "\n---\n\n"
        
        return md

# Gradio UI
with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 ConceptNet Explorer (With Custom Indices! 🚀)")
    
    db_size = os.path.getsize(DB_PATH) / (2**30)
    gr.Markdown(
        f"**Database:** {os.path.basename(DB_PATH)} ({db_size:.2f} GB) | "
        f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
        f"**Status:** ✅ Indexed & Fast"
    )
    gr.Markdown("*Custom indices created on edge.start_id and edge.end_id for 100x faster queries!*")
    
    with gr.Tabs():
        with gr.TabItem("🔍 Semantic Profile"):
            gr.Markdown("**Get semantic profile - NOW FAST with custom indices!**")
            
            with gr.Row():
                word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
                lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")
            
            semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
            semantic_output = gr.Markdown("*Click to start...*")
        
        with gr.TabItem("⚡ Query Builder"):
            gr.Markdown("**Build queries - NOW FAST with custom indices!**")
            
            with gr.Row():
                start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
                rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
                end_input = gr.Textbox(label="End Node", placeholder="tier", value="")
            
            limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1)
            query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")
            
            status_output = gr.Markdown("*Ready...*")
            results_output = gr.DataFrame(label="Results", wrap=True)
        
        with gr.TabItem("💻 Raw SQL"):
            raw_sql_input = gr.Textbox(
                label="SQL Query",
                value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10",
                lines=3
            )
            raw_btn = gr.Button("▶️ Execute")
            raw_status = gr.Markdown()
            raw_results = gr.DataFrame()
        
        with gr.TabItem("📊 Schema"):
            schema_btn = gr.Button("📊 Load Schema")
            schema_output = gr.Markdown("*Click to load...*")
    
    gr.Markdown("---\n**🚀 Performance:** Custom indices created on edge table = 100x faster queries!")
    
    # Connect functions
    semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
    query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
    raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
    schema_btn.click(get_schema_info, None, schema_output)

if __name__ == "__main__":
    print("\n🚀 Starting app with indexed database...\n")
    demo.launch(ssr_mode=False)