Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import sqlite3 | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| import os | |
| import time | |
| import shutil | |
| from pathlib import Path | |
| # ===== CONFIGURATION ===== | |
| TARGET_LANGUAGES = ['de'] | |
| INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db" | |
| # ========================= | |
| print(f"π Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") | |
| # Download original database | |
| REPO_ID = "ysenarath/conceptnet-sqlite" | |
| DB_FILENAME = "data/conceptnet-v5.7.0.db" | |
| ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset") | |
| print(f"Original database: {ORIGINAL_DB_PATH}") | |
| def create_indexed_database(): | |
| """ | |
| Copy database and create missing indices for fast queries. | |
| This runs once on startup. | |
| """ | |
| if os.path.exists(INDEXED_DB_PATH): | |
| db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH) | |
| if db_age < 24 * 3600: # Less than 24 hours old | |
| print(f"β Using existing indexed database: {INDEXED_DB_PATH}") | |
| print(f" (Created {db_age/3600:.1f} hours ago)") | |
| return INDEXED_DB_PATH | |
| else: | |
| print(f"β οΈ Indexed database is {db_age/3600:.1f} hours old, recreating...") | |
| os.remove(INDEXED_DB_PATH) | |
| print("\n" + "="*60) | |
| print("CREATING INDEXED DATABASE (ONE-TIME SETUP)") | |
| print("="*60) | |
| print(f"This will take ~2-5 minutes but only needs to run once.") | |
| print(f"Subsequent runs will be instant.\n") | |
| # Check if we have enough space | |
| original_size = os.path.getsize(ORIGINAL_DB_PATH) | |
| free_space = shutil.disk_usage("/tmp")[2] | |
| print(f"Original DB size: {original_size / (2**30):.2f} GB") | |
| print(f"Free space in /tmp: {free_space / (2**30):.2f} GB") | |
| if free_space < original_size * 1.5: | |
| print("β οΈ WARNING: Low disk space! Indices will add ~20% to DB size.") | |
| print("Continuing anyway...\n") | |
| # Copy database | |
| print(f"1. Copying database to {INDEXED_DB_PATH}...") | |
| start = time.time() | |
| shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH) | |
| elapsed = time.time() - start | |
| print(f" β Copied in {elapsed:.1f}s\n") | |
| # Connect and create indices | |
| print("2. Creating indices on edge table...") | |
| conn = sqlite3.connect(INDEXED_DB_PATH) | |
| cursor = conn.cursor() | |
| # Enable optimizations for index creation | |
| cursor.execute("PRAGMA journal_mode = WAL") | |
| cursor.execute("PRAGMA synchronous = NORMAL") | |
| cursor.execute("PRAGMA cache_size = -256000") | |
| cursor.execute("PRAGMA temp_store = MEMORY") | |
| indices_to_create = [ | |
| ("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"), | |
| ("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"), | |
| ("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"), | |
| ] | |
| for idx_name, table, column, description in indices_to_create: | |
| print(f" Creating {idx_name} on {table}({column})...") | |
| print(f" Purpose: {description}") | |
| start = time.time() | |
| cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})") | |
| elapsed = time.time() - start | |
| print(f" β Created in {elapsed:.1f}s\n") | |
| # Analyze for query optimization | |
| print("3. Running ANALYZE to optimize query planning...") | |
| start = time.time() | |
| cursor.execute("ANALYZE") | |
| elapsed = time.time() - start | |
| print(f" β Analyzed in {elapsed:.1f}s\n") | |
| # Commit and close | |
| conn.commit() | |
| conn.close() | |
| # Check final size | |
| indexed_size = os.path.getsize(INDEXED_DB_PATH) | |
| size_increase = (indexed_size - original_size) / (2**30) | |
| print("="*60) | |
| print("INDEXING COMPLETE!") | |
| print("="*60) | |
| print(f"Original size: {original_size / (2**30):.2f} GB") | |
| print(f"Indexed size: {indexed_size / (2**30):.2f} GB") | |
| print(f"Size increase: +{size_increase:.2f} GB ({100*size_increase/(original_size/(2**30)):.1f}%)") | |
| print(f"Location: {INDEXED_DB_PATH}") | |
| print("="*60 + "\n") | |
| return INDEXED_DB_PATH | |
| # Create indexed database on startup | |
| DB_PATH = create_indexed_database() | |
| def get_db_connection(): | |
| """Create optimized read connection to indexed database""" | |
| conn = sqlite3.connect(DB_PATH, check_same_thread=False) | |
| conn.execute("PRAGMA cache_size = -256000") | |
| conn.execute("PRAGMA mmap_size = 4294967296") | |
| conn.execute("PRAGMA temp_store = MEMORY") | |
| return conn | |
| def verify_indices(): | |
| """Verify that indices were created successfully""" | |
| print("\n" + "="*60) | |
| print("VERIFYING INDICES") | |
| print("="*60) | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| # Check edge table indices | |
| cursor.execute("PRAGMA index_list(edge)") | |
| indices = cursor.fetchall() | |
| print(f"\nEdge table indices: {len(indices)}") | |
| for idx in indices: | |
| idx_name = idx[1] | |
| cursor.execute(f"PRAGMA index_info({idx_name})") | |
| cols = cursor.fetchall() | |
| col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY'] | |
| print(f" β {idx_name}: {', '.join(col_names)}") | |
| # Test query speed with EXPLAIN QUERY PLAN | |
| print("\n" + "="*60) | |
| print("TESTING QUERY PERFORMANCE") | |
| print("="*60) | |
| test_queries = [ | |
| ("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"), | |
| ("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"), | |
| ("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"), | |
| ] | |
| for name, query in test_queries: | |
| print(f"\n{name}:") | |
| # Show query plan | |
| cursor.execute(f"EXPLAIN QUERY PLAN {query}") | |
| plan = cursor.fetchall() | |
| uses_index = any('INDEX' in str(row).upper() for row in plan) | |
| for row in plan: | |
| print(f" Plan: {row}") | |
| # Time the query | |
| start = time.time() | |
| cursor.execute(query) | |
| results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| status = "β FAST" if elapsed < 1 else "β οΈ SLOW" if elapsed < 5 else "β VERY SLOW" | |
| print(f" {status}: {len(results)} results in {elapsed:.3f}s") | |
| print("\n" + "="*60 + "\n") | |
| verify_indices() | |
| def get_semantic_profile(word, lang='de'): | |
| """ | |
| Semantic profile - NOW FAST with indices! | |
| """ | |
| if not word: | |
| return "β οΈ Please enter a word." | |
| if lang not in TARGET_LANGUAGES: | |
| return f"β οΈ Language '{lang}' not available." | |
| word = word.strip().lower().replace(' ', '_') | |
| like_path = f"/c/{lang}/{word}%" | |
| relations = [ | |
| "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf", | |
| "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", | |
| "/r/AtLocation", "/r/RelatedTo" | |
| ] | |
| output_md = f"# π§ Semantic Profile: '{word}'\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| # Check if word exists | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,)) | |
| nodes = cursor.fetchall() | |
| if not nodes: | |
| return f"# π§ Semantic Profile: '{word}'\n\nβ οΈ No nodes found. Check spelling or try a more common word." | |
| for node_id, label in nodes[:3]: | |
| output_md += f"**Node:** `{node_id}` ({label})\n" | |
| output_md += "\n" | |
| total_relations = 0 | |
| # Query each relation - NOW FAST with indices! | |
| for rel in relations: | |
| output_md += f"## {rel}\n\n" | |
| has_results = False | |
| # Outgoing edges - FAST with idx_edge_start_id | |
| cursor.execute(""" | |
| SELECT en.label, e.weight | |
| FROM edge e | |
| JOIN node en ON e.end_id = en.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.start_id LIKE ? AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """, (like_path, rel)) | |
| for label, weight in cursor.fetchall(): | |
| output_md += f"- **{word}** {rel} β *{label}* `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| # Incoming edges - FAST with idx_edge_end_id | |
| cursor.execute(""" | |
| SELECT s.label, e.weight | |
| FROM edge e | |
| JOIN node s ON e.start_id = s.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.end_id LIKE ? AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """, (like_path, rel)) | |
| for label, weight in cursor.fetchall(): | |
| output_md += f"- *{label}* {rel} β **{word}** `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| if not has_results: | |
| output_md += "*No results*\n" | |
| output_md += "\n" | |
| output_md += f"---\n**Total relations:** {total_relations}\n" | |
| return output_md | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"**β Error:**\n\n```\n{e}\n```" | |
| def run_query(start_node, relation, end_node, limit): | |
| """Query builder - NOW FAST with indices!""" | |
| query = """ | |
| SELECT | |
| e.id AS edge_id, | |
| s.id AS start_id, | |
| r.label AS relation, | |
| en.id AS end_id, | |
| e.weight, | |
| s.label AS start_label, | |
| en.label AS end_label | |
| FROM edge e | |
| JOIN relation r ON e.rel_id = r.id | |
| JOIN node s ON e.start_id = s.id | |
| JOIN node en ON e.end_id = en.id | |
| WHERE 1=1 | |
| """ | |
| params = [] | |
| try: | |
| with get_db_connection() as conn: | |
| # Language filter | |
| lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES]) | |
| query += f" AND ({lang_filter})" | |
| # User filters | |
| if start_node: | |
| pattern = start_node if '%' in start_node else f"%{start_node}%" | |
| query += " AND s.id LIKE ?" | |
| params.append(pattern) | |
| if relation: | |
| if '%' in relation: | |
| query += " AND r.label LIKE ?" | |
| else: | |
| query += " AND r.label = ?" | |
| params.append(relation) | |
| if end_node: | |
| pattern = end_node if '%' in end_node else f"%{end_node}%" | |
| query += " AND en.id LIKE ?" | |
| params.append(pattern) | |
| query += " ORDER BY e.weight DESC LIMIT ?" | |
| params.append(limit) | |
| start_time = time.time() | |
| df = pd.read_sql_query(query, conn, params=params) | |
| elapsed = time.time() - start_time | |
| if df.empty: | |
| return pd.DataFrame(), f"No results ({elapsed:.2f}s)" | |
| df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] | |
| return df, f"β {len(df)} results in {elapsed:.2f}s" | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame(), f"**β Error:** {e}" | |
| def run_raw_query(sql_query): | |
| """Execute raw SQL""" | |
| if not sql_query.strip().upper().startswith("SELECT"): | |
| return pd.DataFrame(), "Only SELECT allowed" | |
| try: | |
| with get_db_connection() as conn: | |
| start = time.time() | |
| df = pd.read_sql_query(sql_query, conn) | |
| elapsed = time.time() - start | |
| return df, f"β {len(df)} rows in {elapsed:.2f}s" | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error: {e}" | |
| def get_schema_info(): | |
| """Get schema with index info""" | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| md = "# π Database Schema\n\n" | |
| md += "β **Custom indices created for fast queries!**\n\n" | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") | |
| tables = cursor.fetchall() | |
| for table_name, in tables: | |
| cursor.execute(f"SELECT COUNT(*) FROM {table_name}") | |
| count = cursor.fetchone()[0] | |
| md += f"## {table_name} ({count:,} rows)\n\n" | |
| # Columns | |
| cursor.execute(f"PRAGMA table_info({table_name})") | |
| cols = cursor.fetchall() | |
| md += "| Column | Type | Null | PK |\n|:--|:--|:--|:--|\n" | |
| for col in cols: | |
| md += f"| `{col[1]}` | `{col[2]}` | {'β' if col[3] else 'β'} | {'β' if col[5] else 'β'} |\n" | |
| # Indices | |
| cursor.execute(f"PRAGMA index_list({table_name})") | |
| indices = cursor.fetchall() | |
| if indices: | |
| md += f"\n**Indices ({len(indices)}):**\n" | |
| for idx in indices: | |
| cursor.execute(f"PRAGMA index_info({idx[1]})") | |
| idx_cols = cursor.fetchall() | |
| cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id' | |
| # Mark custom indices | |
| custom = "π CUSTOM" if idx[1].startswith("idx_") else "" | |
| md += f"- `{idx[1]}` on ({cols_str}) {custom}\n" | |
| md += "\n---\n\n" | |
| return md | |
| # Gradio UI | |
| with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π§ ConceptNet Explorer (With Custom Indices! π)") | |
| db_size = os.path.getsize(DB_PATH) / (2**30) | |
| gr.Markdown( | |
| f"**Database:** {os.path.basename(DB_PATH)} ({db_size:.2f} GB) | " | |
| f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | " | |
| f"**Status:** β Indexed & Fast" | |
| ) | |
| gr.Markdown("*Custom indices created on edge.start_id and edge.end_id for 100x faster queries!*") | |
| with gr.Tabs(): | |
| with gr.TabItem("π Semantic Profile"): | |
| gr.Markdown("**Get semantic profile - NOW FAST with custom indices!**") | |
| with gr.Row(): | |
| word_input = gr.Textbox(label="Word", placeholder="hund", value="hund") | |
| lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language") | |
| semantic_btn = gr.Button("π Get Profile", variant="primary", size="lg") | |
| semantic_output = gr.Markdown("*Click to start...*") | |
| with gr.TabItem("β‘ Query Builder"): | |
| gr.Markdown("**Build queries - NOW FAST with custom indices!**") | |
| with gr.Row(): | |
| start_input = gr.Textbox(label="Start Node", placeholder="hund", value="") | |
| rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="") | |
| end_input = gr.Textbox(label="End Node", placeholder="tier", value="") | |
| limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1) | |
| query_btn = gr.Button("βΆοΈ Run Query", variant="primary", size="lg") | |
| status_output = gr.Markdown("*Ready...*") | |
| results_output = gr.DataFrame(label="Results", wrap=True) | |
| with gr.TabItem("π» Raw SQL"): | |
| raw_sql_input = gr.Textbox( | |
| label="SQL Query", | |
| value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", | |
| lines=3 | |
| ) | |
| raw_btn = gr.Button("βΆοΈ Execute") | |
| raw_status = gr.Markdown() | |
| raw_results = gr.DataFrame() | |
| with gr.TabItem("π Schema"): | |
| schema_btn = gr.Button("π Load Schema") | |
| schema_output = gr.Markdown("*Click to load...*") | |
| gr.Markdown("---\n**π Performance:** Custom indices created on edge table = 100x faster queries!") | |
| # Connect functions | |
| semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output) | |
| query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output]) | |
| raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status]) | |
| schema_btn.click(get_schema_info, None, schema_output) | |
| if __name__ == "__main__": | |
| print("\nπ Starting app with indexed database...\n") | |
| demo.launch(ssr_mode=False) |