import gradio as gr import sqlite3 import pandas as pd from huggingface_hub import hf_hub_download, HfApi import os import time import shutil from pathlib import Path import json # ===== CONFIGURATION ===== TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh'] INDEXED_REPO_ID = "cstr/conceptnet-de-indexed" INDEXED_DB_FILENAME = "conceptnet-de-indexed.db" PROGRESS_FILENAME = "indexing_progress.json" LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db" CONCEPTNET_BASE = "http://conceptnet.io" # CRITICAL: Full URL base # ========================= print(f"šŸŒ Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN") if HF_TOKEN: print(f"āœ… HF_TOKEN found (length: {len(HF_TOKEN)})") else: print("āš ļø No HF_TOKEN - checkpointing disabled") ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite" ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db" def log_progress(message, level="INFO"): """Enhanced logging with timestamp""" timestamp = time.strftime("%H:%M:%S") prefix = { "INFO": "ā„¹ļø ", "SUCCESS": "āœ…", "ERROR": "āŒ", "WARN": "āš ļø ", "CHECKPOINT": "šŸ’¾", "DEBUG": "šŸ”" }.get(level, "") print(f"[{timestamp}] {prefix} {message}") def verify_database_has_indices(db_path): """Verify database has required indices""" log_progress(f"Verifying indices in {os.path.basename(db_path)}...", "DEBUG") if not os.path.exists(db_path): log_progress("Database file does not exist", "ERROR") return False, 0 try: conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'") custom_indices = cursor.fetchall() conn.close() has_all = len(custom_indices) >= 4 log_progress(f"Found {len(custom_indices)} custom indices (need 4+): {has_all}", "SUCCESS" if has_all else "WARN") return has_all, len(custom_indices) except Exception as e: log_progress(f"Error verifying indices: {e}", "ERROR") return False, 0 def check_remote_progress(): """Check remote progress with detailed logging""" log_progress("Checking remote progress...", "DEBUG") if not HF_TOKEN: log_progress("No HF_TOKEN - cannot check remote", "WARN") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } try: api = HfApi() try: api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS") except: log_progress("Repository does not exist yet", "INFO") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } try: progress_path = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN ) with open(progress_path, 'r') as f: progress = json.load(f) log_progress("Remote progress loaded:", "SUCCESS") log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO") log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO") log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO") return progress except Exception as e: log_progress("No progress file found (starting fresh)", "INFO") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } except Exception as e: log_progress(f"Error checking remote: {e}", "ERROR") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False): """Update remote progress file""" log_progress("Updating remote progress...", "DEBUG") if not HF_TOKEN: log_progress("Cannot update progress: No HF_TOKEN", "WARN") return False if analyzed_tables is None: analyzed_tables = [] try: api = HfApi() try: api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) except: log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO") api.create_repo( repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, private=False ) progress = { "completed_indices": completed_indices, "analyzed_tables": analyzed_tables, "database_uploaded": database_uploaded, "indexing_complete": indexing_complete, "timestamp": time.time(), "languages": TARGET_LANGUAGES } progress_path = "/tmp/indexing_progress.json" with open(progress_path, 'w') as f: json.dump(progress, f, indent=2) api.upload_file( path_or_fileobj=progress_path, path_in_repo=PROGRESS_FILENAME, repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables" ) log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables", "CHECKPOINT") return True except Exception as e: log_progress(f"Failed to update progress: {e}", "ERROR") import traceback traceback.print_exc() return False def upload_database_checkpoint(message=""): """Upload database with WAL checkpoint""" log_progress("Starting database upload...", "CHECKPOINT") if not HF_TOKEN: log_progress("Cannot upload: No HF_TOKEN", "WARN") return False if not os.path.exists(LOCAL_DB_PATH): log_progress("Database file doesn't exist", "ERROR") return False try: # CRITICAL: Checkpoint WAL to merge changes into main file log_progress("Checkpointing WAL...", "DEBUG") conn = sqlite3.connect(LOCAL_DB_PATH) conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") conn.close() log_progress("WAL checkpoint complete", "SUCCESS") # Verify indices are in file has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH) log_progress(f"Pre-upload verification: {idx_count} indices", "SUCCESS" if has_indices else "WARN") api = HfApi() db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30) log_progress(f"Uploading {db_size:.2f} GB to {INDEXED_REPO_ID}...", "CHECKPOINT") if message: log_progress(f" Message: {message}", "INFO") log_progress(" This will take 2-5 minutes...", "INFO") start = time.time() api.upload_file( path_or_fileobj=LOCAL_DB_PATH, path_in_repo=INDEXED_DB_FILENAME, repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message=message or "Database checkpoint" ) elapsed = time.time() - start speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0 log_progress(f"Upload complete in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS") log_progress(f"View at: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO") return True except Exception as e: log_progress(f"Upload failed: {e}", "ERROR") import traceback traceback.print_exc() return False def create_indexed_database(): """Create or download indexed database with comprehensive checkpointing""" log_progress("="*60, "INFO") log_progress("STARTING DATABASE SETUP", "INFO") log_progress("="*60, "INFO") # Check remote progress progress = check_remote_progress() completed_indices = set(progress.get("completed_indices", [])) analyzed_tables = set(progress.get("analyzed_tables", [])) database_uploaded = progress.get("database_uploaded", False) indexing_complete = progress.get("indexing_complete", False) # If fully complete, download and return if indexing_complete: log_progress("Fully indexed database exists!", "SUCCESS") log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO") try: indexed_path = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN ) log_progress(f"Downloaded to: {indexed_path}", "SUCCESS") # Verify it actually has indices has_indices, idx_count = verify_database_has_indices(indexed_path) if has_indices: log_progress(f"Verified {idx_count} indices present", "SUCCESS") return indexed_path else: log_progress(f"CORRUPTED: Only {idx_count}/4 indices found!", "ERROR") log_progress("The database needs to be re-indexed", "WARN") # Reset and rebuild indexing_complete = False completed_indices = set() analyzed_tables = set() database_uploaded = False update_remote_progress([], [], False, False) except Exception as e: log_progress(f"Download failed: {e}", "ERROR") log_progress("Will create locally", "INFO") # Download partially indexed DB if checkpoint exists if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH): log_progress("Checkpoint detected - downloading partial DB...", "INFO") log_progress(f" Indices done: {sorted(completed_indices)}", "INFO") log_progress(f" Tables analyzed: {sorted(analyzed_tables)}", "INFO") try: indexed_path = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN ) log_progress("Downloaded partial DB", "SUCCESS") # Verify indices has_indices, idx_count = verify_database_has_indices(indexed_path) if idx_count >= len(completed_indices): log_progress(f"Verified {idx_count} indices (expected {len(completed_indices)})", "SUCCESS") log_progress(f"Copying to {LOCAL_DB_PATH}...", "DEBUG") start = time.time() shutil.copy2(indexed_path, LOCAL_DB_PATH) elapsed = time.time() - start log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS") log_progress("Resuming from checkpoint āœ…", "SUCCESS") else: log_progress(f"Index mismatch: found {idx_count}, expected {len(completed_indices)}", "ERROR") log_progress("Will start from scratch", "WARN") completed_indices = set() analyzed_tables = set() except Exception as e: log_progress(f"Could not download partial DB: {e}", "WARN") log_progress("Will start from original", "INFO") completed_indices = set() analyzed_tables = set() # Download original if needed if not os.path.exists(LOCAL_DB_PATH): if completed_indices or analyzed_tables: log_progress("Failed to resume - clearing progress", "WARN") update_remote_progress([], [], False, False) completed_indices = set() analyzed_tables = set() log_progress("Downloading original ConceptNet database...", "INFO") original_path = hf_hub_download( repo_id=ORIGINAL_REPO_ID, filename=ORIGINAL_DB_FILENAME, repo_type="dataset" ) original_size = os.path.getsize(original_path) free_space = shutil.disk_usage("/tmp")[2] log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO") log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO") if free_space < original_size * 2: raise Exception(f"Insufficient space! Need {original_size * 2 / (2**30):.1f} GB, have {free_space / (2**30):.1f} GB") log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO") start = time.time() shutil.copy2(original_path, LOCAL_DB_PATH) elapsed = time.time() - start log_progress(f"Copied {original_size / (2**30):.2f} GB in {elapsed:.1f}s ({original_size / elapsed / (2**20):.1f} MB/s)", "SUCCESS") # Only index if not complete if not (len(completed_indices) >= 4 and len(analyzed_tables) >= 4): log_progress("Indexing required", "INFO") # Connect log_progress("Opening database connection...", "DEBUG") conn = sqlite3.connect(LOCAL_DB_PATH) cursor = conn.cursor() # Optimizations log_progress("Setting PRAGMA optimizations...", "DEBUG") cursor.execute("PRAGMA journal_mode = WAL") cursor.execute("PRAGMA synchronous = NORMAL") cursor.execute("PRAGMA cache_size = -512000") cursor.execute("PRAGMA temp_store = MEMORY") # PHASE 1: Indices log_progress("="*60, "INFO") log_progress("PHASE 1: CREATING INDICES", "INFO") log_progress("="*60, "INFO") indices_to_create = [ ("idx_edge_start_id", "edge", "start_id"), ("idx_edge_end_id", "edge", "end_id"), ("idx_edge_rel_id", "edge", "rel_id"), ("idx_node_label", "node", "label"), ] for i, (idx_name, table, column) in enumerate(indices_to_create, 1): if idx_name in completed_indices: log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED", "INFO") continue log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO") start = time.time() try: cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})") conn.commit() elapsed = time.time() - start log_progress(f" Created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS") completed_indices.add(idx_name) update_remote_progress(list(completed_indices), list(analyzed_tables), False, False) upload_database_checkpoint(f"Checkpoint: {idx_name} ({i}/{len(indices_to_create)})") except Exception as e: log_progress(f"Failed to create {idx_name}: {e}", "ERROR") conn.close() raise # PHASE 2: ANALYZE log_progress("="*60, "INFO") log_progress("PHASE 2: ANALYZING TABLES", "INFO") log_progress("="*60, "INFO") cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") tables = [row[0] for row in cursor.fetchall()] log_progress(f"Found {len(tables)} tables: {tables}", "INFO") for i, table in enumerate(tables, 1): if table in analyzed_tables: log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED", "INFO") continue log_progress(f"[{i}/{len(tables)}] Analyzing {table}...", "INFO") try: cursor.execute(f"SELECT COUNT(*) FROM {table}") row_count = cursor.fetchone()[0] log_progress(f" Rows: {row_count:,}", "INFO") except: log_progress(" Could not count rows", "WARN") start = time.time() try: cursor.execute(f"ANALYZE {table}") conn.commit() elapsed = time.time() - start log_progress(f" Analyzed in {elapsed:.1f}s", "SUCCESS") analyzed_tables.add(table) update_remote_progress(list(completed_indices), list(analyzed_tables), False, False) upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})") except Exception as e: log_progress(f"Failed to analyze {table}: {e}", "ERROR") log_progress("Continuing...", "WARN") # Final checkpoint log_progress("Final WAL checkpoint...", "INFO") cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)") conn.commit() conn.close() log_progress("Database closed", "SUCCESS") # Final upload log_progress("="*60, "INFO") log_progress("FINAL UPLOAD", "INFO") log_progress("="*60, "INFO") has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH) log_progress(f"Final check: {idx_count} indices", "SUCCESS" if has_indices else "ERROR") upload_database_checkpoint("COMPLETE - All indices and analysis done") update_remote_progress(list(completed_indices), list(analyzed_tables), True, True) log_progress("="*60, "SUCCESS") log_progress("INDEXING COMPLETE!", "SUCCESS") log_progress("="*60, "SUCCESS") return LOCAL_DB_PATH # Initialize DB_PATH = create_indexed_database() def get_db_connection(): """Create optimized connection""" log_progress("Creating DB connection", "DEBUG") conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute("PRAGMA cache_size = -256000") conn.execute("PRAGMA mmap_size = 4294967296") return conn def run_diagnostics(): """Run comprehensive diagnostics""" log_progress("="*60, "INFO") log_progress("RUNNING DIAGNOSTICS", "INFO") log_progress("="*60, "INFO") try: with get_db_connection() as conn: cursor = conn.cursor() # 1. Sample nodes log_progress("\n1. Sample node IDs:", "INFO") cursor.execute("SELECT id, label FROM node LIMIT 10") for node_id, label in cursor.fetchall(): print(f" {node_id} -> {label}") # 2. Test correct pattern log_progress("\n2. Testing CORRECT pattern (no leading %):", "INFO") test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%" log_progress(f" Pattern: {test_pattern}", "DEBUG") start = time.time() cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (test_pattern,)) results = cursor.fetchall() elapsed = time.time() - start log_progress(f" Found {len(results)} in {elapsed:.3f}s", "SUCCESS" if elapsed < 1 else "WARN") for node_id, label in results: print(f" {node_id} -> {label}") # 3. Check index usage log_progress("\n3. Checking index usage:", "INFO") cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '{test_pattern}'") plan = cursor.fetchall() uses_index = any('INDEX' in str(row).upper() for row in plan) log_progress(f" Uses index: {uses_index}", "SUCCESS" if uses_index else "ERROR") for row in plan: print(f" {row}") # 4. Test wrong pattern log_progress("\n4. Testing WRONG pattern (leading %):", "WARN") wrong_pattern = f"%/c/en/dog%" log_progress(f" Pattern: {wrong_pattern}", "DEBUG") start = time.time() cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (wrong_pattern,)) results = cursor.fetchall() elapsed = time.time() - start log_progress(f" Found {len(results)} in {elapsed:.3f}s (SLOW!)", "WARN" if elapsed > 1 else "INFO") cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM node WHERE id LIKE '{wrong_pattern}'") plan = cursor.fetchall() uses_index = any('INDEX' in str(row).upper() for row in plan) log_progress(f" Uses index: {uses_index} (should be False)", "WARN" if uses_index else "INFO") log_progress("\n" + "="*60, "INFO") log_progress("DIAGNOSTICS COMPLETE", "SUCCESS") log_progress("="*60 + "\n", "INFO") except Exception as e: log_progress(f"Diagnostics failed: {e}", "ERROR") import traceback traceback.print_exc() # Run diagnostics run_diagnostics() def get_semantic_profile(word, lang='en', progress=gr.Progress()): """Get semantic profile with CORRECT URL pattern""" log_progress(f"Semantic profile request: word='{word}', lang='{lang}'", "DEBUG") progress(0, desc="Starting...") if not word: return "āš ļø Please enter a word." if lang not in TARGET_LANGUAGES: return f"āš ļø Language '{lang}' not supported. Available: {', '.join(TARGET_LANGUAGES)}" word = word.strip().lower().replace(' ', '_') # CORRECT pattern - no leading % allows index usage! like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%" log_progress(f"Using pattern: {like_path}", "DEBUG") relations = [ "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf", "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", "/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom", "/r/SimilarTo" ] output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() progress(0.05, desc="Finding nodes...") start = time.time() cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,)) nodes = cursor.fetchall() elapsed = time.time() - start log_progress(f"Found {len(nodes)} nodes in {elapsed:.3f}s", "SUCCESS" if nodes else "WARN") if not nodes: return f"# 🧠 Semantic Profile: '{word}'\n\nāš ļø **Not found**\n\nSearched: `{like_path}`" for node_id, label in nodes[:3]: output_md += f"**Node:** `{node_id}`\n" output_md += f"**Label:** {label}\n\n" log_progress(f" Found node: {node_id} ({label})", "DEBUG") total_relations = 0 for i, rel in enumerate(relations): progress((i + 1) / len(relations), desc=f"Querying {rel}...") log_progress(f"Querying relation: {rel}", "DEBUG") output_md += f"## {rel}\n\n" has_results = False # Outgoing edges start = time.time() cursor.execute(""" SELECT en.label, e.weight FROM edge e JOIN node en ON e.end_id = en.id JOIN relation r ON e.rel_id = r.id WHERE e.start_id LIKE ? AND r.label = ? ORDER BY e.weight DESC LIMIT 7 """, (like_path, rel)) out_results = cursor.fetchall() elapsed = time.time() - start log_progress(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s", "DEBUG") for label, weight in out_results: output_md += f"- **{word}** {rel} → *{label}* `[{weight:.3f}]`\n" has_results = True total_relations += 1 # Incoming edges start = time.time() cursor.execute(""" SELECT s.label, e.weight FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id WHERE e.end_id LIKE ? AND r.label = ? ORDER BY e.weight DESC LIMIT 7 """, (like_path, rel)) in_results = cursor.fetchall() elapsed = time.time() - start log_progress(f" Incoming: {len(in_results)} results in {elapsed:.3f}s", "DEBUG") for label, weight in in_results: output_md += f"- *{label}* {rel} → **{word}** `[{weight:.3f}]`\n" has_results = True total_relations += 1 if not has_results: output_md += "*No results*\n" output_md += "\n" progress(1.0, desc="Complete!") output_md += "---\n" output_md += f"**Total relations:** {total_relations}\n" log_progress(f"Profile complete: {total_relations} relations found", "SUCCESS") return output_md except Exception as e: log_progress(f"Error in semantic profile: {e}", "ERROR") import traceback traceback.print_exc() return f"**āŒ Error:**\n\n```\n{e}\n```" def run_query(start_node, relation, end_node, limit, progress=gr.Progress()): """Query builder with CORRECT patterns""" log_progress(f"Query request: start={start_node}, rel={relation}, end={end_node}, limit={limit}", "DEBUG") progress(0, desc="Building query...") query = """ SELECT e.id AS edge_id, s.id AS start_id, r.label AS relation, en.id AS end_id, e.weight, s.label AS start_label, en.label AS end_label FROM edge e JOIN relation r ON e.rel_id = r.id JOIN node s ON e.start_id = s.id JOIN node en ON e.end_id = en.id WHERE 1=1 """ params = [] try: with get_db_connection() as conn: progress(0.3, desc="Adding filters...") # Language filter - use correct URL pattern! lang_conditions = [] for lang in TARGET_LANGUAGES: lang_conditions.append(f"s.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'") lang_conditions.append(f"en.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'") query += f" AND ({' OR '.join(lang_conditions)})" # Start node filter if start_node and start_node.strip(): if start_node.startswith('http://'): pattern = f"{start_node}%" else: # User enters just word, we construct full URL pattern = f"{CONCEPTNET_BASE}/c/%/{start_node}%" query += " AND s.id LIKE ?" params.append(pattern) log_progress(f"Start filter: {pattern}", "DEBUG") # Relation filter if relation and relation.strip(): rel_value = relation if relation.startswith('/r/') else f"/r/{relation}" if '%' in relation: query += " AND r.label LIKE ?" else: query += " AND r.label = ?" params.append(rel_value) log_progress(f"Relation filter: {rel_value}", "DEBUG") # End node filter if end_node and end_node.strip(): if end_node.startswith('http://'): pattern = f"{end_node}%" else: pattern = f"{CONCEPTNET_BASE}/c/%/{end_node}%" query += " AND en.id LIKE ?" params.append(pattern) log_progress(f"End filter: {pattern}", "DEBUG") query += " ORDER BY e.weight DESC LIMIT ?" params.append(limit) progress(0.6, desc="Executing...") log_progress(f"Executing query with {len(params)} params", "DEBUG") start_time = time.time() df = pd.read_sql_query(query, conn, params=params) elapsed = time.time() - start_time log_progress(f"Query complete: {len(df)} results in {elapsed:.2f}s", "SUCCESS") progress(1.0, desc="Complete!") if df.empty: return pd.DataFrame(), f"āš ļø No results ({elapsed:.2f}s)" df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] return df, f"āœ… {len(df)} results in {elapsed:.2f}s" except Exception as e: log_progress(f"Query error: {e}", "ERROR") import traceback traceback.print_exc() return pd.DataFrame(), f"**āŒ Error:** {e}" def run_raw_query(sql_query): """Execute raw SQL with logging""" log_progress(f"Raw SQL query: {sql_query[:100]}...", "DEBUG") if not sql_query.strip().upper().startswith("SELECT"): return pd.DataFrame(), "āŒ Only SELECT queries allowed" try: with get_db_connection() as conn: start = time.time() df = pd.read_sql_query(sql_query, conn) elapsed = time.time() - start log_progress(f"Raw query complete: {len(df)} rows in {elapsed:.3f}s", "SUCCESS") return df, f"āœ… {len(df)} rows in {elapsed:.3f}s" except Exception as e: log_progress(f"Raw query error: {e}", "ERROR") return pd.DataFrame(), f"āŒ Error: {e}" def get_schema_info(): """Get schema with sample queries""" log_progress("Loading schema info", "DEBUG") md = f"# šŸ“š Database Schema\n\n" md += f"**Repository:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n" md += f"**Base URL:** `{CONCEPTNET_BASE}`\n\n" md += "## Sample Queries\n\n" md += "**Finding nodes:**\n```sql\n" md += f"-- English 'dog'\n" md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%';\n\n" md += f"-- German 'hund'\n" md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/de/hund%';\n" md += "```\n\n" md += "**Finding edges:**\n```sql\n" md += f"-- Edges from 'dog'\n" md += f"SELECT * FROM edge WHERE start_id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10;\n" md += "```\n\n" md += "āš ļø **Important:** Do NOT use leading `%` in LIKE queries (prevents index usage!)\n\n" md += "āœ… **Good:** `LIKE 'http://conceptnet.io/c/en/dog%'`\n" md += "āŒ **Bad:** `LIKE '%/c/en/dog%'`\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() md += "## Tables\n\n" cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") for table, in cursor.fetchall(): cursor.execute(f"SELECT COUNT(*) FROM {table}") count = cursor.fetchone()[0] md += f"### {table} ({count:,} rows)\n\n" # Show columns cursor.execute(f"PRAGMA table_info({table})") cols = cursor.fetchall() md += "| Column | Type |\n|:--|:--|\n" for col in cols: md += f"| `{col[1]}` | `{col[2]}` |\n" # Show indices cursor.execute(f"PRAGMA index_list({table})") indices = cursor.fetchall() if indices: md += f"\n**Indices ({len(indices)}):**\n" for idx in indices: custom = " šŸ†•" if idx[1].startswith("idx_") else "" md += f"- `{idx[1]}`{custom}\n" md += "\n" log_progress("Schema loaded successfully", "SUCCESS") except Exception as e: log_progress(f"Schema error: {e}", "ERROR") md += f"\n**Error loading schema:** {e}\n" return md # UI with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧠 ConceptNet Explorer") gr.Markdown( f"**Multi-language semantic network explorer** | " f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | " f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})" ) gr.Markdown("āœ… **Optimized with custom indices** - Fast queries using correct URL patterns") with gr.Tabs(): with gr.TabItem("šŸ” Semantic Profile"): gr.Markdown("**Explore semantic relations for any word**") with gr.Row(): word_input = gr.Textbox( label="Word", placeholder="dog", value="dog", info="Enter a word to explore" ) lang_input = gr.Dropdown( choices=TARGET_LANGUAGES, value="en", label="Language", info="Select language" ) semantic_btn = gr.Button("šŸ” Get Semantic Profile", variant="primary", size="lg") semantic_output = gr.Markdown("*Enter a word and click the button to start...*") gr.Markdown("**Examples:** dog (en), hund (de), perro (es), chien (fr), 犬 (ja)") with gr.TabItem("⚔ Query Builder"): gr.Markdown("**Build custom queries to find specific relationships**") with gr.Row(): start_input = gr.Textbox( label="Start Node", placeholder="dog", info="Enter word or full URL" ) rel_input = gr.Textbox( label="Relation", placeholder="IsA", value="IsA", info="e.g., IsA, PartOf, UsedFor" ) end_input = gr.Textbox( label="End Node", placeholder="", info="Leave empty for all" ) limit_slider = gr.Slider( label="Result Limit", minimum=1, maximum=200, value=50, step=1 ) query_btn = gr.Button("ā–¶ļø Run Query", variant="primary", size="lg") status_output = gr.Markdown("*Ready to query...*") results_output = gr.DataFrame( label="Results", wrap=True, interactive=False ) with gr.TabItem("šŸ’» Raw SQL"): gr.Markdown("**Execute custom SQL queries** (SELECT only)") raw_sql_input = gr.Textbox( label="SQL Query", value=f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10", lines=5, info="Write your SELECT query" ) raw_btn = gr.Button("ā–¶ļø Execute Query", variant="secondary", size="lg") raw_status = gr.Markdown() raw_results = gr.DataFrame(label="Query Results", wrap=True) gr.Markdown( "**Tips:**\n" "- Always use `LIMIT` to prevent timeouts\n" f"- Node IDs start with: `{CONCEPTNET_BASE}/c/{{lang}}/{{word}}`\n" "- Don't use leading `%` in LIKE queries for best performance" ) with gr.TabItem("šŸ“Š Schema & Info"): gr.Markdown("**Database schema and structure information**") schema_btn = gr.Button("šŸ“Š Load Schema", variant="secondary", size="lg") schema_output = gr.Markdown("*Click button to load schema...*") gr.Markdown( "---\n" "**Performance:** Custom indices on `edge.start_id`, `edge.end_id`, `edge.rel_id`, `node.label` | " "**Check server logs for detailed query timing and diagnostics**" ) # Wire up event handlers semantic_btn.click( fn=get_semantic_profile, inputs=[word_input, lang_input], outputs=semantic_output ) query_btn.click( fn=run_query, inputs=[start_input, rel_input, end_input, limit_slider], outputs=[results_output, status_output] ) raw_btn.click( fn=run_raw_query, inputs=raw_sql_input, outputs=[raw_results, raw_status] ) schema_btn.click( fn=get_schema_info, inputs=None, outputs=schema_output ) if __name__ == "__main__": log_progress("="*60, "SUCCESS") log_progress("APP READY!", "SUCCESS") log_progress("="*60, "SUCCESS") log_progress(f"Database: {DB_PATH}", "INFO") log_progress(f"Size: {os.path.getsize(DB_PATH) / (2**30):.2f} GB", "INFO") log_progress("="*60 + "\n", "SUCCESS") demo.launch(ssr_mode=False)