Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import sqlite3 | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download, HfApi | |
| import os | |
| import time | |
| import shutil | |
| from pathlib import Path | |
| import json | |
| # ===== CONFIGURATION ===== | |
| TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh'] | |
| INDEXED_REPO_ID = "cstr/conceptnet-de-indexed" | |
| INDEXED_DB_FILENAME = "conceptnet-de-indexed.db" | |
| PROGRESS_FILENAME = "indexing_progress.json" | |
| LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db" | |
| CONCEPTNET_BASE = "http://conceptnet.io" # CRITICAL: Full URL base | |
| # ========================= | |
| print(f"π Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN") | |
| if HF_TOKEN: | |
| print(f"β HF_TOKEN found (length: {len(HF_TOKEN)})") | |
| else: | |
| print("β οΈ No HF_TOKEN - checkpointing disabled") | |
| ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite" | |
| ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db" | |
| def log_progress(message, level="INFO"): | |
| """Enhanced logging with timestamp""" | |
| timestamp = time.strftime("%H:%M:%S") | |
| prefix = { | |
| "INFO": "βΉοΈ ", | |
| "SUCCESS": "β ", | |
| "ERROR": "β", | |
| "WARN": "β οΈ ", | |
| "CHECKPOINT": "πΎ", | |
| "DEBUG": "π" | |
| }.get(level, "") | |
| print(f"[{timestamp}] {prefix} {message}") | |
| def verify_database_has_indices(db_path): | |
| """Verify database has required indices""" | |
| log_progress(f"Verifying indices in {os.path.basename(db_path)}...", "DEBUG") | |
| if not os.path.exists(db_path): | |
| log_progress("Database file does not exist", "ERROR") | |
| return False, 0 | |
| try: | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'") | |
| custom_indices = cursor.fetchall() | |
| conn.close() | |
| has_all = len(custom_indices) >= 4 | |
| log_progress(f"Found {len(custom_indices)} custom indices (need 4+): {has_all}", "SUCCESS" if has_all else "WARN") | |
| return has_all, len(custom_indices) | |
| except Exception as e: | |
| log_progress(f"Error verifying indices: {e}", "ERROR") | |
| return False, 0 | |
| def check_remote_progress(): | |
| """Check remote progress with detailed logging""" | |
| log_progress("Checking remote progress...", "DEBUG") | |
| if not HF_TOKEN: | |
| log_progress("No HF_TOKEN - cannot check remote", "WARN") | |
| return { | |
| "completed_indices": [], | |
| "analyzed_tables": [], | |
| "database_uploaded": False, | |
| "indexing_complete": False | |
| } | |
| try: | |
| api = HfApi() | |
| try: | |
| api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) | |
| log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS") | |
| except: | |
| log_progress("Repository does not exist yet", "INFO") | |
| return { | |
| "completed_indices": [], | |
| "analyzed_tables": [], | |
| "database_uploaded": False, | |
| "indexing_complete": False | |
| } | |
| try: | |
| progress_path = hf_hub_download( | |
| repo_id=INDEXED_REPO_ID, | |
| filename=PROGRESS_FILENAME, | |
| repo_type="dataset", | |
| token=HF_TOKEN | |
| ) | |
| with open(progress_path, 'r') as f: | |
| progress = json.load(f) | |
| log_progress("Remote progress loaded:", "SUCCESS") | |
| log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO") | |
| log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO") | |
| log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO") | |
| return progress | |
| except Exception as e: | |
| log_progress("No progress file found (starting fresh)", "INFO") | |
| return { | |
| "completed_indices": [], | |
| "analyzed_tables": [], | |
| "database_uploaded": False, | |
| "indexing_complete": False | |
| } | |
| except Exception as e: | |
| log_progress(f"Error checking remote: {e}", "ERROR") | |
| return { | |
| "completed_indices": [], | |
| "analyzed_tables": [], | |
| "database_uploaded": False, | |
| "indexing_complete": False | |
| } | |
| def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False): | |
| """Update remote progress file""" | |
| log_progress("Updating remote progress...", "DEBUG") | |
| if not HF_TOKEN: | |
| log_progress("Cannot update progress: No HF_TOKEN", "WARN") | |
| return False | |
| if analyzed_tables is None: | |
| analyzed_tables = [] | |
| try: | |
| api = HfApi() | |
| try: | |
| api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) | |
| except: | |
| log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO") | |
| api.create_repo( | |
| repo_id=INDEXED_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| private=False | |
| ) | |
| progress = { | |
| "completed_indices": completed_indices, | |
| "analyzed_tables": analyzed_tables, | |
| "database_uploaded": database_uploaded, | |
| "indexing_complete": indexing_complete, | |
| "timestamp": time.time(), | |
| "languages": TARGET_LANGUAGES | |
| } | |
| progress_path = "/tmp/indexing_progress.json" | |
| with open(progress_path, 'w') as f: | |
| json.dump(progress, f, indent=2) | |
| api.upload_file( | |
| path_or_fileobj=progress_path, | |
| path_in_repo=PROGRESS_FILENAME, | |
| repo_id=INDEXED_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables" | |
| ) | |
| log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables", "CHECKPOINT") | |
| return True | |
| except Exception as e: | |
| log_progress(f"Failed to update progress: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def upload_database_checkpoint(message=""): | |
| """Upload database with WAL checkpoint""" | |
| log_progress("Starting database upload...", "CHECKPOINT") | |
| if not HF_TOKEN: | |
| log_progress("Cannot upload: No HF_TOKEN", "WARN") | |
| return False | |
| if not os.path.exists(LOCAL_DB_PATH): | |
| log_progress("Database file doesn't exist", "ERROR") | |
| return False | |
| try: | |
| # CRITICAL: Checkpoint WAL to merge changes into main file | |
| log_progress("Checkpointing WAL...", "DEBUG") | |
| conn = sqlite3.connect(LOCAL_DB_PATH) | |
| conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") | |
| conn.close() | |
| log_progress("WAL checkpoint complete", "SUCCESS") | |
| # Verify indices are in file | |
| has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH) | |
| log_progress(f"Pre-upload verification: {idx_count} indices", "SUCCESS" if has_indices else "WARN") | |
| api = HfApi() | |
| db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30) | |
| log_progress(f"Uploading {db_size:.2f} GB to {INDEXED_REPO_ID}...", "CHECKPOINT") | |
| if message: | |
| log_progress(f" Message: {message}", "INFO") | |
| log_progress(" This will take 2-5 minutes...", "INFO") | |
| start = time.time() | |
| api.upload_file( | |
| path_or_fileobj=LOCAL_DB_PATH, | |
| path_in_repo=INDEXED_DB_FILENAME, | |
| repo_id=INDEXED_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message=message or "Database checkpoint" | |
| ) | |
| elapsed = time.time() - start | |
| speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0 | |
| log_progress(f"Upload complete in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS") | |
| log_progress(f"View at: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO") | |
| return True | |
| except Exception as e: | |
| log_progress(f"Upload failed: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def create_indexed_database(): | |
| """Create or download indexed database with comprehensive checkpointing""" | |
| log_progress("="*60, "INFO") | |
| log_progress("STARTING DATABASE SETUP", "INFO") | |
| log_progress("="*60, "INFO") | |
| # Check remote progress | |
| progress = check_remote_progress() | |
| completed_indices = set(progress.get("completed_indices", [])) | |
| analyzed_tables = set(progress.get("analyzed_tables", [])) | |
| database_uploaded = progress.get("database_uploaded", False) | |
| indexing_complete = progress.get("indexing_complete", False) | |
| # If fully complete, download and return | |
| if indexing_complete: | |
| log_progress("Fully indexed database exists!", "SUCCESS") | |
| log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO") | |
| try: | |
| indexed_path = hf_hub_download( | |
| repo_id=INDEXED_REPO_ID, | |
| filename=INDEXED_DB_FILENAME, | |
| repo_type="dataset", | |
| token=HF_TOKEN | |
| ) | |
| log_progress(f"Downloaded to: {indexed_path}", "SUCCESS") | |
| # Verify it actually has indices | |
| has_indices, idx_count = verify_database_has_indices(indexed_path) | |
| if has_indices: | |
| log_progress(f"Verified {idx_count} indices present", "SUCCESS") | |
| return indexed_path | |
| else: | |
| log_progress(f"CORRUPTED: Only {idx_count}/4 indices found!", "ERROR") | |
| log_progress("The database needs to be re-indexed", "WARN") | |
| # Reset and rebuild | |
| indexing_complete = False | |
| completed_indices = set() | |
| analyzed_tables = set() | |
| database_uploaded = False | |
| update_remote_progress([], [], False, False) | |
| except Exception as e: | |
| log_progress(f"Download failed: {e}", "ERROR") | |
| log_progress("Will create locally", "INFO") | |
| # Download partially indexed DB if checkpoint exists | |
| if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH): | |
| log_progress("Checkpoint detected - downloading partial DB...", "INFO") | |
| log_progress(f" Indices done: {sorted(completed_indices)}", "INFO") | |
| log_progress(f" Tables analyzed: {sorted(analyzed_tables)}", "INFO") | |
| try: | |
| indexed_path = hf_hub_download( | |
| repo_id=INDEXED_REPO_ID, | |
| filename=INDEXED_DB_FILENAME, | |
| repo_type="dataset", | |
| token=HF_TOKEN | |
| ) | |
| log_progress("Downloaded partial DB", "SUCCESS") | |
| # Verify indices | |
| has_indices, idx_count = verify_database_has_indices(indexed_path) | |
| if idx_count >= len(completed_indices): | |
| log_progress(f"Verified {idx_count} indices (expected {len(completed_indices)})", "SUCCESS") | |
| log_progress(f"Copying to {LOCAL_DB_PATH}...", "DEBUG") | |
| start = time.time() | |
| shutil.copy2(indexed_path, LOCAL_DB_PATH) | |
| elapsed = time.time() - start | |
| log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS") | |
| log_progress("Resuming from checkpoint β ", "SUCCESS") | |
| else: | |
| log_progress(f"Index mismatch: found {idx_count}, expected {len(completed_indices)}", "ERROR") | |
| log_progress("Will start from scratch", "WARN") | |
| completed_indices = set() | |
| analyzed_tables = set() | |
| except Exception as e: | |
| log_progress(f"Could not download partial DB: {e}", "WARN") | |
| log_progress("Will start from original", "INFO") | |
| completed_indices = set() | |
| analyzed_tables = set() | |
| # Download original if needed | |
| if not os.path.exists(LOCAL_DB_PATH): | |
| if completed_indices or analyzed_tables: | |
| log_progress("Failed to resume - clearing progress", "WARN") | |
| update_remote_progress([], [], False, False) | |
| completed_indices = set() | |
| analyzed_tables = set() | |
| log_progress("Downloading original ConceptNet database...", "INFO") | |
| original_path = hf_hub_download( | |
| repo_id=ORIGINAL_REPO_ID, | |
| filename=ORIGINAL_DB_FILENAME, | |
| repo_type="dataset" | |
| ) | |
| original_size = os.path.getsize(original_path) | |
| free_space = shutil.disk_usage("/tmp")[2] | |
| log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO") | |
| log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO") | |
| if free_space < original_size * 2: | |
| raise Exception(f"Insufficient space! Need {original_size * 2 / (2**30):.1f} GB, have {free_space / (2**30):.1f} GB") | |
| log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO") | |
| start = time.time() | |
| shutil.copy2(original_path, LOCAL_DB_PATH) | |
| elapsed = time.time() - start | |
| log_progress(f"Copied {original_size / (2**30):.2f} GB in {elapsed:.1f}s ({original_size / elapsed / (2**20):.1f} MB/s)", "SUCCESS") | |
| # Only index if not complete | |
| if not (len(completed_indices) >= 4 and len(analyzed_tables) >= 4): | |
| log_progress("Indexing required", "INFO") | |
| # Connect | |
| log_progress("Opening database connection...", "DEBUG") | |
| conn = sqlite3.connect(LOCAL_DB_PATH) | |
| cursor = conn.cursor() | |
| # Optimizations | |
| log_progress("Setting PRAGMA optimizations...", "DEBUG") | |
| cursor.execute("PRAGMA journal_mode = WAL") | |
| cursor.execute("PRAGMA synchronous = NORMAL") | |
| cursor.execute("PRAGMA cache_size = -512000") | |
| cursor.execute("PRAGMA temp_store = MEMORY") | |
| # PHASE 1: Indices | |
| log_progress("="*60, "INFO") | |
| log_progress("PHASE 1: CREATING INDICES", "INFO") | |
| log_progress("="*60, "INFO") | |
| indices_to_create = [ | |
| ("idx_edge_start_id", "edge", "start_id"), | |
| ("idx_edge_end_id", "edge", "end_id"), | |
| ("idx_edge_rel_id", "edge", "rel_id"), | |
| ("idx_node_label", "node", "label"), | |
| ] | |
| for i, (idx_name, table, column) in enumerate(indices_to_create, 1): | |
| if idx_name in completed_indices: | |
| log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED", "INFO") | |
| continue | |
| log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO") | |
| start = time.time() | |
| try: | |
| cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})") | |
| conn.commit() | |
| elapsed = time.time() - start | |
| log_progress(f" Created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS") | |
| completed_indices.add(idx_name) | |
| update_remote_progress(list(completed_indices), list(analyzed_tables), False, False) | |
| upload_database_checkpoint(f"Checkpoint: {idx_name} ({i}/{len(indices_to_create)})") | |
| except Exception as e: | |
| log_progress(f"Failed to create {idx_name}: {e}", "ERROR") | |
| conn.close() | |
| raise | |
| # PHASE 2: ANALYZE | |
| log_progress("="*60, "INFO") | |
| log_progress("PHASE 2: ANALYZING TABLES", "INFO") | |
| log_progress("="*60, "INFO") | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") | |
| tables = [row[0] for row in cursor.fetchall()] | |
| log_progress(f"Found {len(tables)} tables: {tables}", "INFO") | |
| for i, table in enumerate(tables, 1): | |
| if table in analyzed_tables: | |
| log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED", "INFO") | |
| continue | |
| log_progress(f"[{i}/{len(tables)}] Analyzing {table}...", "INFO") | |
| try: | |
| cursor.execute(f"SELECT COUNT(*) FROM {table}") | |
| row_count = cursor.fetchone()[0] | |
| log_progress(f" Rows: {row_count:,}", "INFO") | |
| except: | |
| log_progress(" Could not count rows", "WARN") | |
| start = time.time() | |
| try: | |
| cursor.execute(f"ANALYZE {table}") | |
| conn.commit() | |
| elapsed = time.time() - start | |
| log_progress(f" Analyzed in {elapsed:.1f}s", "SUCCESS") | |
| analyzed_tables.add(table) | |
| update_remote_progress(list(completed_indices), list(analyzed_tables), False, False) | |
| upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})") | |
| except Exception as e: | |
| log_progress(f"Failed to analyze {table}: {e}", "ERROR") | |
| log_progress("Continuing...", "WARN") | |
| # Final checkpoint | |
| log_progress("Final WAL checkpoint...", "INFO") | |
| cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)") | |
| conn.commit() | |
| conn.close() | |
| log_progress("Database closed", "SUCCESS") | |
| # Final upload | |
| log_progress("="*60, "INFO") | |
| log_progress("FINAL UPLOAD", "INFO") | |
| log_progress("="*60, "INFO") | |
| has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH) | |
| log_progress(f"Final check: {idx_count} indices", "SUCCESS" if has_indices else "ERROR") | |
| upload_database_checkpoint("COMPLETE - All indices and analysis done") | |
| update_remote_progress(list(completed_indices), list(analyzed_tables), True, True) | |
| log_progress("="*60, "SUCCESS") | |
| log_progress("INDEXING COMPLETE!", "SUCCESS") | |
| log_progress("="*60, "SUCCESS") | |
| return LOCAL_DB_PATH | |
| # Initialize | |
| DB_PATH = create_indexed_database() | |
| def get_db_connection(): | |
| """Create optimized connection""" | |
| log_progress("Creating DB connection", "DEBUG") | |
| conn = sqlite3.connect(DB_PATH, check_same_thread=False) | |
| conn.execute("PRAGMA cache_size = -256000") | |
| conn.execute("PRAGMA mmap_size = 4294967296") | |
| return conn | |
| def run_diagnostics(): | |
| """Run comprehensive diagnostics""" | |
| log_progress("="*60, "INFO") | |
| log_progress("RUNNING DIAGNOSTICS", "INFO") | |
| log_progress("="*60, "INFO") | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| # 1. Sample nodes | |
| log_progress("\n1. Sample node IDs:", "INFO") | |
| cursor.execute("SELECT id, label FROM node LIMIT 10") | |
| for node_id, label in cursor.fetchall(): | |
| print(f" {node_id} -> {label}") | |
| # 2. Test correct pattern | |
| log_progress("\n2. Testing CORRECT pattern (no leading %):", "INFO") | |
| test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%" | |
| log_progress(f" Pattern: {test_pattern}", "DEBUG") | |
| start = time.time() | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (test_pattern,)) | |
| results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| log_progress(f" Found {len(results)} in {elapsed:.3f}s", "SUCCESS" if elapsed < 1 else "WARN") | |
| for node_id, label in results: | |
| print(f" {node_id} -> {label}") | |
| # 3. Check index usage | |
| log_progress("\n3. Checking index usage:", "INFO") | |
| cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '{test_pattern}'") | |
| plan = cursor.fetchall() | |
| uses_index = any('INDEX' in str(row).upper() for row in plan) | |
| log_progress(f" Uses index: {uses_index}", "SUCCESS" if uses_index else "ERROR") | |
| for row in plan: | |
| print(f" {row}") | |
| # 4. Test wrong pattern | |
| log_progress("\n4. Testing WRONG pattern (leading %):", "WARN") | |
| wrong_pattern = f"%/c/en/dog%" | |
| log_progress(f" Pattern: {wrong_pattern}", "DEBUG") | |
| start = time.time() | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (wrong_pattern,)) | |
| results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| log_progress(f" Found {len(results)} in {elapsed:.3f}s (SLOW!)", "WARN" if elapsed > 1 else "INFO") | |
| cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM node WHERE id LIKE '{wrong_pattern}'") | |
| plan = cursor.fetchall() | |
| uses_index = any('INDEX' in str(row).upper() for row in plan) | |
| log_progress(f" Uses index: {uses_index} (should be False)", "WARN" if uses_index else "INFO") | |
| log_progress("\n" + "="*60, "INFO") | |
| log_progress("DIAGNOSTICS COMPLETE", "SUCCESS") | |
| log_progress("="*60 + "\n", "INFO") | |
| except Exception as e: | |
| log_progress(f"Diagnostics failed: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| # Run diagnostics | |
| run_diagnostics() | |
| def get_semantic_profile(word, lang='en', progress=gr.Progress()): | |
| """Get semantic profile with CORRECT URL pattern""" | |
| log_progress(f"Semantic profile request: word='{word}', lang='{lang}'", "DEBUG") | |
| progress(0, desc="Starting...") | |
| if not word: | |
| return "β οΈ Please enter a word." | |
| if lang not in TARGET_LANGUAGES: | |
| return f"β οΈ Language '{lang}' not supported. Available: {', '.join(TARGET_LANGUAGES)}" | |
| word = word.strip().lower().replace(' ', '_') | |
| # CORRECT pattern - no leading % allows index usage! | |
| like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%" | |
| log_progress(f"Using pattern: {like_path}", "DEBUG") | |
| relations = [ | |
| "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf", | |
| "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", | |
| "/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom", "/r/SimilarTo" | |
| ] | |
| output_md = f"# π§ Semantic Profile: '{word}' ({lang.upper()})\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| progress(0.05, desc="Finding nodes...") | |
| start = time.time() | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,)) | |
| nodes = cursor.fetchall() | |
| elapsed = time.time() - start | |
| log_progress(f"Found {len(nodes)} nodes in {elapsed:.3f}s", "SUCCESS" if nodes else "WARN") | |
| if not nodes: | |
| return f"# π§ Semantic Profile: '{word}'\n\nβ οΈ **Not found**\n\nSearched: `{like_path}`" | |
| for node_id, label in nodes[:3]: | |
| output_md += f"**Node:** `{node_id}`\n" | |
| output_md += f"**Label:** {label}\n\n" | |
| log_progress(f" Found node: {node_id} ({label})", "DEBUG") | |
| total_relations = 0 | |
| for i, rel in enumerate(relations): | |
| progress((i + 1) / len(relations), desc=f"Querying {rel}...") | |
| log_progress(f"Querying relation: {rel}", "DEBUG") | |
| output_md += f"## {rel}\n\n" | |
| has_results = False | |
| # Outgoing edges | |
| start = time.time() | |
| cursor.execute(""" | |
| SELECT en.label, e.weight | |
| FROM edge e | |
| JOIN node en ON e.end_id = en.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.start_id LIKE ? AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """, (like_path, rel)) | |
| out_results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| log_progress(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s", "DEBUG") | |
| for label, weight in out_results: | |
| output_md += f"- **{word}** {rel} β *{label}* `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| # Incoming edges | |
| start = time.time() | |
| cursor.execute(""" | |
| SELECT s.label, e.weight | |
| FROM edge e | |
| JOIN node s ON e.start_id = s.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.end_id LIKE ? AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """, (like_path, rel)) | |
| in_results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| log_progress(f" Incoming: {len(in_results)} results in {elapsed:.3f}s", "DEBUG") | |
| for label, weight in in_results: | |
| output_md += f"- *{label}* {rel} β **{word}** `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| if not has_results: | |
| output_md += "*No results*\n" | |
| output_md += "\n" | |
| progress(1.0, desc="Complete!") | |
| output_md += "---\n" | |
| output_md += f"**Total relations:** {total_relations}\n" | |
| log_progress(f"Profile complete: {total_relations} relations found", "SUCCESS") | |
| return output_md | |
| except Exception as e: | |
| log_progress(f"Error in semantic profile: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| return f"**β Error:**\n\n```\n{e}\n```" | |
| def run_query(start_node, relation, end_node, limit, progress=gr.Progress()): | |
| """Query builder with CORRECT patterns""" | |
| log_progress(f"Query request: start={start_node}, rel={relation}, end={end_node}, limit={limit}", "DEBUG") | |
| progress(0, desc="Building query...") | |
| query = """ | |
| SELECT | |
| e.id AS edge_id, | |
| s.id AS start_id, | |
| r.label AS relation, | |
| en.id AS end_id, | |
| e.weight, | |
| s.label AS start_label, | |
| en.label AS end_label | |
| FROM edge e | |
| JOIN relation r ON e.rel_id = r.id | |
| JOIN node s ON e.start_id = s.id | |
| JOIN node en ON e.end_id = en.id | |
| WHERE 1=1 | |
| """ | |
| params = [] | |
| try: | |
| with get_db_connection() as conn: | |
| progress(0.3, desc="Adding filters...") | |
| # Language filter - use correct URL pattern! | |
| lang_conditions = [] | |
| for lang in TARGET_LANGUAGES: | |
| lang_conditions.append(f"s.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'") | |
| lang_conditions.append(f"en.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'") | |
| query += f" AND ({' OR '.join(lang_conditions)})" | |
| # Start node filter | |
| if start_node and start_node.strip(): | |
| if start_node.startswith('http://'): | |
| pattern = f"{start_node}%" | |
| else: | |
| # User enters just word, we construct full URL | |
| pattern = f"{CONCEPTNET_BASE}/c/%/{start_node}%" | |
| query += " AND s.id LIKE ?" | |
| params.append(pattern) | |
| log_progress(f"Start filter: {pattern}", "DEBUG") | |
| # Relation filter | |
| if relation and relation.strip(): | |
| rel_value = relation if relation.startswith('/r/') else f"/r/{relation}" | |
| if '%' in relation: | |
| query += " AND r.label LIKE ?" | |
| else: | |
| query += " AND r.label = ?" | |
| params.append(rel_value) | |
| log_progress(f"Relation filter: {rel_value}", "DEBUG") | |
| # End node filter | |
| if end_node and end_node.strip(): | |
| if end_node.startswith('http://'): | |
| pattern = f"{end_node}%" | |
| else: | |
| pattern = f"{CONCEPTNET_BASE}/c/%/{end_node}%" | |
| query += " AND en.id LIKE ?" | |
| params.append(pattern) | |
| log_progress(f"End filter: {pattern}", "DEBUG") | |
| query += " ORDER BY e.weight DESC LIMIT ?" | |
| params.append(limit) | |
| progress(0.6, desc="Executing...") | |
| log_progress(f"Executing query with {len(params)} params", "DEBUG") | |
| start_time = time.time() | |
| df = pd.read_sql_query(query, conn, params=params) | |
| elapsed = time.time() - start_time | |
| log_progress(f"Query complete: {len(df)} results in {elapsed:.2f}s", "SUCCESS") | |
| progress(1.0, desc="Complete!") | |
| if df.empty: | |
| return pd.DataFrame(), f"β οΈ No results ({elapsed:.2f}s)" | |
| df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] | |
| return df, f"β {len(df)} results in {elapsed:.2f}s" | |
| except Exception as e: | |
| log_progress(f"Query error: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame(), f"**β Error:** {e}" | |
| def run_raw_query(sql_query): | |
| """Execute raw SQL with logging""" | |
| log_progress(f"Raw SQL query: {sql_query[:100]}...", "DEBUG") | |
| if not sql_query.strip().upper().startswith("SELECT"): | |
| return pd.DataFrame(), "β Only SELECT queries allowed" | |
| try: | |
| with get_db_connection() as conn: | |
| start = time.time() | |
| df = pd.read_sql_query(sql_query, conn) | |
| elapsed = time.time() - start | |
| log_progress(f"Raw query complete: {len(df)} rows in {elapsed:.3f}s", "SUCCESS") | |
| return df, f"β {len(df)} rows in {elapsed:.3f}s" | |
| except Exception as e: | |
| log_progress(f"Raw query error: {e}", "ERROR") | |
| return pd.DataFrame(), f"β Error: {e}" | |
| def get_schema_info(): | |
| """Get schema with sample queries""" | |
| log_progress("Loading schema info", "DEBUG") | |
| md = f"# π Database Schema\n\n" | |
| md += f"**Repository:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n" | |
| md += f"**Base URL:** `{CONCEPTNET_BASE}`\n\n" | |
| md += "## Sample Queries\n\n" | |
| md += "**Finding nodes:**\n```sql\n" | |
| md += f"-- English 'dog'\n" | |
| md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%';\n\n" | |
| md += f"-- German 'hund'\n" | |
| md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/de/hund%';\n" | |
| md += "```\n\n" | |
| md += "**Finding edges:**\n```sql\n" | |
| md += f"-- Edges from 'dog'\n" | |
| md += f"SELECT * FROM edge WHERE start_id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10;\n" | |
| md += "```\n\n" | |
| md += "β οΈ **Important:** Do NOT use leading `%` in LIKE queries (prevents index usage!)\n\n" | |
| md += "β **Good:** `LIKE 'http://conceptnet.io/c/en/dog%'`\n" | |
| md += "β **Bad:** `LIKE '%/c/en/dog%'`\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| md += "## Tables\n\n" | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") | |
| for table, in cursor.fetchall(): | |
| cursor.execute(f"SELECT COUNT(*) FROM {table}") | |
| count = cursor.fetchone()[0] | |
| md += f"### {table} ({count:,} rows)\n\n" | |
| # Show columns | |
| cursor.execute(f"PRAGMA table_info({table})") | |
| cols = cursor.fetchall() | |
| md += "| Column | Type |\n|:--|:--|\n" | |
| for col in cols: | |
| md += f"| `{col[1]}` | `{col[2]}` |\n" | |
| # Show indices | |
| cursor.execute(f"PRAGMA index_list({table})") | |
| indices = cursor.fetchall() | |
| if indices: | |
| md += f"\n**Indices ({len(indices)}):**\n" | |
| for idx in indices: | |
| custom = " π" if idx[1].startswith("idx_") else "" | |
| md += f"- `{idx[1]}`{custom}\n" | |
| md += "\n" | |
| log_progress("Schema loaded successfully", "SUCCESS") | |
| except Exception as e: | |
| log_progress(f"Schema error: {e}", "ERROR") | |
| md += f"\n**Error loading schema:** {e}\n" | |
| return md | |
| # UI | |
| with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π§ ConceptNet Explorer") | |
| gr.Markdown( | |
| f"**Multi-language semantic network explorer** | " | |
| f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | " | |
| f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})" | |
| ) | |
| gr.Markdown("β **Optimized with custom indices** - Fast queries using correct URL patterns") | |
| with gr.Tabs(): | |
| with gr.TabItem("π Semantic Profile"): | |
| gr.Markdown("**Explore semantic relations for any word**") | |
| with gr.Row(): | |
| word_input = gr.Textbox( | |
| label="Word", | |
| placeholder="dog", | |
| value="dog", | |
| info="Enter a word to explore" | |
| ) | |
| lang_input = gr.Dropdown( | |
| choices=TARGET_LANGUAGES, | |
| value="en", | |
| label="Language", | |
| info="Select language" | |
| ) | |
| semantic_btn = gr.Button("π Get Semantic Profile", variant="primary", size="lg") | |
| semantic_output = gr.Markdown("*Enter a word and click the button to start...*") | |
| gr.Markdown("**Examples:** dog (en), hund (de), perro (es), chien (fr), η¬ (ja)") | |
| with gr.TabItem("β‘ Query Builder"): | |
| gr.Markdown("**Build custom queries to find specific relationships**") | |
| with gr.Row(): | |
| start_input = gr.Textbox( | |
| label="Start Node", | |
| placeholder="dog", | |
| info="Enter word or full URL" | |
| ) | |
| rel_input = gr.Textbox( | |
| label="Relation", | |
| placeholder="IsA", | |
| value="IsA", | |
| info="e.g., IsA, PartOf, UsedFor" | |
| ) | |
| end_input = gr.Textbox( | |
| label="End Node", | |
| placeholder="", | |
| info="Leave empty for all" | |
| ) | |
| limit_slider = gr.Slider( | |
| label="Result Limit", | |
| minimum=1, | |
| maximum=200, | |
| value=50, | |
| step=1 | |
| ) | |
| query_btn = gr.Button("βΆοΈ Run Query", variant="primary", size="lg") | |
| status_output = gr.Markdown("*Ready to query...*") | |
| results_output = gr.DataFrame( | |
| label="Results", | |
| wrap=True, | |
| interactive=False | |
| ) | |
| with gr.TabItem("π» Raw SQL"): | |
| gr.Markdown("**Execute custom SQL queries** (SELECT only)") | |
| raw_sql_input = gr.Textbox( | |
| label="SQL Query", | |
| value=f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10", | |
| lines=5, | |
| info="Write your SELECT query" | |
| ) | |
| raw_btn = gr.Button("βΆοΈ Execute Query", variant="secondary", size="lg") | |
| raw_status = gr.Markdown() | |
| raw_results = gr.DataFrame(label="Query Results", wrap=True) | |
| gr.Markdown( | |
| "**Tips:**\n" | |
| "- Always use `LIMIT` to prevent timeouts\n" | |
| f"- Node IDs start with: `{CONCEPTNET_BASE}/c/{{lang}}/{{word}}`\n" | |
| "- Don't use leading `%` in LIKE queries for best performance" | |
| ) | |
| with gr.TabItem("π Schema & Info"): | |
| gr.Markdown("**Database schema and structure information**") | |
| schema_btn = gr.Button("π Load Schema", variant="secondary", size="lg") | |
| schema_output = gr.Markdown("*Click button to load schema...*") | |
| gr.Markdown( | |
| "---\n" | |
| "**Performance:** Custom indices on `edge.start_id`, `edge.end_id`, `edge.rel_id`, `node.label` | " | |
| "**Check server logs for detailed query timing and diagnostics**" | |
| ) | |
| # Wire up event handlers | |
| semantic_btn.click( | |
| fn=get_semantic_profile, | |
| inputs=[word_input, lang_input], | |
| outputs=semantic_output | |
| ) | |
| query_btn.click( | |
| fn=run_query, | |
| inputs=[start_input, rel_input, end_input, limit_slider], | |
| outputs=[results_output, status_output] | |
| ) | |
| raw_btn.click( | |
| fn=run_raw_query, | |
| inputs=raw_sql_input, | |
| outputs=[raw_results, raw_status] | |
| ) | |
| schema_btn.click( | |
| fn=get_schema_info, | |
| inputs=None, | |
| outputs=schema_output | |
| ) | |
| if __name__ == "__main__": | |
| log_progress("="*60, "SUCCESS") | |
| log_progress("APP READY!", "SUCCESS") | |
| log_progress("="*60, "SUCCESS") | |
| log_progress(f"Database: {DB_PATH}", "INFO") | |
| log_progress(f"Size: {os.path.getsize(DB_PATH) / (2**30):.2f} GB", "INFO") | |
| log_progress("="*60 + "\n", "SUCCESS") | |
| demo.launch(ssr_mode=False) |