import gradio as gr import sqlite3 import pandas as pd from huggingface_hub import hf_hub_download, HfApi, HfFolder import os import time import shutil from pathlib import Path import json # ===== CONFIGURATION ===== TARGET_LANGUAGES = ['de'] INDEXED_REPO_ID = "cstr/conceptnet-de-indexed" INDEXED_DB_FILENAME = "conceptnet-de-indexed.db" PROGRESS_FILENAME = "indexing_progress.json" LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db" # ========================= print(f"šŸŒ Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") # Get HF token HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") if not HF_TOKEN: try: HF_TOKEN = HfFolder.get_token() except: pass if not HF_TOKEN: print("āš ļø WARNING: No HF_TOKEN found!") print(" Add HF_TOKEN in Space settings to enable checkpointing") # Original database ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite" ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db" def log_progress(message, level="INFO"): """Enhanced logging with timestamp""" timestamp = time.strftime("%H:%M:%S") prefix = { "INFO": "ā„¹ļø ", "SUCCESS": "āœ…", "ERROR": "āŒ", "WARN": "āš ļø ", "CHECKPOINT": "šŸ’¾" }.get(level, "") print(f"[{timestamp}] {prefix} {message}") def check_remote_progress(): """Check remote progress with detailed logging""" if not HF_TOKEN: log_progress("No HF_TOKEN - cannot check remote progress", "WARN") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } try: api = HfApi() # Check if repo exists try: api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS") except: log_progress("Repository doesn't exist yet", "INFO") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } # Download progress file try: progress_path = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN ) with open(progress_path, 'r') as f: progress = json.load(f) log_progress("Remote progress loaded:", "INFO") log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO") log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO") log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO") return progress except Exception as e: log_progress("No progress file found (starting fresh)", "INFO") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } except Exception as e: log_progress(f"Error checking remote: {e}", "ERROR") return { "completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False } def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False): """Update progress with detailed tracking""" if not HF_TOKEN: log_progress("Cannot update progress: No HF_TOKEN", "WARN") return False if analyzed_tables is None: analyzed_tables = [] try: api = HfApi() # Create repo if needed try: api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) except: log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO") api.create_repo( repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, private=False ) # Create progress file progress = { "completed_indices": completed_indices, "analyzed_tables": analyzed_tables, "database_uploaded": database_uploaded, "indexing_complete": indexing_complete, "timestamp": time.time(), "languages": TARGET_LANGUAGES } progress_path = "/tmp/indexing_progress.json" with open(progress_path, 'w') as f: json.dump(progress, f, indent=2) # Upload api.upload_file( path_or_fileobj=progress_path, path_in_repo=PROGRESS_FILENAME, repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed" ) log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed", "CHECKPOINT") return True except Exception as e: log_progress(f"Failed to update progress: {e}", "ERROR") return False def upload_database_checkpoint(message=""): """Upload database with progress reporting""" if not HF_TOKEN: log_progress("Cannot upload: No HF_TOKEN", "WARN") return False if not os.path.exists(LOCAL_DB_PATH): log_progress("Database file doesn't exist", "ERROR") return False try: api = HfApi() db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30) log_progress(f"Uploading database checkpoint ({db_size:.2f} GB)...", "CHECKPOINT") log_progress(f" {message}", "INFO") log_progress(f" This may take 5-10 minutes...", "INFO") start = time.time() api.upload_file( path_or_fileobj=LOCAL_DB_PATH, path_in_repo=INDEXED_DB_FILENAME, repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message=message or "Database checkpoint" ) elapsed = time.time() - start log_progress(f"Database uploaded in {elapsed:.1f}s ({db_size*8/elapsed:.1f} Mbps)", "SUCCESS") return True except Exception as e: log_progress(f"Upload failed: {e}", "ERROR") import traceback traceback.print_exc() return False def create_indexed_database(): """Create indexed database with comprehensive checkpointing""" log_progress("="*60, "INFO") log_progress("STARTING INDEXED DATABASE CREATION", "INFO") log_progress("="*60, "INFO") # Check remote progress progress = check_remote_progress() completed_indices = set(progress.get("completed_indices", [])) analyzed_tables = set(progress.get("analyzed_tables", [])) database_uploaded = progress.get("database_uploaded", False) indexing_complete = progress.get("indexing_complete", False) # If fully complete, download and return if indexing_complete: log_progress("Fully indexed database exists!", "SUCCESS") log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO") try: indexed_path = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN ) log_progress(f"Downloaded: {indexed_path}", "SUCCESS") return indexed_path except Exception as e: log_progress(f"Download failed: {e}", "ERROR") log_progress("Will create locally", "INFO") # Check for partial progress if completed_indices or analyzed_tables: log_progress("Resuming from checkpoint:", "INFO") log_progress(f" Completed indices: {sorted(completed_indices)}", "INFO") log_progress(f" Analyzed tables: {sorted(analyzed_tables)}", "INFO") # Get or create local database if os.path.exists(LOCAL_DB_PATH) and (completed_indices or analyzed_tables): log_progress("Using existing local database", "SUCCESS") elif database_uploaded: log_progress("Downloading partial database from HF...", "INFO") try: remote_db = hf_hub_download( repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN ) shutil.copy2(remote_db, LOCAL_DB_PATH) log_progress("Downloaded partial database", "SUCCESS") except: log_progress("No partial database, starting from original", "INFO") if not os.path.exists(LOCAL_DB_PATH): # Download and copy original log_progress("Downloading original database...", "INFO") original_path = hf_hub_download( repo_id=ORIGINAL_REPO_ID, filename=ORIGINAL_DB_FILENAME, repo_type="dataset" ) original_size = os.path.getsize(original_path) free_space = shutil.disk_usage("/tmp")[2] log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO") log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO") if free_space < original_size * 2: raise Exception(f"Not enough space! Need {original_size * 2 / (2**30):.1f} GB") log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO") start = time.time() shutil.copy2(original_path, LOCAL_DB_PATH) elapsed = time.time() - start log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS") # Connect to database conn = sqlite3.connect(LOCAL_DB_PATH) cursor = conn.cursor() # Enable optimizations cursor.execute("PRAGMA journal_mode = WAL") cursor.execute("PRAGMA synchronous = NORMAL") cursor.execute("PRAGMA cache_size = -512000") # PHASE 1: Create Indices log_progress("="*60, "INFO") log_progress("PHASE 1: CREATING INDICES", "INFO") log_progress("="*60, "INFO") indices_to_create = [ ("idx_edge_start_id", "edge", "start_id"), ("idx_edge_end_id", "edge", "end_id"), ("idx_edge_rel_id", "edge", "rel_id"), ("idx_node_label", "node", "label"), ] for i, (idx_name, table, column) in enumerate(indices_to_create, 1): if idx_name in completed_indices: log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED (already complete)", "INFO") continue log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO") start = time.time() try: cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})") conn.commit() elapsed = time.time() - start log_progress(f" Index created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS") # Update progress completed_indices.add(idx_name) update_remote_progress( list(completed_indices), list(analyzed_tables), database_uploaded=False, indexing_complete=False ) # Upload checkpoint upload_database_checkpoint(f"Checkpoint: {idx_name} created") except Exception as e: log_progress(f"Failed to create {idx_name}: {e}", "ERROR") conn.close() raise # PHASE 2: Analyze Tables (per-table with checkpoints) log_progress("="*60, "INFO") log_progress("PHASE 2: ANALYZING TABLES", "INFO") log_progress("="*60, "INFO") # Get list of tables cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") tables = [row[0] for row in cursor.fetchall()] log_progress(f"Found {len(tables)} tables to analyze: {tables}", "INFO") for i, table in enumerate(tables, 1): if table in analyzed_tables: log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED (already analyzed)", "INFO") continue log_progress(f"[{i}/{len(tables)}] Analyzing table: {table}", "INFO") # Get table size for progress estimation cursor.execute(f"SELECT COUNT(*) FROM {table}") row_count = cursor.fetchone()[0] log_progress(f" Table has {row_count:,} rows", "INFO") start = time.time() try: # Run ANALYZE on this specific table cursor.execute(f"ANALYZE {table}") conn.commit() elapsed = time.time() - start log_progress(f" Analyzed in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS") # Update progress analyzed_tables.add(table) update_remote_progress( list(completed_indices), list(analyzed_tables), database_uploaded=False, indexing_complete=False ) # Upload checkpoint after each table log_progress(f" Uploading checkpoint after analyzing {table}...", "CHECKPOINT") upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})") except Exception as e: log_progress(f"Failed to analyze {table}: {e}", "ERROR") log_progress("Continuing with next table...", "WARN") conn.close() # PHASE 3: Final upload and completion log_progress("="*60, "INFO") log_progress("PHASE 3: FINAL UPLOAD", "INFO") log_progress("="*60, "INFO") log_progress("All indexing and analysis complete!", "SUCCESS") log_progress("Performing final upload...", "INFO") upload_database_checkpoint("Final indexed database - COMPLETE") # Mark as complete update_remote_progress( list(completed_indices), list(analyzed_tables), database_uploaded=True, indexing_complete=True ) indexed_size = os.path.getsize(LOCAL_DB_PATH) log_progress("="*60, "SUCCESS") log_progress("INDEXING COMPLETE!", "SUCCESS") log_progress("="*60, "SUCCESS") log_progress(f"Final size: {indexed_size / (2**30):.2f} GB", "INFO") log_progress(f"Indices: {sorted(completed_indices)}", "INFO") log_progress(f"Analyzed: {sorted(analyzed_tables)}", "INFO") log_progress(f"Saved to: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO") log_progress("="*60, "SUCCESS") return LOCAL_DB_PATH # Initialize database DB_PATH = create_indexed_database() def get_db_connection(): """Create optimized connection""" conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute("PRAGMA cache_size = -256000") conn.execute("PRAGMA mmap_size = 4294967296") return conn def verify_indices(): """Verify indices""" log_progress("="*60, "INFO") log_progress("VERIFYING INDICES", "INFO") log_progress("="*60, "INFO") with get_db_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'") custom_indices = cursor.fetchall() log_progress(f"Custom indices: {len(custom_indices)}", "INFO") for idx in custom_indices: log_progress(f" āœ“ {idx[0]}", "SUCCESS") # Speed test log_progress("Running speed test...", "INFO") start = time.time() cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE '/c/de/hund%'") count = cursor.fetchone()[0] elapsed = time.time() - start status = "SUCCESS" if elapsed < 1 else "WARN" if elapsed < 5 else "ERROR" log_progress(f"Query: {count} results in {elapsed:.3f}s", status) log_progress("="*60, "INFO") verify_indices() def get_semantic_profile(word, lang='de', progress=gr.Progress()): """Semantic profile""" progress(0, desc="Starting...") if not word: return "āš ļø Please enter a word." word = word.strip().lower().replace(' ', '_') like_path = f"/c/{lang}/{word}%" relations = [ "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf", "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", "/r/AtLocation", "/r/RelatedTo" ] output_md = f"# 🧠 Semantic Profile: '{word}'\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() progress(0.05, desc="Finding nodes...") cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,)) nodes = cursor.fetchall() if not nodes: return f"# 🧠 Semantic Profile: '{word}'\n\nāš ļø **Not found**" for node_id, label in nodes[:3]: output_md += f"**Node:** `{node_id}` ({label})\n" output_md += "\n" total = 0 for i, rel in enumerate(relations): progress((i + 1) / len(relations), desc=f"Querying {rel}...") output_md += f"## {rel}\n\n" found = False # Outgoing cursor.execute(""" SELECT en.label, e.weight FROM edge e JOIN node en ON e.end_id = en.id JOIN relation r ON e.rel_id = r.id WHERE e.start_id LIKE ? AND r.label = ? ORDER BY e.weight DESC LIMIT 7 """, (like_path, rel)) for label, weight in cursor.fetchall(): output_md += f"- **{word}** {rel} → *{label}* `[{weight:.3f}]`\n" found = True total += 1 # Incoming cursor.execute(""" SELECT s.label, e.weight FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id WHERE e.end_id LIKE ? AND r.label = ? ORDER BY e.weight DESC LIMIT 7 """, (like_path, rel)) for label, weight in cursor.fetchall(): output_md += f"- *{label}* {rel} → **{word}** `[{weight:.3f}]`\n" found = True total += 1 if not found: output_md += "*No results*\n" output_md += "\n" progress(1.0, desc="Complete!") output_md += f"---\n**Total:** {total} relations\n" return output_md except Exception as e: import traceback traceback.print_exc() return f"**āŒ Error:** {e}" def run_query(start_node, relation, end_node, limit, progress=gr.Progress()): """Query builder""" progress(0, desc="Starting...") query = """ SELECT e.id, s.id, r.label, en.id, e.weight, s.label, en.label FROM edge e JOIN relation r ON e.rel_id = r.id JOIN node s ON e.start_id = s.id JOIN node en ON e.end_id = en.id WHERE 1=1 """ params = [] try: with get_db_connection() as conn: progress(0.3, desc="Building...") # Language filter lang_cond = [] for lang in TARGET_LANGUAGES: lang_cond.append(f"s.id LIKE '/c/{lang}/%'") lang_cond.append(f"en.id LIKE '/c/{lang}/%'") query += f" AND ({' OR '.join(lang_cond)})" if start_node and start_node.strip(): pattern = start_node if '%' in start_node else f"%{start_node}%" query += " AND s.id LIKE ?" params.append(pattern) if relation and relation.strip(): rel_value = relation if relation.startswith('/r/') else f"/r/{relation}" query += " AND r.label = ?" if '%' not in relation else " AND r.label LIKE ?" params.append(rel_value) if end_node and end_node.strip(): pattern = end_node if '%' in end_node else f"%{end_node}%" query += " AND en.id LIKE ?" params.append(pattern) query += " ORDER BY e.weight DESC LIMIT ?" params.append(limit) progress(0.6, desc="Executing...") start_time = time.time() df = pd.read_sql_query(query, conn, params=params) elapsed = time.time() - start_time progress(1.0, desc="Complete!") if df.empty: return pd.DataFrame(), f"āš ļø No results ({elapsed:.2f}s)" df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] return df, f"āœ… {len(df)} results in {elapsed:.2f}s" except Exception as e: import traceback traceback.print_exc() return pd.DataFrame(), f"**āŒ Error:** {e}" def run_raw_query(sql_query): """Raw SQL""" if not sql_query.strip().upper().startswith("SELECT"): return pd.DataFrame(), "Only SELECT allowed" try: with get_db_connection() as conn: df = pd.read_sql_query(sql_query, conn) return df, f"āœ… {len(df)} rows" except Exception as e: return pd.DataFrame(), f"Error: {e}" def get_schema_info(): """Schema info""" with get_db_connection() as conn: cursor = conn.cursor() md = f"# šŸ“š Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n" cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") for table, in cursor.fetchall(): cursor.execute(f"SELECT COUNT(*) FROM {table}") md += f"## {table} ({cursor.fetchone()[0]:,} rows)\n\n" return md # UI with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo: gr.Markdown(f"# 🧠 ConceptNet ({', '.join([l.upper() for l in TARGET_LANGUAGES])})") gr.Markdown(f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | āœ… Per-table checkpoints") with gr.Tabs(): with gr.TabItem("šŸ” Profile"): with gr.Row(): word_input = gr.Textbox(label="Word", placeholder="hund", value="hund") lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Lang") semantic_btn = gr.Button("šŸ” Get Profile", variant="primary", size="lg") semantic_output = gr.Markdown() with gr.TabItem("⚔ Query"): with gr.Row(): start_input = gr.Textbox(label="Start", placeholder="hund", value="hund") rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA") end_input = gr.Textbox(label="End", placeholder="") limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50) query_btn = gr.Button("ā–¶ļø Run", variant="primary", size="lg") status_output = gr.Markdown() results_output = gr.DataFrame(wrap=True) with gr.TabItem("šŸ’» SQL"): raw_sql_input = gr.Textbox(label="SQL", value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", lines=3) raw_btn = gr.Button("ā–¶ļø Execute") raw_status = gr.Markdown() raw_results = gr.DataFrame() with gr.TabItem("šŸ“Š Schema"): schema_btn = gr.Button("šŸ“Š Load") schema_output = gr.Markdown() gr.Markdown("---\nāœ… **Per-table ANALYZE with checkpoints!** Check server logs for detailed progress.") semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output) query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output]) raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status]) schema_btn.click(get_schema_info, None, schema_output) if __name__ == "__main__": log_progress("App ready with per-table ANALYZE checkpoints!", "SUCCESS") demo.launch(ssr_mode=False)