import gradio as gr import sqlite3 import pandas as pd from huggingface_hub import hf_hub_download, HfApi import os import time import json from typing import Dict, List, Optional # ===== CONFIGURATION ===== TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh'] INDEXED_REPO_ID = "cstr/conceptnet-de-indexed" INDEXED_DB_FILENAME = "conceptnet-de-indexed.db" PROGRESS_FILENAME = "indexing_progress.json" CONCEPTNET_BASE = "http://conceptnet.io" # ========================= # --- All relations MUST be full URLs --- CONCEPTNET_RELATIONS: Dict[str, str] = { "RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo", "IsA": f"{CONCEPTNET_BASE}/r/IsA", "PartOf": f"{CONCEPTNET_BASE}/r/PartOf", "HasA": f"{CONCEPTNET_BASE}/r/HasA", "UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor", "CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf", "AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation", "Causes": f"{CONCEPTNET_BASE}/r/Causes", "HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent", "HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent", "HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent", "HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite", "HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty", "MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal", "ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy", "Desires": f"{CONCEPTNET_BASE}/r/Desires", "CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy", "Synonym": f"{CONCEPTNET_BASE}/r/Synonym", "Antonym": f"{CONCEPTNET_BASE}/r/Antonym", "DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom", "DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom", "SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf", "DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs", "MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf", "LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear", "HasContext": f"{CONCEPTNET_BASE}/r/HasContext", "SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo", "EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo", "EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom", "CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire", "MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf", "ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction", "ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL", "NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires", "NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor", "NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf", "NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty", } # ========================= print(f"šŸŒ Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") print(f"šŸ“š Relations: {len(CONCEPTNET_RELATIONS)} relations loaded") HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN") def log_progress(message, level="INFO"): """Simple logger with timestamp and emoji prefix.""" timestamp = time.strftime("%H:%M:%S") prefix = {"INFO": "ā„¹ļø ", "SUCCESS": "āœ…", "ERROR": "āŒ", "WARN": "āš ļø ", "DEBUG": "šŸ”"}.get(level, "") print(f"[{timestamp}] {prefix} {message}") def check_remote_progress(): """Check Hugging Face Hub for indexing progress.""" if not HF_TOKEN: return {"indexing_complete": False} try: api = HfApi() api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN) with open(progress_path, 'r') as f: return json.load(f) except: return {"indexing_complete": False} def create_indexed_database(): """Download the pre-indexed database from HF Hub if indexing is complete.""" progress = check_remote_progress() if progress.get("indexing_complete"): log_progress("Remote indexing is complete. Downloading DB...", "INFO") try: return hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN) except Exception as e: log_progress(f"Failed to download indexed DB: {e}", "ERROR") pass log_progress("Remote indexing not complete or DB not found.", "WARN") return None DB_PATH = create_indexed_database() if not DB_PATH: log_progress("DATABASE NOT FOUND. App will not function.", "ERROR") else: log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS") def get_db_connection(): """Get a thread-safe connection to the SQLite database.""" if not DB_PATH: raise Exception("Database path is not set. Cannot create connection.") conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute("PRAGMA cache_size = -256000") # 256MB cache conn.execute("PRAGMA temp_store = MEMORY") return conn def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()): """ Get semantic profile for a word. FIX: This is now a generator to stream updates and show progress. FIX: Uses exact node match first for performance. """ log_progress(f"Profile: {word} ({lang})", "INFO") if not word or lang not in TARGET_LANGUAGES: yield "āš ļø Invalid input" return if not DB_PATH: yield "āŒ **Error:** Database file not found." return # Set default relations if none are selected if not selected_relations: selected_relations = [ "IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym" ] word = word.strip().lower().replace(' ', '_') exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}" output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() progress(0, desc="Starting...") yield output_md progress(0.05, desc="Finding nodes...") # --- PERFORMANCE FIX: Try exact match first --- cursor.execute("SELECT id, label FROM node WHERE id = ?", (exact_path,)) exact_node = cursor.fetchone() query_path = None query_operator = "" nodes = [] if exact_node: log_progress(f"Found exact node: {exact_node[0]}", "SUCCESS") nodes = [exact_node] query_path = exact_path query_operator = "=" # Use fast exact match else: log_progress(f"No exact node, falling back to LIKE...", "WARN") like_path = f"{exact_path}%" cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,)) nodes = cursor.fetchall() if not nodes: yield f"# 🧠 '{word}'\n\nāš ļø Not found" return query_path = like_path # Use slower LIKE match query_operator = "LIKE" log_progress(f"Using path: {query_path} (op: {query_operator})", "INFO") for node_id, label in nodes[:3]: output_md += f"**Node:** `{node_id}` → **{label}**\n" output_md += "\n" yield output_md # Yield after finding nodes total = 0 # Filter relations to query relations_to_query = [(name, CONCEPTNET_RELATIONS[name]) for name in selected_relations if name in CONCEPTNET_RELATIONS] num_relations = len(relations_to_query) if num_relations == 0: output_md += "āš ļø No relations selected." yield output_md return for i, (rel_name, rel_url) in enumerate(relations_to_query): progress((i + 0.1) / num_relations, desc=f"šŸ”Ž {rel_name}...") rel_md = f"## {rel_name}\n\n" found = False # Outgoing start_time = time.time() sql_out = f""" SELECT en.label, e.weight FROM edge e JOIN node en ON e.end_id = en.id WHERE e.start_id {query_operator} ? AND e.rel_id = ? ORDER BY e.weight DESC LIMIT 7 """ cursor.execute(sql_out, (query_path, rel_url)) results_out = cursor.fetchall() elapsed = time.time() - start_time log_progress(f" {rel_name} outgoing: {len(results_out)} in {elapsed:.3f}s", "DEBUG") for label, weight in results_out: rel_md += f"- **{word}** {rel_name} → *{label}* `[{weight:.3f}]`\n" found = True total += 1 # Incoming start_time = time.time() sql_in = f""" SELECT s.label, e.weight FROM edge e JOIN node s ON e.start_id = s.id WHERE e.end_id {query_operator} ? AND e.rel_id = ? ORDER BY e.weight DESC LIMIT 7 """ cursor.execute(sql_in, (query_path, rel_url)) results_in = cursor.fetchall() elapsed = time.time() - start_time log_progress(f" {rel_name} incoming: {len(results_in)} in {elapsed:.3f}s", "DEBUG") for label, weight in results_in: rel_md += f"- *{label}* {rel_name} → **{word}** `[{weight:.3f}]`\n" found = True total += 1 if not found: rel_md += "*No results*\n" rel_md += "\n" output_md += rel_md # Add this relation's results to the full markdown progress((i + 1) / num_relations, desc=f"āœ“ {rel_name}") yield output_md # --- PROGRESS FIX: Yield intermediate results --- output_md += f"---\n**Total relations:** {total}\n" log_progress(f"Profile complete: {total} relations", "SUCCESS") progress(1.0, desc="āœ… Complete!") yield output_md # Yield final result except Exception as e: log_progress(f"Error: {e}", "ERROR") import traceback traceback.print_exc() yield f"**āŒ Error:** {e}" def run_query(start_node, relation, end_node, limit, progress=gr.Progress()): """Query builder - FIXED to use relation names from dropdown.""" log_progress(f"Query: start={start_node}, rel={relation}, end={end_node}", "INFO") progress(0, desc="Building...") if not DB_PATH: return pd.DataFrame(), "āŒ **Error:** Database file not found." query = """ SELECT e.id, s.id, r.label, en.id, e.weight, s.label, en.label FROM edge e JOIN relation r ON e.rel_id = r.id JOIN node s ON e.start_id = s.id JOIN node en ON e.end_id = en.id WHERE 1=1 """ params = [] try: with get_db_connection() as conn: progress(0.3, desc="Adding filters...") # Start node if start_node and start_node.strip(): if start_node.startswith('http://'): pattern = f"{start_node}%" else: pattern = f"{CONCEPTNET_BASE}/c/en/{start_node.strip().lower().replace(' ', '_')}%" query += " AND s.id LIKE ?" params.append(pattern) # Relation - FIX: Use dictionary if relation and relation.strip(): rel_value = CONCEPTNET_RELATIONS.get(relation.strip()) if rel_value: query += " AND r.id = ?" params.append(rel_value) log_progress(f"Relation: {rel_value}", "DEBUG") # If relation is blank or invalid, filter is just not added # End node if end_node and end_node.strip(): if end_node.startswith('http://'): pattern = f"{end_node}%" else: pattern = f"{CONCEPTNET_BASE}/c/en/{end_node.strip().lower().replace(' ', '_')}%" query += " AND en.id LIKE ?" params.append(pattern) query += " ORDER BY e.weight DESC LIMIT ?" params.append(limit) progress(0.6, desc="Executing...") start_time = time.time() df = pd.read_sql_query(query, conn, params=params) elapsed = time.time() - start_time log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS") progress(1.0, desc="Done!") if df.empty: return pd.DataFrame(), f"āš ļø No results ({elapsed:.2f}s)" df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] return df, f"āœ… {len(df)} results in {elapsed:.2f}s" except Exception as e: log_progress(f"Error: {e}", "ERROR") import traceback traceback.print_exc() return pd.DataFrame(), f"āŒ {e}" def run_raw_query(sql_query): """Execute a raw SELECT SQL query.""" if not sql_query.strip().upper().startswith("SELECT"): return pd.DataFrame(), "āŒ Only SELECT queries are allowed." if not DB_PATH: return pd.DataFrame(), "āŒ **Error:** Database file not found." try: with get_db_connection() as conn: start = time.time() df = pd.read_sql_query(sql_query, conn) elapsed = time.time() - start return df, f"āœ… {len(df)} rows in {elapsed:.3f}s" except Exception as e: return pd.DataFrame(), f"āŒ {e}" def get_schema_info(): """Get schema information, including tables, row counts, and indices.""" if not DB_PATH: return "āŒ **Error:** Database file not found." md = f"# šŸ“š Schema\n\n" md += f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n" md += "**Key:** Relations and Nodes use full `http://conceptnet.io/...` URLs.\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() md += "## Tables & Row Counts\n\n" cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") tables = cursor.fetchall() for (table,) in tables: cursor.execute(f"SELECT COUNT(*) FROM {table}") md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n" md += "\n## Indices\n\n" cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index'") for name, sql in cursor.fetchall(): if sql: # Filter out auto-indices md += f"- **{name}:** `{sql}`\n" md += "\n## Common Relations\n\n" cursor.execute("SELECT id, label FROM relation ORDER BY label LIMIT 20") for rel_id, label in cursor.fetchall(): md += f"- **{label}:** `{rel_id}`\n" except Exception as e: md += f"\n**āŒ Error:** {e}\n" return md # ===== Build Gradio UI ===== with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧠 ConceptNet Explorer") gr.Markdown(f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | **Relations:** {len(CONCEPTNET_RELATIONS)} types") if not DB_PATH: gr.Markdown("## āŒ ERROR: DATABASE FILE NOT FOUND") gr.Markdown("This app cannot start because the SQLite database file could not be downloaded from Hugging Face Hub. Please check the logs.") else: with gr.Tabs(): with gr.TabItem("šŸ” Semantic Profile"): gr.Markdown("**Explore semantic relations for any word. Progress bar and output will update live.**") with gr.Row(): word_input = gr.Textbox( label="Word", placeholder="e.g., dog, hund, perro", value="dog", scale=3 ) lang_input = gr.Dropdown( choices=TARGET_LANGUAGES, value="en", label="Language", scale=1 ) with gr.Accordion("Select Relations (fewer = faster)", open=False): relation_input = gr.CheckboxGroup( choices=list(CONCEPTNET_RELATIONS.keys()), label="Relations to Query", value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"] ) semantic_btn = gr.Button("šŸ” Get Semantic Profile", variant="primary", size="lg") semantic_output = gr.Markdown(value="Click the button to get the semantic profile.") gr.Examples( examples=[ ["dog", "en"], ["hund", "de"], ["perro", "es"], ["chat", "fr"], ["knowledge", "en"], ], inputs=[word_input, lang_input], label="Examples" ) with gr.TabItem("⚔ Query Builder"): gr.Markdown("**Build custom relationship queries using dropdowns.**") with gr.Row(): start_input = gr.Textbox(label="Start Node", placeholder="dog (optional)") rel_input = gr.Dropdown( choices=[""] + list(CONCEPTNET_RELATIONS.keys()), # Add "" for 'any' label="Relation", value="IsA", info="Leave blank to query all relations" ) end_input = gr.Textbox(label="End Node", placeholder="(optional)") limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1) query_btn = gr.Button("ā–¶ļø Run Query", variant="primary", size="lg") status_output = gr.Markdown() # BUG FIX: Removed 'height' argument from constructor as it caused a TypeError. results_output = gr.DataFrame(wrap=True) with gr.TabItem("šŸ’» Raw SQL"): gr.Markdown("**Execute custom `SELECT` SQL queries against the database.**") raw_sql_input = gr.Textbox( label="SQL Query", value=f"SELECT s.label, r.label, en.label, e.weight\nFROM edge e\nJOIN node s ON e.start_id = s.id\nJOIN node en ON e.end_id = en.id\nJOIN relation r ON e.rel_id = r.id\nWHERE s.id = '{CONCEPTNET_BASE}/c/en/dog'\nAND r.id = '{CONCEPTNET_BASE}/r/IsA'\nORDER BY e.weight DESC\nLIMIT 10", lines=5, elem_classes=["font-mono"] # Use a CSS class for styling if needed ) raw_btn = gr.Button("ā–¶ļø Execute") raw_status = gr.Markdown() # BUG FIX: Removed 'height' argument from constructor. raw_results = gr.DataFrame() with gr.TabItem("šŸ“Š Schema"): gr.Markdown("**View database schema, tables, and indices.**") schema_btn = gr.Button("šŸ“Š Load Schema Info") schema_output = gr.Markdown() # --- Button Click Handlers (with API integration) --- semantic_btn.click( get_semantic_profile, inputs=[word_input, lang_input, relation_input], outputs=semantic_output, api_name="get_semantic_profile" # <-- API Integration ) query_btn.click( run_query, inputs=[start_input, rel_input, end_input, limit_slider], outputs=[results_output, status_output], api_name="run_query" # <-- API Integration ) raw_btn.click( run_raw_query, inputs=raw_sql_input, outputs=[raw_results, raw_status], api_name="run_raw_query" # <-- API Integration ) # Load schema info automatically and on button click demo.load( get_schema_info, None, schema_output, api_name="get_schema" # <-- API Integration ) schema_btn.click( get_schema_info, None, schema_output, api_name="get_schema" # <-- API Integration (re-using endpoint is fine) ) if __name__ == "__main__": if DB_PATH: log_progress("APP READY!", "SUCCESS") else: log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR") demo.launch(ssr_mode=False)