Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import sqlite3 | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download, HfApi | |
| import os | |
| import time | |
| import json | |
| from typing import Dict, List, Optional | |
| # ===== CONFIGURATION ===== | |
| TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh'] | |
| INDEXED_REPO_ID = "cstr/conceptnet-de-indexed" | |
| INDEXED_DB_FILENAME = "conceptnet-de-indexed.db" | |
| PROGRESS_FILENAME = "indexing_progress.json" | |
| CONCEPTNET_BASE = "http://conceptnet.io" | |
| # ========================= | |
| # --- All relations MUST be full URLs --- | |
| CONCEPTNET_RELATIONS: Dict[str, str] = { | |
| "RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo", | |
| "IsA": f"{CONCEPTNET_BASE}/r/IsA", | |
| "PartOf": f"{CONCEPTNET_BASE}/r/PartOf", | |
| "HasA": f"{CONCEPTNET_BASE}/r/HasA", | |
| "UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor", | |
| "CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf", | |
| "AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation", | |
| "Causes": f"{CONCEPTNET_BASE}/r/Causes", | |
| "HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent", | |
| "HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent", | |
| "HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent", | |
| "HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite", | |
| "HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty", | |
| "MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal", | |
| "ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy", | |
| "Desires": f"{CONCEPTNET_BASE}/r/Desires", | |
| "CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy", | |
| "Synonym": f"{CONCEPTNET_BASE}/r/Synonym", | |
| "Antonym": f"{CONCEPTNET_BASE}/r/Antonym", | |
| "DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom", | |
| "DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom", | |
| "SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf", | |
| "DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs", | |
| "MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf", | |
| "LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear", | |
| "HasContext": f"{CONCEPTNET_BASE}/r/HasContext", | |
| "SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo", | |
| "EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo", | |
| "EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom", | |
| "CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire", | |
| "MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf", | |
| "ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction", | |
| "ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL", | |
| "NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires", | |
| "NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor", | |
| "NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf", | |
| "NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty", | |
| } | |
| # ========================= | |
| print(f"π Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") | |
| print(f"π Relations: {len(CONCEPTNET_RELATIONS)} relations loaded") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN") | |
| def log_progress(message, level="INFO"): | |
| """Simple logger with timestamp and emoji prefix.""" | |
| timestamp = time.strftime("%H:%M:%S") | |
| prefix = {"INFO": "βΉοΈ ", "SUCCESS": "β ", "ERROR": "β", "WARN": "β οΈ ", "DEBUG": "π"}.get(level, "") | |
| print(f"[{timestamp}] {prefix} {message}") | |
| def check_remote_progress(): | |
| """Check Hugging Face Hub for indexing progress.""" | |
| if not HF_TOKEN: | |
| return {"indexing_complete": False} | |
| try: | |
| api = HfApi() | |
| api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN) | |
| progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN) | |
| with open(progress_path, 'r') as f: | |
| return json.load(f) | |
| except: | |
| return {"indexing_complete": False} | |
| def create_indexed_database(): | |
| """Download the pre-indexed database from HF Hub if indexing is complete.""" | |
| progress = check_remote_progress() | |
| if progress.get("indexing_complete"): | |
| log_progress("Remote indexing is complete. Downloading DB...", "INFO") | |
| try: | |
| return hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN) | |
| except Exception as e: | |
| log_progress(f"Failed to download indexed DB: {e}", "ERROR") | |
| pass | |
| log_progress("Remote indexing not complete or DB not found.", "WARN") | |
| return None | |
| DB_PATH = create_indexed_database() | |
| if not DB_PATH: | |
| log_progress("DATABASE NOT FOUND. App will not function.", "ERROR") | |
| else: | |
| log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS") | |
| def get_db_connection(): | |
| """Get a thread-safe connection to the SQLite database.""" | |
| if not DB_PATH: | |
| raise Exception("Database path is not set. Cannot create connection.") | |
| conn = sqlite3.connect(DB_PATH, check_same_thread=False) | |
| conn.execute("PRAGMA cache_size = -256000") # 256MB cache | |
| conn.execute("PRAGMA temp_store = MEMORY") | |
| return conn | |
| def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()): | |
| """ | |
| Get semantic profile for a word. | |
| FIX: This is now a generator to stream updates and show progress. | |
| FIX: Uses exact node match first for performance. | |
| """ | |
| log_progress(f"Profile: {word} ({lang})", "INFO") | |
| if not word or lang not in TARGET_LANGUAGES: | |
| yield "β οΈ Invalid input" | |
| return | |
| if not DB_PATH: | |
| yield "β **Error:** Database file not found." | |
| return | |
| # Set default relations if none are selected | |
| if not selected_relations: | |
| selected_relations = [ | |
| "IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", | |
| "CapableOf", "Synonym", "Antonym" | |
| ] | |
| word = word.strip().lower().replace(' ', '_') | |
| exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}" | |
| output_md = f"# π§ Semantic Profile: '{word}' ({lang.upper()})\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| progress(0, desc="Starting...") | |
| yield output_md | |
| progress(0.05, desc="Finding nodes...") | |
| # --- PERFORMANCE FIX: Try exact match first --- | |
| cursor.execute("SELECT id, label FROM node WHERE id = ?", (exact_path,)) | |
| exact_node = cursor.fetchone() | |
| query_path = None | |
| query_operator = "" | |
| nodes = [] | |
| if exact_node: | |
| log_progress(f"Found exact node: {exact_node[0]}", "SUCCESS") | |
| nodes = [exact_node] | |
| query_path = exact_path | |
| query_operator = "=" # Use fast exact match | |
| else: | |
| log_progress(f"No exact node, falling back to LIKE...", "WARN") | |
| like_path = f"{exact_path}%" | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,)) | |
| nodes = cursor.fetchall() | |
| if not nodes: | |
| yield f"# π§ '{word}'\n\nβ οΈ Not found" | |
| return | |
| query_path = like_path # Use slower LIKE match | |
| query_operator = "LIKE" | |
| log_progress(f"Using path: {query_path} (op: {query_operator})", "INFO") | |
| for node_id, label in nodes[:3]: | |
| output_md += f"**Node:** `{node_id}` β **{label}**\n" | |
| output_md += "\n" | |
| yield output_md # Yield after finding nodes | |
| total = 0 | |
| # Filter relations to query | |
| relations_to_query = [(name, CONCEPTNET_RELATIONS[name]) for name in selected_relations if name in CONCEPTNET_RELATIONS] | |
| num_relations = len(relations_to_query) | |
| if num_relations == 0: | |
| output_md += "β οΈ No relations selected." | |
| yield output_md | |
| return | |
| for i, (rel_name, rel_url) in enumerate(relations_to_query): | |
| progress((i + 0.1) / num_relations, desc=f"π {rel_name}...") | |
| rel_md = f"## {rel_name}\n\n" | |
| found = False | |
| # Outgoing | |
| start_time = time.time() | |
| sql_out = f""" | |
| SELECT en.label, e.weight | |
| FROM edge e | |
| JOIN node en ON e.end_id = en.id | |
| WHERE e.start_id {query_operator} ? AND e.rel_id = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """ | |
| cursor.execute(sql_out, (query_path, rel_url)) | |
| results_out = cursor.fetchall() | |
| elapsed = time.time() - start_time | |
| log_progress(f" {rel_name} outgoing: {len(results_out)} in {elapsed:.3f}s", "DEBUG") | |
| for label, weight in results_out: | |
| rel_md += f"- **{word}** {rel_name} β *{label}* `[{weight:.3f}]`\n" | |
| found = True | |
| total += 1 | |
| # Incoming | |
| start_time = time.time() | |
| sql_in = f""" | |
| SELECT s.label, e.weight | |
| FROM edge e | |
| JOIN node s ON e.start_id = s.id | |
| WHERE e.end_id {query_operator} ? AND e.rel_id = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 7 | |
| """ | |
| cursor.execute(sql_in, (query_path, rel_url)) | |
| results_in = cursor.fetchall() | |
| elapsed = time.time() - start_time | |
| log_progress(f" {rel_name} incoming: {len(results_in)} in {elapsed:.3f}s", "DEBUG") | |
| for label, weight in results_in: | |
| rel_md += f"- *{label}* {rel_name} β **{word}** `[{weight:.3f}]`\n" | |
| found = True | |
| total += 1 | |
| if not found: | |
| rel_md += "*No results*\n" | |
| rel_md += "\n" | |
| output_md += rel_md # Add this relation's results to the full markdown | |
| progress((i + 1) / num_relations, desc=f"β {rel_name}") | |
| yield output_md # --- PROGRESS FIX: Yield intermediate results --- | |
| output_md += f"---\n**Total relations:** {total}\n" | |
| log_progress(f"Profile complete: {total} relations", "SUCCESS") | |
| progress(1.0, desc="β Complete!") | |
| yield output_md # Yield final result | |
| except Exception as e: | |
| log_progress(f"Error: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| yield f"**β Error:** {e}" | |
| def run_query(start_node, relation, end_node, limit, progress=gr.Progress()): | |
| """Query builder - FIXED to use relation names from dropdown.""" | |
| log_progress(f"Query: start={start_node}, rel={relation}, end={end_node}", "INFO") | |
| progress(0, desc="Building...") | |
| if not DB_PATH: | |
| return pd.DataFrame(), "β **Error:** Database file not found." | |
| query = """ | |
| SELECT | |
| e.id, s.id, r.label, en.id, e.weight, s.label, en.label | |
| FROM edge e | |
| JOIN relation r ON e.rel_id = r.id | |
| JOIN node s ON e.start_id = s.id | |
| JOIN node en ON e.end_id = en.id | |
| WHERE 1=1 | |
| """ | |
| params = [] | |
| try: | |
| with get_db_connection() as conn: | |
| progress(0.3, desc="Adding filters...") | |
| # Start node | |
| if start_node and start_node.strip(): | |
| if start_node.startswith('http://'): | |
| pattern = f"{start_node}%" | |
| else: | |
| pattern = f"{CONCEPTNET_BASE}/c/en/{start_node.strip().lower().replace(' ', '_')}%" | |
| query += " AND s.id LIKE ?" | |
| params.append(pattern) | |
| # Relation - FIX: Use dictionary | |
| if relation and relation.strip(): | |
| rel_value = CONCEPTNET_RELATIONS.get(relation.strip()) | |
| if rel_value: | |
| query += " AND r.id = ?" | |
| params.append(rel_value) | |
| log_progress(f"Relation: {rel_value}", "DEBUG") | |
| # If relation is blank or invalid, filter is just not added | |
| # End node | |
| if end_node and end_node.strip(): | |
| if end_node.startswith('http://'): | |
| pattern = f"{end_node}%" | |
| else: | |
| pattern = f"{CONCEPTNET_BASE}/c/en/{end_node.strip().lower().replace(' ', '_')}%" | |
| query += " AND en.id LIKE ?" | |
| params.append(pattern) | |
| query += " ORDER BY e.weight DESC LIMIT ?" | |
| params.append(limit) | |
| progress(0.6, desc="Executing...") | |
| start_time = time.time() | |
| df = pd.read_sql_query(query, conn, params=params) | |
| elapsed = time.time() - start_time | |
| log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS") | |
| progress(1.0, desc="Done!") | |
| if df.empty: | |
| return pd.DataFrame(), f"β οΈ No results ({elapsed:.2f}s)" | |
| df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] | |
| return df, f"β {len(df)} results in {elapsed:.2f}s" | |
| except Exception as e: | |
| log_progress(f"Error: {e}", "ERROR") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame(), f"β {e}" | |
| def run_raw_query(sql_query): | |
| """Execute a raw SELECT SQL query.""" | |
| if not sql_query.strip().upper().startswith("SELECT"): | |
| return pd.DataFrame(), "β Only SELECT queries are allowed." | |
| if not DB_PATH: | |
| return pd.DataFrame(), "β **Error:** Database file not found." | |
| try: | |
| with get_db_connection() as conn: | |
| start = time.time() | |
| df = pd.read_sql_query(sql_query, conn) | |
| elapsed = time.time() - start | |
| return df, f"β {len(df)} rows in {elapsed:.3f}s" | |
| except Exception as e: | |
| return pd.DataFrame(), f"β {e}" | |
| def get_schema_info(): | |
| """Get schema information, including tables, row counts, and indices.""" | |
| if not DB_PATH: | |
| return "β **Error:** Database file not found." | |
| md = f"# π Schema\n\n" | |
| md += f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n" | |
| md += "**Key:** Relations and Nodes use full `http://conceptnet.io/...` URLs.\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| md += "## Tables & Row Counts\n\n" | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") | |
| tables = cursor.fetchall() | |
| for (table,) in tables: | |
| cursor.execute(f"SELECT COUNT(*) FROM {table}") | |
| md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n" | |
| md += "\n## Indices\n\n" | |
| cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index'") | |
| for name, sql in cursor.fetchall(): | |
| if sql: # Filter out auto-indices | |
| md += f"- **{name}:** `{sql}`\n" | |
| md += "\n## Common Relations\n\n" | |
| cursor.execute("SELECT id, label FROM relation ORDER BY label LIMIT 20") | |
| for rel_id, label in cursor.fetchall(): | |
| md += f"- **{label}:** `{rel_id}`\n" | |
| except Exception as e: | |
| md += f"\n**β Error:** {e}\n" | |
| return md | |
| # ===== Build Gradio UI ===== | |
| with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π§ ConceptNet Explorer") | |
| gr.Markdown(f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | **Relations:** {len(CONCEPTNET_RELATIONS)} types") | |
| if not DB_PATH: | |
| gr.Markdown("## β ERROR: DATABASE FILE NOT FOUND") | |
| gr.Markdown("This app cannot start because the SQLite database file could not be downloaded from Hugging Face Hub. Please check the logs.") | |
| else: | |
| with gr.Tabs(): | |
| with gr.TabItem("π Semantic Profile"): | |
| gr.Markdown("**Explore semantic relations for any word. Progress bar and output will update live.**") | |
| with gr.Row(): | |
| word_input = gr.Textbox( | |
| label="Word", | |
| placeholder="e.g., dog, hund, perro", | |
| value="dog", | |
| scale=3 | |
| ) | |
| lang_input = gr.Dropdown( | |
| choices=TARGET_LANGUAGES, | |
| value="en", | |
| label="Language", | |
| scale=1 | |
| ) | |
| with gr.Accordion("Select Relations (fewer = faster)", open=False): | |
| relation_input = gr.CheckboxGroup( | |
| choices=list(CONCEPTNET_RELATIONS.keys()), | |
| label="Relations to Query", | |
| value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"] | |
| ) | |
| semantic_btn = gr.Button("π Get Semantic Profile", variant="primary", size="lg") | |
| semantic_output = gr.Markdown(value="Click the button to get the semantic profile.") | |
| gr.Examples( | |
| examples=[ | |
| ["dog", "en"], | |
| ["hund", "de"], | |
| ["perro", "es"], | |
| ["chat", "fr"], | |
| ["knowledge", "en"], | |
| ], | |
| inputs=[word_input, lang_input], | |
| label="Examples" | |
| ) | |
| with gr.TabItem("β‘ Query Builder"): | |
| gr.Markdown("**Build custom relationship queries using dropdowns.**") | |
| with gr.Row(): | |
| start_input = gr.Textbox(label="Start Node", placeholder="dog (optional)") | |
| rel_input = gr.Dropdown( | |
| choices=[""] + list(CONCEPTNET_RELATIONS.keys()), # Add "" for 'any' | |
| label="Relation", | |
| value="IsA", | |
| info="Leave blank to query all relations" | |
| ) | |
| end_input = gr.Textbox(label="End Node", placeholder="(optional)") | |
| limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1) | |
| query_btn = gr.Button("βΆοΈ Run Query", variant="primary", size="lg") | |
| status_output = gr.Markdown() | |
| # BUG FIX: Removed 'height' argument from constructor as it caused a TypeError. | |
| results_output = gr.DataFrame(wrap=True) | |
| with gr.TabItem("π» Raw SQL"): | |
| gr.Markdown("**Execute custom `SELECT` SQL queries against the database.**") | |
| raw_sql_input = gr.Textbox( | |
| label="SQL Query", | |
| value=f"SELECT s.label, r.label, en.label, e.weight\nFROM edge e\nJOIN node s ON e.start_id = s.id\nJOIN node en ON e.end_id = en.id\nJOIN relation r ON e.rel_id = r.id\nWHERE s.id = '{CONCEPTNET_BASE}/c/en/dog'\nAND r.id = '{CONCEPTNET_BASE}/r/IsA'\nORDER BY e.weight DESC\nLIMIT 10", | |
| lines=5, | |
| elem_classes=["font-mono"] # Use a CSS class for styling if needed | |
| ) | |
| raw_btn = gr.Button("βΆοΈ Execute") | |
| raw_status = gr.Markdown() | |
| # BUG FIX: Removed 'height' argument from constructor. | |
| raw_results = gr.DataFrame() | |
| with gr.TabItem("π Schema"): | |
| gr.Markdown("**View database schema, tables, and indices.**") | |
| schema_btn = gr.Button("π Load Schema Info") | |
| schema_output = gr.Markdown() | |
| # --- Button Click Handlers (with API integration) --- | |
| semantic_btn.click( | |
| get_semantic_profile, | |
| inputs=[word_input, lang_input, relation_input], | |
| outputs=semantic_output, | |
| api_name="get_semantic_profile" # <-- API Integration | |
| ) | |
| query_btn.click( | |
| run_query, | |
| inputs=[start_input, rel_input, end_input, limit_slider], | |
| outputs=[results_output, status_output], | |
| api_name="run_query" # <-- API Integration | |
| ) | |
| raw_btn.click( | |
| run_raw_query, | |
| inputs=raw_sql_input, | |
| outputs=[raw_results, raw_status], | |
| api_name="run_raw_query" # <-- API Integration | |
| ) | |
| # Load schema info automatically and on button click | |
| demo.load( | |
| get_schema_info, | |
| None, | |
| schema_output, | |
| api_name="get_schema" # <-- API Integration | |
| ) | |
| schema_btn.click( | |
| get_schema_info, | |
| None, | |
| schema_output, | |
| api_name="get_schema" # <-- API Integration (re-using endpoint is fine) | |
| ) | |
| if __name__ == "__main__": | |
| if DB_PATH: | |
| log_progress("APP READY!", "SUCCESS") | |
| else: | |
| log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR") | |
| demo.launch(ssr_mode=False) |