conceptnet_db / app.py
cstr's picture
Update app.py
ee99d6d verified
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi
import os
import time
import json
from typing import Dict, List, Optional
# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
CONCEPTNET_BASE = "http://conceptnet.io"
# =========================
# --- All relations MUST be full URLs ---
CONCEPTNET_RELATIONS: Dict[str, str] = {
"RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo",
"IsA": f"{CONCEPTNET_BASE}/r/IsA",
"PartOf": f"{CONCEPTNET_BASE}/r/PartOf",
"HasA": f"{CONCEPTNET_BASE}/r/HasA",
"UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor",
"CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf",
"AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation",
"Causes": f"{CONCEPTNET_BASE}/r/Causes",
"HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent",
"HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent",
"HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent",
"HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite",
"HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty",
"MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal",
"ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy",
"Desires": f"{CONCEPTNET_BASE}/r/Desires",
"CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy",
"Synonym": f"{CONCEPTNET_BASE}/r/Synonym",
"Antonym": f"{CONCEPTNET_BASE}/r/Antonym",
"DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom",
"DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom",
"SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf",
"DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs",
"MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf",
"LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear",
"HasContext": f"{CONCEPTNET_BASE}/r/HasContext",
"SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo",
"EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo",
"EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom",
"CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire",
"MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf",
"ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction",
"ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL",
"NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires",
"NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor",
"NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf",
"NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty",
}
# =========================
print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
print(f"πŸ“š Relations: {len(CONCEPTNET_RELATIONS)} relations loaded")
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
def log_progress(message, level="INFO"):
"""Simple logger with timestamp and emoji prefix."""
timestamp = time.strftime("%H:%M:%S")
prefix = {"INFO": "ℹ️ ", "SUCCESS": "βœ…", "ERROR": "❌", "WARN": "⚠️ ", "DEBUG": "πŸ”"}.get(level, "")
print(f"[{timestamp}] {prefix} {message}")
def check_remote_progress():
"""Check Hugging Face Hub for indexing progress."""
if not HF_TOKEN:
return {"indexing_complete": False}
try:
api = HfApi()
api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN)
with open(progress_path, 'r') as f:
return json.load(f)
except:
return {"indexing_complete": False}
def create_indexed_database():
"""Download the pre-indexed database from HF Hub if indexing is complete."""
progress = check_remote_progress()
if progress.get("indexing_complete"):
log_progress("Remote indexing is complete. Downloading DB...", "INFO")
try:
return hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN)
except Exception as e:
log_progress(f"Failed to download indexed DB: {e}", "ERROR")
pass
log_progress("Remote indexing not complete or DB not found.", "WARN")
return None
DB_PATH = create_indexed_database()
if not DB_PATH:
log_progress("DATABASE NOT FOUND. App will not function.", "ERROR")
else:
log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS")
def get_db_connection():
"""Get a thread-safe connection to the SQLite database."""
if not DB_PATH:
raise Exception("Database path is not set. Cannot create connection.")
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute("PRAGMA cache_size = -256000") # 256MB cache
conn.execute("PRAGMA temp_store = MEMORY")
return conn
def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()):
"""
Get semantic profile for a word.
FIX: This is now a generator to stream updates and show progress.
FIX: Uses exact node match first for performance.
"""
log_progress(f"Profile: {word} ({lang})", "INFO")
if not word or lang not in TARGET_LANGUAGES:
yield "⚠️ Invalid input"
return
if not DB_PATH:
yield "❌ **Error:** Database file not found."
return
# Set default relations if none are selected
if not selected_relations:
selected_relations = [
"IsA", "RelatedTo", "PartOf", "HasA", "UsedFor",
"CapableOf", "Synonym", "Antonym"
]
word = word.strip().lower().replace(' ', '_')
exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}"
output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
progress(0, desc="Starting...")
yield output_md
progress(0.05, desc="Finding nodes...")
# --- PERFORMANCE FIX: Try exact match first ---
cursor.execute("SELECT id, label FROM node WHERE id = ?", (exact_path,))
exact_node = cursor.fetchone()
query_path = None
query_operator = ""
nodes = []
if exact_node:
log_progress(f"Found exact node: {exact_node[0]}", "SUCCESS")
nodes = [exact_node]
query_path = exact_path
query_operator = "=" # Use fast exact match
else:
log_progress(f"No exact node, falling back to LIKE...", "WARN")
like_path = f"{exact_path}%"
cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
nodes = cursor.fetchall()
if not nodes:
yield f"# 🧠 '{word}'\n\n⚠️ Not found"
return
query_path = like_path # Use slower LIKE match
query_operator = "LIKE"
log_progress(f"Using path: {query_path} (op: {query_operator})", "INFO")
for node_id, label in nodes[:3]:
output_md += f"**Node:** `{node_id}` β†’ **{label}**\n"
output_md += "\n"
yield output_md # Yield after finding nodes
total = 0
# Filter relations to query
relations_to_query = [(name, CONCEPTNET_RELATIONS[name]) for name in selected_relations if name in CONCEPTNET_RELATIONS]
num_relations = len(relations_to_query)
if num_relations == 0:
output_md += "⚠️ No relations selected."
yield output_md
return
for i, (rel_name, rel_url) in enumerate(relations_to_query):
progress((i + 0.1) / num_relations, desc=f"πŸ”Ž {rel_name}...")
rel_md = f"## {rel_name}\n\n"
found = False
# Outgoing
start_time = time.time()
sql_out = f"""
SELECT en.label, e.weight
FROM edge e
JOIN node en ON e.end_id = en.id
WHERE e.start_id {query_operator} ? AND e.rel_id = ?
ORDER BY e.weight DESC
LIMIT 7
"""
cursor.execute(sql_out, (query_path, rel_url))
results_out = cursor.fetchall()
elapsed = time.time() - start_time
log_progress(f" {rel_name} outgoing: {len(results_out)} in {elapsed:.3f}s", "DEBUG")
for label, weight in results_out:
rel_md += f"- **{word}** {rel_name} β†’ *{label}* `[{weight:.3f}]`\n"
found = True
total += 1
# Incoming
start_time = time.time()
sql_in = f"""
SELECT s.label, e.weight
FROM edge e
JOIN node s ON e.start_id = s.id
WHERE e.end_id {query_operator} ? AND e.rel_id = ?
ORDER BY e.weight DESC
LIMIT 7
"""
cursor.execute(sql_in, (query_path, rel_url))
results_in = cursor.fetchall()
elapsed = time.time() - start_time
log_progress(f" {rel_name} incoming: {len(results_in)} in {elapsed:.3f}s", "DEBUG")
for label, weight in results_in:
rel_md += f"- *{label}* {rel_name} β†’ **{word}** `[{weight:.3f}]`\n"
found = True
total += 1
if not found:
rel_md += "*No results*\n"
rel_md += "\n"
output_md += rel_md # Add this relation's results to the full markdown
progress((i + 1) / num_relations, desc=f"βœ“ {rel_name}")
yield output_md # --- PROGRESS FIX: Yield intermediate results ---
output_md += f"---\n**Total relations:** {total}\n"
log_progress(f"Profile complete: {total} relations", "SUCCESS")
progress(1.0, desc="βœ… Complete!")
yield output_md # Yield final result
except Exception as e:
log_progress(f"Error: {e}", "ERROR")
import traceback
traceback.print_exc()
yield f"**❌ Error:** {e}"
def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
"""Query builder - FIXED to use relation names from dropdown."""
log_progress(f"Query: start={start_node}, rel={relation}, end={end_node}", "INFO")
progress(0, desc="Building...")
if not DB_PATH:
return pd.DataFrame(), "❌ **Error:** Database file not found."
query = """
SELECT
e.id, s.id, r.label, en.id, e.weight, s.label, en.label
FROM edge e
JOIN relation r ON e.rel_id = r.id
JOIN node s ON e.start_id = s.id
JOIN node en ON e.end_id = en.id
WHERE 1=1
"""
params = []
try:
with get_db_connection() as conn:
progress(0.3, desc="Adding filters...")
# Start node
if start_node and start_node.strip():
if start_node.startswith('http://'):
pattern = f"{start_node}%"
else:
pattern = f"{CONCEPTNET_BASE}/c/en/{start_node.strip().lower().replace(' ', '_')}%"
query += " AND s.id LIKE ?"
params.append(pattern)
# Relation - FIX: Use dictionary
if relation and relation.strip():
rel_value = CONCEPTNET_RELATIONS.get(relation.strip())
if rel_value:
query += " AND r.id = ?"
params.append(rel_value)
log_progress(f"Relation: {rel_value}", "DEBUG")
# If relation is blank or invalid, filter is just not added
# End node
if end_node and end_node.strip():
if end_node.startswith('http://'):
pattern = f"{end_node}%"
else:
pattern = f"{CONCEPTNET_BASE}/c/en/{end_node.strip().lower().replace(' ', '_')}%"
query += " AND en.id LIKE ?"
params.append(pattern)
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
progress(0.6, desc="Executing...")
start_time = time.time()
df = pd.read_sql_query(query, conn, params=params)
elapsed = time.time() - start_time
log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS")
progress(1.0, desc="Done!")
if df.empty:
return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
except Exception as e:
log_progress(f"Error: {e}", "ERROR")
import traceback
traceback.print_exc()
return pd.DataFrame(), f"❌ {e}"
def run_raw_query(sql_query):
"""Execute a raw SELECT SQL query."""
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "❌ Only SELECT queries are allowed."
if not DB_PATH:
return pd.DataFrame(), "❌ **Error:** Database file not found."
try:
with get_db_connection() as conn:
start = time.time()
df = pd.read_sql_query(sql_query, conn)
elapsed = time.time() - start
return df, f"βœ… {len(df)} rows in {elapsed:.3f}s"
except Exception as e:
return pd.DataFrame(), f"❌ {e}"
def get_schema_info():
"""Get schema information, including tables, row counts, and indices."""
if not DB_PATH:
return "❌ **Error:** Database file not found."
md = f"# πŸ“š Schema\n\n"
md += f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
md += "**Key:** Relations and Nodes use full `http://conceptnet.io/...` URLs.\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
md += "## Tables & Row Counts\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
tables = cursor.fetchall()
for (table,) in tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n"
md += "\n## Indices\n\n"
cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index'")
for name, sql in cursor.fetchall():
if sql: # Filter out auto-indices
md += f"- **{name}:** `{sql}`\n"
md += "\n## Common Relations\n\n"
cursor.execute("SELECT id, label FROM relation ORDER BY label LIMIT 20")
for rel_id, label in cursor.fetchall():
md += f"- **{label}:** `{rel_id}`\n"
except Exception as e:
md += f"\n**❌ Error:** {e}\n"
return md
# ===== Build Gradio UI =====
with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet Explorer")
gr.Markdown(f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | **Relations:** {len(CONCEPTNET_RELATIONS)} types")
if not DB_PATH:
gr.Markdown("## ❌ ERROR: DATABASE FILE NOT FOUND")
gr.Markdown("This app cannot start because the SQLite database file could not be downloaded from Hugging Face Hub. Please check the logs.")
else:
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown("**Explore semantic relations for any word. Progress bar and output will update live.**")
with gr.Row():
word_input = gr.Textbox(
label="Word",
placeholder="e.g., dog, hund, perro",
value="dog",
scale=3
)
lang_input = gr.Dropdown(
choices=TARGET_LANGUAGES,
value="en",
label="Language",
scale=1
)
with gr.Accordion("Select Relations (fewer = faster)", open=False):
relation_input = gr.CheckboxGroup(
choices=list(CONCEPTNET_RELATIONS.keys()),
label="Relations to Query",
value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"]
)
semantic_btn = gr.Button("πŸ” Get Semantic Profile", variant="primary", size="lg")
semantic_output = gr.Markdown(value="Click the button to get the semantic profile.")
gr.Examples(
examples=[
["dog", "en"],
["hund", "de"],
["perro", "es"],
["chat", "fr"],
["knowledge", "en"],
],
inputs=[word_input, lang_input],
label="Examples"
)
with gr.TabItem("⚑ Query Builder"):
gr.Markdown("**Build custom relationship queries using dropdowns.**")
with gr.Row():
start_input = gr.Textbox(label="Start Node", placeholder="dog (optional)")
rel_input = gr.Dropdown(
choices=[""] + list(CONCEPTNET_RELATIONS.keys()), # Add "" for 'any'
label="Relation",
value="IsA",
info="Leave blank to query all relations"
)
end_input = gr.Textbox(label="End Node", placeholder="(optional)")
limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
status_output = gr.Markdown()
# BUG FIX: Removed 'height' argument from constructor as it caused a TypeError.
results_output = gr.DataFrame(wrap=True)
with gr.TabItem("πŸ’» Raw SQL"):
gr.Markdown("**Execute custom `SELECT` SQL queries against the database.**")
raw_sql_input = gr.Textbox(
label="SQL Query",
value=f"SELECT s.label, r.label, en.label, e.weight\nFROM edge e\nJOIN node s ON e.start_id = s.id\nJOIN node en ON e.end_id = en.id\nJOIN relation r ON e.rel_id = r.id\nWHERE s.id = '{CONCEPTNET_BASE}/c/en/dog'\nAND r.id = '{CONCEPTNET_BASE}/r/IsA'\nORDER BY e.weight DESC\nLIMIT 10",
lines=5,
elem_classes=["font-mono"] # Use a CSS class for styling if needed
)
raw_btn = gr.Button("▢️ Execute")
raw_status = gr.Markdown()
# BUG FIX: Removed 'height' argument from constructor.
raw_results = gr.DataFrame()
with gr.TabItem("πŸ“Š Schema"):
gr.Markdown("**View database schema, tables, and indices.**")
schema_btn = gr.Button("πŸ“Š Load Schema Info")
schema_output = gr.Markdown()
# --- Button Click Handlers (with API integration) ---
semantic_btn.click(
get_semantic_profile,
inputs=[word_input, lang_input, relation_input],
outputs=semantic_output,
api_name="get_semantic_profile" # <-- API Integration
)
query_btn.click(
run_query,
inputs=[start_input, rel_input, end_input, limit_slider],
outputs=[results_output, status_output],
api_name="run_query" # <-- API Integration
)
raw_btn.click(
run_raw_query,
inputs=raw_sql_input,
outputs=[raw_results, raw_status],
api_name="run_raw_query" # <-- API Integration
)
# Load schema info automatically and on button click
demo.load(
get_schema_info,
None,
schema_output,
api_name="get_schema" # <-- API Integration
)
schema_btn.click(
get_schema_info,
None,
schema_output,
api_name="get_schema" # <-- API Integration (re-using endpoint is fine)
)
if __name__ == "__main__":
if DB_PATH:
log_progress("APP READY!", "SUCCESS")
else:
log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR")
demo.launch(ssr_mode=False)