Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

conceptnet_db / app.py

cstr

Update app.py

ee99d6d verified about 1 month ago

raw

history blame contribute delete

21.9 kB

	import gradio as gr
	import sqlite3
	import pandas as pd
	from huggingface_hub import hf_hub_download, HfApi
	import os
	import time
	import json
	from typing import Dict, List, Optional

	# ===== CONFIGURATION =====
	TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
	INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
	INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
	PROGRESS_FILENAME = "indexing_progress.json"
	CONCEPTNET_BASE = "http://conceptnet.io"
	# =========================

	# --- All relations MUST be full URLs ---
	CONCEPTNET_RELATIONS: Dict[str, str] = {
	"RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo",
	"IsA": f"{CONCEPTNET_BASE}/r/IsA",
	"PartOf": f"{CONCEPTNET_BASE}/r/PartOf",
	"HasA": f"{CONCEPTNET_BASE}/r/HasA",
	"UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor",
	"CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf",
	"AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation",
	"Causes": f"{CONCEPTNET_BASE}/r/Causes",
	"HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent",
	"HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent",
	"HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent",
	"HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite",
	"HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty",
	"MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal",
	"ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy",
	"Desires": f"{CONCEPTNET_BASE}/r/Desires",
	"CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy",
	"Synonym": f"{CONCEPTNET_BASE}/r/Synonym",
	"Antonym": f"{CONCEPTNET_BASE}/r/Antonym",
	"DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom",
	"DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom",
	"SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf",
	"DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs",
	"MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf",
	"LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear",
	"HasContext": f"{CONCEPTNET_BASE}/r/HasContext",
	"SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo",
	"EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo",
	"EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom",
	"CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire",
	"MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf",
	"ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction",
	"ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL",
	"NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires",
	"NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor",
	"NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf",
	"NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty",
	}
	# =========================

	print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
	print(f"📚 Relations: {len(CONCEPTNET_RELATIONS)} relations loaded")

	HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")

	def log_progress(message, level="INFO"):
	"""Simple logger with timestamp and emoji prefix."""
	timestamp = time.strftime("%H:%M:%S")
	prefix = {"INFO": "ℹ️ ", "SUCCESS": "✅", "ERROR": "❌", "WARN": "⚠️ ", "DEBUG": "🔍"}.get(level, "")
	print(f"[{timestamp}] {prefix} {message}")

	def check_remote_progress():
	"""Check Hugging Face Hub for indexing progress."""
	if not HF_TOKEN:
	return {"indexing_complete": False}
	try:
	api = HfApi()
	api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
	progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN)
	with open(progress_path, 'r') as f:
	return json.load(f)
	except:
	return {"indexing_complete": False}

	def create_indexed_database():
	"""Download the pre-indexed database from HF Hub if indexing is complete."""
	progress = check_remote_progress()
	if progress.get("indexing_complete"):
	log_progress("Remote indexing is complete. Downloading DB...", "INFO")
	try:
	return hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN)
	except Exception as e:
	log_progress(f"Failed to download indexed DB: {e}", "ERROR")
	pass
	log_progress("Remote indexing not complete or DB not found.", "WARN")
	return None

	DB_PATH = create_indexed_database()

	if not DB_PATH:
	log_progress("DATABASE NOT FOUND. App will not function.", "ERROR")
	else:
	log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS")

	def get_db_connection():
	"""Get a thread-safe connection to the SQLite database."""
	if not DB_PATH:
	raise Exception("Database path is not set. Cannot create connection.")
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	conn.execute("PRAGMA cache_size = -256000") # 256MB cache
	conn.execute("PRAGMA temp_store = MEMORY")
	return conn

	def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()):
	"""
	Get semantic profile for a word.
	FIX: This is now a generator to stream updates and show progress.
	FIX: Uses exact node match first for performance.
	"""
	log_progress(f"Profile: {word} ({lang})", "INFO")

	if not word or lang not in TARGET_LANGUAGES:
	yield "⚠️ Invalid input"
	return

	if not DB_PATH:
	yield "❌ Error: Database file not found."
	return

	# Set default relations if none are selected
	if not selected_relations:
	selected_relations = [
	"IsA", "RelatedTo", "PartOf", "HasA", "UsedFor",
	"CapableOf", "Synonym", "Antonym"
	]

	word = word.strip().lower().replace(' ', '_')
	exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}"

	output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	progress(0, desc="Starting...")
	yield output_md

	progress(0.05, desc="Finding nodes...")

	# --- PERFORMANCE FIX: Try exact match first ---
	cursor.execute("SELECT id, label FROM node WHERE id = ?", (exact_path,))
	exact_node = cursor.fetchone()

	query_path = None
	query_operator = ""
	nodes = []

	if exact_node:
	log_progress(f"Found exact node: {exact_node[0]}", "SUCCESS")
	nodes = [exact_node]
	query_path = exact_path
	query_operator = "=" # Use fast exact match
	else:
	log_progress(f"No exact node, falling back to LIKE...", "WARN")
	like_path = f"{exact_path}%"
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
	nodes = cursor.fetchall()
	if not nodes:
	yield f"# 🧠 '{word}'\n\n⚠️ Not found"
	return
	query_path = like_path # Use slower LIKE match
	query_operator = "LIKE"

	log_progress(f"Using path: {query_path} (op: {query_operator})", "INFO")

	for node_id, label in nodes[:3]:
	output_md += f"Node: `{node_id}` → {label}\n"
	output_md += "\n"

	yield output_md # Yield after finding nodes

	total = 0

	# Filter relations to query
	relations_to_query = [(name, CONCEPTNET_RELATIONS[name]) for name in selected_relations if name in CONCEPTNET_RELATIONS]
	num_relations = len(relations_to_query)

	if num_relations == 0:
	output_md += "⚠️ No relations selected."
	yield output_md
	return

	for i, (rel_name, rel_url) in enumerate(relations_to_query):
	progress((i + 0.1) / num_relations, desc=f"🔎 {rel_name}...")

	rel_md = f"## {rel_name}\n\n"
	found = False

	# Outgoing
	start_time = time.time()
	sql_out = f"""
	SELECT en.label, e.weight
	FROM edge e
	JOIN node en ON e.end_id = en.id
	WHERE e.start_id {query_operator} ? AND e.rel_id = ?
	ORDER BY e.weight DESC
	LIMIT 7
	"""
	cursor.execute(sql_out, (query_path, rel_url))
	results_out = cursor.fetchall()
	elapsed = time.time() - start_time
	log_progress(f" {rel_name} outgoing: {len(results_out)} in {elapsed:.3f}s", "DEBUG")

	for label, weight in results_out:
	rel_md += f"- {word} {rel_name} → {label} `[{weight:.3f}]`\n"
	found = True
	total += 1

	# Incoming
	start_time = time.time()
	sql_in = f"""
	SELECT s.label, e.weight
	FROM edge e
	JOIN node s ON e.start_id = s.id
	WHERE e.end_id {query_operator} ? AND e.rel_id = ?
	ORDER BY e.weight DESC
	LIMIT 7
	"""
	cursor.execute(sql_in, (query_path, rel_url))
	results_in = cursor.fetchall()
	elapsed = time.time() - start_time
	log_progress(f" {rel_name} incoming: {len(results_in)} in {elapsed:.3f}s", "DEBUG")

	for label, weight in results_in:
	rel_md += f"- {label} {rel_name} → {word} `[{weight:.3f}]`\n"
	found = True
	total += 1

	if not found:
	rel_md += "No results\n"

	rel_md += "\n"

	output_md += rel_md # Add this relation's results to the full markdown
	progress((i + 1) / num_relations, desc=f"✓ {rel_name}")
	yield output_md # --- PROGRESS FIX: Yield intermediate results ---

	output_md += f"---\nTotal relations: {total}\n"
	log_progress(f"Profile complete: {total} relations", "SUCCESS")
	progress(1.0, desc="✅ Complete!")
	yield output_md # Yield final result

	except Exception as e:
	log_progress(f"Error: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	yield f"❌ Error: {e}"

	def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
	"""Query builder - FIXED to use relation names from dropdown."""
	log_progress(f"Query: start={start_node}, rel={relation}, end={end_node}", "INFO")
	progress(0, desc="Building...")

	if not DB_PATH:
	return pd.DataFrame(), "❌ Error: Database file not found."

	query = """
	SELECT
	e.id, s.id, r.label, en.id, e.weight, s.label, en.label
	FROM edge e
	JOIN relation r ON e.rel_id = r.id
	JOIN node s ON e.start_id = s.id
	JOIN node en ON e.end_id = en.id
	WHERE 1=1
	"""

	params = []

	try:
	with get_db_connection() as conn:
	progress(0.3, desc="Adding filters...")

	# Start node
	if start_node and start_node.strip():
	if start_node.startswith('http://'):
	pattern = f"{start_node}%"
	else:
	pattern = f"{CONCEPTNET_BASE}/c/en/{start_node.strip().lower().replace(' ', '_')}%"
	query += " AND s.id LIKE ?"
	params.append(pattern)

	# Relation - FIX: Use dictionary
	if relation and relation.strip():
	rel_value = CONCEPTNET_RELATIONS.get(relation.strip())
	if rel_value:
	query += " AND r.id = ?"
	params.append(rel_value)
	log_progress(f"Relation: {rel_value}", "DEBUG")
	# If relation is blank or invalid, filter is just not added

	# End node
	if end_node and end_node.strip():
	if end_node.startswith('http://'):
	pattern = f"{end_node}%"
	else:
	pattern = f"{CONCEPTNET_BASE}/c/en/{end_node.strip().lower().replace(' ', '_')}%"
	query += " AND en.id LIKE ?"
	params.append(pattern)

	query += " ORDER BY e.weight DESC LIMIT ?"
	params.append(limit)

	progress(0.6, desc="Executing...")

	start_time = time.time()
	df = pd.read_sql_query(query, conn, params=params)
	elapsed = time.time() - start_time

	log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS")

	progress(1.0, desc="Done!")

	if df.empty:
	return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"

	df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
	return df, f"✅ {len(df)} results in {elapsed:.2f}s"

	except Exception as e:
	log_progress(f"Error: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	return pd.DataFrame(), f"❌ {e}"

	def run_raw_query(sql_query):
	"""Execute a raw SELECT SQL query."""
	if not sql_query.strip().upper().startswith("SELECT"):
	return pd.DataFrame(), "❌ Only SELECT queries are allowed."

	if not DB_PATH:
	return pd.DataFrame(), "❌ Error: Database file not found."

	try:
	with get_db_connection() as conn:
	start = time.time()
	df = pd.read_sql_query(sql_query, conn)
	elapsed = time.time() - start
	return df, f"✅ {len(df)} rows in {elapsed:.3f}s"
	except Exception as e:
	return pd.DataFrame(), f"❌ {e}"

	def get_schema_info():
	"""Get schema information, including tables, row counts, and indices."""
	if not DB_PATH:
	return "❌ Error: Database file not found."

	md = f"# 📚 Schema\n\n"
	md += f"Repo: [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
	md += "Key: Relations and Nodes use full `http://conceptnet.io/...` URLs.\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	md += "## Tables & Row Counts\n\n"
	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
	tables = cursor.fetchall()
	for (table,) in tables:
	cursor.execute(f"SELECT COUNT(*) FROM {table}")
	md += f"- {table}: {cursor.fetchone()[0]:,} rows\n"

	md += "\n## Indices\n\n"
	cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index'")
	for name, sql in cursor.fetchall():
	if sql: # Filter out auto-indices
	md += f"- {name}: `{sql}`\n"

	md += "\n## Common Relations\n\n"
	cursor.execute("SELECT id, label FROM relation ORDER BY label LIMIT 20")
	for rel_id, label in cursor.fetchall():
	md += f"- {label}: `{rel_id}`\n"

	except Exception as e:
	md += f"\n❌ Error: {e}\n"

	return md

	# ===== Build Gradio UI =====
	with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 ConceptNet Explorer")
	gr.Markdown(f"Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])} \| Relations: {len(CONCEPTNET_RELATIONS)} types")

	if not DB_PATH:
	gr.Markdown("## ❌ ERROR: DATABASE FILE NOT FOUND")
	gr.Markdown("This app cannot start because the SQLite database file could not be downloaded from Hugging Face Hub. Please check the logs.")

	else:
	with gr.Tabs():
	with gr.TabItem("🔍 Semantic Profile"):
	gr.Markdown("Explore semantic relations for any word. Progress bar and output will update live.")

	with gr.Row():
	word_input = gr.Textbox(
	label="Word",
	placeholder="e.g., dog, hund, perro",
	value="dog",
	scale=3
	)
	lang_input = gr.Dropdown(
	choices=TARGET_LANGUAGES,
	value="en",
	label="Language",
	scale=1
	)

	with gr.Accordion("Select Relations (fewer = faster)", open=False):
	relation_input = gr.CheckboxGroup(
	choices=list(CONCEPTNET_RELATIONS.keys()),
	label="Relations to Query",
	value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"]
	)

	semantic_btn = gr.Button("🔍 Get Semantic Profile", variant="primary", size="lg")
	semantic_output = gr.Markdown(value="Click the button to get the semantic profile.")

	gr.Examples(
	examples=[
	["dog", "en"],
	["hund", "de"],
	["perro", "es"],
	["chat", "fr"],
	["knowledge", "en"],
	],
	inputs=[word_input, lang_input],
	label="Examples"
	)

	with gr.TabItem("⚡ Query Builder"):
	gr.Markdown("Build custom relationship queries using dropdowns.")

	with gr.Row():
	start_input = gr.Textbox(label="Start Node", placeholder="dog (optional)")
	rel_input = gr.Dropdown(
	choices=[""] + list(CONCEPTNET_RELATIONS.keys()), # Add "" for 'any'
	label="Relation",
	value="IsA",
	info="Leave blank to query all relations"
	)
	end_input = gr.Textbox(label="End Node", placeholder="(optional)")

	limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1)
	query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")

	status_output = gr.Markdown()

	# BUG FIX: Removed 'height' argument from constructor as it caused a TypeError.
	results_output = gr.DataFrame(wrap=True)

	with gr.TabItem("💻 Raw SQL"):
	gr.Markdown("Execute custom `SELECT` SQL queries against the database.")

	raw_sql_input = gr.Textbox(
	label="SQL Query",
	value=f"SELECT s.label, r.label, en.label, e.weight\nFROM edge e\nJOIN node s ON e.start_id = s.id\nJOIN node en ON e.end_id = en.id\nJOIN relation r ON e.rel_id = r.id\nWHERE s.id = '{CONCEPTNET_BASE}/c/en/dog'\nAND r.id = '{CONCEPTNET_BASE}/r/IsA'\nORDER BY e.weight DESC\nLIMIT 10",
	lines=5,
	elem_classes=["font-mono"] # Use a CSS class for styling if needed
	)

	raw_btn = gr.Button("▶️ Execute")
	raw_status = gr.Markdown()

	# BUG FIX: Removed 'height' argument from constructor.
	raw_results = gr.DataFrame()

	with gr.TabItem("📊 Schema"):
	gr.Markdown("View database schema, tables, and indices.")
	schema_btn = gr.Button("📊 Load Schema Info")
	schema_output = gr.Markdown()

	# --- Button Click Handlers (with API integration) ---
	semantic_btn.click(
	get_semantic_profile,
	inputs=[word_input, lang_input, relation_input],
	outputs=semantic_output,
	api_name="get_semantic_profile" # <-- API Integration
	)

	query_btn.click(
	run_query,
	inputs=[start_input, rel_input, end_input, limit_slider],
	outputs=[results_output, status_output],
	api_name="run_query" # <-- API Integration
	)

	raw_btn.click(
	run_raw_query,
	inputs=raw_sql_input,
	outputs=[raw_results, raw_status],
	api_name="run_raw_query" # <-- API Integration
	)

	# Load schema info automatically and on button click
	demo.load(
	get_schema_info,
	None,
	schema_output,
	api_name="get_schema" # <-- API Integration
	)
	schema_btn.click(
	get_schema_info,
	None,
	schema_output,
	api_name="get_schema" # <-- API Integration (re-using endpoint is fine)
	)

	if __name__ == "__main__":
	if DB_PATH:
	log_progress("APP READY!", "SUCCESS")
	else:
	log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR")
	demo.launch(ssr_mode=False)