Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

conceptnet_db / app.py

cstr

Update app.py

45626f2 verified about 1 month ago

raw

history blame

17.3 kB

	import gradio as gr
	import sqlite3
	import pandas as pd
	from huggingface_hub import hf_hub_download, snapshot_download
	import os
	import time
	import shutil
	from pathlib import Path

	# ===== CONFIGURATION =====
	TARGET_LANGUAGES = ['de']
	INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db"
	# =========================

	print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

	# Download original database
	REPO_ID = "ysenarath/conceptnet-sqlite"
	DB_FILENAME = "data/conceptnet-v5.7.0.db"

	ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
	print(f"Original database: {ORIGINAL_DB_PATH}")

	def create_indexed_database():
	"""
	Copy database and create missing indices for fast queries.
	This runs once on startup.
	"""
	if os.path.exists(INDEXED_DB_PATH):
	db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH)
	if db_age < 24 * 3600: # Less than 24 hours old
	print(f"✅ Using existing indexed database: {INDEXED_DB_PATH}")
	print(f" (Created {db_age/3600:.1f} hours ago)")
	return INDEXED_DB_PATH
	else:
	print(f"⚠️ Indexed database is {db_age/3600:.1f} hours old, recreating...")
	os.remove(INDEXED_DB_PATH)

	print("\n" + "="*60)
	print("CREATING INDEXED DATABASE (ONE-TIME SETUP)")
	print("="*60)
	print(f"This will take ~2-5 minutes but only needs to run once.")
	print(f"Subsequent runs will be instant.\n")

	# Check if we have enough space
	original_size = os.path.getsize(ORIGINAL_DB_PATH)
	free_space = shutil.disk_usage("/tmp")[2]

	print(f"Original DB size: {original_size / (2**30):.2f} GB")
	print(f"Free space in /tmp: {free_space / (2**30):.2f} GB")

	if free_space < original_size * 1.5:
	print("⚠️ WARNING: Low disk space! Indices will add ~20% to DB size.")
	print("Continuing anyway...\n")

	# Copy database
	print(f"1. Copying database to {INDEXED_DB_PATH}...")
	start = time.time()
	shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH)
	elapsed = time.time() - start
	print(f" ✓ Copied in {elapsed:.1f}s\n")

	# Connect and create indices
	print("2. Creating indices on edge table...")
	conn = sqlite3.connect(INDEXED_DB_PATH)
	cursor = conn.cursor()

	# Enable optimizations for index creation
	cursor.execute("PRAGMA journal_mode = WAL")
	cursor.execute("PRAGMA synchronous = NORMAL")
	cursor.execute("PRAGMA cache_size = -256000")
	cursor.execute("PRAGMA temp_store = MEMORY")

	indices_to_create = [
	("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"),
	("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"),
	("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"),
	]

	for idx_name, table, column, description in indices_to_create:
	print(f" Creating {idx_name} on {table}({column})...")
	print(f" Purpose: {description}")
	start = time.time()

	cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")

	elapsed = time.time() - start
	print(f" ✓ Created in {elapsed:.1f}s\n")

	# Analyze for query optimization
	print("3. Running ANALYZE to optimize query planning...")
	start = time.time()
	cursor.execute("ANALYZE")
	elapsed = time.time() - start
	print(f" ✓ Analyzed in {elapsed:.1f}s\n")

	# Commit and close
	conn.commit()
	conn.close()

	# Check final size
	indexed_size = os.path.getsize(INDEXED_DB_PATH)
	size_increase = (indexed_size - original_size) / (2**30)

	print("="*60)
	print("INDEXING COMPLETE!")
	print("="*60)
	print(f"Original size: {original_size / (2**30):.2f} GB")
	print(f"Indexed size: {indexed_size / (2**30):.2f} GB")
	print(f"Size increase: +{size_increase:.2f} GB ({100size_increase/(original_size/(2*30)):.1f}%)")
	print(f"Location: {INDEXED_DB_PATH}")
	print("="*60 + "\n")

	return INDEXED_DB_PATH

	# Create indexed database on startup
	DB_PATH = create_indexed_database()

	def get_db_connection():
	"""Create optimized read connection to indexed database"""
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	conn.execute("PRAGMA cache_size = -256000")
	conn.execute("PRAGMA mmap_size = 4294967296")
	conn.execute("PRAGMA temp_store = MEMORY")
	return conn

	def verify_indices():
	"""Verify that indices were created successfully"""
	print("\n" + "="*60)
	print("VERIFYING INDICES")
	print("="*60)

	with get_db_connection() as conn:
	cursor = conn.cursor()

	# Check edge table indices
	cursor.execute("PRAGMA index_list(edge)")
	indices = cursor.fetchall()

	print(f"\nEdge table indices: {len(indices)}")
	for idx in indices:
	idx_name = idx[1]
	cursor.execute(f"PRAGMA index_info({idx_name})")
	cols = cursor.fetchall()
	col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY']
	print(f" ✓ {idx_name}: {', '.join(col_names)}")

	# Test query speed with EXPLAIN QUERY PLAN
	print("\n" + "="*60)
	print("TESTING QUERY PERFORMANCE")
	print("="*60)

	test_queries = [
	("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"),
	("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"),
	("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"),
	]

	for name, query in test_queries:
	print(f"\n{name}:")

	# Show query plan
	cursor.execute(f"EXPLAIN QUERY PLAN {query}")
	plan = cursor.fetchall()
	uses_index = any('INDEX' in str(row).upper() for row in plan)

	for row in plan:
	print(f" Plan: {row}")

	# Time the query
	start = time.time()
	cursor.execute(query)
	results = cursor.fetchall()
	elapsed = time.time() - start

	status = "✅ FAST" if elapsed < 1 else "⚠️ SLOW" if elapsed < 5 else "❌ VERY SLOW"
	print(f" {status}: {len(results)} results in {elapsed:.3f}s")

	print("\n" + "="*60 + "\n")

	verify_indices()

	def get_semantic_profile(word, lang='de'):
	"""
	Semantic profile - NOW FAST with indices!
	"""
	if not word:
	return "⚠️ Please enter a word."

	if lang not in TARGET_LANGUAGES:
	return f"⚠️ Language '{lang}' not available."

	word = word.strip().lower().replace(' ', '_')
	like_path = f"/c/{lang}/{word}%"

	relations = [
	"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
	"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
	"/r/AtLocation", "/r/RelatedTo"
	]

	output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# Check if word exists
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
	nodes = cursor.fetchall()

	if not nodes:
	return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found. Check spelling or try a more common word."

	for node_id, label in nodes[:3]:
	output_md += f"Node: `{node_id}` ({label})\n"
	output_md += "\n"

	total_relations = 0

	# Query each relation - NOW FAST with indices!
	for rel in relations:
	output_md += f"## {rel}\n\n"
	has_results = False

	# Outgoing edges - FAST with idx_edge_start_id
	cursor.execute("""
	SELECT en.label, e.weight
	FROM edge e
	JOIN node en ON e.end_id = en.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.start_id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""", (like_path, rel))

	for label, weight in cursor.fetchall():
	output_md += f"- {word} {rel} → {label} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	# Incoming edges - FAST with idx_edge_end_id
	cursor.execute("""
	SELECT s.label, e.weight
	FROM edge e
	JOIN node s ON e.start_id = s.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.end_id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""", (like_path, rel))

	for label, weight in cursor.fetchall():
	output_md += f"- {label} {rel} → {word} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	if not has_results:
	output_md += "No results\n"
	output_md += "\n"

	output_md += f"---\nTotal relations: {total_relations}\n"
	return output_md

	except Exception as e:
	print(f"ERROR: {e}")
	import traceback
	traceback.print_exc()
	return f"❌ Error:\n\n```\n{e}\n```"

	def run_query(start_node, relation, end_node, limit):
	"""Query builder - NOW FAST with indices!"""

	query = """
	SELECT
	e.id AS edge_id,
	s.id AS start_id,
	r.label AS relation,
	en.id AS end_id,
	e.weight,
	s.label AS start_label,
	en.label AS end_label
	FROM edge e
	JOIN relation r ON e.rel_id = r.id
	JOIN node s ON e.start_id = s.id
	JOIN node en ON e.end_id = en.id
	WHERE 1=1
	"""

	params = []

	try:
	with get_db_connection() as conn:
	# Language filter
	lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES])
	query += f" AND ({lang_filter})"

	# User filters
	if start_node:
	pattern = start_node if '%' in start_node else f"%{start_node}%"
	query += " AND s.id LIKE ?"
	params.append(pattern)

	if relation:
	if '%' in relation:
	query += " AND r.label LIKE ?"
	else:
	query += " AND r.label = ?"
	params.append(relation)

	if end_node:
	pattern = end_node if '%' in end_node else f"%{end_node}%"
	query += " AND en.id LIKE ?"
	params.append(pattern)

	query += " ORDER BY e.weight DESC LIMIT ?"
	params.append(limit)

	start_time = time.time()
	df = pd.read_sql_query(query, conn, params=params)
	elapsed = time.time() - start_time

	if df.empty:
	return pd.DataFrame(), f"No results ({elapsed:.2f}s)"

	df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
	return df, f"✅ {len(df)} results in {elapsed:.2f}s"

	except Exception as e:
	print(f"ERROR: {e}")
	import traceback
	traceback.print_exc()
	return pd.DataFrame(), f"❌ Error: {e}"

	def run_raw_query(sql_query):
	"""Execute raw SQL"""
	if not sql_query.strip().upper().startswith("SELECT"):
	return pd.DataFrame(), "Only SELECT allowed"

	try:
	with get_db_connection() as conn:
	start = time.time()
	df = pd.read_sql_query(sql_query, conn)
	elapsed = time.time() - start
	return df, f"✅ {len(df)} rows in {elapsed:.2f}s"
	except Exception as e:
	return pd.DataFrame(), f"Error: {e}"

	def get_schema_info():
	"""Get schema with index info"""
	with get_db_connection() as conn:
	cursor = conn.cursor()

	md = "# 📚 Database Schema\n\n"
	md += "✅ Custom indices created for fast queries!\n\n"

	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
	tables = cursor.fetchall()

	for table_name, in tables:
	cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
	count = cursor.fetchone()[0]
	md += f"## {table_name} ({count:,} rows)\n\n"

	# Columns
	cursor.execute(f"PRAGMA table_info({table_name})")
	cols = cursor.fetchall()
	md += "\| Column \| Type \| Null \| PK \|\n\|:--\|:--\|:--\|:--\|\n"
	for col in cols:
	md += f"\| `{col[1]}` \| `{col[2]}` \| {'✗' if col[3] else '✓'} \| {'✓' if col[5] else '✗'} \|\n"

	# Indices
	cursor.execute(f"PRAGMA index_list({table_name})")
	indices = cursor.fetchall()

	if indices:
	md += f"\nIndices ({len(indices)}):\n"
	for idx in indices:
	cursor.execute(f"PRAGMA index_info({idx[1]})")
	idx_cols = cursor.fetchall()
	cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id'

	# Mark custom indices
	custom = "🆕 CUSTOM" if idx[1].startswith("idx_") else ""
	md += f"- `{idx[1]}` on ({cols_str}) {custom}\n"

	md += "\n---\n\n"

	return md

	# Gradio UI
	with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 ConceptNet Explorer (With Custom Indices! 🚀)")

	db_size = os.path.getsize(DB_PATH) / (2**30)
	gr.Markdown(
	f"Database: {os.path.basename(DB_PATH)} ({db_size:.2f} GB) \| "
	f"Language: {', '.join([l.upper() for l in TARGET_LANGUAGES])} \| "
	f"Status: ✅ Indexed & Fast"
	)
	gr.Markdown("Custom indices created on edge.start_id and edge.end_id for 100x faster queries!")

	with gr.Tabs():
	with gr.TabItem("🔍 Semantic Profile"):
	gr.Markdown("Get semantic profile - NOW FAST with custom indices!")

	with gr.Row():
	word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
	lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")

	semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
	semantic_output = gr.Markdown("Click to start...")

	with gr.TabItem("⚡ Query Builder"):
	gr.Markdown("Build queries - NOW FAST with custom indices!")

	with gr.Row():
	start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
	rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
	end_input = gr.Textbox(label="End Node", placeholder="tier", value="")

	limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1)
	query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")

	status_output = gr.Markdown("Ready...")
	results_output = gr.DataFrame(label="Results", wrap=True)

	with gr.TabItem("💻 Raw SQL"):
	raw_sql_input = gr.Textbox(
	label="SQL Query",
	value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10",
	lines=3
	)
	raw_btn = gr.Button("▶️ Execute")
	raw_status = gr.Markdown()
	raw_results = gr.DataFrame()

	with gr.TabItem("📊 Schema"):
	schema_btn = gr.Button("📊 Load Schema")
	schema_output = gr.Markdown("Click to load...")

	gr.Markdown("---\n🚀 Performance: Custom indices created on edge table = 100x faster queries!")

	# Connect functions
	semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
	query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
	raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
	schema_btn.click(get_schema_info, None, schema_output)

	if __name__ == "__main__":
	print("\n🚀 Starting app with indexed database...\n")
	demo.launch(ssr_mode=False)