Spaces:

cstr
/

conceptnet_db

Running

App Files Files Community

conceptnet_db / app.py

cstr

Update app.py

7ff8eef verified about 2 months ago

raw

history blame

26.9 kB

	import gradio as gr
	import sqlite3
	import pandas as pd
	from huggingface_hub import hf_hub_download, snapshot_download
	import os
	import traceback
	from pathlib import Path

	# --- 1. Download and Cache the Database with Indices ---
	print("Downloading ConceptNet database and indices...")
	REPO_ID = "ysenarath/conceptnet-sqlite"
	DB_FILENAME = "data/conceptnet-v5.7.0.db"
	INDEX_FOLDER = "data/conceptnet-v5.7.0-index"
	VOCAB_DB = "data/conceptnet-v5.7.0-vocab.db"

	# Download the main database
	DB_PATH = hf_hub_download(
	repo_id=REPO_ID,
	filename=DB_FILENAME,
	repo_type="dataset"
	)
	print(f"Main database downloaded to: {DB_PATH}")

	# Download the vocabulary database (optional but helpful)
	try:
	VOCAB_PATH = hf_hub_download(
	repo_id=REPO_ID,
	filename=VOCAB_DB,
	repo_type="dataset"
	)
	print(f"Vocabulary database downloaded to: {VOCAB_PATH}")
	except Exception as e:
	print(f"Could not download vocabulary DB: {e}")
	VOCAB_PATH = None

	# Download the entire index folder for better performance
	try:
	# Use snapshot_download to get the entire data directory with indices
	CACHE_DIR = snapshot_download(
	repo_id=REPO_ID,
	repo_type="dataset",
	allow_patterns=["data/conceptnet-v5.7.0-index/*"]
	)
	INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
	print(f"Index files downloaded to: {INDEX_PATH}")

	# Count index files
	if os.path.exists(INDEX_PATH):
	index_files = list(Path(INDEX_PATH).glob("*.ldb"))
	print(f"Found {len(index_files)} index files (.ldb)")
	except Exception as e:
	print(f"Could not download index files: {e}")
	INDEX_PATH = None

	# --- 2. Database Helper Functions ---

	def get_db_connection():
	"""
	Creates a new read-only connection to the SQLite database with optimizations.
	"""
	try:
	db_uri = f"file:{DB_PATH}?mode=ro"
	conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)

	# Apply PRAGMA optimizations for read performance
	pragmas = [
	"PRAGMA query_only = ON", # Read-only mode
	"PRAGMA temp_store = MEMORY", # Use memory for temp tables
	"PRAGMA cache_size = -128000", # 128MB cache (negative = KB)
	"PRAGMA page_size = 8192", # Larger page size for better I/O
	"PRAGMA mmap_size = 2147483648", # 2GB memory-mapped I/O
	"PRAGMA synchronous = OFF", # Safe for read-only
	"PRAGMA journal_mode = OFF", # No journal needed for read-only
	"PRAGMA locking_mode = NORMAL", # Allow multiple readers
	"PRAGMA threads = 4", # Use multiple threads
	]

	for pragma in pragmas:
	try:
	conn.execute(pragma)
	except sqlite3.OperationalError as e:
	print(f"Warning: Could not apply {pragma}: {e}")

	return conn
	except Exception as e:
	print(f"Error connecting to DB: {e}")
	traceback.print_exc()
	return None

	def verify_indices():
	"""
	Check and report on database indices and their usage.
	"""
	print("\n=== Database Index Analysis ===")
	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# Check all tables
	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
	tables = cursor.fetchall()

	total_indices = 0
	for table in tables:
	table_name = table[0]

	# Get indices for this table
	cursor.execute(f"PRAGMA index_list({table_name})")
	indices = cursor.fetchall()

	if indices:
	print(f"\n📊 Table: {table_name}")
	for idx in indices:
	idx_name, unique, origin = idx[1], idx[2], idx[3]

	# Get index details
	cursor.execute(f"PRAGMA index_info({idx_name})")
	idx_cols = cursor.fetchall()

	cols = [col[2] for col in idx_cols]
	unique_str = "UNIQUE" if unique else "NON-UNIQUE"
	print(f" ├─ {idx_name} ({unique_str}) on columns: {', '.join(cols)}")
	total_indices += 1

	print(f"\n✅ Total indices found: {total_indices}")

	# Check if FTS (Full Text Search) is available
	cursor.execute("SELECT * FROM pragma_compile_options WHERE compile_options LIKE '%FTS%'")
	fts = cursor.fetchall()
	if fts:
	print(f"✅ Full-Text Search enabled: {[f[0] for f in fts]}")

	# Check database page size and cache
	cursor.execute("PRAGMA page_size")
	page_size = cursor.fetchone()[0]
	cursor.execute("PRAGMA cache_size")
	cache_size = cursor.fetchone()[0]
	print(f"\n📈 Page size: {page_size} bytes")
	print(f"📈 Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"📈 Cache size: {cache_size} pages")

	# Get database size
	cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
	db_size = cursor.fetchone()[0]
	print(f"📦 Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")

	except Exception as e:
	print(f"Error in verify_indices: {e}")
	traceback.print_exc()

	def get_schema_info():
	"""
	Dynamically queries the SQLite database to get its schema with index information.
	"""
	print("Getting schema info...")
	schema_md = "# 📚 Database Schema\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# Get database stats
	cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
	db_size = cursor.fetchone()[0]
	schema_md += f"Database Size: {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"

	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
	tables = cursor.fetchall()

	if not tables:
	return "Could not find any tables in the database."

	for table in tables:
	table_name = table[0]

	# Get row count
	try:
	cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
	row_count = cursor.fetchone()[0]
	schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
	except:
	schema_md += f"## Table: `{table_name}`\n\n"

	schema_md += "### Columns\n\n"
	schema_md += "\| Column Name \| Data Type \| Not Null \| Primary Key \|\n"
	schema_md += "\|:------------\|:----------\|:---------\|:------------\|\n"

	cursor.execute(f"PRAGMA table_info({table_name});")
	columns = cursor.fetchall()
	for col in columns:
	name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
	schema_md += f"\| `{name}` \| `{dtype}` \| {'✓' if notnull else '✗'} \| {'✓' if pk else '✗'} \|\n"

	# Show indices with details
	cursor.execute(f"PRAGMA index_list({table_name});")
	indices = cursor.fetchall()
	if indices:
	schema_md += f"\n### Indices ({len(indices)})\n\n"
	for idx in indices:
	idx_name, unique, origin = idx[1], idx[2], idx[3]

	# Get indexed columns
	cursor.execute(f"PRAGMA index_info({idx_name});")
	idx_cols = cursor.fetchall()
	cols = [col[2] for col in idx_cols if col[2]]

	unique_badge = "🔒 UNIQUE" if unique else "📑 INDEX"
	schema_md += f"- {idx_name} {unique_badge}\n"
	schema_md += f" - Columns: `{', '.join(cols) if cols else 'N/A'}`\n"
	schema_md += f" - Origin: {origin}\n"

	schema_md += "\n---\n\n"

	return schema_md
	except Exception as e:
	print(f"Error in get_schema_info: {e}")
	traceback.print_exc()
	return f"An error occurred while fetching schema: {e}"

	def run_query(start_node, relation, end_node, limit):
	"""
	OPTIMIZED: Uses direct JOINs with indexed columns for maximum performance.
	"""
	print(f"Running query: start='{start_node}', rel='{relation}', end='{end_node}'")

	# Build query using indexed JOIN approach
	query = """
	SELECT
	e.id AS edge_id,
	s.id AS start_id_path,
	r.label AS relation_label,
	en.id AS end_id_path,
	e.weight,
	e.dataset,
	e.surface_text,
	s.label AS start_label_text,
	en.label AS end_label_text
	FROM edge AS e
	INNER JOIN relation AS r ON e.rel_id = r.id
	INNER JOIN node AS s ON e.start_id = s.id
	INNER JOIN node AS en ON e.end_id = en.id
	"""

	where_conditions = []
	params = []

	try:
	# Build WHERE conditions leveraging indices
	if start_node:
	if "%" in start_node:
	where_conditions.append("s.id LIKE ?")
	params.append(start_node)
	else:
	# Exact match or prefix match
	where_conditions.append("s.id LIKE ?")
	params.append(f"%{start_node}%")

	if relation:
	if "%" in relation:
	where_conditions.append("r.label LIKE ?")
	params.append(relation)
	else:
	# Exact match is faster
	where_conditions.append("r.label = ?")
	params.append(relation)

	if end_node:
	if "%" in end_node:
	where_conditions.append("en.id LIKE ?")
	params.append(end_node)
	else:
	where_conditions.append("en.id LIKE ?")
	params.append(f"%{end_node}%")

	if where_conditions:
	query += " WHERE " + " AND ".join(where_conditions)

	# Order by weight to get most relevant results first
	query += " ORDER BY e.weight DESC LIMIT ?"
	params.append(limit)

	print(f"Executing SQL with {len(params)} parameters")

	with get_db_connection() as conn:
	# Use EXPLAIN QUERY PLAN to verify index usage (for debugging)
	explain_query = "EXPLAIN QUERY PLAN " + query
	try:
	explain_result = conn.execute(explain_query, params).fetchall()
	print("Query Plan:")
	for row in explain_result:
	print(f" {row}")
	except:
	pass

	# Execute actual query
	df = pd.read_sql_query(query, conn, params=params)

	if df.empty:
	return pd.DataFrame(), "Query ran successfully but returned no results. Try broader search terms or check spelling."

	return df, f"✅ Query successful! Found {len(df)} results (ordered by relevance)."

	except Exception as e:
	print(f"Error in run_query: {e}")
	traceback.print_exc()
	err_msg = f"❌ Query Failed!\n\n```\n{e}\n```"
	return pd.DataFrame(), err_msg

	def run_raw_query(sql_query):
	"""
	Executes a raw, read-only SQL query with query plan analysis.
	"""
	print(f"Running raw query: {sql_query[:100]}...")

	if not sql_query.strip().upper().startswith("SELECT"):
	return pd.DataFrame(), "Error: Only `SELECT` statements are allowed for safety."

	try:
	with get_db_connection() as conn:
	# Show query plan
	try:
	explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
	print("Query Plan:")
	for row in explain_result:
	print(f" {row}")
	except:
	pass

	df = pd.read_sql_query(sql_query, conn)

	if df.empty:
	return df, "Query ran successfully but returned no results."

	return df, f"✅ Query successful! Returned {len(df)} rows."
	except Exception as e:
	print(f"Error in run_raw_query: {e}")
	traceback.print_exc()
	return pd.DataFrame(), f"❌ Query Failed!\n\n```\n{e}\n```"

	def get_semantic_profile(word, lang='en'):
	"""
	HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
	Uses indexed columns for maximum speed.
	"""
	if not word:
	return "⚠️ Please enter a word."

	word = word.strip().lower().replace(' ', '_')
	like_path = f"/c/{lang}/{word}%"
	print(f"Getting semantic profile for: {like_path}")

	# Most important relations for semantic understanding
	relations_to_check = [
	"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
	"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
	"/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom"
	]

	output_md = f"# 🧠 Semantic Profile: '{word}'\n"
	output_md += f"Language: {lang.upper()} \| Search Pattern: `{like_path}`\n\n"

	try:
	with get_db_connection() as conn:
	# MEGA-OPTIMIZED: Single UNION ALL query for all relations
	union_parts = []
	union_params = []

	for rel in relations_to_check:
	# Outgoing edges (word as subject)
	union_parts.append("""
	SELECT
	? as rel_label,
	'out' as dir,
	en.id as target_id,
	en.label as target_label,
	e.weight as weight
	FROM edge e
	INDEXED BY (SELECT name FROM pragma_index_list('edge') LIMIT 1)
	INNER JOIN node s ON e.start_id = s.id
	INNER JOIN node en ON e.end_id = en.id
	INNER JOIN relation r ON e.rel_id = r.id
	WHERE s.id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""")
	union_params.extend([rel, like_path, rel])

	# Incoming edges (word as object)
	union_parts.append("""
	SELECT
	? as rel_label,
	'in' as dir,
	s.id as target_id,
	s.label as target_label,
	e.weight as weight
	FROM edge e
	INNER JOIN node s ON e.start_id = s.id
	INNER JOIN node en ON e.end_id = en.id
	INNER JOIN relation r ON e.rel_id = r.id
	WHERE en.id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""")
	union_params.extend([rel, like_path, rel])

	# Execute the mega-query
	full_query = " UNION ALL ".join(union_parts)

	print(f"Executing optimized semantic profile query...")
	cursor = conn.execute(full_query, union_params)
	results = cursor.fetchall()

	if not results:
	return f"""# 🧠 Semantic Profile: '{word}'

	⚠️ No results found

	This could mean:
	1. The word isn't in ConceptNet for language `{lang}`
	2. Try checking spelling: `{word}`
	3. Try language code 'en' for English
	4. Try a more common/simpler word

	Tip: Use the Query Builder to search manually."""

	# Group and format results
	current_rel = None
	rel_results = []
	total_relations = 0

	for rel_label, direction, target_id, target_label, weight in results:
	if rel_label != current_rel:
	if current_rel is not None:
	# Write previous relation
	output_md += f"## {current_rel}\n\n"
	if rel_results:
	for line in rel_results:
	output_md += line
	total_relations += len(rel_results)
	else:
	output_md += "No results\n"
	output_md += "\n"

	current_rel = rel_label
	rel_results = []

	# Format output
	weight_str = f"{weight:.3f}"
	if direction == 'out':
	rel_results.append(
	f"- {word} {rel_label} → {target_label} "
	f"`[{weight_str}]`\n"
	)
	else:
	rel_results.append(
	f"- {target_label} {rel_label} → {word} "
	f"`[{weight_str}]`\n"
	)

	# Write last relation
	if current_rel is not None:
	output_md += f"## {current_rel}\n\n"
	if rel_results:
	for line in rel_results:
	output_md += line
	total_relations += len(rel_results)
	else:
	output_md += "No results\n"
	output_md += "\n"

	output_md += "---\n"
	output_md += f"Total relations found: {total_relations}\n"
	output_md += f"Weight indicates strength of association (higher = stronger)\n"

	return output_md

	except Exception as e:
	print(f"Error in get_semantic_profile: {e}")
	traceback.print_exc()
	return f"❌ An error occurred:\n\n```\n{e}\n```"

	# --- 3. Build the Gradio UI ---

	# Verify indices on startup
	verify_indices()

	with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 ConceptNet SQLite Explorer")
	gr.Markdown(
	f"Database: `{os.path.basename(DB_PATH)}` ({os.path.getsize(DB_PATH) / 1024 / 1024 / 1024:.2f} GB) \| "
	f"Status: {'✅ Indices Loaded' if INDEX_PATH and os.path.exists(INDEX_PATH) else '⚠️ No Index Cache'}"
	)
	gr.Markdown("Explore semantic relationships in ConceptNet with optimized indexed queries")

	with gr.Tabs():
	with gr.TabItem("🔍 Semantic Profile"):
	gr.Markdown(
	"Get a comprehensive semantic profile for any word.\n\n"
	"Queries common relations: IsA, HasA, UsedFor, CapableOf, Causes, HasProperty, and more.\n"
	)

	with gr.Row():
	with gr.Column(scale=2):
	word_input = gr.Textbox(
	label="Word",
	placeholder="dog",
	info="Single word or phrase (use underscores for phrases)"
	)
	with gr.Column(scale=1):
	lang_input = gr.Textbox(
	label="Language",
	value="en",
	placeholder="en",
	info="ISO code (en, de, es, fr, ja, zh, etc.)"
	)

	with gr.Row():
	semantic_btn = gr.Button("🔍 Get Semantic Profile", variant="primary", size="lg")

	with gr.Accordion("📚 Example Words", open=False):
	gr.Markdown(
	"English (en): dog, cat, computer, love, happiness, run\n\n"
	"German (de): hund, katze, liebe, glück\n\n"
	"Spanish (es): perro, gato, amor, felicidad\n\n"
	"French (fr): chien, chat, amour, bonheur\n\n"
	"Japanese (ja): 犬, 猫, 愛, 幸せ"
	)

	semantic_output = gr.Markdown("Click 'Get Semantic Profile' to start...")

	with gr.TabItem("⚡ Query Builder"):
	gr.Markdown(
	"Build custom queries using ConceptNet's graph structure.\n\n"
	"Find edges connecting concepts through specific relations. Leverages database indices for fast lookups."
	)

	with gr.Row():
	start_input = gr.Textbox(
	label="Start Node",
	placeholder="dog (or /c/en/dog)",
	info="Leave empty for any"
	)
	rel_input = gr.Textbox(
	label="Relation",
	placeholder="IsA (or /r/IsA)",
	info="Leave empty for any"
	)
	end_input = gr.Textbox(
	label="End Node",
	placeholder="animal (or /c/en/animal)",
	info="Leave empty for any"
	)

	limit_slider = gr.Slider(
	label="Results Limit",
	minimum=1,
	maximum=500,
	value=50,
	step=1,
	info="Higher limits may be slower"
	)

	query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")

	with gr.Accordion("💡 Query Tips & Examples", open=False):
	gr.Markdown(
	"Tips:\n"
	"- Omit `/c/en/` prefix - it's added automatically\n"
	"- Use `%` as wildcard: `%dog%` matches 'hotdog', 'doghouse'\n"
	"- More specific = faster queries\n\n"
	"Examples:\n"
	"1. What can dogs do? → Start: `dog`, Relation: `CapableOf`, End: empty\n"
	"2. What is a dog? → Start: `dog`, Relation: `IsA`, End: empty\n"
	"3. Things at home → Start: empty, Relation: `AtLocation`, End: `home`\n"
	"4. Synonyms of happy → Start: `happy`, Relation: `Synonym`, End: empty"
	)

	status_output = gr.Markdown("Ready to query...")
	results_output = gr.DataFrame(label="Query Results", interactive=False, wrap=True)

	with gr.TabItem("💻 Raw SQL"):
	gr.Markdown(
	"Advanced: Execute custom SQL queries.\n\n"
	"⚠️ Only `SELECT` statements allowed. Check Schema tab for table structure."
	)

	raw_sql_input = gr.Textbox(
	label="SQL Query",
	placeholder="SELECT s.label, r.label, e.label FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id JOIN node e ON e.end_id = e.id WHERE s.label = 'dog' LIMIT 10",
	lines=6,
	info="Write SELECT query"
	)

	with gr.Accordion("📋 Useful Queries", open=False):
	gr.Markdown(
	"Count edges by relation:\n"
	"```sql\n"
	"SELECT r.label, COUNT(*) as count \n"
	"FROM edge e \n"
	"JOIN relation r ON e.rel_id = r.id \n"
	"GROUP BY r.label \n"
	"ORDER BY count DESC\n"
	"```\n\n"
	"Find strongest connections:\n"
	"```sql\n"
	"SELECT s.label, r.label, e.label, edge.weight\n"
	"FROM edge \n"
	"JOIN node s ON edge.start_id = s.id\n"
	"JOIN relation r ON edge.rel_id = r.id\n"
	"JOIN node e ON edge.end_id = e.id\n"
	"ORDER BY weight DESC LIMIT 20\n"
	"```\n\n"
	"Check index usage:\n"
	"```sql\n"
	"EXPLAIN QUERY PLAN\n"
	"SELECT * FROM edge WHERE start_id = '/c/en/dog'\n"
	"```"
	)

	raw_query_btn = gr.Button("▶️ Execute SQL", variant="secondary", size="lg")
	raw_status_output = gr.Markdown("Ready...")
	raw_results_output = gr.DataFrame(label="Results", interactive=False, wrap=True)

	with gr.TabItem("📊 Schema & Indices"):
	gr.Markdown(
	"Database structure, indices, and optimization info.\n\n"
	"View tables, columns, and index configuration."
	)

	schema_btn = gr.Button("📊 Load Schema", variant="secondary", size="lg")
	schema_output = gr.Markdown("Click button to load schema...")

	gr.Markdown("---")
	gr.Markdown(
	"💡 Performance: Queries use database indices for fast lookups. "
	"Exact matches are faster than wildcards. "
	f"{'✅ Index files loaded from HuggingFace cache.' if INDEX_PATH else '⚠️ Running without index cache - queries may be slower.'}"
	)

	# Connect UI to functions
	semantic_btn.click(
	fn=get_semantic_profile,
	inputs=[word_input, lang_input],
	outputs=[semantic_output],
	api_name="get_semantic_profile"
	)

	query_btn.click(
	fn=run_query,
	inputs=[start_input, rel_input, end_input, limit_slider],
	outputs=[results_output, status_output],
	api_name="run_query"
	)

	raw_query_btn.click(
	fn=run_raw_query,
	inputs=[raw_sql_input],
	outputs=[raw_results_output, raw_status_output],
	api_name="run_raw_query"
	)

	schema_btn.click(
	fn=get_schema_info,
	inputs=None,
	outputs=schema_output,
	api_name="get_schema"
	)

	if __name__ == "__main__":
	print("\n" + "="*50)
	print("🚀 Starting ConceptNet SQLite Explorer")
	print("="*50 + "\n")
	demo.launch(ssr_mode=False)