Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

conceptnet_db / app.py

cstr

Update app.py

54d8b53 verified about 1 month ago

raw

history blame

39 kB

	import gradio as gr
	import sqlite3
	import pandas as pd
	from huggingface_hub import hf_hub_download, HfApi
	import os
	import time
	import shutil
	from pathlib import Path
	import json

	# ===== CONFIGURATION =====
	TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
	INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
	INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
	PROGRESS_FILENAME = "indexing_progress.json"
	LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
	CONCEPTNET_BASE = "http://conceptnet.io" # CRITICAL: Full URL base
	# =========================

	print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

	HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")

	if HF_TOKEN:
	print(f"✅ HF_TOKEN found (length: {len(HF_TOKEN)})")
	else:
	print("⚠️ No HF_TOKEN - checkpointing disabled")

	ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
	ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"

	def log_progress(message, level="INFO"):
	"""Enhanced logging with timestamp"""
	timestamp = time.strftime("%H:%M:%S")
	prefix = {
	"INFO": "ℹ️ ",
	"SUCCESS": "✅",
	"ERROR": "❌",
	"WARN": "⚠️ ",
	"CHECKPOINT": "💾",
	"DEBUG": "🔍"
	}.get(level, "")
	print(f"[{timestamp}] {prefix} {message}")

	def verify_database_has_indices(db_path):
	"""Verify database has required indices"""
	log_progress(f"Verifying indices in {os.path.basename(db_path)}...", "DEBUG")

	if not os.path.exists(db_path):
	log_progress("Database file does not exist", "ERROR")
	return False, 0

	try:
	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()

	cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
	custom_indices = cursor.fetchall()

	conn.close()

	has_all = len(custom_indices) >= 4
	log_progress(f"Found {len(custom_indices)} custom indices (need 4+): {has_all}", "SUCCESS" if has_all else "WARN")

	return has_all, len(custom_indices)

	except Exception as e:
	log_progress(f"Error verifying indices: {e}", "ERROR")
	return False, 0

	def check_remote_progress():
	"""Check remote progress with detailed logging"""
	log_progress("Checking remote progress...", "DEBUG")

	if not HF_TOKEN:
	log_progress("No HF_TOKEN - cannot check remote", "WARN")
	return {
	"completed_indices": [],
	"analyzed_tables": [],
	"database_uploaded": False,
	"indexing_complete": False
	}

	try:
	api = HfApi()

	try:
	api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
	log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS")
	except:
	log_progress("Repository does not exist yet", "INFO")
	return {
	"completed_indices": [],
	"analyzed_tables": [],
	"database_uploaded": False,
	"indexing_complete": False
	}

	try:
	progress_path = hf_hub_download(
	repo_id=INDEXED_REPO_ID,
	filename=PROGRESS_FILENAME,
	repo_type="dataset",
	token=HF_TOKEN
	)

	with open(progress_path, 'r') as f:
	progress = json.load(f)

	log_progress("Remote progress loaded:", "SUCCESS")
	log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO")
	log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO")
	log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO")

	return progress

	except Exception as e:
	log_progress("No progress file found (starting fresh)", "INFO")
	return {
	"completed_indices": [],
	"analyzed_tables": [],
	"database_uploaded": False,
	"indexing_complete": False
	}

	except Exception as e:
	log_progress(f"Error checking remote: {e}", "ERROR")
	return {
	"completed_indices": [],
	"analyzed_tables": [],
	"database_uploaded": False,
	"indexing_complete": False
	}

	def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
	"""Update remote progress file"""
	log_progress("Updating remote progress...", "DEBUG")

	if not HF_TOKEN:
	log_progress("Cannot update progress: No HF_TOKEN", "WARN")
	return False

	if analyzed_tables is None:
	analyzed_tables = []

	try:
	api = HfApi()

	try:
	api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
	except:
	log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO")
	api.create_repo(
	repo_id=INDEXED_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	private=False
	)

	progress = {
	"completed_indices": completed_indices,
	"analyzed_tables": analyzed_tables,
	"database_uploaded": database_uploaded,
	"indexing_complete": indexing_complete,
	"timestamp": time.time(),
	"languages": TARGET_LANGUAGES
	}

	progress_path = "/tmp/indexing_progress.json"
	with open(progress_path, 'w') as f:
	json.dump(progress, f, indent=2)

	api.upload_file(
	path_or_fileobj=progress_path,
	path_in_repo=PROGRESS_FILENAME,
	repo_id=INDEXED_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables"
	)

	log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables", "CHECKPOINT")
	return True

	except Exception as e:
	log_progress(f"Failed to update progress: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	return False

	def upload_database_checkpoint(message=""):
	"""Upload database with WAL checkpoint"""
	log_progress("Starting database upload...", "CHECKPOINT")

	if not HF_TOKEN:
	log_progress("Cannot upload: No HF_TOKEN", "WARN")
	return False

	if not os.path.exists(LOCAL_DB_PATH):
	log_progress("Database file doesn't exist", "ERROR")
	return False

	try:
	# CRITICAL: Checkpoint WAL to merge changes into main file
	log_progress("Checkpointing WAL...", "DEBUG")
	conn = sqlite3.connect(LOCAL_DB_PATH)
	conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
	conn.close()
	log_progress("WAL checkpoint complete", "SUCCESS")

	# Verify indices are in file
	has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
	log_progress(f"Pre-upload verification: {idx_count} indices", "SUCCESS" if has_indices else "WARN")

	api = HfApi()
	db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)

	log_progress(f"Uploading {db_size:.2f} GB to {INDEXED_REPO_ID}...", "CHECKPOINT")
	if message:
	log_progress(f" Message: {message}", "INFO")
	log_progress(" This will take 2-5 minutes...", "INFO")

	start = time.time()

	api.upload_file(
	path_or_fileobj=LOCAL_DB_PATH,
	path_in_repo=INDEXED_DB_FILENAME,
	repo_id=INDEXED_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message=message or "Database checkpoint"
	)

	elapsed = time.time() - start
	speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0

	log_progress(f"Upload complete in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS")
	log_progress(f"View at: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO")

	return True

	except Exception as e:
	log_progress(f"Upload failed: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	return False

	def create_indexed_database():
	"""Create or download indexed database with comprehensive checkpointing"""
	log_progress("="*60, "INFO")
	log_progress("STARTING DATABASE SETUP", "INFO")
	log_progress("="*60, "INFO")

	# Check remote progress
	progress = check_remote_progress()
	completed_indices = set(progress.get("completed_indices", []))
	analyzed_tables = set(progress.get("analyzed_tables", []))
	database_uploaded = progress.get("database_uploaded", False)
	indexing_complete = progress.get("indexing_complete", False)

	# If fully complete, download and return
	if indexing_complete:
	log_progress("Fully indexed database exists!", "SUCCESS")
	log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")

	try:
	indexed_path = hf_hub_download(
	repo_id=INDEXED_REPO_ID,
	filename=INDEXED_DB_FILENAME,
	repo_type="dataset",
	token=HF_TOKEN
	)

	log_progress(f"Downloaded to: {indexed_path}", "SUCCESS")

	# Verify it actually has indices
	has_indices, idx_count = verify_database_has_indices(indexed_path)

	if has_indices:
	log_progress(f"Verified {idx_count} indices present", "SUCCESS")
	return indexed_path
	else:
	log_progress(f"CORRUPTED: Only {idx_count}/4 indices found!", "ERROR")
	log_progress("The database needs to be re-indexed", "WARN")

	# Reset and rebuild
	indexing_complete = False
	completed_indices = set()
	analyzed_tables = set()
	database_uploaded = False
	update_remote_progress([], [], False, False)

	except Exception as e:
	log_progress(f"Download failed: {e}", "ERROR")
	log_progress("Will create locally", "INFO")

	# Download partially indexed DB if checkpoint exists
	if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH):
	log_progress("Checkpoint detected - downloading partial DB...", "INFO")
	log_progress(f" Indices done: {sorted(completed_indices)}", "INFO")
	log_progress(f" Tables analyzed: {sorted(analyzed_tables)}", "INFO")

	try:
	indexed_path = hf_hub_download(
	repo_id=INDEXED_REPO_ID,
	filename=INDEXED_DB_FILENAME,
	repo_type="dataset",
	token=HF_TOKEN
	)

	log_progress("Downloaded partial DB", "SUCCESS")

	# Verify indices
	has_indices, idx_count = verify_database_has_indices(indexed_path)

	if idx_count >= len(completed_indices):
	log_progress(f"Verified {idx_count} indices (expected {len(completed_indices)})", "SUCCESS")

	log_progress(f"Copying to {LOCAL_DB_PATH}...", "DEBUG")
	start = time.time()
	shutil.copy2(indexed_path, LOCAL_DB_PATH)
	elapsed = time.time() - start
	log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
	log_progress("Resuming from checkpoint ✅", "SUCCESS")
	else:
	log_progress(f"Index mismatch: found {idx_count}, expected {len(completed_indices)}", "ERROR")
	log_progress("Will start from scratch", "WARN")
	completed_indices = set()
	analyzed_tables = set()

	except Exception as e:
	log_progress(f"Could not download partial DB: {e}", "WARN")
	log_progress("Will start from original", "INFO")
	completed_indices = set()
	analyzed_tables = set()

	# Download original if needed
	if not os.path.exists(LOCAL_DB_PATH):
	if completed_indices or analyzed_tables:
	log_progress("Failed to resume - clearing progress", "WARN")
	update_remote_progress([], [], False, False)
	completed_indices = set()
	analyzed_tables = set()

	log_progress("Downloading original ConceptNet database...", "INFO")

	original_path = hf_hub_download(
	repo_id=ORIGINAL_REPO_ID,
	filename=ORIGINAL_DB_FILENAME,
	repo_type="dataset"
	)

	original_size = os.path.getsize(original_path)
	free_space = shutil.disk_usage("/tmp")[2]

	log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO")
	log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO")

	if free_space < original_size * 2:
	raise Exception(f"Insufficient space! Need {original_size * 2 / (230):.1f} GB, have {free_space / (230):.1f} GB")

	log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO")
	start = time.time()
	shutil.copy2(original_path, LOCAL_DB_PATH)
	elapsed = time.time() - start
	log_progress(f"Copied {original_size / (230):.2f} GB in {elapsed:.1f}s ({original_size / elapsed / (220):.1f} MB/s)", "SUCCESS")

	# Only index if not complete
	if not (len(completed_indices) >= 4 and len(analyzed_tables) >= 4):
	log_progress("Indexing required", "INFO")

	# Connect
	log_progress("Opening database connection...", "DEBUG")
	conn = sqlite3.connect(LOCAL_DB_PATH)
	cursor = conn.cursor()

	# Optimizations
	log_progress("Setting PRAGMA optimizations...", "DEBUG")
	cursor.execute("PRAGMA journal_mode = WAL")
	cursor.execute("PRAGMA synchronous = NORMAL")
	cursor.execute("PRAGMA cache_size = -512000")
	cursor.execute("PRAGMA temp_store = MEMORY")

	# PHASE 1: Indices
	log_progress("="*60, "INFO")
	log_progress("PHASE 1: CREATING INDICES", "INFO")
	log_progress("="*60, "INFO")

	indices_to_create = [
	("idx_edge_start_id", "edge", "start_id"),
	("idx_edge_end_id", "edge", "end_id"),
	("idx_edge_rel_id", "edge", "rel_id"),
	("idx_node_label", "node", "label"),
	]

	for i, (idx_name, table, column) in enumerate(indices_to_create, 1):
	if idx_name in completed_indices:
	log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED", "INFO")
	continue

	log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO")

	start = time.time()

	try:
	cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
	conn.commit()

	elapsed = time.time() - start
	log_progress(f" Created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS")

	completed_indices.add(idx_name)
	update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
	upload_database_checkpoint(f"Checkpoint: {idx_name} ({i}/{len(indices_to_create)})")

	except Exception as e:
	log_progress(f"Failed to create {idx_name}: {e}", "ERROR")
	conn.close()
	raise

	# PHASE 2: ANALYZE
	log_progress("="*60, "INFO")
	log_progress("PHASE 2: ANALYZING TABLES", "INFO")
	log_progress("="*60, "INFO")

	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
	tables = [row[0] for row in cursor.fetchall()]

	log_progress(f"Found {len(tables)} tables: {tables}", "INFO")

	for i, table in enumerate(tables, 1):
	if table in analyzed_tables:
	log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED", "INFO")
	continue

	log_progress(f"[{i}/{len(tables)}] Analyzing {table}...", "INFO")

	try:
	cursor.execute(f"SELECT COUNT(*) FROM {table}")
	row_count = cursor.fetchone()[0]
	log_progress(f" Rows: {row_count:,}", "INFO")
	except:
	log_progress(" Could not count rows", "WARN")

	start = time.time()

	try:
	cursor.execute(f"ANALYZE {table}")
	conn.commit()

	elapsed = time.time() - start
	log_progress(f" Analyzed in {elapsed:.1f}s", "SUCCESS")

	analyzed_tables.add(table)
	update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
	upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")

	except Exception as e:
	log_progress(f"Failed to analyze {table}: {e}", "ERROR")
	log_progress("Continuing...", "WARN")

	# Final checkpoint
	log_progress("Final WAL checkpoint...", "INFO")
	cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)")
	conn.commit()
	conn.close()
	log_progress("Database closed", "SUCCESS")

	# Final upload
	log_progress("="*60, "INFO")
	log_progress("FINAL UPLOAD", "INFO")
	log_progress("="*60, "INFO")

	has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
	log_progress(f"Final check: {idx_count} indices", "SUCCESS" if has_indices else "ERROR")

	upload_database_checkpoint("COMPLETE - All indices and analysis done")
	update_remote_progress(list(completed_indices), list(analyzed_tables), True, True)

	log_progress("="*60, "SUCCESS")
	log_progress("INDEXING COMPLETE!", "SUCCESS")
	log_progress("="*60, "SUCCESS")

	return LOCAL_DB_PATH

	# Initialize
	DB_PATH = create_indexed_database()

	def get_db_connection():
	"""Create optimized connection"""
	log_progress("Creating DB connection", "DEBUG")
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	conn.execute("PRAGMA cache_size = -256000")
	conn.execute("PRAGMA mmap_size = 4294967296")
	return conn

	def run_diagnostics():
	"""Run comprehensive diagnostics"""
	log_progress("="*60, "INFO")
	log_progress("RUNNING DIAGNOSTICS", "INFO")
	log_progress("="*60, "INFO")

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# 1. Sample nodes
	log_progress("\n1. Sample node IDs:", "INFO")
	cursor.execute("SELECT id, label FROM node LIMIT 10")
	for node_id, label in cursor.fetchall():
	print(f" {node_id} -> {label}")

	# 2. Test correct pattern
	log_progress("\n2. Testing CORRECT pattern (no leading %):", "INFO")
	test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%"
	log_progress(f" Pattern: {test_pattern}", "DEBUG")

	start = time.time()
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (test_pattern,))
	results = cursor.fetchall()
	elapsed = time.time() - start

	log_progress(f" Found {len(results)} in {elapsed:.3f}s", "SUCCESS" if elapsed < 1 else "WARN")
	for node_id, label in results:
	print(f" {node_id} -> {label}")

	# 3. Check index usage
	log_progress("\n3. Checking index usage:", "INFO")
	cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '{test_pattern}'")
	plan = cursor.fetchall()
	uses_index = any('INDEX' in str(row).upper() for row in plan)
	log_progress(f" Uses index: {uses_index}", "SUCCESS" if uses_index else "ERROR")
	for row in plan:
	print(f" {row}")

	# 4. Test wrong pattern
	log_progress("\n4. Testing WRONG pattern (leading %):", "WARN")
	wrong_pattern = f"%/c/en/dog%"
	log_progress(f" Pattern: {wrong_pattern}", "DEBUG")

	start = time.time()
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (wrong_pattern,))
	results = cursor.fetchall()
	elapsed = time.time() - start

	log_progress(f" Found {len(results)} in {elapsed:.3f}s (SLOW!)", "WARN" if elapsed > 1 else "INFO")

	cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM node WHERE id LIKE '{wrong_pattern}'")
	plan = cursor.fetchall()
	uses_index = any('INDEX' in str(row).upper() for row in plan)
	log_progress(f" Uses index: {uses_index} (should be False)", "WARN" if uses_index else "INFO")

	log_progress("\n" + "="*60, "INFO")
	log_progress("DIAGNOSTICS COMPLETE", "SUCCESS")
	log_progress("="*60 + "\n", "INFO")

	except Exception as e:
	log_progress(f"Diagnostics failed: {e}", "ERROR")
	import traceback
	traceback.print_exc()

	# Run diagnostics
	run_diagnostics()

	def get_semantic_profile(word, lang='en', progress=gr.Progress()):
	"""Get semantic profile with CORRECT URL pattern"""
	log_progress(f"Semantic profile request: word='{word}', lang='{lang}'", "DEBUG")
	progress(0, desc="Starting...")

	if not word:
	return "⚠️ Please enter a word."

	if lang not in TARGET_LANGUAGES:
	return f"⚠️ Language '{lang}' not supported. Available: {', '.join(TARGET_LANGUAGES)}"

	word = word.strip().lower().replace(' ', '_')

	# CORRECT pattern - no leading % allows index usage!
	like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%"
	log_progress(f"Using pattern: {like_path}", "DEBUG")

	relations = [
	"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
	"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
	"/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom", "/r/SimilarTo"
	]

	output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	progress(0.05, desc="Finding nodes...")

	start = time.time()
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
	nodes = cursor.fetchall()
	elapsed = time.time() - start

	log_progress(f"Found {len(nodes)} nodes in {elapsed:.3f}s", "SUCCESS" if nodes else "WARN")

	if not nodes:
	return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ Not found\n\nSearched: `{like_path}`"

	for node_id, label in nodes[:3]:
	output_md += f"Node: `{node_id}`\n"
	output_md += f"Label: {label}\n\n"
	log_progress(f" Found node: {node_id} ({label})", "DEBUG")

	total_relations = 0

	for i, rel in enumerate(relations):
	progress((i + 1) / len(relations), desc=f"Querying {rel}...")
	log_progress(f"Querying relation: {rel}", "DEBUG")

	output_md += f"## {rel}\n\n"
	has_results = False

	# Outgoing edges
	start = time.time()
	cursor.execute("""
	SELECT en.label, e.weight
	FROM edge e
	JOIN node en ON e.end_id = en.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.start_id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""", (like_path, rel))

	out_results = cursor.fetchall()
	elapsed = time.time() - start
	log_progress(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s", "DEBUG")

	for label, weight in out_results:
	output_md += f"- {word} {rel} → {label} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	# Incoming edges
	start = time.time()
	cursor.execute("""
	SELECT s.label, e.weight
	FROM edge e
	JOIN node s ON e.start_id = s.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.end_id LIKE ? AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 7
	""", (like_path, rel))

	in_results = cursor.fetchall()
	elapsed = time.time() - start
	log_progress(f" Incoming: {len(in_results)} results in {elapsed:.3f}s", "DEBUG")

	for label, weight in in_results:
	output_md += f"- {label} {rel} → {word} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	if not has_results:
	output_md += "No results\n"

	output_md += "\n"

	progress(1.0, desc="Complete!")

	output_md += "---\n"
	output_md += f"Total relations: {total_relations}\n"

	log_progress(f"Profile complete: {total_relations} relations found", "SUCCESS")

	return output_md

	except Exception as e:
	log_progress(f"Error in semantic profile: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	return f"❌ Error:\n\n```\n{e}\n```"

	def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
	"""Query builder with CORRECT patterns"""
	log_progress(f"Query request: start={start_node}, rel={relation}, end={end_node}, limit={limit}", "DEBUG")
	progress(0, desc="Building query...")

	query = """
	SELECT
	e.id AS edge_id,
	s.id AS start_id,
	r.label AS relation,
	en.id AS end_id,
	e.weight,
	s.label AS start_label,
	en.label AS end_label
	FROM edge e
	JOIN relation r ON e.rel_id = r.id
	JOIN node s ON e.start_id = s.id
	JOIN node en ON e.end_id = en.id
	WHERE 1=1
	"""

	params = []

	try:
	with get_db_connection() as conn:
	progress(0.3, desc="Adding filters...")

	# Language filter - use correct URL pattern!
	lang_conditions = []
	for lang in TARGET_LANGUAGES:
	lang_conditions.append(f"s.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
	lang_conditions.append(f"en.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
	query += f" AND ({' OR '.join(lang_conditions)})"

	# Start node filter
	if start_node and start_node.strip():
	if start_node.startswith('http://'):
	pattern = f"{start_node}%"
	else:
	# User enters just word, we construct full URL
	pattern = f"{CONCEPTNET_BASE}/c/%/{start_node}%"
	query += " AND s.id LIKE ?"
	params.append(pattern)
	log_progress(f"Start filter: {pattern}", "DEBUG")

	# Relation filter
	if relation and relation.strip():
	rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
	if '%' in relation:
	query += " AND r.label LIKE ?"
	else:
	query += " AND r.label = ?"
	params.append(rel_value)
	log_progress(f"Relation filter: {rel_value}", "DEBUG")

	# End node filter
	if end_node and end_node.strip():
	if end_node.startswith('http://'):
	pattern = f"{end_node}%"
	else:
	pattern = f"{CONCEPTNET_BASE}/c/%/{end_node}%"
	query += " AND en.id LIKE ?"
	params.append(pattern)
	log_progress(f"End filter: {pattern}", "DEBUG")

	query += " ORDER BY e.weight DESC LIMIT ?"
	params.append(limit)

	progress(0.6, desc="Executing...")
	log_progress(f"Executing query with {len(params)} params", "DEBUG")

	start_time = time.time()
	df = pd.read_sql_query(query, conn, params=params)
	elapsed = time.time() - start_time

	log_progress(f"Query complete: {len(df)} results in {elapsed:.2f}s", "SUCCESS")

	progress(1.0, desc="Complete!")

	if df.empty:
	return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"

	df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
	return df, f"✅ {len(df)} results in {elapsed:.2f}s"

	except Exception as e:
	log_progress(f"Query error: {e}", "ERROR")
	import traceback
	traceback.print_exc()
	return pd.DataFrame(), f"❌ Error: {e}"

	def run_raw_query(sql_query):
	"""Execute raw SQL with logging"""
	log_progress(f"Raw SQL query: {sql_query[:100]}...", "DEBUG")

	if not sql_query.strip().upper().startswith("SELECT"):
	return pd.DataFrame(), "❌ Only SELECT queries allowed"

	try:
	with get_db_connection() as conn:
	start = time.time()
	df = pd.read_sql_query(sql_query, conn)
	elapsed = time.time() - start

	log_progress(f"Raw query complete: {len(df)} rows in {elapsed:.3f}s", "SUCCESS")

	return df, f"✅ {len(df)} rows in {elapsed:.3f}s"

	except Exception as e:
	log_progress(f"Raw query error: {e}", "ERROR")
	return pd.DataFrame(), f"❌ Error: {e}"

	def get_schema_info():
	"""Get schema with sample queries"""
	log_progress("Loading schema info", "DEBUG")

	md = f"# 📚 Database Schema\n\n"
	md += f"Repository: [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
	md += f"Base URL: `{CONCEPTNET_BASE}`\n\n"

	md += "## Sample Queries\n\n"
	md += "Finding nodes:\n```sql\n"
	md += f"-- English 'dog'\n"
	md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%';\n\n"
	md += f"-- German 'hund'\n"
	md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/de/hund%';\n"
	md += "```\n\n"

	md += "Finding edges:\n```sql\n"
	md += f"-- Edges from 'dog'\n"
	md += f"SELECT * FROM edge WHERE start_id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10;\n"
	md += "```\n\n"

	md += "⚠️ Important: Do NOT use leading `%` in LIKE queries (prevents index usage!)\n\n"
	md += "✅ Good: `LIKE 'http://conceptnet.io/c/en/dog%'`\n"
	md += "❌ Bad: `LIKE '%/c/en/dog%'`\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	md += "## Tables\n\n"

	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")

	for table, in cursor.fetchall():
	cursor.execute(f"SELECT COUNT(*) FROM {table}")
	count = cursor.fetchone()[0]

	md += f"### {table} ({count:,} rows)\n\n"

	# Show columns
	cursor.execute(f"PRAGMA table_info({table})")
	cols = cursor.fetchall()

	md += "\| Column \| Type \|\n\|:--\|:--\|\n"
	for col in cols:
	md += f"\| `{col[1]}` \| `{col[2]}` \|\n"

	# Show indices
	cursor.execute(f"PRAGMA index_list({table})")
	indices = cursor.fetchall()

	if indices:
	md += f"\nIndices ({len(indices)}):\n"
	for idx in indices:
	custom = " 🆕" if idx[1].startswith("idx_") else ""
	md += f"- `{idx[1]}`{custom}\n"

	md += "\n"

	log_progress("Schema loaded successfully", "SUCCESS")

	except Exception as e:
	log_progress(f"Schema error: {e}", "ERROR")
	md += f"\nError loading schema: {e}\n"

	return md

	# UI
	with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 ConceptNet Explorer")
	gr.Markdown(
	f"Multi-language semantic network explorer \| "
	f"Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])} \| "
	f"Repo: [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})"
	)
	gr.Markdown("✅ Optimized with custom indices - Fast queries using correct URL patterns")

	with gr.Tabs():
	with gr.TabItem("🔍 Semantic Profile"):
	gr.Markdown("Explore semantic relations for any word")

	with gr.Row():
	word_input = gr.Textbox(
	label="Word",
	placeholder="dog",
	value="dog",
	info="Enter a word to explore"
	)
	lang_input = gr.Dropdown(
	choices=TARGET_LANGUAGES,
	value="en",
	label="Language",
	info="Select language"
	)

	semantic_btn = gr.Button("🔍 Get Semantic Profile", variant="primary", size="lg")
	semantic_output = gr.Markdown("Enter a word and click the button to start...")

	gr.Markdown("Examples: dog (en), hund (de), perro (es), chien (fr), 犬 (ja)")

	with gr.TabItem("⚡ Query Builder"):
	gr.Markdown("Build custom queries to find specific relationships")

	with gr.Row():
	start_input = gr.Textbox(
	label="Start Node",
	placeholder="dog",
	info="Enter word or full URL"
	)
	rel_input = gr.Textbox(
	label="Relation",
	placeholder="IsA",
	value="IsA",
	info="e.g., IsA, PartOf, UsedFor"
	)
	end_input = gr.Textbox(
	label="End Node",
	placeholder="",
	info="Leave empty for all"
	)

	limit_slider = gr.Slider(
	label="Result Limit",
	minimum=1,
	maximum=200,
	value=50,
	step=1
	)

	query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")

	status_output = gr.Markdown("Ready to query...")
	results_output = gr.DataFrame(
	label="Results",
	wrap=True,
	interactive=False
	)

	with gr.TabItem("💻 Raw SQL"):
	gr.Markdown("Execute custom SQL queries (SELECT only)")

	raw_sql_input = gr.Textbox(
	label="SQL Query",
	value=f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10",
	lines=5,
	info="Write your SELECT query"
	)

	raw_btn = gr.Button("▶️ Execute Query", variant="secondary", size="lg")

	raw_status = gr.Markdown()
	raw_results = gr.DataFrame(label="Query Results", wrap=True)

	gr.Markdown(
	"Tips:\n"
	"- Always use `LIMIT` to prevent timeouts\n"
	f"- Node IDs start with: `{CONCEPTNET_BASE}/c/{{lang}}/{{word}}`\n"
	"- Don't use leading `%` in LIKE queries for best performance"
	)

	with gr.TabItem("📊 Schema & Info"):
	gr.Markdown("Database schema and structure information")

	schema_btn = gr.Button("📊 Load Schema", variant="secondary", size="lg")
	schema_output = gr.Markdown("Click button to load schema...")

	gr.Markdown(
	"---\n"
	"Performance: Custom indices on `edge.start_id`, `edge.end_id`, `edge.rel_id`, `node.label` \| "
	"Check server logs for detailed query timing and diagnostics"
	)

	# Wire up event handlers
	semantic_btn.click(
	fn=get_semantic_profile,
	inputs=[word_input, lang_input],
	outputs=semantic_output
	)

	query_btn.click(
	fn=run_query,
	inputs=[start_input, rel_input, end_input, limit_slider],
	outputs=[results_output, status_output]
	)

	raw_btn.click(
	fn=run_raw_query,
	inputs=raw_sql_input,
	outputs=[raw_results, raw_status]
	)

	schema_btn.click(
	fn=get_schema_info,
	inputs=None,
	outputs=schema_output
	)

	if __name__ == "__main__":
	log_progress("="*60, "SUCCESS")
	log_progress("APP READY!", "SUCCESS")
	log_progress("="*60, "SUCCESS")
	log_progress(f"Database: {DB_PATH}", "INFO")
	log_progress(f"Size: {os.path.getsize(DB_PATH) / (2**30):.2f} GB", "INFO")
	log_progress("="*60 + "\n", "SUCCESS")

	demo.launch(ssr_mode=False)