conceptnet_db / app.py
cstr's picture
Update app.py
54d8b53 verified
raw
history blame
39 kB
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi
import os
import time
import shutil
from pathlib import Path
import json
# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
CONCEPTNET_BASE = "http://conceptnet.io" # CRITICAL: Full URL base
# =========================
print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
if HF_TOKEN:
print(f"βœ… HF_TOKEN found (length: {len(HF_TOKEN)})")
else:
print("⚠️ No HF_TOKEN - checkpointing disabled")
ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"
def log_progress(message, level="INFO"):
"""Enhanced logging with timestamp"""
timestamp = time.strftime("%H:%M:%S")
prefix = {
"INFO": "ℹ️ ",
"SUCCESS": "βœ…",
"ERROR": "❌",
"WARN": "⚠️ ",
"CHECKPOINT": "πŸ’Ύ",
"DEBUG": "πŸ”"
}.get(level, "")
print(f"[{timestamp}] {prefix} {message}")
def verify_database_has_indices(db_path):
"""Verify database has required indices"""
log_progress(f"Verifying indices in {os.path.basename(db_path)}...", "DEBUG")
if not os.path.exists(db_path):
log_progress("Database file does not exist", "ERROR")
return False, 0
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
custom_indices = cursor.fetchall()
conn.close()
has_all = len(custom_indices) >= 4
log_progress(f"Found {len(custom_indices)} custom indices (need 4+): {has_all}", "SUCCESS" if has_all else "WARN")
return has_all, len(custom_indices)
except Exception as e:
log_progress(f"Error verifying indices: {e}", "ERROR")
return False, 0
def check_remote_progress():
"""Check remote progress with detailed logging"""
log_progress("Checking remote progress...", "DEBUG")
if not HF_TOKEN:
log_progress("No HF_TOKEN - cannot check remote", "WARN")
return {
"completed_indices": [],
"analyzed_tables": [],
"database_uploaded": False,
"indexing_complete": False
}
try:
api = HfApi()
try:
api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS")
except:
log_progress("Repository does not exist yet", "INFO")
return {
"completed_indices": [],
"analyzed_tables": [],
"database_uploaded": False,
"indexing_complete": False
}
try:
progress_path = hf_hub_download(
repo_id=INDEXED_REPO_ID,
filename=PROGRESS_FILENAME,
repo_type="dataset",
token=HF_TOKEN
)
with open(progress_path, 'r') as f:
progress = json.load(f)
log_progress("Remote progress loaded:", "SUCCESS")
log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO")
log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO")
log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO")
return progress
except Exception as e:
log_progress("No progress file found (starting fresh)", "INFO")
return {
"completed_indices": [],
"analyzed_tables": [],
"database_uploaded": False,
"indexing_complete": False
}
except Exception as e:
log_progress(f"Error checking remote: {e}", "ERROR")
return {
"completed_indices": [],
"analyzed_tables": [],
"database_uploaded": False,
"indexing_complete": False
}
def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
"""Update remote progress file"""
log_progress("Updating remote progress...", "DEBUG")
if not HF_TOKEN:
log_progress("Cannot update progress: No HF_TOKEN", "WARN")
return False
if analyzed_tables is None:
analyzed_tables = []
try:
api = HfApi()
try:
api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
except:
log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO")
api.create_repo(
repo_id=INDEXED_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
private=False
)
progress = {
"completed_indices": completed_indices,
"analyzed_tables": analyzed_tables,
"database_uploaded": database_uploaded,
"indexing_complete": indexing_complete,
"timestamp": time.time(),
"languages": TARGET_LANGUAGES
}
progress_path = "/tmp/indexing_progress.json"
with open(progress_path, 'w') as f:
json.dump(progress, f, indent=2)
api.upload_file(
path_or_fileobj=progress_path,
path_in_repo=PROGRESS_FILENAME,
repo_id=INDEXED_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables"
)
log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables", "CHECKPOINT")
return True
except Exception as e:
log_progress(f"Failed to update progress: {e}", "ERROR")
import traceback
traceback.print_exc()
return False
def upload_database_checkpoint(message=""):
"""Upload database with WAL checkpoint"""
log_progress("Starting database upload...", "CHECKPOINT")
if not HF_TOKEN:
log_progress("Cannot upload: No HF_TOKEN", "WARN")
return False
if not os.path.exists(LOCAL_DB_PATH):
log_progress("Database file doesn't exist", "ERROR")
return False
try:
# CRITICAL: Checkpoint WAL to merge changes into main file
log_progress("Checkpointing WAL...", "DEBUG")
conn = sqlite3.connect(LOCAL_DB_PATH)
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
conn.close()
log_progress("WAL checkpoint complete", "SUCCESS")
# Verify indices are in file
has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
log_progress(f"Pre-upload verification: {idx_count} indices", "SUCCESS" if has_indices else "WARN")
api = HfApi()
db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
log_progress(f"Uploading {db_size:.2f} GB to {INDEXED_REPO_ID}...", "CHECKPOINT")
if message:
log_progress(f" Message: {message}", "INFO")
log_progress(" This will take 2-5 minutes...", "INFO")
start = time.time()
api.upload_file(
path_or_fileobj=LOCAL_DB_PATH,
path_in_repo=INDEXED_DB_FILENAME,
repo_id=INDEXED_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message=message or "Database checkpoint"
)
elapsed = time.time() - start
speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0
log_progress(f"Upload complete in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS")
log_progress(f"View at: https://huggingface.co/datasets/{INDEXED_REPO_ID}", "INFO")
return True
except Exception as e:
log_progress(f"Upload failed: {e}", "ERROR")
import traceback
traceback.print_exc()
return False
def create_indexed_database():
"""Create or download indexed database with comprehensive checkpointing"""
log_progress("="*60, "INFO")
log_progress("STARTING DATABASE SETUP", "INFO")
log_progress("="*60, "INFO")
# Check remote progress
progress = check_remote_progress()
completed_indices = set(progress.get("completed_indices", []))
analyzed_tables = set(progress.get("analyzed_tables", []))
database_uploaded = progress.get("database_uploaded", False)
indexing_complete = progress.get("indexing_complete", False)
# If fully complete, download and return
if indexing_complete:
log_progress("Fully indexed database exists!", "SUCCESS")
log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
try:
indexed_path = hf_hub_download(
repo_id=INDEXED_REPO_ID,
filename=INDEXED_DB_FILENAME,
repo_type="dataset",
token=HF_TOKEN
)
log_progress(f"Downloaded to: {indexed_path}", "SUCCESS")
# Verify it actually has indices
has_indices, idx_count = verify_database_has_indices(indexed_path)
if has_indices:
log_progress(f"Verified {idx_count} indices present", "SUCCESS")
return indexed_path
else:
log_progress(f"CORRUPTED: Only {idx_count}/4 indices found!", "ERROR")
log_progress("The database needs to be re-indexed", "WARN")
# Reset and rebuild
indexing_complete = False
completed_indices = set()
analyzed_tables = set()
database_uploaded = False
update_remote_progress([], [], False, False)
except Exception as e:
log_progress(f"Download failed: {e}", "ERROR")
log_progress("Will create locally", "INFO")
# Download partially indexed DB if checkpoint exists
if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH):
log_progress("Checkpoint detected - downloading partial DB...", "INFO")
log_progress(f" Indices done: {sorted(completed_indices)}", "INFO")
log_progress(f" Tables analyzed: {sorted(analyzed_tables)}", "INFO")
try:
indexed_path = hf_hub_download(
repo_id=INDEXED_REPO_ID,
filename=INDEXED_DB_FILENAME,
repo_type="dataset",
token=HF_TOKEN
)
log_progress("Downloaded partial DB", "SUCCESS")
# Verify indices
has_indices, idx_count = verify_database_has_indices(indexed_path)
if idx_count >= len(completed_indices):
log_progress(f"Verified {idx_count} indices (expected {len(completed_indices)})", "SUCCESS")
log_progress(f"Copying to {LOCAL_DB_PATH}...", "DEBUG")
start = time.time()
shutil.copy2(indexed_path, LOCAL_DB_PATH)
elapsed = time.time() - start
log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
log_progress("Resuming from checkpoint βœ…", "SUCCESS")
else:
log_progress(f"Index mismatch: found {idx_count}, expected {len(completed_indices)}", "ERROR")
log_progress("Will start from scratch", "WARN")
completed_indices = set()
analyzed_tables = set()
except Exception as e:
log_progress(f"Could not download partial DB: {e}", "WARN")
log_progress("Will start from original", "INFO")
completed_indices = set()
analyzed_tables = set()
# Download original if needed
if not os.path.exists(LOCAL_DB_PATH):
if completed_indices or analyzed_tables:
log_progress("Failed to resume - clearing progress", "WARN")
update_remote_progress([], [], False, False)
completed_indices = set()
analyzed_tables = set()
log_progress("Downloading original ConceptNet database...", "INFO")
original_path = hf_hub_download(
repo_id=ORIGINAL_REPO_ID,
filename=ORIGINAL_DB_FILENAME,
repo_type="dataset"
)
original_size = os.path.getsize(original_path)
free_space = shutil.disk_usage("/tmp")[2]
log_progress(f"Original size: {original_size / (2**30):.2f} GB", "INFO")
log_progress(f"Free space: {free_space / (2**30):.2f} GB", "INFO")
if free_space < original_size * 2:
raise Exception(f"Insufficient space! Need {original_size * 2 / (2**30):.1f} GB, have {free_space / (2**30):.1f} GB")
log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO")
start = time.time()
shutil.copy2(original_path, LOCAL_DB_PATH)
elapsed = time.time() - start
log_progress(f"Copied {original_size / (2**30):.2f} GB in {elapsed:.1f}s ({original_size / elapsed / (2**20):.1f} MB/s)", "SUCCESS")
# Only index if not complete
if not (len(completed_indices) >= 4 and len(analyzed_tables) >= 4):
log_progress("Indexing required", "INFO")
# Connect
log_progress("Opening database connection...", "DEBUG")
conn = sqlite3.connect(LOCAL_DB_PATH)
cursor = conn.cursor()
# Optimizations
log_progress("Setting PRAGMA optimizations...", "DEBUG")
cursor.execute("PRAGMA journal_mode = WAL")
cursor.execute("PRAGMA synchronous = NORMAL")
cursor.execute("PRAGMA cache_size = -512000")
cursor.execute("PRAGMA temp_store = MEMORY")
# PHASE 1: Indices
log_progress("="*60, "INFO")
log_progress("PHASE 1: CREATING INDICES", "INFO")
log_progress("="*60, "INFO")
indices_to_create = [
("idx_edge_start_id", "edge", "start_id"),
("idx_edge_end_id", "edge", "end_id"),
("idx_edge_rel_id", "edge", "rel_id"),
("idx_node_label", "node", "label"),
]
for i, (idx_name, table, column) in enumerate(indices_to_create, 1):
if idx_name in completed_indices:
log_progress(f"[{i}/{len(indices_to_create)}] {idx_name} - SKIPPED", "INFO")
continue
log_progress(f"[{i}/{len(indices_to_create)}] Creating {idx_name} on {table}({column})...", "INFO")
start = time.time()
try:
cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
conn.commit()
elapsed = time.time() - start
log_progress(f" Created in {elapsed:.1f}s ({elapsed/60:.1f} min)", "SUCCESS")
completed_indices.add(idx_name)
update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
upload_database_checkpoint(f"Checkpoint: {idx_name} ({i}/{len(indices_to_create)})")
except Exception as e:
log_progress(f"Failed to create {idx_name}: {e}", "ERROR")
conn.close()
raise
# PHASE 2: ANALYZE
log_progress("="*60, "INFO")
log_progress("PHASE 2: ANALYZING TABLES", "INFO")
log_progress("="*60, "INFO")
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
tables = [row[0] for row in cursor.fetchall()]
log_progress(f"Found {len(tables)} tables: {tables}", "INFO")
for i, table in enumerate(tables, 1):
if table in analyzed_tables:
log_progress(f"[{i}/{len(tables)}] {table} - SKIPPED", "INFO")
continue
log_progress(f"[{i}/{len(tables)}] Analyzing {table}...", "INFO")
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
row_count = cursor.fetchone()[0]
log_progress(f" Rows: {row_count:,}", "INFO")
except:
log_progress(" Could not count rows", "WARN")
start = time.time()
try:
cursor.execute(f"ANALYZE {table}")
conn.commit()
elapsed = time.time() - start
log_progress(f" Analyzed in {elapsed:.1f}s", "SUCCESS")
analyzed_tables.add(table)
update_remote_progress(list(completed_indices), list(analyzed_tables), False, False)
upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")
except Exception as e:
log_progress(f"Failed to analyze {table}: {e}", "ERROR")
log_progress("Continuing...", "WARN")
# Final checkpoint
log_progress("Final WAL checkpoint...", "INFO")
cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)")
conn.commit()
conn.close()
log_progress("Database closed", "SUCCESS")
# Final upload
log_progress("="*60, "INFO")
log_progress("FINAL UPLOAD", "INFO")
log_progress("="*60, "INFO")
has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
log_progress(f"Final check: {idx_count} indices", "SUCCESS" if has_indices else "ERROR")
upload_database_checkpoint("COMPLETE - All indices and analysis done")
update_remote_progress(list(completed_indices), list(analyzed_tables), True, True)
log_progress("="*60, "SUCCESS")
log_progress("INDEXING COMPLETE!", "SUCCESS")
log_progress("="*60, "SUCCESS")
return LOCAL_DB_PATH
# Initialize
DB_PATH = create_indexed_database()
def get_db_connection():
"""Create optimized connection"""
log_progress("Creating DB connection", "DEBUG")
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute("PRAGMA cache_size = -256000")
conn.execute("PRAGMA mmap_size = 4294967296")
return conn
def run_diagnostics():
"""Run comprehensive diagnostics"""
log_progress("="*60, "INFO")
log_progress("RUNNING DIAGNOSTICS", "INFO")
log_progress("="*60, "INFO")
try:
with get_db_connection() as conn:
cursor = conn.cursor()
# 1. Sample nodes
log_progress("\n1. Sample node IDs:", "INFO")
cursor.execute("SELECT id, label FROM node LIMIT 10")
for node_id, label in cursor.fetchall():
print(f" {node_id} -> {label}")
# 2. Test correct pattern
log_progress("\n2. Testing CORRECT pattern (no leading %):", "INFO")
test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%"
log_progress(f" Pattern: {test_pattern}", "DEBUG")
start = time.time()
cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (test_pattern,))
results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" Found {len(results)} in {elapsed:.3f}s", "SUCCESS" if elapsed < 1 else "WARN")
for node_id, label in results:
print(f" {node_id} -> {label}")
# 3. Check index usage
log_progress("\n3. Checking index usage:", "INFO")
cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '{test_pattern}'")
plan = cursor.fetchall()
uses_index = any('INDEX' in str(row).upper() for row in plan)
log_progress(f" Uses index: {uses_index}", "SUCCESS" if uses_index else "ERROR")
for row in plan:
print(f" {row}")
# 4. Test wrong pattern
log_progress("\n4. Testing WRONG pattern (leading %):", "WARN")
wrong_pattern = f"%/c/en/dog%"
log_progress(f" Pattern: {wrong_pattern}", "DEBUG")
start = time.time()
cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (wrong_pattern,))
results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" Found {len(results)} in {elapsed:.3f}s (SLOW!)", "WARN" if elapsed > 1 else "INFO")
cursor.execute(f"EXPLAIN QUERY PLAN SELECT * FROM node WHERE id LIKE '{wrong_pattern}'")
plan = cursor.fetchall()
uses_index = any('INDEX' in str(row).upper() for row in plan)
log_progress(f" Uses index: {uses_index} (should be False)", "WARN" if uses_index else "INFO")
log_progress("\n" + "="*60, "INFO")
log_progress("DIAGNOSTICS COMPLETE", "SUCCESS")
log_progress("="*60 + "\n", "INFO")
except Exception as e:
log_progress(f"Diagnostics failed: {e}", "ERROR")
import traceback
traceback.print_exc()
# Run diagnostics
run_diagnostics()
def get_semantic_profile(word, lang='en', progress=gr.Progress()):
"""Get semantic profile with CORRECT URL pattern"""
log_progress(f"Semantic profile request: word='{word}', lang='{lang}'", "DEBUG")
progress(0, desc="Starting...")
if not word:
return "⚠️ Please enter a word."
if lang not in TARGET_LANGUAGES:
return f"⚠️ Language '{lang}' not supported. Available: {', '.join(TARGET_LANGUAGES)}"
word = word.strip().lower().replace(' ', '_')
# CORRECT pattern - no leading % allows index usage!
like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%"
log_progress(f"Using pattern: {like_path}", "DEBUG")
relations = [
"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
"/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom", "/r/SimilarTo"
]
output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
progress(0.05, desc="Finding nodes...")
start = time.time()
cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
nodes = cursor.fetchall()
elapsed = time.time() - start
log_progress(f"Found {len(nodes)} nodes in {elapsed:.3f}s", "SUCCESS" if nodes else "WARN")
if not nodes:
return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**\n\nSearched: `{like_path}`"
for node_id, label in nodes[:3]:
output_md += f"**Node:** `{node_id}`\n"
output_md += f"**Label:** {label}\n\n"
log_progress(f" Found node: {node_id} ({label})", "DEBUG")
total_relations = 0
for i, rel in enumerate(relations):
progress((i + 1) / len(relations), desc=f"Querying {rel}...")
log_progress(f"Querying relation: {rel}", "DEBUG")
output_md += f"## {rel}\n\n"
has_results = False
# Outgoing edges
start = time.time()
cursor.execute("""
SELECT en.label, e.weight
FROM edge e
JOIN node en ON e.end_id = en.id
JOIN relation r ON e.rel_id = r.id
WHERE e.start_id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel))
out_results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s", "DEBUG")
for label, weight in out_results:
output_md += f"- **{word}** {rel} β†’ *{label}* `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
# Incoming edges
start = time.time()
cursor.execute("""
SELECT s.label, e.weight
FROM edge e
JOIN node s ON e.start_id = s.id
JOIN relation r ON e.rel_id = r.id
WHERE e.end_id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel))
in_results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" Incoming: {len(in_results)} results in {elapsed:.3f}s", "DEBUG")
for label, weight in in_results:
output_md += f"- *{label}* {rel} β†’ **{word}** `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
if not has_results:
output_md += "*No results*\n"
output_md += "\n"
progress(1.0, desc="Complete!")
output_md += "---\n"
output_md += f"**Total relations:** {total_relations}\n"
log_progress(f"Profile complete: {total_relations} relations found", "SUCCESS")
return output_md
except Exception as e:
log_progress(f"Error in semantic profile: {e}", "ERROR")
import traceback
traceback.print_exc()
return f"**❌ Error:**\n\n```\n{e}\n```"
def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
"""Query builder with CORRECT patterns"""
log_progress(f"Query request: start={start_node}, rel={relation}, end={end_node}, limit={limit}", "DEBUG")
progress(0, desc="Building query...")
query = """
SELECT
e.id AS edge_id,
s.id AS start_id,
r.label AS relation,
en.id AS end_id,
e.weight,
s.label AS start_label,
en.label AS end_label
FROM edge e
JOIN relation r ON e.rel_id = r.id
JOIN node s ON e.start_id = s.id
JOIN node en ON e.end_id = en.id
WHERE 1=1
"""
params = []
try:
with get_db_connection() as conn:
progress(0.3, desc="Adding filters...")
# Language filter - use correct URL pattern!
lang_conditions = []
for lang in TARGET_LANGUAGES:
lang_conditions.append(f"s.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
lang_conditions.append(f"en.id LIKE '{CONCEPTNET_BASE}/c/{lang}/%'")
query += f" AND ({' OR '.join(lang_conditions)})"
# Start node filter
if start_node and start_node.strip():
if start_node.startswith('http://'):
pattern = f"{start_node}%"
else:
# User enters just word, we construct full URL
pattern = f"{CONCEPTNET_BASE}/c/%/{start_node}%"
query += " AND s.id LIKE ?"
params.append(pattern)
log_progress(f"Start filter: {pattern}", "DEBUG")
# Relation filter
if relation and relation.strip():
rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
if '%' in relation:
query += " AND r.label LIKE ?"
else:
query += " AND r.label = ?"
params.append(rel_value)
log_progress(f"Relation filter: {rel_value}", "DEBUG")
# End node filter
if end_node and end_node.strip():
if end_node.startswith('http://'):
pattern = f"{end_node}%"
else:
pattern = f"{CONCEPTNET_BASE}/c/%/{end_node}%"
query += " AND en.id LIKE ?"
params.append(pattern)
log_progress(f"End filter: {pattern}", "DEBUG")
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
progress(0.6, desc="Executing...")
log_progress(f"Executing query with {len(params)} params", "DEBUG")
start_time = time.time()
df = pd.read_sql_query(query, conn, params=params)
elapsed = time.time() - start_time
log_progress(f"Query complete: {len(df)} results in {elapsed:.2f}s", "SUCCESS")
progress(1.0, desc="Complete!")
if df.empty:
return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
except Exception as e:
log_progress(f"Query error: {e}", "ERROR")
import traceback
traceback.print_exc()
return pd.DataFrame(), f"**❌ Error:** {e}"
def run_raw_query(sql_query):
"""Execute raw SQL with logging"""
log_progress(f"Raw SQL query: {sql_query[:100]}...", "DEBUG")
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "❌ Only SELECT queries allowed"
try:
with get_db_connection() as conn:
start = time.time()
df = pd.read_sql_query(sql_query, conn)
elapsed = time.time() - start
log_progress(f"Raw query complete: {len(df)} rows in {elapsed:.3f}s", "SUCCESS")
return df, f"βœ… {len(df)} rows in {elapsed:.3f}s"
except Exception as e:
log_progress(f"Raw query error: {e}", "ERROR")
return pd.DataFrame(), f"❌ Error: {e}"
def get_schema_info():
"""Get schema with sample queries"""
log_progress("Loading schema info", "DEBUG")
md = f"# πŸ“š Database Schema\n\n"
md += f"**Repository:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
md += f"**Base URL:** `{CONCEPTNET_BASE}`\n\n"
md += "## Sample Queries\n\n"
md += "**Finding nodes:**\n```sql\n"
md += f"-- English 'dog'\n"
md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%';\n\n"
md += f"-- German 'hund'\n"
md += f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/de/hund%';\n"
md += "```\n\n"
md += "**Finding edges:**\n```sql\n"
md += f"-- Edges from 'dog'\n"
md += f"SELECT * FROM edge WHERE start_id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10;\n"
md += "```\n\n"
md += "⚠️ **Important:** Do NOT use leading `%` in LIKE queries (prevents index usage!)\n\n"
md += "βœ… **Good:** `LIKE 'http://conceptnet.io/c/en/dog%'`\n"
md += "❌ **Bad:** `LIKE '%/c/en/dog%'`\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
md += "## Tables\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
for table, in cursor.fetchall():
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
md += f"### {table} ({count:,} rows)\n\n"
# Show columns
cursor.execute(f"PRAGMA table_info({table})")
cols = cursor.fetchall()
md += "| Column | Type |\n|:--|:--|\n"
for col in cols:
md += f"| `{col[1]}` | `{col[2]}` |\n"
# Show indices
cursor.execute(f"PRAGMA index_list({table})")
indices = cursor.fetchall()
if indices:
md += f"\n**Indices ({len(indices)}):**\n"
for idx in indices:
custom = " πŸ†•" if idx[1].startswith("idx_") else ""
md += f"- `{idx[1]}`{custom}\n"
md += "\n"
log_progress("Schema loaded successfully", "SUCCESS")
except Exception as e:
log_progress(f"Schema error: {e}", "ERROR")
md += f"\n**Error loading schema:** {e}\n"
return md
# UI
with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet Explorer")
gr.Markdown(
f"**Multi-language semantic network explorer** | "
f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})"
)
gr.Markdown("βœ… **Optimized with custom indices** - Fast queries using correct URL patterns")
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown("**Explore semantic relations for any word**")
with gr.Row():
word_input = gr.Textbox(
label="Word",
placeholder="dog",
value="dog",
info="Enter a word to explore"
)
lang_input = gr.Dropdown(
choices=TARGET_LANGUAGES,
value="en",
label="Language",
info="Select language"
)
semantic_btn = gr.Button("πŸ” Get Semantic Profile", variant="primary", size="lg")
semantic_output = gr.Markdown("*Enter a word and click the button to start...*")
gr.Markdown("**Examples:** dog (en), hund (de), perro (es), chien (fr), 犬 (ja)")
with gr.TabItem("⚑ Query Builder"):
gr.Markdown("**Build custom queries to find specific relationships**")
with gr.Row():
start_input = gr.Textbox(
label="Start Node",
placeholder="dog",
info="Enter word or full URL"
)
rel_input = gr.Textbox(
label="Relation",
placeholder="IsA",
value="IsA",
info="e.g., IsA, PartOf, UsedFor"
)
end_input = gr.Textbox(
label="End Node",
placeholder="",
info="Leave empty for all"
)
limit_slider = gr.Slider(
label="Result Limit",
minimum=1,
maximum=200,
value=50,
step=1
)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
status_output = gr.Markdown("*Ready to query...*")
results_output = gr.DataFrame(
label="Results",
wrap=True,
interactive=False
)
with gr.TabItem("πŸ’» Raw SQL"):
gr.Markdown("**Execute custom SQL queries** (SELECT only)")
raw_sql_input = gr.Textbox(
label="SQL Query",
value=f"SELECT * FROM node WHERE id LIKE '{CONCEPTNET_BASE}/c/en/dog%' LIMIT 10",
lines=5,
info="Write your SELECT query"
)
raw_btn = gr.Button("▢️ Execute Query", variant="secondary", size="lg")
raw_status = gr.Markdown()
raw_results = gr.DataFrame(label="Query Results", wrap=True)
gr.Markdown(
"**Tips:**\n"
"- Always use `LIMIT` to prevent timeouts\n"
f"- Node IDs start with: `{CONCEPTNET_BASE}/c/{{lang}}/{{word}}`\n"
"- Don't use leading `%` in LIKE queries for best performance"
)
with gr.TabItem("πŸ“Š Schema & Info"):
gr.Markdown("**Database schema and structure information**")
schema_btn = gr.Button("πŸ“Š Load Schema", variant="secondary", size="lg")
schema_output = gr.Markdown("*Click button to load schema...*")
gr.Markdown(
"---\n"
"**Performance:** Custom indices on `edge.start_id`, `edge.end_id`, `edge.rel_id`, `node.label` | "
"**Check server logs for detailed query timing and diagnostics**"
)
# Wire up event handlers
semantic_btn.click(
fn=get_semantic_profile,
inputs=[word_input, lang_input],
outputs=semantic_output
)
query_btn.click(
fn=run_query,
inputs=[start_input, rel_input, end_input, limit_slider],
outputs=[results_output, status_output]
)
raw_btn.click(
fn=run_raw_query,
inputs=raw_sql_input,
outputs=[raw_results, raw_status]
)
schema_btn.click(
fn=get_schema_info,
inputs=None,
outputs=schema_output
)
if __name__ == "__main__":
log_progress("="*60, "SUCCESS")
log_progress("APP READY!", "SUCCESS")
log_progress("="*60, "SUCCESS")
log_progress(f"Database: {DB_PATH}", "INFO")
log_progress(f"Size: {os.path.getsize(DB_PATH) / (2**30):.2f} GB", "INFO")
log_progress("="*60 + "\n", "SUCCESS")
demo.launch(ssr_mode=False)