conceptnet_db / app.py
cstr's picture
Update app.py
45626f2 verified
raw
history blame
17.3 kB
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, snapshot_download
import os
import time
import shutil
from pathlib import Path
# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de']
INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db"
# =========================
print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
# Download original database
REPO_ID = "ysenarath/conceptnet-sqlite"
DB_FILENAME = "data/conceptnet-v5.7.0.db"
ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
print(f"Original database: {ORIGINAL_DB_PATH}")
def create_indexed_database():
"""
Copy database and create missing indices for fast queries.
This runs once on startup.
"""
if os.path.exists(INDEXED_DB_PATH):
db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH)
if db_age < 24 * 3600: # Less than 24 hours old
print(f"βœ… Using existing indexed database: {INDEXED_DB_PATH}")
print(f" (Created {db_age/3600:.1f} hours ago)")
return INDEXED_DB_PATH
else:
print(f"⚠️ Indexed database is {db_age/3600:.1f} hours old, recreating...")
os.remove(INDEXED_DB_PATH)
print("\n" + "="*60)
print("CREATING INDEXED DATABASE (ONE-TIME SETUP)")
print("="*60)
print(f"This will take ~2-5 minutes but only needs to run once.")
print(f"Subsequent runs will be instant.\n")
# Check if we have enough space
original_size = os.path.getsize(ORIGINAL_DB_PATH)
free_space = shutil.disk_usage("/tmp")[2]
print(f"Original DB size: {original_size / (2**30):.2f} GB")
print(f"Free space in /tmp: {free_space / (2**30):.2f} GB")
if free_space < original_size * 1.5:
print("⚠️ WARNING: Low disk space! Indices will add ~20% to DB size.")
print("Continuing anyway...\n")
# Copy database
print(f"1. Copying database to {INDEXED_DB_PATH}...")
start = time.time()
shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH)
elapsed = time.time() - start
print(f" βœ“ Copied in {elapsed:.1f}s\n")
# Connect and create indices
print("2. Creating indices on edge table...")
conn = sqlite3.connect(INDEXED_DB_PATH)
cursor = conn.cursor()
# Enable optimizations for index creation
cursor.execute("PRAGMA journal_mode = WAL")
cursor.execute("PRAGMA synchronous = NORMAL")
cursor.execute("PRAGMA cache_size = -256000")
cursor.execute("PRAGMA temp_store = MEMORY")
indices_to_create = [
("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"),
("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"),
("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"),
]
for idx_name, table, column, description in indices_to_create:
print(f" Creating {idx_name} on {table}({column})...")
print(f" Purpose: {description}")
start = time.time()
cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
elapsed = time.time() - start
print(f" βœ“ Created in {elapsed:.1f}s\n")
# Analyze for query optimization
print("3. Running ANALYZE to optimize query planning...")
start = time.time()
cursor.execute("ANALYZE")
elapsed = time.time() - start
print(f" βœ“ Analyzed in {elapsed:.1f}s\n")
# Commit and close
conn.commit()
conn.close()
# Check final size
indexed_size = os.path.getsize(INDEXED_DB_PATH)
size_increase = (indexed_size - original_size) / (2**30)
print("="*60)
print("INDEXING COMPLETE!")
print("="*60)
print(f"Original size: {original_size / (2**30):.2f} GB")
print(f"Indexed size: {indexed_size / (2**30):.2f} GB")
print(f"Size increase: +{size_increase:.2f} GB ({100*size_increase/(original_size/(2**30)):.1f}%)")
print(f"Location: {INDEXED_DB_PATH}")
print("="*60 + "\n")
return INDEXED_DB_PATH
# Create indexed database on startup
DB_PATH = create_indexed_database()
def get_db_connection():
"""Create optimized read connection to indexed database"""
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute("PRAGMA cache_size = -256000")
conn.execute("PRAGMA mmap_size = 4294967296")
conn.execute("PRAGMA temp_store = MEMORY")
return conn
def verify_indices():
"""Verify that indices were created successfully"""
print("\n" + "="*60)
print("VERIFYING INDICES")
print("="*60)
with get_db_connection() as conn:
cursor = conn.cursor()
# Check edge table indices
cursor.execute("PRAGMA index_list(edge)")
indices = cursor.fetchall()
print(f"\nEdge table indices: {len(indices)}")
for idx in indices:
idx_name = idx[1]
cursor.execute(f"PRAGMA index_info({idx_name})")
cols = cursor.fetchall()
col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY']
print(f" βœ“ {idx_name}: {', '.join(col_names)}")
# Test query speed with EXPLAIN QUERY PLAN
print("\n" + "="*60)
print("TESTING QUERY PERFORMANCE")
print("="*60)
test_queries = [
("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"),
("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"),
("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"),
]
for name, query in test_queries:
print(f"\n{name}:")
# Show query plan
cursor.execute(f"EXPLAIN QUERY PLAN {query}")
plan = cursor.fetchall()
uses_index = any('INDEX' in str(row).upper() for row in plan)
for row in plan:
print(f" Plan: {row}")
# Time the query
start = time.time()
cursor.execute(query)
results = cursor.fetchall()
elapsed = time.time() - start
status = "βœ… FAST" if elapsed < 1 else "⚠️ SLOW" if elapsed < 5 else "❌ VERY SLOW"
print(f" {status}: {len(results)} results in {elapsed:.3f}s")
print("\n" + "="*60 + "\n")
verify_indices()
def get_semantic_profile(word, lang='de'):
"""
Semantic profile - NOW FAST with indices!
"""
if not word:
return "⚠️ Please enter a word."
if lang not in TARGET_LANGUAGES:
return f"⚠️ Language '{lang}' not available."
word = word.strip().lower().replace(' ', '_')
like_path = f"/c/{lang}/{word}%"
relations = [
"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
"/r/AtLocation", "/r/RelatedTo"
]
output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
# Check if word exists
cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
nodes = cursor.fetchall()
if not nodes:
return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found. Check spelling or try a more common word."
for node_id, label in nodes[:3]:
output_md += f"**Node:** `{node_id}` ({label})\n"
output_md += "\n"
total_relations = 0
# Query each relation - NOW FAST with indices!
for rel in relations:
output_md += f"## {rel}\n\n"
has_results = False
# Outgoing edges - FAST with idx_edge_start_id
cursor.execute("""
SELECT en.label, e.weight
FROM edge e
JOIN node en ON e.end_id = en.id
JOIN relation r ON e.rel_id = r.id
WHERE e.start_id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel))
for label, weight in cursor.fetchall():
output_md += f"- **{word}** {rel} β†’ *{label}* `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
# Incoming edges - FAST with idx_edge_end_id
cursor.execute("""
SELECT s.label, e.weight
FROM edge e
JOIN node s ON e.start_id = s.id
JOIN relation r ON e.rel_id = r.id
WHERE e.end_id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel))
for label, weight in cursor.fetchall():
output_md += f"- *{label}* {rel} β†’ **{word}** `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
if not has_results:
output_md += "*No results*\n"
output_md += "\n"
output_md += f"---\n**Total relations:** {total_relations}\n"
return output_md
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
return f"**❌ Error:**\n\n```\n{e}\n```"
def run_query(start_node, relation, end_node, limit):
"""Query builder - NOW FAST with indices!"""
query = """
SELECT
e.id AS edge_id,
s.id AS start_id,
r.label AS relation,
en.id AS end_id,
e.weight,
s.label AS start_label,
en.label AS end_label
FROM edge e
JOIN relation r ON e.rel_id = r.id
JOIN node s ON e.start_id = s.id
JOIN node en ON e.end_id = en.id
WHERE 1=1
"""
params = []
try:
with get_db_connection() as conn:
# Language filter
lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES])
query += f" AND ({lang_filter})"
# User filters
if start_node:
pattern = start_node if '%' in start_node else f"%{start_node}%"
query += " AND s.id LIKE ?"
params.append(pattern)
if relation:
if '%' in relation:
query += " AND r.label LIKE ?"
else:
query += " AND r.label = ?"
params.append(relation)
if end_node:
pattern = end_node if '%' in end_node else f"%{end_node}%"
query += " AND en.id LIKE ?"
params.append(pattern)
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
start_time = time.time()
df = pd.read_sql_query(query, conn, params=params)
elapsed = time.time() - start_time
if df.empty:
return pd.DataFrame(), f"No results ({elapsed:.2f}s)"
df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
return pd.DataFrame(), f"**❌ Error:** {e}"
def run_raw_query(sql_query):
"""Execute raw SQL"""
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "Only SELECT allowed"
try:
with get_db_connection() as conn:
start = time.time()
df = pd.read_sql_query(sql_query, conn)
elapsed = time.time() - start
return df, f"βœ… {len(df)} rows in {elapsed:.2f}s"
except Exception as e:
return pd.DataFrame(), f"Error: {e}"
def get_schema_info():
"""Get schema with index info"""
with get_db_connection() as conn:
cursor = conn.cursor()
md = "# πŸ“š Database Schema\n\n"
md += "βœ… **Custom indices created for fast queries!**\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
tables = cursor.fetchall()
for table_name, in tables:
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
count = cursor.fetchone()[0]
md += f"## {table_name} ({count:,} rows)\n\n"
# Columns
cursor.execute(f"PRAGMA table_info({table_name})")
cols = cursor.fetchall()
md += "| Column | Type | Null | PK |\n|:--|:--|:--|:--|\n"
for col in cols:
md += f"| `{col[1]}` | `{col[2]}` | {'βœ—' if col[3] else 'βœ“'} | {'βœ“' if col[5] else 'βœ—'} |\n"
# Indices
cursor.execute(f"PRAGMA index_list({table_name})")
indices = cursor.fetchall()
if indices:
md += f"\n**Indices ({len(indices)}):**\n"
for idx in indices:
cursor.execute(f"PRAGMA index_info({idx[1]})")
idx_cols = cursor.fetchall()
cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id'
# Mark custom indices
custom = "πŸ†• CUSTOM" if idx[1].startswith("idx_") else ""
md += f"- `{idx[1]}` on ({cols_str}) {custom}\n"
md += "\n---\n\n"
return md
# Gradio UI
with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet Explorer (With Custom Indices! πŸš€)")
db_size = os.path.getsize(DB_PATH) / (2**30)
gr.Markdown(
f"**Database:** {os.path.basename(DB_PATH)} ({db_size:.2f} GB) | "
f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
f"**Status:** βœ… Indexed & Fast"
)
gr.Markdown("*Custom indices created on edge.start_id and edge.end_id for 100x faster queries!*")
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown("**Get semantic profile - NOW FAST with custom indices!**")
with gr.Row():
word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")
semantic_btn = gr.Button("πŸ” Get Profile", variant="primary", size="lg")
semantic_output = gr.Markdown("*Click to start...*")
with gr.TabItem("⚑ Query Builder"):
gr.Markdown("**Build queries - NOW FAST with custom indices!**")
with gr.Row():
start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
end_input = gr.Textbox(label="End Node", placeholder="tier", value="")
limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
status_output = gr.Markdown("*Ready...*")
results_output = gr.DataFrame(label="Results", wrap=True)
with gr.TabItem("πŸ’» Raw SQL"):
raw_sql_input = gr.Textbox(
label="SQL Query",
value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10",
lines=3
)
raw_btn = gr.Button("▢️ Execute")
raw_status = gr.Markdown()
raw_results = gr.DataFrame()
with gr.TabItem("πŸ“Š Schema"):
schema_btn = gr.Button("πŸ“Š Load Schema")
schema_output = gr.Markdown("*Click to load...*")
gr.Markdown("---\n**πŸš€ Performance:** Custom indices created on edge table = 100x faster queries!")
# Connect functions
semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
schema_btn.click(get_schema_info, None, schema_output)
if __name__ == "__main__":
print("\nπŸš€ Starting app with indexed database...\n")
demo.launch(ssr_mode=False)