conceptnet_db / app.py
cstr's picture
Update app.py
7ff8eef verified
raw
history blame
26.9 kB
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, snapshot_download
import os
import traceback
from pathlib import Path
# --- 1. Download and Cache the Database with Indices ---
print("Downloading ConceptNet database and indices...")
REPO_ID = "ysenarath/conceptnet-sqlite"
DB_FILENAME = "data/conceptnet-v5.7.0.db"
INDEX_FOLDER = "data/conceptnet-v5.7.0-index"
VOCAB_DB = "data/conceptnet-v5.7.0-vocab.db"
# Download the main database
DB_PATH = hf_hub_download(
repo_id=REPO_ID,
filename=DB_FILENAME,
repo_type="dataset"
)
print(f"Main database downloaded to: {DB_PATH}")
# Download the vocabulary database (optional but helpful)
try:
VOCAB_PATH = hf_hub_download(
repo_id=REPO_ID,
filename=VOCAB_DB,
repo_type="dataset"
)
print(f"Vocabulary database downloaded to: {VOCAB_PATH}")
except Exception as e:
print(f"Could not download vocabulary DB: {e}")
VOCAB_PATH = None
# Download the entire index folder for better performance
try:
# Use snapshot_download to get the entire data directory with indices
CACHE_DIR = snapshot_download(
repo_id=REPO_ID,
repo_type="dataset",
allow_patterns=["data/conceptnet-v5.7.0-index/*"]
)
INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
print(f"Index files downloaded to: {INDEX_PATH}")
# Count index files
if os.path.exists(INDEX_PATH):
index_files = list(Path(INDEX_PATH).glob("*.ldb"))
print(f"Found {len(index_files)} index files (.ldb)")
except Exception as e:
print(f"Could not download index files: {e}")
INDEX_PATH = None
# --- 2. Database Helper Functions ---
def get_db_connection():
"""
Creates a new read-only connection to the SQLite database with optimizations.
"""
try:
db_uri = f"file:{DB_PATH}?mode=ro"
conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
# Apply PRAGMA optimizations for read performance
pragmas = [
"PRAGMA query_only = ON", # Read-only mode
"PRAGMA temp_store = MEMORY", # Use memory for temp tables
"PRAGMA cache_size = -128000", # 128MB cache (negative = KB)
"PRAGMA page_size = 8192", # Larger page size for better I/O
"PRAGMA mmap_size = 2147483648", # 2GB memory-mapped I/O
"PRAGMA synchronous = OFF", # Safe for read-only
"PRAGMA journal_mode = OFF", # No journal needed for read-only
"PRAGMA locking_mode = NORMAL", # Allow multiple readers
"PRAGMA threads = 4", # Use multiple threads
]
for pragma in pragmas:
try:
conn.execute(pragma)
except sqlite3.OperationalError as e:
print(f"Warning: Could not apply {pragma}: {e}")
return conn
except Exception as e:
print(f"Error connecting to DB: {e}")
traceback.print_exc()
return None
def verify_indices():
"""
Check and report on database indices and their usage.
"""
print("\n=== Database Index Analysis ===")
try:
with get_db_connection() as conn:
cursor = conn.cursor()
# Check all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
tables = cursor.fetchall()
total_indices = 0
for table in tables:
table_name = table[0]
# Get indices for this table
cursor.execute(f"PRAGMA index_list({table_name})")
indices = cursor.fetchall()
if indices:
print(f"\nπŸ“Š Table: {table_name}")
for idx in indices:
idx_name, unique, origin = idx[1], idx[2], idx[3]
# Get index details
cursor.execute(f"PRAGMA index_info({idx_name})")
idx_cols = cursor.fetchall()
cols = [col[2] for col in idx_cols]
unique_str = "UNIQUE" if unique else "NON-UNIQUE"
print(f" β”œβ”€ {idx_name} ({unique_str}) on columns: {', '.join(cols)}")
total_indices += 1
print(f"\nβœ… Total indices found: {total_indices}")
# Check if FTS (Full Text Search) is available
cursor.execute("SELECT * FROM pragma_compile_options WHERE compile_options LIKE '%FTS%'")
fts = cursor.fetchall()
if fts:
print(f"βœ… Full-Text Search enabled: {[f[0] for f in fts]}")
# Check database page size and cache
cursor.execute("PRAGMA page_size")
page_size = cursor.fetchone()[0]
cursor.execute("PRAGMA cache_size")
cache_size = cursor.fetchone()[0]
print(f"\nπŸ“ˆ Page size: {page_size} bytes")
print(f"πŸ“ˆ Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"πŸ“ˆ Cache size: {cache_size} pages")
# Get database size
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
db_size = cursor.fetchone()[0]
print(f"πŸ“¦ Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")
except Exception as e:
print(f"Error in verify_indices: {e}")
traceback.print_exc()
def get_schema_info():
"""
Dynamically queries the SQLite database to get its schema with index information.
"""
print("Getting schema info...")
schema_md = "# πŸ“š Database Schema\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
# Get database stats
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
db_size = cursor.fetchone()[0]
schema_md += f"**Database Size:** {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
tables = cursor.fetchall()
if not tables:
return "Could not find any tables in the database."
for table in tables:
table_name = table[0]
# Get row count
try:
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
row_count = cursor.fetchone()[0]
schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
except:
schema_md += f"## Table: `{table_name}`\n\n"
schema_md += "### Columns\n\n"
schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
schema_md += "|:------------|:----------|:---------|:------------|\n"
cursor.execute(f"PRAGMA table_info({table_name});")
columns = cursor.fetchall()
for col in columns:
name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
schema_md += f"| `{name}` | `{dtype}` | {'βœ“' if notnull else 'βœ—'} | {'βœ“' if pk else 'βœ—'} |\n"
# Show indices with details
cursor.execute(f"PRAGMA index_list({table_name});")
indices = cursor.fetchall()
if indices:
schema_md += f"\n### Indices ({len(indices)})\n\n"
for idx in indices:
idx_name, unique, origin = idx[1], idx[2], idx[3]
# Get indexed columns
cursor.execute(f"PRAGMA index_info({idx_name});")
idx_cols = cursor.fetchall()
cols = [col[2] for col in idx_cols if col[2]]
unique_badge = "πŸ”’ UNIQUE" if unique else "πŸ“‘ INDEX"
schema_md += f"- **{idx_name}** {unique_badge}\n"
schema_md += f" - Columns: `{', '.join(cols) if cols else 'N/A'}`\n"
schema_md += f" - Origin: {origin}\n"
schema_md += "\n---\n\n"
return schema_md
except Exception as e:
print(f"Error in get_schema_info: {e}")
traceback.print_exc()
return f"An error occurred while fetching schema: {e}"
def run_query(start_node, relation, end_node, limit):
"""
OPTIMIZED: Uses direct JOINs with indexed columns for maximum performance.
"""
print(f"Running query: start='{start_node}', rel='{relation}', end='{end_node}'")
# Build query using indexed JOIN approach
query = """
SELECT
e.id AS edge_id,
s.id AS start_id_path,
r.label AS relation_label,
en.id AS end_id_path,
e.weight,
e.dataset,
e.surface_text,
s.label AS start_label_text,
en.label AS end_label_text
FROM edge AS e
INNER JOIN relation AS r ON e.rel_id = r.id
INNER JOIN node AS s ON e.start_id = s.id
INNER JOIN node AS en ON e.end_id = en.id
"""
where_conditions = []
params = []
try:
# Build WHERE conditions leveraging indices
if start_node:
if "%" in start_node:
where_conditions.append("s.id LIKE ?")
params.append(start_node)
else:
# Exact match or prefix match
where_conditions.append("s.id LIKE ?")
params.append(f"%{start_node}%")
if relation:
if "%" in relation:
where_conditions.append("r.label LIKE ?")
params.append(relation)
else:
# Exact match is faster
where_conditions.append("r.label = ?")
params.append(relation)
if end_node:
if "%" in end_node:
where_conditions.append("en.id LIKE ?")
params.append(end_node)
else:
where_conditions.append("en.id LIKE ?")
params.append(f"%{end_node}%")
if where_conditions:
query += " WHERE " + " AND ".join(where_conditions)
# Order by weight to get most relevant results first
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
print(f"Executing SQL with {len(params)} parameters")
with get_db_connection() as conn:
# Use EXPLAIN QUERY PLAN to verify index usage (for debugging)
explain_query = "EXPLAIN QUERY PLAN " + query
try:
explain_result = conn.execute(explain_query, params).fetchall()
print("Query Plan:")
for row in explain_result:
print(f" {row}")
except:
pass
# Execute actual query
df = pd.read_sql_query(query, conn, params=params)
if df.empty:
return pd.DataFrame(), "Query ran successfully but returned no results. Try broader search terms or check spelling."
return df, f"βœ… Query successful! Found {len(df)} results (ordered by relevance)."
except Exception as e:
print(f"Error in run_query: {e}")
traceback.print_exc()
err_msg = f"**❌ Query Failed!**\n\n```\n{e}\n```"
return pd.DataFrame(), err_msg
def run_raw_query(sql_query):
"""
Executes a raw, read-only SQL query with query plan analysis.
"""
print(f"Running raw query: {sql_query[:100]}...")
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "**Error:** Only `SELECT` statements are allowed for safety."
try:
with get_db_connection() as conn:
# Show query plan
try:
explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
print("Query Plan:")
for row in explain_result:
print(f" {row}")
except:
pass
df = pd.read_sql_query(sql_query, conn)
if df.empty:
return df, "Query ran successfully but returned no results."
return df, f"βœ… Query successful! Returned {len(df)} rows."
except Exception as e:
print(f"Error in run_raw_query: {e}")
traceback.print_exc()
return pd.DataFrame(), f"**❌ Query Failed!**\n\n```\n{e}\n```"
def get_semantic_profile(word, lang='en'):
"""
HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
Uses indexed columns for maximum speed.
"""
if not word:
return "⚠️ Please enter a word."
word = word.strip().lower().replace(' ', '_')
like_path = f"/c/{lang}/{word}%"
print(f"Getting semantic profile for: {like_path}")
# Most important relations for semantic understanding
relations_to_check = [
"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
"/r/AtLocation", "/r/RelatedTo", "/r/DerivedFrom"
]
output_md = f"# 🧠 Semantic Profile: '{word}'\n"
output_md += f"**Language:** {lang.upper()} | **Search Pattern:** `{like_path}`\n\n"
try:
with get_db_connection() as conn:
# MEGA-OPTIMIZED: Single UNION ALL query for all relations
union_parts = []
union_params = []
for rel in relations_to_check:
# Outgoing edges (word as subject)
union_parts.append("""
SELECT
? as rel_label,
'out' as dir,
en.id as target_id,
en.label as target_label,
e.weight as weight
FROM edge e
INDEXED BY (SELECT name FROM pragma_index_list('edge') LIMIT 1)
INNER JOIN node s ON e.start_id = s.id
INNER JOIN node en ON e.end_id = en.id
INNER JOIN relation r ON e.rel_id = r.id
WHERE s.id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""")
union_params.extend([rel, like_path, rel])
# Incoming edges (word as object)
union_parts.append("""
SELECT
? as rel_label,
'in' as dir,
s.id as target_id,
s.label as target_label,
e.weight as weight
FROM edge e
INNER JOIN node s ON e.start_id = s.id
INNER JOIN node en ON e.end_id = en.id
INNER JOIN relation r ON e.rel_id = r.id
WHERE en.id LIKE ? AND r.label = ?
ORDER BY e.weight DESC
LIMIT 7
""")
union_params.extend([rel, like_path, rel])
# Execute the mega-query
full_query = " UNION ALL ".join(union_parts)
print(f"Executing optimized semantic profile query...")
cursor = conn.execute(full_query, union_params)
results = cursor.fetchall()
if not results:
return f"""# 🧠 Semantic Profile: '{word}'
⚠️ **No results found**
This could mean:
1. The word isn't in ConceptNet for language `{lang}`
2. Try checking spelling: `{word}`
3. Try language code 'en' for English
4. Try a more common/simpler word
**Tip:** Use the Query Builder to search manually."""
# Group and format results
current_rel = None
rel_results = []
total_relations = 0
for rel_label, direction, target_id, target_label, weight in results:
if rel_label != current_rel:
if current_rel is not None:
# Write previous relation
output_md += f"## {current_rel}\n\n"
if rel_results:
for line in rel_results:
output_md += line
total_relations += len(rel_results)
else:
output_md += "*No results*\n"
output_md += "\n"
current_rel = rel_label
rel_results = []
# Format output
weight_str = f"{weight:.3f}"
if direction == 'out':
rel_results.append(
f"- **{word}** {rel_label} β†’ *{target_label}* "
f"`[{weight_str}]`\n"
)
else:
rel_results.append(
f"- *{target_label}* {rel_label} β†’ **{word}** "
f"`[{weight_str}]`\n"
)
# Write last relation
if current_rel is not None:
output_md += f"## {current_rel}\n\n"
if rel_results:
for line in rel_results:
output_md += line
total_relations += len(rel_results)
else:
output_md += "*No results*\n"
output_md += "\n"
output_md += "---\n"
output_md += f"**Total relations found:** {total_relations}\n"
output_md += f"*Weight indicates strength of association (higher = stronger)*\n"
return output_md
except Exception as e:
print(f"Error in get_semantic_profile: {e}")
traceback.print_exc()
return f"**❌ An error occurred:**\n\n```\n{e}\n```"
# --- 3. Build the Gradio UI ---
# Verify indices on startup
verify_indices()
with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet SQLite Explorer")
gr.Markdown(
f"**Database:** `{os.path.basename(DB_PATH)}` ({os.path.getsize(DB_PATH) / 1024 / 1024 / 1024:.2f} GB) | "
f"**Status:** {'βœ… Indices Loaded' if INDEX_PATH and os.path.exists(INDEX_PATH) else '⚠️ No Index Cache'}"
)
gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown(
"**Get a comprehensive semantic profile for any word.**\n\n"
"Queries common relations: IsA, HasA, UsedFor, CapableOf, Causes, HasProperty, and more.\n"
)
with gr.Row():
with gr.Column(scale=2):
word_input = gr.Textbox(
label="Word",
placeholder="dog",
info="Single word or phrase (use underscores for phrases)"
)
with gr.Column(scale=1):
lang_input = gr.Textbox(
label="Language",
value="en",
placeholder="en",
info="ISO code (en, de, es, fr, ja, zh, etc.)"
)
with gr.Row():
semantic_btn = gr.Button("πŸ” Get Semantic Profile", variant="primary", size="lg")
with gr.Accordion("πŸ“š Example Words", open=False):
gr.Markdown(
"**English (en):** dog, cat, computer, love, happiness, run\n\n"
"**German (de):** hund, katze, liebe, glΓΌck\n\n"
"**Spanish (es):** perro, gato, amor, felicidad\n\n"
"**French (fr):** chien, chat, amour, bonheur\n\n"
"**Japanese (ja):** 犬, 猫, ζ„›, 幸せ"
)
semantic_output = gr.Markdown("*Click 'Get Semantic Profile' to start...*")
with gr.TabItem("⚑ Query Builder"):
gr.Markdown(
"**Build custom queries using ConceptNet's graph structure.**\n\n"
"Find edges connecting concepts through specific relations. Leverages database indices for fast lookups."
)
with gr.Row():
start_input = gr.Textbox(
label="Start Node",
placeholder="dog (or /c/en/dog)",
info="Leave empty for any"
)
rel_input = gr.Textbox(
label="Relation",
placeholder="IsA (or /r/IsA)",
info="Leave empty for any"
)
end_input = gr.Textbox(
label="End Node",
placeholder="animal (or /c/en/animal)",
info="Leave empty for any"
)
limit_slider = gr.Slider(
label="Results Limit",
minimum=1,
maximum=500,
value=50,
step=1,
info="Higher limits may be slower"
)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
with gr.Accordion("πŸ’‘ Query Tips & Examples", open=False):
gr.Markdown(
"**Tips:**\n"
"- Omit `/c/en/` prefix - it's added automatically\n"
"- Use `%` as wildcard: `%dog%` matches 'hotdog', 'doghouse'\n"
"- More specific = faster queries\n\n"
"**Examples:**\n"
"1. What can dogs do? β†’ Start: `dog`, Relation: `CapableOf`, End: *empty*\n"
"2. What is a dog? β†’ Start: `dog`, Relation: `IsA`, End: *empty*\n"
"3. Things at home β†’ Start: *empty*, Relation: `AtLocation`, End: `home`\n"
"4. Synonyms of happy β†’ Start: `happy`, Relation: `Synonym`, End: *empty*"
)
status_output = gr.Markdown("*Ready to query...*")
results_output = gr.DataFrame(label="Query Results", interactive=False, wrap=True)
with gr.TabItem("πŸ’» Raw SQL"):
gr.Markdown(
"**Advanced:** Execute custom SQL queries.\n\n"
"⚠️ Only `SELECT` statements allowed. Check Schema tab for table structure."
)
raw_sql_input = gr.Textbox(
label="SQL Query",
placeholder="SELECT s.label, r.label, e.label FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id JOIN node e ON e.end_id = e.id WHERE s.label = 'dog' LIMIT 10",
lines=6,
info="Write SELECT query"
)
with gr.Accordion("πŸ“‹ Useful Queries", open=False):
gr.Markdown(
"**Count edges by relation:**\n"
"```sql\n"
"SELECT r.label, COUNT(*) as count \n"
"FROM edge e \n"
"JOIN relation r ON e.rel_id = r.id \n"
"GROUP BY r.label \n"
"ORDER BY count DESC\n"
"```\n\n"
"**Find strongest connections:**\n"
"```sql\n"
"SELECT s.label, r.label, e.label, edge.weight\n"
"FROM edge \n"
"JOIN node s ON edge.start_id = s.id\n"
"JOIN relation r ON edge.rel_id = r.id\n"
"JOIN node e ON edge.end_id = e.id\n"
"ORDER BY weight DESC LIMIT 20\n"
"```\n\n"
"**Check index usage:**\n"
"```sql\n"
"EXPLAIN QUERY PLAN\n"
"SELECT * FROM edge WHERE start_id = '/c/en/dog'\n"
"```"
)
raw_query_btn = gr.Button("▢️ Execute SQL", variant="secondary", size="lg")
raw_status_output = gr.Markdown("*Ready...*")
raw_results_output = gr.DataFrame(label="Results", interactive=False, wrap=True)
with gr.TabItem("πŸ“Š Schema & Indices"):
gr.Markdown(
"**Database structure, indices, and optimization info.**\n\n"
"View tables, columns, and index configuration."
)
schema_btn = gr.Button("πŸ“Š Load Schema", variant="secondary", size="lg")
schema_output = gr.Markdown("*Click button to load schema...*")
gr.Markdown("---")
gr.Markdown(
"πŸ’‘ **Performance:** Queries use database indices for fast lookups. "
"Exact matches are faster than wildcards. "
f"{'βœ… Index files loaded from HuggingFace cache.' if INDEX_PATH else '⚠️ Running without index cache - queries may be slower.'}"
)
# Connect UI to functions
semantic_btn.click(
fn=get_semantic_profile,
inputs=[word_input, lang_input],
outputs=[semantic_output],
api_name="get_semantic_profile"
)
query_btn.click(
fn=run_query,
inputs=[start_input, rel_input, end_input, limit_slider],
outputs=[results_output, status_output],
api_name="run_query"
)
raw_query_btn.click(
fn=run_raw_query,
inputs=[raw_sql_input],
outputs=[raw_results_output, raw_status_output],
api_name="run_raw_query"
)
schema_btn.click(
fn=get_schema_info,
inputs=None,
outputs=schema_output,
api_name="get_schema"
)
if __name__ == "__main__":
print("\n" + "="*50)
print("πŸš€ Starting ConceptNet SQLite Explorer")
print("="*50 + "\n")
demo.launch(ssr_mode=False)