conceptnet_db / app.py
cstr's picture
Update app.py
9592e20 verified
raw
history blame
14.2 kB
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi
import os
import time
import json
# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
CONCEPTNET_BASE = "http://conceptnet.io"
# =========================
print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
def log_progress(message, level="INFO"):
timestamp = time.strftime("%H:%M:%S")
prefix = {"INFO": "ℹ️ ", "SUCCESS": "βœ…", "ERROR": "❌", "WARN": "⚠️ ", "DEBUG": "πŸ”"}.get(level, "")
print(f"[{timestamp}] {prefix} {message}")
def check_remote_progress():
if not HF_TOKEN:
return {"indexing_complete": False}
try:
api = HfApi()
api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN)
with open(progress_path, 'r') as f:
return json.load(f)
except:
return {"indexing_complete": False}
def create_indexed_database():
progress = check_remote_progress()
if progress.get("indexing_complete"):
try:
return hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN)
except:
pass
return None
DB_PATH = create_indexed_database()
def get_db_connection():
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute("PRAGMA cache_size = -256000")
return conn
log_progress("Database loaded", "SUCCESS")
def get_semantic_profile(word, lang='en', progress=gr.Progress()):
"""FIXED: Use full URL for relations!"""
log_progress(f"Profile: {word} ({lang})", "INFO")
progress(0, desc="Starting...")
if not word or lang not in TARGET_LANGUAGES:
return "⚠️ Invalid input"
word = word.strip().lower().replace(' ', '_')
like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%"
# FIXED: Use FULL URLs for relations!
relations = [
("IsA", f"{CONCEPTNET_BASE}/r/IsA"),
("PartOf", f"{CONCEPTNET_BASE}/r/PartOf"),
("HasA", f"{CONCEPTNET_BASE}/r/HasA"),
("UsedFor", f"{CONCEPTNET_BASE}/r/UsedFor"),
("CapableOf", f"{CONCEPTNET_BASE}/r/CapableOf"),
("Causes", f"{CONCEPTNET_BASE}/r/Causes"),
("HasProperty", f"{CONCEPTNET_BASE}/r/HasProperty"),
("Synonym", f"{CONCEPTNET_BASE}/r/Synonym"),
("Antonym", f"{CONCEPTNET_BASE}/r/Antonym"),
("AtLocation", f"{CONCEPTNET_BASE}/r/AtLocation"),
("RelatedTo", f"{CONCEPTNET_BASE}/r/RelatedTo"),
("DerivedFrom", f"{CONCEPTNET_BASE}/r/DerivedFrom"),
]
output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
progress(0.05, desc="Finding nodes...")
# Find nodes
cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
nodes = cursor.fetchall()
if not nodes:
return f"# 🧠 '{word}'\n\n⚠️ Not found"
log_progress(f"Found {len(nodes)} nodes", "SUCCESS")
for node_id, label in nodes[:3]:
output_md += f"**Node:** `{node_id}` β†’ **{label}**\n"
output_md += "\n"
total = 0
for i, (rel_name, rel_url) in enumerate(relations):
progress((i + 1) / len(relations), desc=f"Querying {rel_name}...")
output_md += f"## {rel_name}\n\n"
found = False
# Outgoing - use exact match on rel_id (FAST!)
start = time.time()
cursor.execute("""
SELECT en.label, e.weight
FROM edge e
JOIN node en ON e.end_id = en.id
WHERE e.start_id LIKE ? AND e.rel_id = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel_url))
results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" {rel_name} outgoing: {len(results)} in {elapsed:.3f}s", "DEBUG")
for label, weight in results:
output_md += f"- **{word}** {rel_name} β†’ *{label}* `[{weight:.3f}]`\n"
found = True
total += 1
# Incoming
start = time.time()
cursor.execute("""
SELECT s.label, e.weight
FROM edge e
JOIN node s ON e.start_id = s.id
WHERE e.end_id LIKE ? AND e.rel_id = ?
ORDER BY e.weight DESC
LIMIT 7
""", (like_path, rel_url))
results = cursor.fetchall()
elapsed = time.time() - start
log_progress(f" {rel_name} incoming: {len(results)} in {elapsed:.3f}s", "DEBUG")
for label, weight in results:
output_md += f"- *{label}* {rel_name} β†’ **{word}** `[{weight:.3f}]`\n"
found = True
total += 1
if not found:
output_md += "*No results*\n"
output_md += "\n"
progress(1.0, desc="Complete!")
output_md += f"---\n**Total relations:** {total}\n"
log_progress(f"Profile complete: {total} relations", "SUCCESS")
return output_md
except Exception as e:
log_progress(f"Error: {e}", "ERROR")
import traceback
traceback.print_exc()
return f"**❌ Error:** {e}"
def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
"""Query builder - FIXED with full URLs"""
log_progress(f"Query: start={start_node}, rel={relation}, end={end_node}", "INFO")
progress(0, desc="Building...")
query = """
SELECT
e.id, s.id, r.label, en.id, e.weight, s.label, en.label
FROM edge e
JOIN relation r ON e.rel_id = r.id
JOIN node s ON e.start_id = s.id
JOIN node en ON e.end_id = en.id
WHERE 1=1
"""
params = []
try:
with get_db_connection() as conn:
progress(0.3, desc="Adding filters...")
# Start node
if start_node and start_node.strip():
if start_node.startswith('http://'):
pattern = f"{start_node}%"
else:
pattern = f"{CONCEPTNET_BASE}/c/en/{start_node}%"
query += " AND s.id LIKE ?"
params.append(pattern)
# Relation - FIXED: use full URL!
if relation and relation.strip():
if relation.startswith('http://'):
rel_value = relation
elif relation.startswith('/r/'):
rel_value = f"{CONCEPTNET_BASE}{relation}"
else:
rel_value = f"{CONCEPTNET_BASE}/r/{relation}"
query += " AND r.id = ?"
params.append(rel_value)
log_progress(f"Relation: {rel_value}", "DEBUG")
# End node
if end_node and end_node.strip():
if end_node.startswith('http://'):
pattern = f"{end_node}%"
else:
pattern = f"{CONCEPTNET_BASE}/c/en/{end_node}%"
query += " AND en.id LIKE ?"
params.append(pattern)
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
progress(0.6, desc="Executing...")
start_time = time.time()
df = pd.read_sql_query(query, conn, params=params)
elapsed = time.time() - start_time
log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS")
progress(1.0, desc="Done!")
if df.empty:
return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
except Exception as e:
log_progress(f"Error: {e}", "ERROR")
import traceback
traceback.print_exc()
return pd.DataFrame(), f"❌ {e}"
def run_raw_query(sql_query):
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "❌ Only SELECT"
try:
with get_db_connection() as conn:
start = time.time()
df = pd.read_sql_query(sql_query, conn)
elapsed = time.time() - start
return df, f"βœ… {len(df)} rows in {elapsed:.3f}s"
except Exception as e:
return pd.DataFrame(), f"❌ {e}"
def get_schema_info():
md = f"# πŸ“š Schema\n\n"
md += f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
md += "## Key Discovery\n\n"
md += "βœ… **Relations use FULL URLs:** `http://conceptnet.io/r/IsA` (not `/r/IsA`)\n\n"
md += "βœ… **Nodes use FULL URLs:** `http://conceptnet.io/c/en/dog` (not `/c/en/dog`)\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
md += "## Common Relations\n\n"
cursor.execute("SELECT id, label FROM relation ORDER BY label LIMIT 20")
for rel_id, label in cursor.fetchall():
md += f"- **{label}:** `{rel_id}`\n"
md += "\n## Tables\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
for table, in cursor.fetchall():
cursor.execute(f"SELECT COUNT(*) FROM {table}")
md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n"
except Exception as e:
md += f"\nError: {e}\n"
return md
# UI
with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet Explorer")
gr.Markdown(f"**Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | βœ… **FIXED:** Using full URLs for relations!")
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown("**Explore semantic relations for any word**")
with gr.Row():
word_input = gr.Textbox(
label="Word",
placeholder="dog",
value="dog",
info="Enter a word"
)
lang_input = gr.Dropdown(
choices=TARGET_LANGUAGES,
value="en",
label="Language"
)
semantic_btn = gr.Button("πŸ” Get Semantic Profile", variant="primary", size="lg")
semantic_output = gr.Markdown()
gr.Examples(
examples=[
["dog", "en"],
["hund", "de"],
["perro", "es"],
["chien", "fr"],
],
inputs=[word_input, lang_input]
)
with gr.TabItem("⚑ Query Builder"):
gr.Markdown("**Build custom relationship queries**")
with gr.Row():
start_input = gr.Textbox(label="Start Node", placeholder="dog")
rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA")
end_input = gr.Textbox(label="End Node", placeholder="")
limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
status_output = gr.Markdown()
results_output = gr.DataFrame(wrap=True)
with gr.TabItem("πŸ’» Raw SQL"):
gr.Markdown("**Execute custom SQL queries**")
raw_sql_input = gr.Textbox(
label="SQL Query",
value=f"SELECT e.*, r.label FROM edge e JOIN relation r ON e.rel_id = r.id WHERE e.start_id = '{CONCEPTNET_BASE}/c/en/dog' LIMIT 10",
lines=3
)
raw_btn = gr.Button("▢️ Execute")
raw_status = gr.Markdown()
raw_results = gr.DataFrame()
with gr.TabItem("πŸ“Š Schema"):
schema_btn = gr.Button("πŸ“Š Load Schema")
schema_output = gr.Markdown()
gr.Markdown(
"---\n"
"**Fixed:** Relations now use full URLs (`http://conceptnet.io/r/IsA`) | "
"**Performance:** Exact match on rel_id = fast queries!"
)
semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
schema_btn.click(get_schema_info, None, schema_output)
if __name__ == "__main__":
log_progress("APP READY - Relations use full URLs now!", "SUCCESS")
demo.launch(ssr_mode=False)