Spaces:

cstr
/

conceptnet_db

Running

File size: 9,382 Bytes

import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download
import os
import traceback

# --- 1. Download and Cache the Database ---
print("Downloading database (if not already cached)...")
REPO_ID = "ysenarath/conceptnet-sqlite"
DB_FILENAME = "data/conceptnet-v5.7.0.db"

# This is the critical caching step for HF Spaces
DB_PATH = hf_hub_download(
    repo_id=REPO_ID, 
    filename=DB_FILENAME, 
    repo_type="dataset"
)
print(f"Database is available at: {DB_PATH}")

# --- 2. Database Helper Functions ---

def get_db_connection():
    """
    Creates a new read-only connection to the SQLite database.
    This is safer for Gradio's multi-threading.
    """
    try:
        # Connect in read-only mode
        db_uri = f"file:{DB_PATH}?mode=ro"
        conn = sqlite3.connect(db_uri, uri=True)
        return conn
    except Exception as e:
        print(f"Error connecting to DB: {e}")
        traceback.print_exc()
        return None

def get_schema_info():
    """
    Dynamically queries the SQLite database to get its schema.
    """
    print("Getting schema info...")
    schema_md = "# Database Schema\n\n"
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
            tables = cursor.fetchall()
            
            if not tables:
                return "Could not find any tables in the database."

            for table in tables:
                table_name = table[0]
                schema_md += f"## Table: `{table_name}`\n\n"
                schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
                schema_md += "|:------------|:----------|:---------|:------------|\n"
                
                cursor.execute(f"PRAGMA table_info({table_name});")
                columns = cursor.fetchall()
                for col in columns:
                    # col structure: (cid, name, type, notnull, dflt_value, pk)
                    name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
                    schema_md += f"| `{name}` | `{dtype}` | {bool(notnull)} | {bool(pk)} |\n"
                schema_md += "\n"
            
            return schema_md
    except Exception as e:
        print(f"Error in get_schema_info: {e}")
        traceback.print_exc()
        return f"An error occurred while fetching schema: {e}"

def run_query(start_node, relation, end_node, limit):
    """
    Runs an efficient query by filtering on text paths first,
    then joining by indexed IDs.
    
    *** FIX ***: This now queries the `node.id` column (which contains the full path)
    and the `relation.label` column, based on the raw query results.
    """
    print(f"Running query: start='{start_node}', rel='{relation}', end='{end_node}'")
    
    # --- FIX: Select the `id` column (which has the path) ---
    select_clause = """
        SELECT
            e.id AS edge_id,
            s.id AS start_id_path,
            r.label AS relation_label,
            en.id AS end_id_path,
            e.weight,
            e.dataset,
            e.surface_text,
            s.label AS start_label_text,
            en.label AS end_label_text
        FROM
            edge AS e
        JOIN
            relation AS r ON e.rel_id = r.id
        JOIN
            node AS s ON e.start_id = s.id
        JOIN
            node AS en ON e.end_id = en.id
    """
    
    where_conditions = []
    params = []
    
    try:
        # --- FIX: Query `node.id` for nodes, `relation.label` for relations ---
        if start_node:
            # We add a wildcard to the front to match the full http://... ID
            # unless the user has already added their own wildcard.
            param_val = start_node if "%" in start_node else f"%{start_node}"
            where_conditions.append(f"e.start_id IN (SELECT id FROM node WHERE id LIKE ?)")
            params.append(param_val)
        
        if relation:
            # Relation table uses `label` which is a direct match (e.g., /r/IsA)
            op = "LIKE" if "%" in relation else "="
            where_conditions.append(f"e.rel_id IN (SELECT id FROM relation WHERE label {op} ?)")
            params.append(relation)

        if end_node:
            # We add a wildcard to the front to match the full http://... ID
            param_val = end_node if "%" in end_node else f"%{end_node}"
            where_conditions.append(f"e.end_id IN (SELECT id FROM node WHERE id LIKE ?)")
            params.append(param_val)
        # --- End of Fix ---

        if not where_conditions:
            where_clause = " WHERE 1=1" # Get random edges if no filter
        else:
            where_clause = " WHERE " + " AND ".join(where_conditions)
            
        query = select_clause + where_clause + " LIMIT ?"
        params.append(limit)
        
        print(f"Executing SQL: {query}")
        print(f"With params: {params}")

        with get_db_connection() as conn:
            df = pd.read_sql_query(query, conn, params=params)
        
        if df.empty:
            return pd.DataFrame(), "Query ran successfully but returned no results. (This is common if your query is very specific)."
            
        return df, "Query successful!"

    except Exception as e:
        print(f"Error in run_query: {e}")
        traceback.print_exc()
        err_msg = f"**Query Failed!**\n\n`{e}`"
        return pd.DataFrame(), err_msg

def run_raw_query(sql_query):
    """
    Executes a raw, read-only SQL query against the database.
    DANGEROUS but useful for debugging.
    """
    print(f"Running raw query: {sql_query}")
    
    # Basic safety check for read-only
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "**Error:** Only `SELECT` statements are allowed for safety."
        
    try:
        with get_db_connection() as conn:
            df = pd.read_sql_query(sql_query, conn)
        
        if df.empty:
            return df, "Query ran successfully but returned no results."
        
        return df, "Raw query successful!"
    except Exception as e:
        print(f"Error in run_raw_query: {e}")
        traceback.print_exc()
        return pd.DataFrame(), f"**Query Failed!**\n\n`{e}`"


# --- 3. Build the Gradio UI ---

with gr.Blocks(title="ConceptNet SQLite Explorer") as demo:
    gr.Markdown(f"# ConceptNet SQLite Explorer (DB: `{DB_FILENAME}`)")
    gr.Markdown(f"**Note:** Initial query might take a few seconds as the database warms up.")
    
    with gr.Tabs():
        with gr.TabItem("Query Builder"):
            gr.Markdown(
                "**Run a query against the database.**\n"
                "This query joins the `edge`, `node`, and `relation` tables to find connections.\n"
                "Use the *path part* of the ID (e.g., `/c/en/dog` or `/r/Is%`)."
            )
            with gr.Row():
                # --- FIX: Updated labels for clarity ---
                start_input = gr.Textbox(label="Start Node Path", placeholder="/c/en/dog")
                rel_input = gr.Textbox(label="Relation Path", placeholder="/r/IsA")
                end_input = gr.Textbox(label="End Node Path", placeholder="/c/en/animal")
            
            limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=10, step=1)
            query_btn = gr.Button("Run Query", variant="primary")
            
            status_output = gr.Markdown("Status will appear here...")
            results_output = gr.DataFrame(label="Query Results", interactive=False)
        
        with gr.TabItem("Raw SQL Query"):
            gr.Markdown("**Danger Zone:** Run a raw `SELECT` query against the database. Use the Schema Explorer tab to see table/column names.")
            # --- FIX: Updated placeholder to a useful, correct query ---
            raw_sql_input = gr.Textbox(
                label="Raw SQL Query", 
                placeholder="SELECT id, label, language, path FROM node WHERE id LIKE '%/c/en/dog' LIMIT 5",
                lines=5
            )
            raw_query_btn = gr.Button("Run Raw SQL", variant="secondary")
            raw_status_output = gr.Markdown("Status will appear here...")
            raw_results_output = gr.DataFrame(label="Raw Query Results", interactive=False)

        with gr.TabItem("Schema Explorer"):
            gr.Markdown("Click the button to see all tables and columns in the database. This helps you write correct queries.")
            schema_btn = gr.Button("Show Database Schema", variant="secondary")
            schema_output = gr.Markdown("Schema will appear here...")
            
    # --- 4. Connect UI Elements to Functions ---
    query_btn.click(
        fn=run_query,
        inputs=[start_input, rel_input, end_input, limit_slider],
        outputs=[results_output, status_output],
        api_name="run_query"
    )
    
    raw_query_btn.click(
        fn=run_raw_query,
        inputs=[raw_sql_input],
        outputs=[raw_results_output, raw_status_output],
        api_name="run_raw_query"
    )

    schema_btn.click(
        fn=get_schema_info,
        inputs=None,
        outputs=schema_output,
        api_name="get_schema"
    )

if __name__ == "__main__":
    # Removed experimental ssr_mode=False
    demo.launch(ssr_mode=False)