Spaces:

cstr
/

conceptnet_db

Running

File size: 15,045 Bytes

import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi
import os
import time
import json

# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh']
INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
PROGRESS_FILENAME = "indexing_progress.json"
LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
CONCEPTNET_BASE = "http://conceptnet.io"
# =========================

print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")

if HF_TOKEN:
    print(f"✅ HF_TOKEN found")

ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"

def log_progress(message, level="INFO"):
    timestamp = time.strftime("%H:%M:%S")
    prefix = {"INFO": "ℹ️ ", "SUCCESS": "✅", "ERROR": "❌", "WARN": "⚠️ ", "DEBUG": "🔍"}.get(level, "")
    print(f"[{timestamp}] {prefix} {message}")

def check_remote_progress():
    if not HF_TOKEN:
        return {"indexing_complete": False}
    try:
        api = HfApi()
        api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
        progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN)
        with open(progress_path, 'r') as f:
            return json.load(f)
    except:
        return {"indexing_complete": False}

def create_indexed_database():
    progress = check_remote_progress()
    if progress.get("indexing_complete", False):
        try:
            indexed_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=INDEXED_DB_FILENAME, repo_type="dataset", token=HF_TOKEN)
            log_progress("Downloaded indexed DB", "SUCCESS")
            return indexed_path
        except:
            pass
    return LOCAL_DB_PATH

DB_PATH = create_indexed_database()

def get_db_connection():
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    conn.execute("PRAGMA cache_size = -256000")
    return conn

def deep_debug():
    """DEEP DEBUGGING - Find out what's actually wrong!"""
    log_progress("="*60, "INFO")
    log_progress("DEEP DEBUGGING SESSION", "INFO")
    log_progress("="*60, "INFO")
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # 1. Find actual dog edges
            log_progress("\n1. Finding actual edges for 'dog':", "INFO")
            cursor.execute("""
                SELECT e.id, e.start_id, e.rel_id, e.end_id, e.weight
                FROM edge e
                WHERE e.start_id LIKE 'http://conceptnet.io/c/en/dog%'
                LIMIT 5
            """)
            edges = cursor.fetchall()
            
            log_progress(f"Found {len(edges)} edges:", "SUCCESS")
            for edge_id, start_id, rel_id, end_id, weight in edges:
                print(f"  {edge_id}")
                print(f"    start: {start_id}")
                print(f"    rel:   {rel_id}")
                print(f"    end:   {end_id}")
                print(f"    weight: {weight}")
            
            if not edges:
                log_progress("NO EDGES FOUND! Database might be corrupted!", "ERROR")
                return
            
            # 2. Check what relations actually exist
            log_progress("\n2. What relations exist?", "INFO")
            cursor.execute("SELECT id, label FROM relation LIMIT 20")
            relations = cursor.fetchall()
            log_progress(f"Found {len(relations)} relations:", "SUCCESS")
            for rel_id, label in relations:
                print(f"  {rel_id} -> {label}")
            
            # 3. Check if relation JOIN works
            log_progress("\n3. Testing relation JOIN:", "INFO")
            test_rel_id = edges[0][2] if edges else None
            if test_rel_id:
                log_progress(f"Looking up relation ID: {test_rel_id}", "DEBUG")
                cursor.execute("SELECT id, label FROM relation WHERE id = ?", (test_rel_id,))
                rel_result = cursor.fetchone()
                if rel_result:
                    log_progress(f"  ✅ Found: {rel_result[0]} -> {rel_result[1]}", "SUCCESS")
                else:
                    log_progress(f"  ❌ Relation ID not found in relation table!", "ERROR")
            
            # 4. Test the FULL JOIN query on ONE edge
            if edges:
                test_start = edges[0][1]
                log_progress(f"\n4. Testing full JOIN on: {test_start}", "INFO")
                
                query = """
                    SELECT 
                        e.id,
                        s.label AS start_label,
                        r.label AS relation,
                        en.label AS end_label,
                        e.weight
                    FROM edge e
                    JOIN node s ON e.start_id = s.id
                    JOIN relation r ON e.rel_id = r.id
                    JOIN node en ON e.end_id = en.id
                    WHERE e.start_id = ?
                    LIMIT 5
                """
                
                start = time.time()
                cursor.execute(query, (test_start,))
                results = cursor.fetchall()
                elapsed = time.time() - start
                
                log_progress(f"Full JOIN returned {len(results)} in {elapsed:.3f}s", "SUCCESS" if results else "ERROR")
                
                if results:
                    for edge_id, s_label, r_label, e_label, weight in results:
                        print(f"  {s_label} --{r_label}--> {e_label} [{weight:.3f}]")
                else:
                    log_progress("JOIN returned nothing! Checking each table...", "ERROR")
                    
                    # Debug each join
                    cursor.execute("SELECT id, label FROM node WHERE id = ?", (test_start,))
                    start_node = cursor.fetchone()
                    log_progress(f"  Start node: {start_node}", "DEBUG")
                    
                    test_end = edges[0][3]
                    cursor.execute("SELECT id, label FROM node WHERE id = ?", (test_end,))
                    end_node = cursor.fetchone()
                    log_progress(f"  End node: {end_node}", "DEBUG")
                    
                    test_rel = edges[0][2]
                    cursor.execute("SELECT id, label FROM relation WHERE id = ?", (test_rel,))
                    rel = cursor.fetchone()
                    log_progress(f"  Relation: {rel}", "DEBUG")
            
            # 5. Test with LIKE and JOIN
            log_progress("\n5. Testing LIKE + JOIN (what semantic profile does):", "INFO")
            
            test_pattern = f"{CONCEPTNET_BASE}/c/en/dog%"
            test_relation = "/r/IsA"
            
            query = """
                SELECT 
                    en.label,
                    e.weight
                FROM edge e
                JOIN node en ON e.end_id = en.id
                JOIN relation r ON e.rel_id = r.id
                WHERE e.start_id LIKE ? AND r.label = ?
                LIMIT 5
            """
            
            log_progress(f"Pattern: {test_pattern}", "DEBUG")
            log_progress(f"Relation: {test_relation}", "DEBUG")
            
            start = time.time()
            cursor.execute(query, (test_pattern, test_relation))
            results = cursor.fetchall()
            elapsed = time.time() - start
            
            log_progress(f"Result: {len(results)} rows in {elapsed:.3f}s", "SUCCESS" if results else "WARN")
            
            if results:
                for label, weight in results:
                    print(f"  dog IsA {label} [{weight:.3f}]")
            else:
                log_progress("No results! Let's check why...", "WARN")
                
                # Check if edges exist with this pattern
                cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE ?", (test_pattern,))
                edge_count = cursor.fetchone()[0]
                log_progress(f"  Edges with pattern: {edge_count}", "DEBUG")
                
                # Check if any edges have this relation
                cursor.execute("SELECT COUNT(*) FROM edge e JOIN relation r ON e.rel_id = r.id WHERE r.label = ?", (test_relation,))
                rel_edge_count = cursor.fetchone()[0]
                log_progress(f"  Edges with relation {test_relation}: {rel_edge_count}", "DEBUG")
                
                # Check if the combination exists
                cursor.execute("""
                    SELECT COUNT(*) FROM edge e 
                    JOIN relation r ON e.rel_id = r.id 
                    WHERE e.start_id LIKE ? AND r.label = ?
                """, (test_pattern, test_relation))
                combo_count = cursor.fetchone()[0]
                log_progress(f"  Combination: {combo_count}", "DEBUG")
                
                if combo_count == 0:
                    log_progress("  ❌ NO edges match pattern + relation!", "ERROR")
                    log_progress("  Checking what relations DO exist for 'dog':", "INFO")
                    
                    cursor.execute("""
                        SELECT DISTINCT r.label, COUNT(*) as cnt
                        FROM edge e
                        JOIN relation r ON e.rel_id = r.id
                        WHERE e.start_id LIKE ?
                        GROUP BY r.label
                        ORDER BY cnt DESC
                        LIMIT 10
                    """, (test_pattern,))
                    
                    actual_rels = cursor.fetchall()
                    log_progress(f"  Actual relations for 'dog':", "INFO")
                    for rel_label, count in actual_rels:
                        print(f"    {rel_label}: {count} edges")
            
            log_progress("\n" + "="*60, "INFO")
            log_progress("DEBUGGING COMPLETE", "INFO")
            log_progress("="*60 + "\n", "INFO")
            
    except Exception as e:
        log_progress(f"Debug failed: {e}", "ERROR")
        import traceback
        traceback.print_exc()

# Run deep debugging
deep_debug()

def get_semantic_profile(word, lang='en', progress=gr.Progress()):
    """Semantic profile - will be fixed after we understand the debug output"""
    log_progress(f"Profile request: {word} ({lang})", "INFO")
    
    if not word or lang not in TARGET_LANGUAGES:
        return "⚠️ Invalid input"
    
    word = word.strip().lower().replace(' ', '_')
    like_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}%"
    
    output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
    output_md += "*Check server logs for detailed debug information*\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # Find nodes
            cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
            nodes = cursor.fetchall()
            
            if not nodes:
                return f"# 🧠 '{word}'\n\n⚠️ Not found"
            
            for node_id, label in nodes[:3]:
                output_md += f"**Node:** `{node_id}` → {label}\n"
            
            output_md += "\n## Relations Found\n\n"
            
            # Get actual relations that exist
            query = """
                SELECT DISTINCT r.label, COUNT(*) as cnt
                FROM edge e
                JOIN relation r ON e.rel_id = r.id
                WHERE e.start_id LIKE ?
                GROUP BY r.label
                ORDER BY cnt DESC
            """
            
            cursor.execute(query, (like_path,))
            relations = cursor.fetchall()
            
            log_progress(f"Found {len(relations)} relation types", "INFO")
            
            for rel_label, count in relations[:20]:
                output_md += f"### {rel_label} ({count} edges)\n\n"
                
                # Get sample edges
                cursor.execute("""
                    SELECT en.label, e.weight
                    FROM edge e
                    JOIN node en ON e.end_id = en.id
                    JOIN relation r ON e.rel_id = r.id
                    WHERE e.start_id LIKE ? AND r.label = ?
                    ORDER BY e.weight DESC
                    LIMIT 5
                """, (like_path, rel_label))
                
                results = cursor.fetchall()
                for label, weight in results:
                    output_md += f"- **{word}** {rel_label} → *{label}* `[{weight:.3f}]`\n"
                
                output_md += "\n"
            
            return output_md
            
    except Exception as e:
        log_progress(f"Error: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return f"**❌ Error:** {e}"

def run_raw_query(sql_query):
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "❌ Only SELECT"
    try:
        with get_db_connection() as conn:
            start = time.time()
            df = pd.read_sql_query(sql_query, conn)
            elapsed = time.time() - start
            return df, f"✅ {len(df)} rows in {elapsed:.3f}s"
    except Exception as e:
        return pd.DataFrame(), f"❌ {e}"

def get_schema_info():
    return f"# Schema\n\nCheck server logs for detailed debugging output."

# UI
with gr.Blocks(title="ConceptNet Debug", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 ConceptNet Debugger")
    gr.Markdown("**Check server logs for comprehensive debugging information!**")
    
    with gr.Tabs():
        with gr.TabItem("🔍 Profile"):
            with gr.Row():
                word_input = gr.Textbox(label="Word", value="dog")
                lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Lang")
            profile_btn = gr.Button("Get Profile")
            profile_out = gr.Markdown()
        
        with gr.TabItem("💻 SQL"):
            sql_input = gr.Textbox(
                label="SQL",
                value="SELECT e.*, r.label FROM edge e JOIN relation r ON e.rel_id = r.id WHERE e.start_id LIKE 'http://conceptnet.io/c/en/dog%' LIMIT 10",
                lines=3
            )
            sql_btn = gr.Button("Execute")
            sql_status = gr.Markdown()
            sql_results = gr.DataFrame()
        
        with gr.TabItem("📊 Schema"):
            schema_btn = gr.Button("Load")
            schema_out = gr.Markdown()
    
    profile_btn.click(get_semantic_profile, [word_input, lang_input], profile_out)
    sql_btn.click(run_raw_query, sql_input, [sql_results, sql_status])
    schema_btn.click(get_schema_info, None, schema_out)

if __name__ == "__main__":
    log_progress("DEBUG MODE READY", "SUCCESS")
    demo.launch(ssr_mode=False)