Spaces:

holistic-ai
/

AgentGraph

Running

File size: 18,961 Bytes

#!/usr/bin/env python
"""
Database initialization and reset utility.
Creates or resets the database with the three required tables:
- knowledge_graphs (with status column)
- entities
- relations
"""

import os
import sys
import sqlite3
import argparse
import logging
import shutil
import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure logging (backend/database/init_db.py -> backend/database/ -> backend/ -> project_root/ -> logs/)
LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(os.path.join(LOG_DIR, 'agent_monitoring.log'))
    ]
)

logger = logging.getLogger(__name__)

# Get the absolute path to the project root directory
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import DB_URI from central config
from utils.config import DB_URI

# Extract path from sqlite URI
if DB_URI.startswith("sqlite:///"):
    DB_PATH = DB_URI.replace("sqlite:///", "")
else:
    # Fallback for non-sqlite databases
    DB_PATH = os.path.join(ROOT_DIR, 'datasets/db/agent_monitoring.db')

def confirm_reset():
    """Ask for user confirmation before force resetting the database."""
    print("\nWARNING: This will DELETE ALL data in the database.")
    print("All knowledge graphs, entities, and relations will be permanently lost.")
    response = input("Are you sure you want to continue? (yes/no): ")
    return response.lower() in ["yes", "y"]

def init_database(reset=False, force=False):
    """
    Initialize the database with the required tables.

    Args:
        reset: If True, drop and recreate the tables
        force: If True, delete the database file completely
    """
    # Make sure the directory exists
    db_dir = os.path.dirname(DB_PATH)
    if db_dir:  # Only try to create if there's a directory path
        try:
            os.makedirs(db_dir, exist_ok=True)
            logger.info(f"Database directory ensured at: {db_dir}")
        except OSError as e:
            logger.warning(f"Could not create database directory {db_dir}: {e}")
            # In HF Spaces, /data might not be available until Persistent Storage is enabled
            if "/data" in db_dir:
                logger.warning("HF Spaces Persistent Storage may not be enabled. "
                             "Database will be stored in ephemeral storage.")
    
    # Check if database exists
    db_exists = os.path.exists(DB_PATH) and os.path.getsize(DB_PATH) > 0
    
    # Handle reset with file deletion if requested
    if db_exists and reset and force:
        logger.info(f"Found existing database at {DB_PATH}")
        
        # Create backup
        backup_path = f"{DB_PATH}.backup_{int(time.time())}"
        logger.info(f"Creating backup at {backup_path}")
        shutil.copy2(DB_PATH, backup_path)
        
        # Delete the database file
        logger.info("Deleting database file")
        os.remove(DB_PATH)
        db_exists = False
        logger.info("Database file deleted")
    
    # Connect to the database
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    # Drop tables if reset requested
    if reset and db_exists:
        logger.info("Dropping existing tables")
        cursor.execute("DROP TABLE IF EXISTS relations")
        cursor.execute("DROP TABLE IF EXISTS entities")
        cursor.execute("DROP TABLE IF EXISTS knowledge_graphs")
    
    # Create tables
    logger.info("Creating tables")
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS knowledge_graphs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        filename VARCHAR(255) UNIQUE,
        creation_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
        update_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
        creator VARCHAR(255),
        entity_count INTEGER DEFAULT 0,
        relation_count INTEGER DEFAULT 0,
        namespace VARCHAR(255),
        system_name VARCHAR(255),
        system_summary TEXT,
        graph_data TEXT,
        status VARCHAR(50) DEFAULT 'created',
        trace_id VARCHAR(36),
        window_index INTEGER,
        window_total INTEGER,
        window_start_char INTEGER,
        window_end_char INTEGER,
        processing_run_id VARCHAR(255)
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS traces (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        trace_id VARCHAR(36) UNIQUE,
        filename VARCHAR(255),
        title VARCHAR(255),
        description TEXT,
        content TEXT,
        content_hash VARCHAR(64),
        upload_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
        update_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
        uploader VARCHAR(255),
        trace_type VARCHAR(50),
        trace_source VARCHAR(50),
        character_count INTEGER DEFAULT 0,
        turn_count INTEGER DEFAULT 0,
        status VARCHAR(50) DEFAULT 'uploaded',
        processing_method VARCHAR(50),
        tags TEXT,
        trace_metadata TEXT
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS entities (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        graph_id INTEGER,
        entity_id VARCHAR(255),
        type VARCHAR(255),
        name VARCHAR(255),
        properties TEXT,
        knowledge_graph_namespace VARCHAR(255),
        FOREIGN KEY (graph_id) REFERENCES knowledge_graphs(id),
        UNIQUE (graph_id, entity_id)
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS relations (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        graph_id INTEGER,
        relation_id VARCHAR(255),
        type VARCHAR(255),
        source_id INTEGER,
        target_id INTEGER,
        properties TEXT,
        knowledge_graph_namespace VARCHAR(255),
        FOREIGN KEY (graph_id) REFERENCES knowledge_graphs(id),
        FOREIGN KEY (source_id) REFERENCES entities(id),
        FOREIGN KEY (target_id) REFERENCES entities(id),
        UNIQUE (graph_id, relation_id)
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS prompt_reconstructions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        knowledge_graph_id INTEGER,
        relation_id VARCHAR(255),
        reconstructed_prompt TEXT,
        dependencies TEXT,
        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id)
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS perturbation_tests (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        knowledge_graph_id INTEGER NOT NULL,
        prompt_reconstruction_id INTEGER NOT NULL,
        relation_id VARCHAR(255) NOT NULL,
        perturbation_type VARCHAR(50) NOT NULL,
        perturbation_set_id VARCHAR(64),
        test_result JSON,
        perturbation_score FLOAT,
        test_metadata JSON,
        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id),
        FOREIGN KEY (prompt_reconstruction_id) REFERENCES prompt_reconstructions(id)
    )
    ''')

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS observability_connections (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        connection_id VARCHAR(36) UNIQUE,
        platform VARCHAR(50) NOT NULL,
        public_key TEXT NOT NULL,
        secret_key TEXT,
        host VARCHAR(255),
        projects TEXT,
        status VARCHAR(50) DEFAULT 'connected',
        connected_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        last_sync DATETIME,
        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
    )
    ''')
    
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS fetched_traces (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        trace_id VARCHAR(255) NOT NULL,
        name VARCHAR(255) NOT NULL,
        platform VARCHAR(50) NOT NULL,
        project_name VARCHAR(255),
        connection_id VARCHAR(36) NOT NULL,
        data TEXT,
        fetched_at DATETIME DEFAULT CURRENT_TIMESTAMP,
        imported BOOLEAN DEFAULT 0,
        imported_at DATETIME,
        imported_trace_id VARCHAR(36),
        FOREIGN KEY (connection_id) REFERENCES observability_connections(connection_id),
        UNIQUE (trace_id, connection_id)
    )
    ''')

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS causal_analyses (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        knowledge_graph_id INTEGER NOT NULL,
        perturbation_set_id VARCHAR(64) NOT NULL,
        analysis_method VARCHAR(50) NOT NULL,
        analysis_result JSON,
        causal_score FLOAT,
        analysis_metadata JSON,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id)
    )
    ''')
    

    
    # Create indexes
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_filename ON knowledge_graphs(filename)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_namespace ON knowledge_graphs(namespace)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_trace_id ON knowledge_graphs(trace_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_entities_entity_id ON entities(entity_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_relations_relation_id ON relations(relation_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_trace_id ON traces(trace_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_content_hash ON traces(content_hash)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_filename ON traces(filename)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_status ON traces(status)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_prompt_reconstructions_kgid ON prompt_reconstructions(knowledge_graph_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_prompt_reconstructions_relation_id ON prompt_reconstructions(relation_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_kgid ON perturbation_tests(knowledge_graph_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_prid ON perturbation_tests(prompt_reconstruction_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_relation ON perturbation_tests(relation_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_type ON perturbation_tests(perturbation_type)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_kgid ON causal_analyses(knowledge_graph_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_method ON causal_analyses(analysis_method)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_setid ON causal_analyses(perturbation_set_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_observability_connections_connection_id ON observability_connections(connection_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_observability_connections_platform ON observability_connections(platform)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_trace_id ON fetched_traces(trace_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_connection_id ON fetched_traces(connection_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_platform ON fetched_traces(platform)')
    
    # Commit changes
    conn.commit()
    
    # Handle column migrations for existing tables
    logger.info("Checking for column migrations...")
    
    # Check if project_name column exists in fetched_traces
    cursor.execute("PRAGMA table_info(fetched_traces)")
    fetched_traces_columns = [column[1] for column in cursor.fetchall()]
    
    if 'project_name' not in fetched_traces_columns:
        logger.info("Adding project_name column to fetched_traces table...")
        cursor.execute("ALTER TABLE fetched_traces ADD COLUMN project_name TEXT")
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_project_name ON fetched_traces(project_name)')
        conn.commit()
        logger.info("Successfully added project_name column")
    
    # Check if projects column exists in observability_connections
    cursor.execute("PRAGMA table_info(observability_connections)")
    obs_conn_columns = [column[1] for column in cursor.fetchall()]
    
    if 'projects' not in obs_conn_columns:
        logger.info("Adding projects column to observability_connections table...")
        cursor.execute("ALTER TABLE observability_connections ADD COLUMN projects TEXT")
        conn.commit()
        logger.info("Successfully added projects column")
    
    # Check if we need to rename preview_data to data in fetched_traces
    if 'preview_data' in fetched_traces_columns and 'data' not in fetched_traces_columns:
        logger.info("Renaming preview_data column to data in fetched_traces table...")
        cursor.execute("ALTER TABLE fetched_traces RENAME COLUMN preview_data TO data")
        conn.commit()
        logger.info("Successfully renamed preview_data column to data")
    
    # Verify tables were created
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = cursor.fetchall()
    table_names = [table[0] for table in tables]
    
    # Report results
    if reset:
        logger.info(f"Database reset completed. Tables created: {table_names}")
    else:
        logger.info(f"Database initialization completed. Tables created: {table_names}")
    
    # Check if the tables are empty
    cursor.execute("SELECT count(*) FROM knowledge_graphs")
    kg_count = cursor.fetchone()[0]
    cursor.execute("SELECT count(*) FROM entities")
    entity_count = cursor.fetchone()[0]
    cursor.execute("SELECT count(*) FROM relations")
    relation_count = cursor.fetchone()[0]
    
    trace_count = 0
    if 'traces' in table_names:
        cursor.execute("SELECT count(*) FROM traces")
        trace_count = cursor.fetchone()[0]
    
    logger.info(f"Database contains: {kg_count} knowledge graphs, {entity_count} entities, {relation_count} relations, {trace_count} traces")
    
    # Insert sample data for new databases (when trace count is 0)
    if trace_count == 0:
        logger.info("Empty database detected, inserting sample data for better user experience...")
        try:
            # Import here to avoid circular imports
            from .sample_data import insert_sample_data
            from . import SessionLocal
            
            # Use SQLAlchemy session for sample data insertion
            session = SessionLocal()
            try:
                results = insert_sample_data(session, force_insert=True)
                session.commit()
            except Exception as e:
                session.rollback()
                raise
            finally:
                session.close()
                
            if results["traces_inserted"] > 0 or results["knowledge_graphs_inserted"] > 0:
                logger.info(f"✅ Sample data inserted successfully: {results['traces_inserted']} traces, {results['knowledge_graphs_inserted']} knowledge graphs")
            elif results["skipped"] > 0:
                logger.info(f"Sample data already exists, skipped {results['skipped']} items")
            
            if results["errors"]:
                logger.warning(f"Some errors occurred during sample data insertion: {results['errors']}")
                
        except Exception as e:
            logger.warning(f"Failed to insert sample data (non-critical): {str(e)}")
            logger.info("Database initialization completed without sample data")
    else:
        # Check and fix existing sample knowledge graphs if needed
        logger.info("Existing data found, checking for sample data fixes...")
        try:
            from . import SessionLocal
            from .models import KnowledgeGraph, Trace
            
            session = SessionLocal()
            try:
                # Find sample traces
                sample_traces = session.query(Trace).filter(
                    Trace.trace_source == "sample_data"
                ).all()
                
                if sample_traces:
                    # Find knowledge graphs with incorrect window_index
                    trace_ids = [trace.trace_id for trace in sample_traces]
                    broken_kgs = session.query(KnowledgeGraph).filter(
                        KnowledgeGraph.trace_id.in_(trace_ids),
                        KnowledgeGraph.window_index == 0,  # Should be None for final KGs
                        KnowledgeGraph.window_total == 1
                    ).all()
                    
                    if broken_kgs:
                        logger.info(f"Found {len(broken_kgs)} sample KGs that need fixing (window_index=0 -> None)")
                        for kg in broken_kgs:
                            kg.window_index = None  # Fix to make it a final KG
                        session.commit()
                        logger.info(f"✅ Fixed {len(broken_kgs)} sample knowledge graphs to display as final KGs")
                    else:
                        logger.info("All sample knowledge graphs are correctly configured")
                        
            except Exception as e:
                session.rollback()
                logger.warning(f"Error checking sample data fixes: {str(e)}")
            finally:
                session.close()
                
        except Exception as e:
            logger.warning(f"Failed to check sample data fixes (non-critical): {str(e)}")
    
    # Close connection
    conn.close()

def main():
    """Parse arguments and initialize database."""
    parser = argparse.ArgumentParser(description='Initialize or reset the database')
    parser.add_argument('--reset', action='store_true', help='Reset the database by dropping and recreating tables')
    parser.add_argument('--force', action='store_true', help='Force reset by deleting the database file')
    args = parser.parse_args()
    
    try:
        if args.reset and not args.force and not confirm_reset():
            print("Database reset canceled.")
            return 0
            
        if args.force and not confirm_reset():
            print("Database force reset canceled.")
            return 0
        
        init_database(reset=args.reset, force=args.force)
        print("Database operation completed successfully.")
        return 0
    except Exception as e:
        logger.error(f"Error: {str(e)}")
        return 1

if __name__ == "__main__":
    sys.exit(main())