Spaces:

holistic-ai
/

AgentGraph

Running

App Files Files Community

AgentGraph / backend /database /init_db.py

wu981526092

Security & HF Spaces fixes: Enable CSRF, auth middleware, persistent storage

ea856a6 about 1 month ago

raw

history blame contribute delete

19 kB

	#!/usr/bin/env python
	"""
	Database initialization and reset utility.
	Creates or resets the database with the three required tables:
	- knowledge_graphs (with status column)
	- entities
	- relations
	"""

	import os
	import sys
	import sqlite3
	import argparse
	import logging
	import shutil
	import time
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Configure logging (backend/database/init_db.py -> backend/database/ -> backend/ -> project_root/ -> logs/)
	LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs")
	os.makedirs(LOG_DIR, exist_ok=True)

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler(os.path.join(LOG_DIR, 'agent_monitoring.log'))
	]
	)

	logger = logging.getLogger(__name__)

	# Get the absolute path to the project root directory
	ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	# Import DB_URI from central config
	from utils.config import DB_URI

	# Extract path from sqlite URI
	if DB_URI.startswith("sqlite:///"):
	DB_PATH = DB_URI.replace("sqlite:///", "")
	else:
	# Fallback for non-sqlite databases
	DB_PATH = os.path.join(ROOT_DIR, 'datasets/db/agent_monitoring.db')

	def confirm_reset():
	"""Ask for user confirmation before force resetting the database."""
	print("\nWARNING: This will DELETE ALL data in the database.")
	print("All knowledge graphs, entities, and relations will be permanently lost.")
	response = input("Are you sure you want to continue? (yes/no): ")
	return response.lower() in ["yes", "y"]

	def init_database(reset=False, force=False):
	"""
	Initialize the database with the required tables.

	Args:
	reset: If True, drop and recreate the tables
	force: If True, delete the database file completely
	"""
	# Make sure the directory exists
	db_dir = os.path.dirname(DB_PATH)
	if db_dir: # Only try to create if there's a directory path
	try:
	os.makedirs(db_dir, exist_ok=True)
	logger.info(f"Database directory ensured at: {db_dir}")
	except OSError as e:
	logger.warning(f"Could not create database directory {db_dir}: {e}")
	# In HF Spaces, /data might not be available until Persistent Storage is enabled
	if "/data" in db_dir:
	logger.warning("HF Spaces Persistent Storage may not be enabled. "
	"Database will be stored in ephemeral storage.")

	# Check if database exists
	db_exists = os.path.exists(DB_PATH) and os.path.getsize(DB_PATH) > 0

	# Handle reset with file deletion if requested
	if db_exists and reset and force:
	logger.info(f"Found existing database at {DB_PATH}")

	# Create backup
	backup_path = f"{DB_PATH}.backup_{int(time.time())}"
	logger.info(f"Creating backup at {backup_path}")
	shutil.copy2(DB_PATH, backup_path)

	# Delete the database file
	logger.info("Deleting database file")
	os.remove(DB_PATH)
	db_exists = False
	logger.info("Database file deleted")

	# Connect to the database
	conn = sqlite3.connect(DB_PATH)
	cursor = conn.cursor()

	# Drop tables if reset requested
	if reset and db_exists:
	logger.info("Dropping existing tables")
	cursor.execute("DROP TABLE IF EXISTS relations")
	cursor.execute("DROP TABLE IF EXISTS entities")
	cursor.execute("DROP TABLE IF EXISTS knowledge_graphs")

	# Create tables
	logger.info("Creating tables")
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS knowledge_graphs (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	filename VARCHAR(255) UNIQUE,
	creation_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
	update_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
	creator VARCHAR(255),
	entity_count INTEGER DEFAULT 0,
	relation_count INTEGER DEFAULT 0,
	namespace VARCHAR(255),
	system_name VARCHAR(255),
	system_summary TEXT,
	graph_data TEXT,
	status VARCHAR(50) DEFAULT 'created',
	trace_id VARCHAR(36),
	window_index INTEGER,
	window_total INTEGER,
	window_start_char INTEGER,
	window_end_char INTEGER,
	processing_run_id VARCHAR(255)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS traces (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	trace_id VARCHAR(36) UNIQUE,
	filename VARCHAR(255),
	title VARCHAR(255),
	description TEXT,
	content TEXT,
	content_hash VARCHAR(64),
	upload_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
	update_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
	uploader VARCHAR(255),
	trace_type VARCHAR(50),
	trace_source VARCHAR(50),
	character_count INTEGER DEFAULT 0,
	turn_count INTEGER DEFAULT 0,
	status VARCHAR(50) DEFAULT 'uploaded',
	processing_method VARCHAR(50),
	tags TEXT,
	trace_metadata TEXT
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS entities (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	graph_id INTEGER,
	entity_id VARCHAR(255),
	type VARCHAR(255),
	name VARCHAR(255),
	properties TEXT,
	knowledge_graph_namespace VARCHAR(255),
	FOREIGN KEY (graph_id) REFERENCES knowledge_graphs(id),
	UNIQUE (graph_id, entity_id)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS relations (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	graph_id INTEGER,
	relation_id VARCHAR(255),
	type VARCHAR(255),
	source_id INTEGER,
	target_id INTEGER,
	properties TEXT,
	knowledge_graph_namespace VARCHAR(255),
	FOREIGN KEY (graph_id) REFERENCES knowledge_graphs(id),
	FOREIGN KEY (source_id) REFERENCES entities(id),
	FOREIGN KEY (target_id) REFERENCES entities(id),
	UNIQUE (graph_id, relation_id)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS prompt_reconstructions (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	knowledge_graph_id INTEGER,
	relation_id VARCHAR(255),
	reconstructed_prompt TEXT,
	dependencies TEXT,
	created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS perturbation_tests (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	knowledge_graph_id INTEGER NOT NULL,
	prompt_reconstruction_id INTEGER NOT NULL,
	relation_id VARCHAR(255) NOT NULL,
	perturbation_type VARCHAR(50) NOT NULL,
	perturbation_set_id VARCHAR(64),
	test_result JSON,
	perturbation_score FLOAT,
	test_metadata JSON,
	created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id),
	FOREIGN KEY (prompt_reconstruction_id) REFERENCES prompt_reconstructions(id)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS observability_connections (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	connection_id VARCHAR(36) UNIQUE,
	platform VARCHAR(50) NOT NULL,
	public_key TEXT NOT NULL,
	secret_key TEXT,
	host VARCHAR(255),
	projects TEXT,
	status VARCHAR(50) DEFAULT 'connected',
	connected_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	last_sync DATETIME,
	created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS fetched_traces (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	trace_id VARCHAR(255) NOT NULL,
	name VARCHAR(255) NOT NULL,
	platform VARCHAR(50) NOT NULL,
	project_name VARCHAR(255),
	connection_id VARCHAR(36) NOT NULL,
	data TEXT,
	fetched_at DATETIME DEFAULT CURRENT_TIMESTAMP,
	imported BOOLEAN DEFAULT 0,
	imported_at DATETIME,
	imported_trace_id VARCHAR(36),
	FOREIGN KEY (connection_id) REFERENCES observability_connections(connection_id),
	UNIQUE (trace_id, connection_id)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS causal_analyses (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	knowledge_graph_id INTEGER NOT NULL,
	perturbation_set_id VARCHAR(64) NOT NULL,
	analysis_method VARCHAR(50) NOT NULL,
	analysis_result JSON,
	causal_score FLOAT,
	analysis_metadata JSON,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (knowledge_graph_id) REFERENCES knowledge_graphs(id)
	)
	''')



	# Create indexes
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_filename ON knowledge_graphs(filename)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_namespace ON knowledge_graphs(namespace)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_knowledge_graphs_trace_id ON knowledge_graphs(trace_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_entities_entity_id ON entities(entity_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_relations_relation_id ON relations(relation_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_trace_id ON traces(trace_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_content_hash ON traces(content_hash)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_filename ON traces(filename)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_traces_status ON traces(status)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_prompt_reconstructions_kgid ON prompt_reconstructions(knowledge_graph_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_prompt_reconstructions_relation_id ON prompt_reconstructions(relation_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_kgid ON perturbation_tests(knowledge_graph_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_prid ON perturbation_tests(prompt_reconstruction_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_relation ON perturbation_tests(relation_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_perturbation_tests_type ON perturbation_tests(perturbation_type)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_kgid ON causal_analyses(knowledge_graph_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_method ON causal_analyses(analysis_method)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_causal_analyses_setid ON causal_analyses(perturbation_set_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_observability_connections_connection_id ON observability_connections(connection_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_observability_connections_platform ON observability_connections(platform)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_trace_id ON fetched_traces(trace_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_connection_id ON fetched_traces(connection_id)')
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_platform ON fetched_traces(platform)')

	# Commit changes
	conn.commit()

	# Handle column migrations for existing tables
	logger.info("Checking for column migrations...")

	# Check if project_name column exists in fetched_traces
	cursor.execute("PRAGMA table_info(fetched_traces)")
	fetched_traces_columns = [column[1] for column in cursor.fetchall()]

	if 'project_name' not in fetched_traces_columns:
	logger.info("Adding project_name column to fetched_traces table...")
	cursor.execute("ALTER TABLE fetched_traces ADD COLUMN project_name TEXT")
	cursor.execute('CREATE INDEX IF NOT EXISTS idx_fetched_traces_project_name ON fetched_traces(project_name)')
	conn.commit()
	logger.info("Successfully added project_name column")

	# Check if projects column exists in observability_connections
	cursor.execute("PRAGMA table_info(observability_connections)")
	obs_conn_columns = [column[1] for column in cursor.fetchall()]

	if 'projects' not in obs_conn_columns:
	logger.info("Adding projects column to observability_connections table...")
	cursor.execute("ALTER TABLE observability_connections ADD COLUMN projects TEXT")
	conn.commit()
	logger.info("Successfully added projects column")

	# Check if we need to rename preview_data to data in fetched_traces
	if 'preview_data' in fetched_traces_columns and 'data' not in fetched_traces_columns:
	logger.info("Renaming preview_data column to data in fetched_traces table...")
	cursor.execute("ALTER TABLE fetched_traces RENAME COLUMN preview_data TO data")
	conn.commit()
	logger.info("Successfully renamed preview_data column to data")

	# Verify tables were created
	cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
	tables = cursor.fetchall()
	table_names = [table[0] for table in tables]

	# Report results
	if reset:
	logger.info(f"Database reset completed. Tables created: {table_names}")
	else:
	logger.info(f"Database initialization completed. Tables created: {table_names}")

	# Check if the tables are empty
	cursor.execute("SELECT count(*) FROM knowledge_graphs")
	kg_count = cursor.fetchone()[0]
	cursor.execute("SELECT count(*) FROM entities")
	entity_count = cursor.fetchone()[0]
	cursor.execute("SELECT count(*) FROM relations")
	relation_count = cursor.fetchone()[0]

	trace_count = 0
	if 'traces' in table_names:
	cursor.execute("SELECT count(*) FROM traces")
	trace_count = cursor.fetchone()[0]

	logger.info(f"Database contains: {kg_count} knowledge graphs, {entity_count} entities, {relation_count} relations, {trace_count} traces")

	# Insert sample data for new databases (when trace count is 0)
	if trace_count == 0:
	logger.info("Empty database detected, inserting sample data for better user experience...")
	try:
	# Import here to avoid circular imports
	from .sample_data import insert_sample_data
	from . import SessionLocal

	# Use SQLAlchemy session for sample data insertion
	session = SessionLocal()
	try:
	results = insert_sample_data(session, force_insert=True)
	session.commit()
	except Exception as e:
	session.rollback()
	raise
	finally:
	session.close()

	if results["traces_inserted"] > 0 or results["knowledge_graphs_inserted"] > 0:
	logger.info(f"✅ Sample data inserted successfully: {results['traces_inserted']} traces, {results['knowledge_graphs_inserted']} knowledge graphs")
	elif results["skipped"] > 0:
	logger.info(f"Sample data already exists, skipped {results['skipped']} items")

	if results["errors"]:
	logger.warning(f"Some errors occurred during sample data insertion: {results['errors']}")

	except Exception as e:
	logger.warning(f"Failed to insert sample data (non-critical): {str(e)}")
	logger.info("Database initialization completed without sample data")
	else:
	# Check and fix existing sample knowledge graphs if needed
	logger.info("Existing data found, checking for sample data fixes...")
	try:
	from . import SessionLocal
	from .models import KnowledgeGraph, Trace

	session = SessionLocal()
	try:
	# Find sample traces
	sample_traces = session.query(Trace).filter(
	Trace.trace_source == "sample_data"
	).all()

	if sample_traces:
	# Find knowledge graphs with incorrect window_index
	trace_ids = [trace.trace_id for trace in sample_traces]
	broken_kgs = session.query(KnowledgeGraph).filter(
	KnowledgeGraph.trace_id.in_(trace_ids),
	KnowledgeGraph.window_index == 0, # Should be None for final KGs
	KnowledgeGraph.window_total == 1
	).all()

	if broken_kgs:
	logger.info(f"Found {len(broken_kgs)} sample KGs that need fixing (window_index=0 -> None)")
	for kg in broken_kgs:
	kg.window_index = None # Fix to make it a final KG
	session.commit()
	logger.info(f"✅ Fixed {len(broken_kgs)} sample knowledge graphs to display as final KGs")
	else:
	logger.info("All sample knowledge graphs are correctly configured")

	except Exception as e:
	session.rollback()
	logger.warning(f"Error checking sample data fixes: {str(e)}")
	finally:
	session.close()

	except Exception as e:
	logger.warning(f"Failed to check sample data fixes (non-critical): {str(e)}")

	# Close connection
	conn.close()

	def main():
	"""Parse arguments and initialize database."""
	parser = argparse.ArgumentParser(description='Initialize or reset the database')
	parser.add_argument('--reset', action='store_true', help='Reset the database by dropping and recreating tables')
	parser.add_argument('--force', action='store_true', help='Force reset by deleting the database file')
	args = parser.parse_args()

	try:
	if args.reset and not args.force and not confirm_reset():
	print("Database reset canceled.")
	return 0

	if args.force and not confirm_reset():
	print("Database force reset canceled.")
	return 0

	init_database(reset=args.reset, force=args.force)
	print("Database operation completed successfully.")
	return 0
	except Exception as e:
	logger.error(f"Error: {str(e)}")
	return 1

	if __name__ == "__main__":
	sys.exit(main())