agent-mcp-sql

No application file

App Files Files Community

ohmygaugh commited on Oct 2, 2025

Commit

9d411a7

1 Parent(s): 86cbe3c

This fixed the docker container health errors. just there is no mcp connection still.

Browse files

Files changed (11) hide show

README.md +4 -4
agent/main.py +18 -13
agent/requirements.txt +0 -1
docker-compose.yml +1 -3
mcp/core/config.py +13 -14
mcp/core/database.py +30 -17
mcp/core/discovery.py +56 -84
mcp/core/graph.py +64 -136
mcp/core/intelligence.py +58 -146
mcp/main.py +81 -139
streamlit/requirements.txt +0 -1

README.md CHANGED Viewed

@@ -45,7 +45,7 @@ This project implements an intelligent, multi-step GraphRAG-powered agent that u
 ### Prerequisites
 - Docker & Docker Compose
-- OpenAI API key
 ### Setup
 1. **Clone and configure**:
@@ -55,9 +55,9 @@ This project implements an intelligent, multi-step GraphRAG-powered agent that u
    touch .env
    ```
-2. **Add your OpenAI API key** to the `.env` file. This is the only secret you need to provide.
    ```
-   OPENAI_API_KEY="sk-your-openai-key-here"
    ```
 3. **Start the system**:
@@ -104,7 +104,7 @@ To test the agent's logic directly without the full Docker stack, you can run it
 2.  **Set your API key**:
     ```bash
-    export OPENAI_API_KEY="sk-your-openai-key-here"
     ```
 3.  **Run the agent**:

 ### Prerequisites
 - Docker & Docker Compose
+- LLM API key (e.g., for OpenAI)
 ### Setup
 1. **Clone and configure**:
    touch .env
    ```
+2. **Add your LLM API key** to the `.env` file.
    ```
+   LLM_API_KEY="sk-your-llm-api-key-here"
    ```
 3. **Start the system**:
 2.  **Set your API key**:
     ```bash
+    export LLM_API_KEY="sk-your-llm-api-key-here"
     ```
 3.  **Run the agent**:

agent/main.py CHANGED Viewed

@@ -3,17 +3,18 @@ import sys
 import logging
 import json
 from typing import Annotated, List, TypedDict
 from fastapi import FastAPI
 from pydantic import BaseModel
 import uvicorn
 from fastapi.responses import StreamingResponse
 from langchain_core.messages import BaseMessage, ToolMessage, AIMessage
-from langchain_openai import OpenAI
 from langgraph.graph import StateGraph, START, END
 from langgraph.prebuilt import ToolNode
-from agent.tools import MCPClient, SchemaSearchTool, JoinPathFinderTool, QueryExecutorTool
 # --- Configuration & Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
 MCP_URL = os.getenv("MCP_URL", "http://mcp:8000/mcp")
 API_KEY = os.getenv("MCP_API_KEY", "dev-key-123")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 # --- Agent State Definition ---
 class AgentState(TypedDict):
@@ -32,10 +33,10 @@ class GraphRAGAgent:
     """The core agent for handling GraphRAG queries using LangGraph."""
     def __init__(self):
-        if not OPENAI_API_KEY:
-            raise ValueError("OPENAI_API_KEY environment variable not set.")
-        llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0, max_retries=1)
         mcp_client = MCPClient(mcp_url=MCP_URL, api_key=API_KEY)
         tools = [
@@ -93,21 +94,25 @@ class GraphRAGAgent:
                 yield json.dumps({"type": "final_answer", "content": last_message.content}) + "\\n\\n"
 # --- FastAPI Application ---
-app = FastAPI(title="GraphRAG Agent Server")
 agent = None
-class QueryRequest(BaseModel):
-    question: str
-@app.on_event("startup")
-def startup_event():
-    """Initialize the agent on server startup."""
     global agent
     try:
         agent = GraphRAGAgent()
         logger.info("GraphRAGAgent initialized successfully.")
     except ValueError as e:
         logger.error(f"Agent initialization failed: {e}")
 @app.post("/query")
 async def execute_query(request: QueryRequest) -> StreamingResponse:

 import logging
 import json
 from typing import Annotated, List, TypedDict
+from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from pydantic import BaseModel
 import uvicorn
 from fastapi.responses import StreamingResponse
 from langchain_core.messages import BaseMessage, ToolMessage, AIMessage
+from langchain_openai import ChatOpenAI
 from langgraph.graph import StateGraph, START, END
 from langgraph.prebuilt import ToolNode
+from tools import MCPClient, SchemaSearchTool, JoinPathFinderTool, QueryExecutorTool
 # --- Configuration & Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 MCP_URL = os.getenv("MCP_URL", "http://mcp:8000/mcp")
 API_KEY = os.getenv("MCP_API_KEY", "dev-key-123")
+LLM_API_KEY = os.getenv("LLM_API_KEY")
 # --- Agent State Definition ---
 class AgentState(TypedDict):
     """The core agent for handling GraphRAG queries using LangGraph."""
     def __init__(self):
+        if not LLM_API_KEY:
+            raise ValueError("LLM_API_KEY environment variable not set.")
+        llm = ChatOpenAI(api_key=LLM_API_KEY, model="gpt-4o-mini", temperature=0, max_retries=1)
         mcp_client = MCPClient(mcp_url=MCP_URL, api_key=API_KEY)
         tools = [
                 yield json.dumps({"type": "final_answer", "content": last_message.content}) + "\\n\\n"
 # --- FastAPI Application ---
 agent = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Handles agent initialization on startup."""
     global agent
+    logger.info("Agent server startup...")
     try:
         agent = GraphRAGAgent()
         logger.info("GraphRAGAgent initialized successfully.")
     except ValueError as e:
         logger.error(f"Agent initialization failed: {e}")
+    yield
+    logger.info("Agent server shutdown.")
+app = FastAPI(title="GraphRAG Agent Server", lifespan=lifespan)
+class QueryRequest(BaseModel):
+    question: str
 @app.post("/query")
 async def execute_query(request: QueryRequest) -> StreamingResponse:

agent/requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 requests
-python-dotenv
 langchain
 langchain-openai
 pydantic

 requests
 langchain
 langchain-openai
 pydantic

docker-compose.yml CHANGED Viewed

@@ -1,5 +1,3 @@
-version: '3.8'
 services:
   neo4j:
     build: ./neo4j
@@ -50,7 +48,7 @@ services:
       - MCP_URL=http://mcp:8000/mcp
       - MCP_API_KEY=dev-key-123
       - AGENT_POLL_INTERVAL=${AGENT_POLL_INTERVAL}
-      - OPENAI_API_KEY=${OPENAI_API_KEY}
     depends_on:
       mcp:
         condition: service_healthy

 services:
   neo4j:
     build: ./neo4j
       - MCP_URL=http://mcp:8000/mcp
       - MCP_API_KEY=dev-key-123
       - AGENT_POLL_INTERVAL=${AGENT_POLL_INTERVAL}
+      - LLM_API_KEY=${LLM_API_KEY}
     depends_on:
       mcp:
         condition: service_healthy

mcp/core/config.py CHANGED Viewed

@@ -1,22 +1,21 @@
 import os
 # --- Neo4j Configuration ---
-NEO4J_URI = os.getenv("NEO4J_BOLT_URL", "bolt://neo4j:7687")
-NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
-NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
-# --- SQLite Configuration ---
-SQLITE_DATA_DIR = os.getenv("SQLITE_DATA_DIR", "/app/data")
-def get_sqlite_connection_string(db_name: str) -> str:
-    """
-    Generates the SQLAlchemy connection string for a given SQLite database file.
-    Assumes the database file is located in the SQLITE_DATA_DIR.
-    Example: get_sqlite_connection_string("clinical_trials.db")
-             -> "sqlite:////app/data/clinical_trials.db"
-    """
-    db_path = os.path.join(SQLITE_DATA_DIR, db_name)
-    return f"sqlite:///{db_path}"
 # --- Application Settings ---
 # You can add other application-wide settings here

 import os
 # --- Neo4j Configuration ---
+NEO4J_URI = os.getenv("NEO4J_BOLT_URL", "bolt://localhost:7687")
+NEO4J_USER = "neo4j"
+# The NEO4J_AUTH env var is in the format 'neo4j/password'
+# We need to extract the password part.
+neo4j_auth = os.getenv("NEO4J_AUTH", "neo4j/password")
+NEO4J_PASSWORD = neo4j_auth.split('/')[1] if '/' in neo4j_auth else neo4j_auth
+# --- Database Configuration ---
+# A dictionary of connection strings for the SQLite databases
+DB_CONNECTIONS = {
+    "clinical_trials": f"sqlite:////app/data/clinical_trials.db",
+    "drug_discovery": f"sqlite:////app/data/drug_discovery.db",
+    "laboratory": f"sqlite:////app/data/laboratory.db",
+}
 # --- Application Settings ---
 # You can add other application-wide settings here

mcp/core/database.py CHANGED Viewed

@@ -1,26 +1,39 @@
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Engine
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def get_db_engine(connection_string: str) -> Engine | None:
-    """
-    Creates a SQLAlchemy engine for a given database connection string.
-    Args:
-        connection_string: The database connection string.
-    Returns:
-        A SQLAlchemy Engine instance, or None if connection fails.
     """
-    try:
-        engine = create_engine(connection_string)
-        # Test the connection
-        with engine.connect() as connection:
-            logger.info(f"Successfully connected to {engine.url.database}")
-        return engine
-    except Exception as e:
-        logger.error(f"Failed to connect to database: {e}")
-        return None

 from sqlalchemy import create_engine
 from sqlalchemy.engine import Engine
 import logging
+from typing import Dict
+from . import config
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# A dictionary to hold the initialized SQLAlchemy engines
+_db_engines: Dict[str, Engine] = {}
+def get_db_connections() -> Dict[str, Engine]:
     """
+    Initializes and returns a dictionary of SQLAlchemy engines for all configured databases.
+    This function is idempotent.
+    """
+    global _db_engines
+    if not _db_engines:
+        logger.info("Initializing database connections...")
+        for db_name, conn_str in config.DB_CONNECTIONS.items():
+            try:
+                engine = create_engine(conn_str)
+                # Test the connection
+                with engine.connect():
+                    logger.info(f"Successfully connected to {db_name}")
+                    _db_engines[db_name] = engine
+            except Exception as e:
+                logger.error(f"Failed to connect to {db_name}: {e}")
+    return _db_engines
+def close_db_connections():
+    """Closes all active database connections."""
+    global _db_engines
+    logger.info("Closing database connections...")
+    for engine in _db_engines.values():
+        engine.dispose()
+    _db_engines = {}

mcp/core/discovery.py CHANGED Viewed

@@ -1,98 +1,70 @@
-from sqlalchemy import inspect, text
 from sqlalchemy.engine import Engine
 from typing import Dict, Any, List
 import logging
-import json
-from concurrent.futures import TimeoutError, ThreadPoolExecutor
-logger = logging.getLogger(__name__)
-def get_table_schema(inspector, table_name: str) -> Dict[str, Any]:
-    """Extracts schema for a single table."""
-    columns = inspector.get_columns(table_name)
-    primary_keys = inspector.get_pk_constraint(table_name)['constrained_columns']
-    foreign_keys = inspector.get_foreign_keys(table_name)
-    table_schema = {
-        "name": table_name,
-        "columns": [],
-        "primary_keys": primary_keys,
-        "foreign_keys": foreign_keys
-    }
-    for col in columns:
-        table_schema["columns"].append({
-            "name": col['name'],
-            "type": str(col['type']),
-            "nullable": col['nullable'],
-            "default": col.get('default'),
-        })
-    return table_schema
-def get_sample_data(engine: Engine, table_name: str, sample_size: int = 5) -> Dict[str, Any]:
-    """Fetches sample data and distinct values for each column."""
-    sample_data = {}
-    with engine.connect() as connection:
-        # Get row count
-        try:
-            result = connection.execute(text(f'SELECT COUNT(*) FROM "{table_name}"'))
-            sample_data['row_count'] = result.scalar_one()
-        except Exception as e:
-            logger.warning(f"Could not get row count for table {table_name}: {e}")
-            sample_data['row_count'] = -1 # Indicate error or unknown
-        # Get sample rows
-        try:
-            result = connection.execute(text(f'SELECT * FROM "{table_name}" LIMIT {sample_size}'))
-            rows = [dict(row._mapping) for row in result.fetchall()]
-            # Attempt to JSON serialize to handle complex types gracefully
-            sample_data['sample_rows'] = json.loads(json.dumps(rows, default=str))
-        except Exception as e:
-            logger.warning(f"Could not get sample rows for table {table_name}: {e}")
-            sample_data['sample_rows'] = []
-    return sample_data
-def discover_schema(engine: Engine, timeout: int = 30) -> Dict[str, Any] | None:
-    """
-    Discovers the full schema of a database using SQLAlchemy's inspection API.
-    Includes table schemas and sample data.
-    """
-    try:
-        with ThreadPoolExecutor() as executor:
-            future = executor.submit(_discover_schema_task, engine)
-            return future.result(timeout=timeout)
-    except TimeoutError:
-        logger.error(f"Schema discovery for {engine.url.database} timed out after {timeout} seconds.")
-        return None
-    except Exception as e:
-        logger.error(f"An unexpected error occurred during schema discovery for {engine.url.database}: {e}")
-        return None
-def _discover_schema_task(engine: Engine) -> Dict[str, Any]:
-    """The actual schema discovery logic to be run with a timeout."""
     inspector = inspect(engine)
     db_schema = {
-        "database_name": engine.url.database,
-        "dialect": engine.dialect.name,
         "tables": []
     }
     table_names = inspector.get_table_names()
     for table_name in table_names:
-        try:
-            logger.info(f"Discovering schema for table: {table_name}")
-            table_schema = get_table_schema(inspector, table_name)
-            logger.info(f"Collecting sample data for table: {table_name}")
-            sample_info = get_sample_data(engine, table_name)
-            table_schema.update(sample_info)
-            db_schema["tables"].append(table_schema)
-        except Exception as e:
-            logger.error(f"Could not inspect table '{table_name}': {e}")
-            continue
-    return db_schema

+from sqlalchemy import inspect
 from sqlalchemy.engine import Engine
 from typing import Dict, Any, List
 import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .database import get_db_connections
+logger = logging.getLogger(__name__)
+def _discover_single_db_schema(db_name: str, engine: Engine) -> Dict[str, Any]:
+    """Discovers the schema for a single database engine."""
     inspector = inspect(engine)
     db_schema = {
+        "database_name": db_name,
         "tables": []
     }
     table_names = inspector.get_table_names()
     for table_name in table_names:
+        columns = inspector.get_columns(table_name)
+        db_schema["tables"].append({
+            "name": table_name,
+            "columns": [{"name": c['name'], "type": str(c['type'])} for c in columns]
+        })
+    return db_schema
+async def get_relevant_schemas(query: str) -> List[Dict[str, Any]]:
+    """
+    Discovers schemas from all connected databases and performs a simple keyword search.
+    A more advanced implementation would use embeddings for semantic search.
+    """
+    db_engines = get_db_connections()
+    all_schemas = []
+    with ThreadPoolExecutor() as executor:
+        # Discover all schemas in parallel
+        future_to_db = {executor.submit(_discover_single_db_schema, name, eng): name for name, eng in db_engines.items()}
+        for future in as_completed(future_to_db):
+            try:
+                all_schemas.append(future.result())
+            except Exception as e:
+                db_name = future_to_db[future]
+                logger.error(f"Failed to discover schema for {db_name}: {e}")
+    if not query:
+        return all_schemas
+    # Simple keyword filtering
+    keywords = query.lower().split()
+    relevant_schemas = []
+    for db_schema in all_schemas:
+        for table in db_schema.get("tables", []):
+            if any(keyword in table['name'].lower() for keyword in keywords):
+                relevant_schemas.append({
+                    "database": db_schema["database_name"],
+                    "table": table['name'],
+                    "columns": table['columns']
+                })
+            else:
+                for col in table.get("columns", []):
+                    if any(keyword in col['name'].lower() for keyword in keywords):
+                        relevant_schemas.append({
+                            "database": db_schema["database_name"],
+                            "table": table['name'],
+                             "columns": table['columns'] # Return full table if a column matches
+                        })
+                        break # Move to next table
+    # Deduplicate results (in case multiple keywords match the same table)
+    return [dict(t) for t in {tuple(d.items()) for d in relevant_schemas}]

mcp/core/graph.py CHANGED Viewed

@@ -1,151 +1,79 @@
-from neo4j import GraphDatabase
 import logging
-import json
 from typing import List, Dict, Any
 from . import config
 logger = logging.getLogger(__name__)
-class GraphStore:
-    def __init__(self):
-        self._driver = GraphDatabase.driver(config.NEO4J_URI, auth=(config.NEO4J_USER, config.NEO4J_PASSWORD))
-        self.ensure_constraints()
-    def close(self):
-        self._driver.close()
-    def ensure_constraints(self):
-        """Ensure uniqueness constraints are set up in Neo4j."""
-        with self._driver.session() as session:
-            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Database) REQUIRE d.name IS UNIQUE")
-            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (t:Table) REQUIRE t.unique_name IS UNIQUE")
-            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Column) REQUIRE c.unique_name IS UNIQUE")
-        logger.info("Neo4j constraints ensured.")
-    def import_schema(self, schema_data: dict):
-        """
-        Imports a discovered database schema into the Neo4j graph.
-        """
-        db_name = schema_data['database_name']
-        with self._driver.session() as session:
-            # Create Database node
-            session.run("MERGE (d:Database {name: $db_name})", db_name=db_name)
-            for table in schema_data['tables']:
-                table_unique_name = f"{db_name}.{table['name']}"
-                table_properties = {
-                    "name": table['name'],
-                    "unique_name": table_unique_name,
-                    "row_count": table.get('row_count', -1),
-                    "sample_rows": json.dumps(table.get('sample_rows', []))
-                }
-                # Create Table node and HAS_TABLE relationship
-                session.run(
-                    """
-                    MATCH (d:Database {name: $db_name})
-                    MERGE (t:Table {unique_name: $unique_name})
-                    ON CREATE SET t += $props
-                    ON MATCH SET t += $props
-                    MERGE (d)-[:HAS_TABLE]->(t)
-                    """,
-                    db_name=db_name,
-                    unique_name=table_unique_name,
-                    props=table_properties
-                )
-                for column in table['columns']:
-                    column_unique_name = f"{table_unique_name}.{column['name']}"
-                    column_properties = {
-                        "name": column['name'],
-                        "unique_name": column_unique_name,
-                        "type": column['type'],
-                        "nullable": column['nullable'],
-                        "default": str(column.get('default')) # Ensure default is string
-                    }
-                    # Create Column node and HAS_COLUMN relationship
-                    session.run(
-                        """
-                        MATCH (t:Table {unique_name: $table_unique_name})
-                        MERGE (c:Column {unique_name: $column_unique_name})
-                        ON CREATE SET c += $props
-                        ON MATCH SET c += $props
-                        MERGE (t)-[:HAS_COLUMN]->(c)
-                        """,
-                        table_unique_name=table_unique_name,
-                        column_unique_name=column_unique_name,
-                        props=column_properties
-                    )
-            # After all tables and columns are created, create foreign key relationships
-            for table in schema_data['tables']:
-                table_unique_name = f"{db_name}.{table['name']}"
-                if table.get('foreign_keys'):
-                    for fk in table['foreign_keys']:
-                        constrained_columns = fk['constrained_columns']
-                        referred_table = fk['referred_table']
-                        referred_columns = fk['referred_columns']
-                        referred_table_unique_name = f"{db_name}.{referred_table}"
-                        for i, col_name in enumerate(constrained_columns):
-                            from_col_unique_name = f"{table_unique_name}.{col_name}"
-                            to_col_unique_name = f"{referred_table_unique_name}.{referred_columns[i]}"
-                            session.run(
-                                """
-                                MATCH (from_col:Column {unique_name: $from_col})
-                                MATCH (to_col:Column {unique_name: $to_col})
-                                MERGE (from_col)-[:REFERENCES]->(to_col)
-                                """,
-                                from_col=from_col_unique_name,
-                                to_col=to_col_unique_name
-                            )
-        logger.info(f"Successfully imported schema for database: {db_name}")
-    def find_shortest_path(self, start_node_name: str, end_node_name: str) -> List[Dict[str, Any]]:
-        """
-        Finds the shortest path between two nodes (Tables or Columns) in the graph.
-        This is a generic pathfinder.
-        """
-        query = """
-        MATCH (start {unique_name: $start_name}), (end {unique_name: $end_name})
-        CALL apoc.path.shortestPath(start, end, 'REFERENCES|HAS_COLUMN|HAS_TABLE', {maxLevel: 10}) YIELD path
-        RETURN path
-        """
-        with self._driver.session() as session:
-            result = session.run(query, start_name=start_node_name, end_name=end_node_name)
-            # The result is complex, we need to parse it into a user-friendly format.
-            # For now, returning the raw path objects.
-            return [record["path"] for record in result]
-    def keyword_search(self, keyword: str) -> List[Dict[str, Any]]:
-        """
-        Searches for tables and columns matching a keyword.
-        Returns a list of matching nodes with their database and table context.
-        """
-        query = """
-        MATCH (n)
-        WHERE (n:Table OR n:Column) AND n.name CONTAINS $keyword
-        OPTIONAL MATCH (d:Database)-[:HAS_TABLE]->(t:Table)-[:HAS_COLUMN]->(n) WHERE n:Column
-        OPTIONAL MATCH (d2:Database)-[:HAS_TABLE]->(n) WHERE n:Table
-        WITH COALESCE(d, d2) AS db, COALESCE(t, n) AS tbl, n AS item
-        RETURN db.name AS database, tbl.name AS table, item.name AS name, labels(item) AS type
-        LIMIT 25
-        """
-        with self._driver.session() as session:
-            result = session.run(query, keyword=keyword)
-            return [record.data() for record in result]
-    def get_table_row_count(self, table_unique_name: str) -> int:
-        """Retrieves the stored row count for a given table."""
-        query = """
-        MATCH (t:Table {unique_name: $unique_name})
-        RETURN t.row_count AS row_count
-        """
-        with self._driver.session() as session:
-            result = session.run(query, unique_name=table_unique_name)
-            record = result.single()
-            return record['row_count'] if record else -1

+from neo4j import GraphDatabase, Driver
 import logging
 from typing import List, Dict, Any
 from . import config
 logger = logging.getLogger(__name__)
+_driver: Driver = None
+def get_graph_driver() -> Driver:
+    """Initializes and returns the singleton Neo4j driver instance."""
+    global _driver
+    if _driver is None:
+        logger.info("Initializing Neo4j driver...")
+        _driver = GraphDatabase.driver(config.NEO4J_URI, auth=(config.NEO4J_USER, config.NEO4J_PASSWORD))
+        _ensure_constraints(_driver)
+    return _driver
+def close_graph_driver():
+    """Closes the Neo4j driver connection."""
+    global _driver
+    if _driver:
+        logger.info("Closing Neo4j driver.")
+        _driver.close()
+        _driver = None
+def _ensure_constraints(driver: Driver):
+    """Ensure uniqueness constraints are set up in Neo4j."""
+    with driver.session() as session:
+        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Database) REQUIRE d.name IS UNIQUE")
+        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (t:Table) REQUIRE t.unique_name IS UNIQUE")
+        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Column) REQUIRE c.unique_name IS UNIQUE")
+    logger.info("Neo4j constraints ensured.")
+def _keyword_search(keyword: str) -> List[Dict[str, Any]]:
+    """Internal helper to search for table nodes by keyword."""
+    driver = get_graph_driver()
+    query = """
+    MATCH (d:Database)-[:HAS_TABLE]->(t:Table)
+    WHERE t.name CONTAINS $keyword
+    RETURN d.name as database, t.name as table
+    LIMIT 5
+    """
+    with driver.session() as session:
+        result = session.run(query, keyword=keyword)
+        return [record.data() for record in result]
+def find_join_path(table1_name: str, table2_name: str) -> str:
+    """
+    Finds a human-readable join path between two tables using the graph's schema.
+    """
+    driver = get_graph_driver()
+    t1_nodes = _keyword_search(table1_name)
+    t2_nodes = _keyword_search(table2_name)
+    if not t1_nodes: return f"Could not find a table matching '{table1_name}'."
+    if not t2_nodes: return f"Could not find a table matching '{table2_name}'."
+    t1_unique_name = f"{t1_nodes[0]['database']}.{t1_nodes[0]['table']}"
+    t2_unique_name = f"{t2_nodes[0]['database']}.{t2_nodes[0]['table']}"
+    path_query = """
+    MATCH (start:Table {unique_name: $start_name}), (end:Table {unique_name: $end_name})
+    CALL apoc.path.shortestPath(start, end, 'HAS_COLUMN|REFERENCES|<HAS_COLUMN', {maxLevel: 5}) YIELD path
+    WITH [n in nodes(path) | COALESCE(n.name, '')] as path_nodes
+    RETURN FILTER(name in path_nodes WHERE name <> '') as path
+    LIMIT 1
+    """
+    with driver.session() as session:
+        result = session.run(path_query, start_name=t1_unique_name, end_name=t2_unique_name)
+        record = result.single()
+        if not record or not record["path"]:
+            return f"No join path found between {table1_name} and {table2_name}."
+        path_str = " -> ".join(record["path"])
+        return f"Found path: {path_str}"

mcp/core/intelligence.py CHANGED Viewed

@@ -1,161 +1,73 @@
 import sqlparse
 import logging
 from typing import List, Dict, Any
-from .graph import GraphStore
-from .database import get_db_engine
-from . import config
 from sqlalchemy import text
-logger = logging.getLogger(__name__)
-# Constants for query cost estimation
-ROW_EXECUTION_THRESHOLD = 100  # Execute queries expected to return fewer rows
-JOIN_CARDINALITY_ESTIMATE = 1000 # A simplistic estimate for joins
-class QueryIntelligence:
     """
-    Provides intelligence for handling SQL queries. It estimates query cost
-    and decides on an execution strategy.
     """
-    def __init__(self, graph_store: GraphStore):
-        self.graph_store = graph_store
-        self.db_engines = {}
-    def _get_engine_for_db(self, db_name: str):
-        """Helper to get or create an engine for a specific database."""
-        if db_name not in self.db_engines:
-            # Assuming db_name includes the .db extension
-            connection_string = config.get_sqlite_connection_string(db_name)
-            self.db_engines[db_name] = get_db_engine(connection_string)
-        return self.db_engines.get(db_name)
-    async def get_relevant_schemas(self, query: str) -> List[Dict[str, Any]]:
-        """Finds schemas relevant to a natural language query."""
-        # This is a simplistic keyword search. A real implementation would use
-        # embedding-based search or an LLM to extract entities.
-        keywords = query.split()
-        all_results = []
-        for keyword in keywords:
-            if len(keyword) > 2: # Avoid very short keywords
-                results = self.graph_store.keyword_search(keyword)
-                all_results.extend(results)
-        # Deduplicate results
-        return [dict(t) for t in {tuple(d.items()) for d in all_results}]
-    async def find_join_path(self, table1_name: str, table2_name: str) -> str:
-        """Finds a join path between two tables using the graph."""
-        # This is a simplification. It requires table names to be unique or requires
-        # the user to provide fully qualified names (db.table).
-        t1_nodes = self.graph_store.keyword_search(table1_name)
-        t2_nodes = self.graph_store.keyword_search(table2_name)
-        if not t1_nodes or not t2_nodes:
-            return "Could not find one or both tables."
-        # Assume the first result is correct for simplicity
-        t1_unique_name = f"{t1_nodes[0]['database']}.{t1_nodes[0]['table']}"
-        t2_unique_name = f"{t2_nodes[0]['database']}.{t2_nodes[0]['table']}"
-        path_result = self.graph_store.find_shortest_path(t1_unique_name, t2_unique_name)
-        if not path_result:
-            return f"No path found between {table1_name} and {table2_name}."
-        # Format the path for display
-        # This is a complex task. The raw path from Neo4j needs careful parsing.
-        # This is a placeholder for that logic.
-        return f"Path found (details require parsing): {path_result}"
-    async def execute_query(self, sql: str, limit: int) -> List[Dict[str, Any]]:
-        """
-        Executes a SQL query against the appropriate database if the estimated
-        cost is below the threshold.
-        """
-        cost_estimate = self.estimate_query_cost(sql)
-        if cost_estimate['decision'] != 'execute':
-            raise PermissionError(f"Query execution denied. Estimated cost is too high ({cost_estimate['estimated_rows']} rows).")
-        # This is a major simplification. Determining which database to run the query
-        # against is a hard problem (especially for federated queries).
-        # We assume the first table found belongs to the correct database.
-        parsed_sql = self._parse_sql(sql)
-        if not parsed_sql['tables']:
-            raise ValueError("No tables found in SQL query.")
-        first_table = parsed_sql['tables'][0]
-        search_results = self.graph_store.keyword_search(first_table)
-        if not search_results:
-            raise ValueError(f"Table '{first_table}' not found in any known database.")
-        db_name = search_results[0]['database']
-        engine = self._get_engine_for_db(db_name)
-        if not engine:
-            raise ConnectionError(f"Could not connect to database: {db_name}")
         with engine.connect() as connection:
-            # Append limit to the query
-            safe_sql = f"{sql.strip().rstrip(';')} LIMIT {int(limit)}"
-            result = connection.execute(text(safe_sql))
             return [dict(row._mapping) for row in result.fetchall()]
-    def _parse_sql(self, sql: str) -> Dict[str, Any]:
-        """Parses the SQL to identify tables and columns."""
-        parsed = sqlparse.parse(sql)[0]
-        # This is a simplistic parser. A real implementation would need
-        # a much more robust SQL parsing library to handle complex queries, CTEs, etc.
-        tables = set()
-        for token in parsed.tokens:
-            if isinstance(token, sqlparse.sql.Identifier):
-                tables.add(token.get_real_name())
-            elif token.is_group:
-                # Look for identifiers within subgroups (e.g., in FROM or JOIN clauses)
-                for sub_token in token.tokens:
-                     if isinstance(sub_token, sqlparse.sql.Identifier):
-                         tables.add(sub_token.get_real_name())
-        return {"tables": list(tables)}
-    def estimate_query_cost(self, sql: str) -> Dict[str, Any]:
-        """
-        Estimates the cost of a query based on row counts from the graph.
-        """
-        try:
-            parsed_sql = self._parse_sql(sql)
-            tables_in_query = parsed_sql['tables']
-            if not tables_in_query:
-                return {"estimated_rows": 0, "decision": "execute", "message": "No tables found in query."}
-            # For simplicity, we'll take the max row count of any table in the query.
-            # A real system would analyze JOINs and WHERE clauses.
-            max_rows = 0
-            for table_name in tables_in_query:
-                # Need to find the unique name. This assumes table names are unique across DBs for now.
-                # A real implementation needs context of which DB is being queried.
-                search_result = self.graph_store.keyword_search(table_name)
-                if search_result:
-                    table_unique_name = f"{search_result[0]['database']}.{search_result[0]['table']}"
-                    row_count = self.graph_store.get_table_row_count(table_unique_name)
-                    if row_count > max_rows:
-                        max_rows = row_count
-            estimated_rows = max_rows
-            # Crude adjustment for joins
-            if len(tables_in_query) > 1:
-                # A better estimate would involve graph traversal and statistical models
-                estimated_rows *= JOIN_CARDINALITY_ESTIMATE * (len(tables_in_query) - 1)
-            decision = "execute" if estimated_rows < ROW_EXECUTION_THRESHOLD else "return_sql"
-            return {
-                "estimated_rows": estimated_rows,
-                "decision": decision,
-                "tables_found": tables_in_query
-            }
-        except Exception as e:
-            logger.error(f"Error estimating query cost: {e}")
-            return {"estimated_rows": -1, "decision": "error", "message": str(e)}

 import sqlparse
 import logging
 from typing import List, Dict, Any
 from sqlalchemy import text
+from .database import get_db_connections
+logger = logging.getLogger(__name__)
+def _get_database_for_table(table_name: str) -> str | None:
     """
+    Finds which database a table belongs to by checking the graph.
+    (This is a simplified helper; assumes GraphStore is accessible or passed)
     """
+    # This is a placeholder for the logic to find a table's database.
+    # In a real scenario, this would query Neo4j. We'll simulate it.
+    # A simple mapping for our known databases:
+    if table_name in ["studies", "patients", "adverse_events"]:
+        return "clinical_trials"
+    if table_name in ["lab_tests", "test_results", "biomarkers"]:
+        return "laboratory"
+    if table_name in ["compounds", "assay_results", "drug_targets", "compound_targets"]:
+        return "drug_discovery"
+    return None
+async def execute_federated_query(sql: str) -> List[Dict[str, Any]]:
+    """
+    Executes a SQL query against the correct SQLite database.
+    This is a simplified version of a federated query engine. It identifies the
+    target database from the first table name in the SQL query.
+    """
+    parsed = sqlparse.parse(sql)[0]
+    target_table = None
+    # Find the first table name in the parsed SQL
+    for token in parsed.tokens:
+        if isinstance(token, sqlparse.sql.Identifier):
+            target_table = token.get_real_name()
+            break
+        elif token.is_group:
+            for sub_token in token.tokens:
+                if isinstance(sub_token, sqlparse.sql.Identifier):
+                    target_table = sub_token.get_real_name()
+                    break
+        if target_table:
+            break
+    if not target_table:
+        raise ValueError("Could not identify a target table in the SQL query.")
+    logger.info(f"Identified target table: {target_table}")
+    # Determine which database engine to use
+    db_name = _get_database_for_table(target_table)
+    if not db_name:
+        raise ValueError(f"Table '{target_table}' not found in any known database.")
+    db_engines = get_db_connections()
+    engine = db_engines.get(db_name)
+    if not engine:
+        raise ConnectionError(f"No active connection for database '{db_name}'.")
+    logger.info(f"Executing query against database: {db_name}")
+    try:
         with engine.connect() as connection:
+            result = connection.execute(text(sql))
             return [dict(row._mapping) for row in result.fetchall()]
+    except Exception as e:
+        logger.error(f"Failed to execute query on {db_name}: {e}", exc_info=True)
+        raise

mcp/main.py CHANGED Viewed

@@ -1,146 +1,88 @@
-from fastapi import FastAPI, Header, HTTPException
-from neo4j import GraphDatabase
 import os
-import json
-from datetime import datetime
-import psycopg2
-from psycopg2.extras import RealDictCursor
-app = FastAPI()
-driver = GraphDatabase.driver(
-    os.getenv("NEO4J_BOLT_URL"),
-    auth=("neo4j", os.getenv("NEO4J_AUTH").split("/")[1])
-)
-VALID_API_KEYS = os.getenv("MCP_API_KEYS", "").split(",")
-POSTGRES_CONN = os.getenv("POSTGRES_CONNECTION")
-@app.get("/health")
-def health():
-    return {"ok": True, "timestamp": datetime.now().isoformat()}
-@app.post("/mcp")
-async def execute_tool(request: dict, x_api_key: str = Header(None)):
-    # Verify API key
     if x_api_key not in VALID_API_KEYS:
-        raise HTTPException(status_code=401, detail="Invalid API key")
-    print(f"Raw request: {request}")
-    tool = request.get("tool")
-    params = request.get("params", {})
-    print(f"Tool: {tool}, Params: {params}")
-    if tool == "get_schema":
-        # Return node labels and relationships
-        with driver.session() as session:
-            result = session.run("CALL db.labels() YIELD label RETURN collect(label) as labels")
-            return {"labels": result.single()["labels"]}
-    elif tool == "query_graph":
-        # Execute parameterized query
-        try:
-            query = params.get("query")
-            query_params = params.get("parameters", {})
-            # Fix parameter substitution issue - replace placeholders with Neo4j parameters
-            # The $ character gets stripped by environment variable substitution
-            # So we use $$PARAM$$ as a placeholder and replace it with $PARAM
-            import re
-            for param_name in query_params.keys():
-                # Replace $$param_name$$ with $param_name
-                query = query.replace(f'$${param_name}$$', f'${param_name}')
-                # Also handle the case where frontend sends $param (which becomes param)
-                query = query.replace(f' {param_name} ', f' ${param_name} ')
-                query = query.replace(f'={param_name})', f'=${param_name})')
-                query = query.replace(f'({param_name})', f'(${param_name})')
-            print(f"Original query: {params.get('query')}")
-            print(f"Fixed query: {query}")
-            print(f"With parameters: {query_params}")
-            with driver.session() as session:
-                result = session.run(query, query_params)
-                return {"data": [dict(record) for record in result]}
-        except Exception as e:
-            print(f"Query error: {e}")
-            return {"error": str(e), "query": query, "parameters": query_params}
-    elif tool == "write_graph":
-        # Structured write operation
-        action = params.get("action")
-        if action == "create_node":
-            label = params.get("label")
-            properties = params.get("properties", {})
-            with driver.session() as session:
-                result = session.run(f"CREATE (n:{label} $props) RETURN n", {"props": properties})
-                record = result.single()
-                if record:
-                    node = record["n"]
-                    return {"created": dict(node) if hasattr(node, 'items') else {"id": str(node.id), "labels": list(node.labels), "properties": dict(node)}}
-                return {"created": {}}
-    elif tool == "get_next_instruction":
-        # Get next pending instruction
-        with driver.session() as session:
-            result = session.run("""
-                MATCH (i:Instruction {status: 'pending'})
-                RETURN i ORDER BY i.sequence LIMIT 1
-            """)
-            record = result.single()
-            return {"instruction": dict(record["i"]) if record else None}
-    elif tool == "query_postgres":
-        query = params.get("query")
-        try:
-            conn = psycopg2.connect(POSTGRES_CONN)
-            with conn.cursor(cursor_factory=RealDictCursor) as cur:
-                cur.execute(query)
-                if cur.description:  # SELECT query
-                    results = cur.fetchall()
-                    return {"data": results, "row_count": len(results)}
-                else:  # INSERT/UPDATE/DELETE
-                    conn.commit()
-                    return {"affected_rows": cur.rowcount}
-        except Exception as e:
-            return {"error": str(e)}
-        finally:
-            if 'conn' in locals():
-                conn.close()
-    elif tool == "discover_postgres_schema":
-        try:
-            conn = psycopg2.connect(POSTGRES_CONN)
-            with conn.cursor(cursor_factory=RealDictCursor) as cur:
-                # Get all tables
-                cur.execute("""
-                    SELECT table_name, table_schema
-                    FROM information_schema.tables
-                    WHERE table_schema = 'public'
-                    AND table_type = 'BASE TABLE'
-                """)
-                tables = cur.fetchall()
-                schema_info = {}
-                for table in tables:
-                    table_name = table['table_name']
-                    # Get columns for each table
-                    cur.execute("""
-                        SELECT column_name, data_type, is_nullable,
-                               column_default, character_maximum_length
-                        FROM information_schema.columns
-                        WHERE table_schema = 'public'
-                        AND table_name = %s
-                        ORDER BY ordinal_position
-                    """, (table_name,))
-                    schema_info[table_name] = cur.fetchall()
-                return {"schema": schema_info}
-        except Exception as e:
-            return {"error": str(e)}
-        finally:
-            if 'conn' in locals():
-                conn.close()
-    return {"error": "Unknown tool"}

+from fastapi import FastAPI, Header, HTTPException, Depends
+from typing import List, Dict, Any
 import os
+import logging
+from pydantic import BaseModel
+# --- Core Logic Imports ---
+# These imports assume your project structure places the core logic correctly.
+from core.database import get_db_connections, close_db_connections
+from core.discovery import get_relevant_schemas
+from core.graph import find_join_path, get_graph_driver, close_graph_driver
+from core.intelligence import execute_federated_query
+# --- App Configuration ---
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="MCP Server", version="2.0")
+VALID_API_KEYS = os.getenv("MCP_API_KEYS", "dev-key-123").split(",")
+# --- Pydantic Models ---
+class ToolRequest(BaseModel):
+    tool: str
+    params: Dict[str, Any]
+class SchemaQuery(BaseModel):
+    query: str
+class JoinPathRequest(BaseModel):
+    table1: str
+    table2: str
+class SQLQuery(BaseModel):
+    sql: str
+# --- Dependency for Auth ---
+async def verify_api_key(x_api_key: str = Header(...)):
     if x_api_key not in VALID_API_KEYS:
+        raise HTTPException(status_code=401, detail="Invalid API Key")
+    return x_api_key
+# --- Event Handlers ---
+@app.on_event("startup")
+async def startup_event():
+    """Initializes the database connection pool on server startup."""
+    get_db_connections()
+    get_graph_driver()
+    logger.info("MCP server started and database connections initialized.")
+@app.on_event("shutdown")
+def shutdown_event():
+    """Closes the database connection pool on server shutdown."""
+    close_db_connections()
+    close_graph_driver()
+    logger.info("MCP server shutting down and database connections closed.")
+# --- API Endpoints ---
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+@app.post("/mcp/discovery/get_relevant_schemas", dependencies=[Depends(verify_api_key)])
+async def discover_schemas(request: SchemaQuery):
+    try:
+        schemas = await get_relevant_schemas(request.query)
+        return {"status": "success", "schemas": schemas}
+    except Exception as e:
+        logger.error(f"Schema discovery failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/mcp/graph/find_join_path", dependencies=[Depends(verify_api_key)])
+async def get_join_path(request: JoinPathRequest):
+    try:
+        path = find_join_path(request.table1, request.table2)
+        return {"status": "success", "path": path}
+    except Exception as e:
+        logger.error(f"Join path finding failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/mcp/intelligence/execute_query", dependencies=[Depends(verify_api_key)])
+async def execute_query(request: SQLQuery):
+    try:
+        results = await execute_federated_query(request.sql)
+        return {"status": "success", "results": results}
+    except Exception as e:
+        logger.error(f"Query execution failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))

streamlit/requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
 streamlit==1.28.0
 requests==2.31.0
 pandas==2.1.0
-python-dotenv==1.0.0

 streamlit==1.28.0
 requests==2.31.0
 pandas==2.1.0