Spaces:

nothingworry
/

IntegraChat

Sleeping

App Files Files Community

nothingworry commited on Nov 20, 2025

Commit

9d50a01

1 Parent(s): 4c04529

imporve RAG

Browse files

Files changed (5) hide show

backend/api/mcp_clients/mcp_client.py +25 -3
backend/api/mcp_clients/rag_client.py +11 -1
backend/api/services/tool_selector.py +2 -1
backend/mcp_server/common/database.py +31 -112
backend/mcp_server/rag/search.py +23 -7

backend/api/mcp_clients/mcp_client.py CHANGED Viewed

@@ -10,9 +10,31 @@ class MCPClient:
     client: httpx.AsyncClient = field(default_factory=lambda: httpx.AsyncClient(timeout=30))
-    async def call_rag(self, tenant_id: str, query: str):
-        r = await self.client.post(f"{self.rag_url}/search", json={"tenant_id":tenant_id,"query":query})
-        return r.json()
     async def call_web(self, tenant_id: str, query: str):

     client: httpx.AsyncClient = field(default_factory=lambda: httpx.AsyncClient(timeout=30))
+    async def call_rag(self, tenant_id: str, query: str, threshold: float = 0.3):
+        """
+        Calls the RAG search endpoint and returns the unwrapped results.
+        The MCP server wraps responses in a 'data' field, so we extract it.
+        Uses a lower threshold (0.3) by default to ensure we find relevant results
+        even if semantic similarity is moderate.
+        """
+        r = await self.client.post(
+            f"{self.rag_url}/search",
+            json={
+                "tenant_id": tenant_id,
+                "query": query,
+                "threshold": threshold  # Lower threshold for better recall
+            }
+        )
+        if r.status_code != 200:
+            return {"results": [], "error": f"HTTP {r.status_code}"}
+        data = r.json()
+        # MCP server wraps response in a 'data' field
+        # Extract the actual result data
+        if isinstance(data, dict) and "data" in data:
+            return data["data"]
+        # If not wrapped, return as-is (backward compatibility)
+        return data
     async def call_web(self, tenant_id: str, query: str):

backend/api/mcp_clients/rag_client.py CHANGED Viewed

@@ -19,6 +19,7 @@ class RAGClient:
     async def search(self, query: str, tenant_id: str):
         """
         Sends the query to the RAG server and returns document chunks.
         """
         try:
@@ -35,7 +36,16 @@ class RAGClient:
                 return []
             data = response.json()
-            return data.get("results", [])
         except Exception as e:
             print("RAG Client Error:", e)

     async def search(self, query: str, tenant_id: str):
         """
         Sends the query to the RAG server and returns document chunks.
+        Unwraps MCP server responses automatically.
         """
         try:
                 return []
             data = response.json()
+            if isinstance(data, dict) and data.get("status") == "error":
+                print("RAG Client Error:", data.get("message"))
+                return []
+            if isinstance(data, dict) and "data" in data:
+                payload = data["data"]
+                return payload.get("results", []) if isinstance(payload, dict) else payload
+            return data.get("results", []) if isinstance(data, dict) else data
         except Exception as e:
             print("RAG Client Error:", e)

backend/api/services/tool_selector.py CHANGED Viewed

@@ -37,7 +37,8 @@ class ToolSelector:
         # RAG patterns: internal knowledge, company-specific, documentation
         rag_patterns = [
             r"company", r"internal", r"documentation", r"our ", r"your ",
-            r"knowledge base", r"private", r"internal docs", r"corporate"
         ]
         if rag_has_data or rag_score >= 0.55 or any(re.search(p, msg) for p in rag_patterns):
             needs_rag = True

         # RAG patterns: internal knowledge, company-specific, documentation
         rag_patterns = [
             r"company", r"internal", r"documentation", r"our ", r"your ",
+            r"knowledge base", r"private", r"internal docs", r"corporate",
+            r"admin", r"administrator", r"who is", r"what is"  # Add admin and fact lookup patterns
         ]
         if rag_has_data or rag_score >= 0.55 or any(re.search(p, msg) for p in rag_patterns):
             needs_rag = True

backend/mcp_server/common/database.py CHANGED Viewed

@@ -155,11 +155,11 @@ def search_vectors(tenant_id: str, vector: list, limit: int = 5) -> List[Dict[st
             print("DB SEARCH ERROR: tenant_id is empty")
             return []
-        tenant_id = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
-        # Query with explicit tenant_id filtering
         cur.execute(
             """
             SELECT
@@ -167,11 +167,11 @@ def search_vectors(tenant_id: str, vector: list, limit: int = 5) -> List[Dict[st
                 tenant_id,
                 1 - (embedding <=> %s::vector(384)) AS similarity
             FROM documents
-            WHERE tenant_id = %s
             ORDER BY embedding <=> %s::vector(384)
             LIMIT %s;
             """,
-            (vector, tenant_id, vector, limit),
         )
         rows = cur.fetchall()
@@ -180,9 +180,9 @@ def search_vectors(tenant_id: str, vector: list, limit: int = 5) -> List[Dict[st
         results: List[Dict[str, Any]] = []
         for row in rows:
             row_tenant_id = row.get("tenant_id", "")
-            if row_tenant_id != tenant_id:
                 print(
-                    f"WARNING: Found document with tenant_id '{row_tenant_id}' when searching for '{tenant_id}' - skipping"
                 )
                 continue
@@ -211,58 +211,35 @@ def list_all_documents(
 ) -> Dict[str, Any]:
     """
     List all documents for a tenant with pagination.
-    Handles tenant_id normalization to match documents stored with different formatting.
     """
     try:
-        # Normalize tenant_id to ensure consistency
         tenant_id_normalized = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
-        # Get all unique tenant_ids that match when normalized
-        cur.execute("SELECT DISTINCT tenant_id FROM documents;")
-        all_tenant_ids = [row[0] for row in cur.fetchall()]
-        # Find tenant_ids that match when normalized
-        matching_tenant_ids = []
-        for stored_tenant_id in all_tenant_ids:
-            if stored_tenant_id and stored_tenant_id.strip() == tenant_id_normalized:
-                matching_tenant_ids.append(stored_tenant_id)
-        if not matching_tenant_ids:
-            # No matching tenant_ids found
-            cur.close()
-            conn.close()
-            return {"documents": [], "total": 0, "limit": limit, "offset": offset}
-        # Build query to match any of the normalized tenant_ids
-        placeholders = ','.join(['%s'] * len(matching_tenant_ids))
         cur.execute(
-            f"""
             SELECT
                 id,
                 chunk_text,
                 created_at
             FROM documents
-            WHERE tenant_id IN ({placeholders})
             ORDER BY created_at DESC
             LIMIT %s OFFSET %s;
             """,
-            tuple(matching_tenant_ids) + (limit, offset),
         )
         rows = cur.fetchall()
-        # Get total count for all matching tenant_ids
-        placeholders = ','.join(['%s'] * len(matching_tenant_ids))
         cur.execute(
-            f"""
             SELECT COUNT(*) as total
             FROM documents
-            WHERE tenant_id IN ({placeholders});
             """,
-            tuple(matching_tenant_ids),
         )
         total_row = cur.fetchone()
         total = total_row["total"] if total_row else 0
@@ -299,56 +276,24 @@ def delete_document(tenant_id: str, document_id: int) -> bool:
     Returns True if document was deleted, False otherwise.
     """
     try:
-        # Normalize tenant_id to ensure consistency
-        tenant_id = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor()
-        # First, verify the document exists
         cur.execute(
             """
-            SELECT id, tenant_id FROM documents
-            WHERE id = %s;
             """,
-            (document_id,),
         )
-        doc_row = cur.fetchone()
-        if doc_row is None:
-            print(f"DB DELETE: Document {document_id} does not exist")
-            cur.close()
-            conn.close()
-            return False
-        doc_tenant_id = doc_row[1] if len(doc_row) > 1 else None
-        # Normalize both tenant_ids for comparison (handle existing data with whitespace)
-        doc_tenant_id_normalized = doc_tenant_id.strip() if doc_tenant_id else None
-        tenant_id_normalized = tenant_id.strip()
-        # Try to delete with normalized comparison - if normalized match, use stored value for actual delete
-        if doc_tenant_id_normalized == tenant_id_normalized:
-            # Tenant IDs match after normalization - proceed with delete using stored tenant_id
-            cur.execute(
-                """
-                DELETE FROM documents
-                WHERE id = %s AND tenant_id = %s;
-                """,
-                (document_id, doc_tenant_id),
-            )
-            deleted = cur.rowcount > 0
-        else:
-            # Tenant IDs don't match - log the mismatch
-            print(f"DB DELETE: Document {document_id} belongs to tenant '{doc_tenant_id}' (normalized: '{doc_tenant_id_normalized}'), not '{tenant_id}' (normalized: '{tenant_id_normalized}')")
-            print(f"DB DELETE: Tenant ID lengths - stored: {len(doc_tenant_id) if doc_tenant_id else 0}, requested: {len(tenant_id)}")
-            print(f"DB DELETE: Tenant ID repr - stored: {repr(doc_tenant_id)}, requested: {repr(tenant_id)}")
-            deleted = False
         if deleted:
-            print(f"DB DELETE: Successfully deleted document {document_id} for tenant '{tenant_id}'")
         else:
-            print(f"DB DELETE: Failed to delete document {document_id} for tenant '{tenant_id}' (rowcount: {cur.rowcount})")
         conn.commit()
         cur.close()
         conn.close()
@@ -369,47 +314,21 @@ def delete_all_documents(tenant_id: str) -> int:
     Handles tenant_id normalization to match documents stored with different formatting.
     """
     try:
-        # Normalize tenant_id
-        tenant_id = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor()
-        # First, get all unique tenant_ids that match when normalized
         cur.execute(
             """
-            SELECT DISTINCT tenant_id FROM documents;
-            """
         )
-        all_tenant_ids = [row[0] for row in cur.fetchall()]
-        # Find tenant_ids that match when normalized
-        matching_tenant_ids = []
-        tenant_id_normalized = tenant_id.strip()
-        for stored_tenant_id in all_tenant_ids:
-            if stored_tenant_id and stored_tenant_id.strip() == tenant_id_normalized:
-                matching_tenant_ids.append(stored_tenant_id)
-        if not matching_tenant_ids:
-            print(f"DB DELETE ALL: No documents found for tenant '{tenant_id}' (normalized: '{tenant_id_normalized}')")
-            cur.close()
-            conn.close()
-            return 0
-        # Delete documents matching any of the normalized tenant_ids
-        deleted_count = 0
-        for matching_tenant_id in matching_tenant_ids:
-            cur.execute(
-                """
-                DELETE FROM documents
-                WHERE tenant_id = %s;
-                """,
-                (matching_tenant_id,),
-            )
-            deleted_count += cur.rowcount
-        print(f"DB DELETE ALL: Deleted {deleted_count} document(s) for tenant '{tenant_id}' (matched {len(matching_tenant_ids)} tenant_id variant(s))")
         conn.commit()
         cur.close()
         conn.close()

             print("DB SEARCH ERROR: tenant_id is empty")
             return []
+        tenant_id_normalized = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
+        # Query with normalized tenant_id filtering
         cur.execute(
             """
             SELECT
                 tenant_id,
                 1 - (embedding <=> %s::vector(384)) AS similarity
             FROM documents
+            WHERE TRIM(tenant_id) = %s
             ORDER BY embedding <=> %s::vector(384)
             LIMIT %s;
             """,
+            (vector, tenant_id_normalized, vector, limit),
         )
         rows = cur.fetchall()
         results: List[Dict[str, Any]] = []
         for row in rows:
             row_tenant_id = row.get("tenant_id", "")
+            if row_tenant_id and row_tenant_id.strip() != tenant_id_normalized:
                 print(
+                    f"WARNING: Found document with tenant_id '{row_tenant_id}' when searching for '{tenant_id_normalized}' - skipping"
                 )
                 continue
 ) -> Dict[str, Any]:
     """
     List all documents for a tenant with pagination.
+    tenant_id comparison is normalized via TRIM() to handle historical data.
     """
     try:
         tenant_id_normalized = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
         cur.execute(
+            """
             SELECT
                 id,
                 chunk_text,
                 created_at
             FROM documents
+            WHERE TRIM(tenant_id) = %s
             ORDER BY created_at DESC
             LIMIT %s OFFSET %s;
             """,
+            (tenant_id_normalized, limit, offset),
         )
         rows = cur.fetchall()
         cur.execute(
+            """
             SELECT COUNT(*) as total
             FROM documents
+            WHERE TRIM(tenant_id) = %s;
             """,
+            (tenant_id_normalized,),
         )
         total_row = cur.fetchone()
         total = total_row["total"] if total_row else 0
     Returns True if document was deleted, False otherwise.
     """
     try:
+        tenant_id_normalized = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor()
         cur.execute(
             """
+            DELETE FROM documents
+            WHERE id = %s AND TRIM(tenant_id) = %s;
             """,
+            (document_id, tenant_id_normalized),
         )
+        deleted = cur.rowcount > 0
         if deleted:
+            print(f"DB DELETE: Deleted document {document_id} for tenant '{tenant_id_normalized}'")
         else:
+            print(f"DB DELETE: Document {document_id} not found for tenant '{tenant_id_normalized}'")
         conn.commit()
         cur.close()
         conn.close()
     Handles tenant_id normalization to match documents stored with different formatting.
     """
     try:
+        tenant_id_normalized = tenant_id.strip()
         conn = get_connection()
         cur = conn.cursor()
         cur.execute(
             """
+            DELETE FROM documents
+            WHERE TRIM(tenant_id) = %s;
+            """,
+            (tenant_id_normalized,),
         )
+        deleted_count = cur.rowcount
+        print(f"DB DELETE ALL: Deleted {deleted_count} document(s) for tenant '{tenant_id_normalized}'")
         conn.commit()
         cur.close()
         conn.close()

backend/mcp_server/rag/search.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from statistics import mean
-from typing import Mapping
 from backend.mcp_server.common.database import search_vectors
 from backend.mcp_server.common.embeddings import embed_text
@@ -26,7 +26,7 @@ async def rag_search(context: TenantContext, payload: Mapping[str, Any]) -> dict
     except (TypeError, ValueError):
         raise ToolValidationError("limit must be an integer between 1 and 25")
-    threshold = payload.get("threshold", 0.55)
     try:
         threshold_value = max(0.0, min(float(threshold), 1.0))
     except (TypeError, ValueError):
@@ -34,11 +34,27 @@ async def rag_search(context: TenantContext, payload: Mapping[str, Any]) -> dict
     embedding = embed_text(query)
     raw_results = search_vectors(context.tenant_id, embedding, limit=limit_value)
-    filtered = [
-        {"text": chunk.get("text", ""), "relevance": chunk.get("similarity", 0.0)}
-        for chunk in raw_results
-        if chunk.get("similarity", 0.0) >= threshold_value
-    ][:3]
     hits = len(raw_results)
     avg_score = mean([item.get("similarity", 0.0) for item in raw_results]) if raw_results else None

 from __future__ import annotations
 from statistics import mean
+from typing import Any, Mapping
 from backend.mcp_server.common.database import search_vectors
 from backend.mcp_server.common.embeddings import embed_text
     except (TypeError, ValueError):
         raise ToolValidationError("limit must be an integer between 1 and 25")
+    threshold = payload.get("threshold", 0.3)  # Lower default threshold for better recall
     try:
         threshold_value = max(0.0, min(float(threshold), 1.0))
     except (TypeError, ValueError):
     embedding = embed_text(query)
     raw_results = search_vectors(context.tenant_id, embedding, limit=limit_value)
+    # Return top results even if slightly below threshold, but prioritize high-scoring ones
+    filtered = []
+    for chunk in raw_results:
+        similarity = chunk.get("similarity", 0.0)
+        if similarity >= threshold_value:
+            filtered.append({
+                "text": chunk.get("text", ""),
+                "relevance": similarity,
+                "score": similarity  # Add score field for compatibility
+            })
+    # If we have results above threshold, return top 3. Otherwise, return top 1 even if below threshold.
+    if filtered:
+        filtered = sorted(filtered, key=lambda x: x.get("relevance", 0.0), reverse=True)[:3]
+    elif raw_results:
+        # Return the top result even if below threshold, as it might still be relevant
+        top_chunk = raw_results[0]
+        filtered = [{
+            "text": top_chunk.get("text", ""),
+            "relevance": top_chunk.get("similarity", 0.0),
+            "score": top_chunk.get("similarity", 0.0)
+        }]
     hits = len(raw_results)
     avg_score = mean([item.get("similarity", 0.0) for item in raw_results]) if raw_results else None