Spaces:

AdithyaVardan
/

GodSpeed

Sleeping

App Files Files Community

AdithyaVardan commited on May 8

Commit

1fbfa0e

1 Parent(s): cbf8345

Fix Qdrant search API, add Supabase writes to agent pipelines, pin qdrant-client 1.17.1

Browse files

Files changed (5) hide show

agent/tools/doc_search.py +14 -31
requirements.txt +1 -4
src/confluence_agent/pipeline.py +11 -4
src/file_agent/pipeline.py +24 -10
src/jira_agent/pipeline.py +11 -11

agent/tools/doc_search.py CHANGED Viewed

@@ -144,51 +144,34 @@ async def run_doc_search(query: str, team_id: str) -> list[RetrievedChunk]:
     try:
         client = AsyncQdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
-        results = await client.search(
             collection_name=settings.qdrant_collection,
-            query_vector=qmodels.NamedVector(
-                name=settings.qdrant_dense_vector_name,
-                vector=dense_vector,
-            ),
-            query_filter=qmodels.Filter(
-                must=[
-                    qmodels.FieldCondition(
-                        key="team_id",
-                        match=qmodels.MatchValue(value=team_id),
-                    )
-                ]
-            ),
             limit=settings.rrf_top_k,
             with_payload=True,
         )
-        for hit in results:
             doc_id = hit.payload.get("chunk_id", str(hit.id))
             qdrant_ranked_ids.append(doc_id)
             qdrant_payload_map[doc_id] = hit.payload
             qdrant_score_map[doc_id] = hit.score
-        sparse_results = await client.search(
             collection_name=settings.qdrant_collection,
-            query_vector=qmodels.NamedSparseVector(
-                name=settings.qdrant_sparse_vector_name,
-                vector=qmodels.SparseVector(
-                    indices=sparse_indices,
-                    values=sparse_values,
-                ),
-            ),
-            query_filter=qmodels.Filter(
-                must=[
-                    qmodels.FieldCondition(
-                        key="team_id",
-                        match=qmodels.MatchValue(value=team_id),
-                    )
-                ]
-            ),
             limit=settings.rrf_top_k,
             with_payload=True,
         )
         sparse_ranked_ids: list[str] = []
-        for hit in sparse_results:
             doc_id = hit.payload.get("chunk_id", str(hit.id))
             sparse_ranked_ids.append(doc_id)
             qdrant_payload_map.setdefault(doc_id, hit.payload)

     try:
         client = AsyncQdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
+        team_filter = qmodels.Filter(
+            must=[qmodels.FieldCondition(key="team_id", match=qmodels.MatchValue(value=team_id))]
+        )
+        dense_response = await client.query_points(
             collection_name=settings.qdrant_collection,
+            query=dense_vector,
+            using=settings.qdrant_dense_vector_name,
+            query_filter=team_filter,
             limit=settings.rrf_top_k,
             with_payload=True,
         )
+        for hit in dense_response.points:
             doc_id = hit.payload.get("chunk_id", str(hit.id))
             qdrant_ranked_ids.append(doc_id)
             qdrant_payload_map[doc_id] = hit.payload
             qdrant_score_map[doc_id] = hit.score
+        sparse_response = await client.query_points(
             collection_name=settings.qdrant_collection,
+            query=qmodels.SparseVector(indices=sparse_indices, values=sparse_values),
+            using=settings.qdrant_sparse_vector_name,
+            query_filter=team_filter,
             limit=settings.rrf_top_k,
             with_payload=True,
         )
         sparse_ranked_ids: list[str] = []
+        for hit in sparse_response.points:
             doc_id = hit.payload.get("chunk_id", str(hit.id))
             sparse_ranked_ids.append(doc_id)
             qdrant_payload_map.setdefault(doc_id, hit.payload)

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ FlagEmbedding==1.2.11
 gliner==0.2.13
 # Vector database
-qdrant-client==1.12.1
 # BM25
 rank-bm25==0.2.2
@@ -82,6 +82,3 @@ watchdog==6.0.0
 # File agent — OCR fallback for scanned PDFs (optional but recommended)
 pytesseract==0.3.13
 Pillow==11.1.0
-# FastAPI file upload support
-python-multipart==0.0.20

 gliner==0.2.13
 # Vector database
+qdrant-client==1.17.1
 # BM25
 rank-bm25==0.2.2
 # File agent — OCR fallback for scanned PDFs (optional but recommended)
 pytesseract==0.3.13
 Pillow==11.1.0

src/confluence_agent/pipeline.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.confluence_agent.adapter import ConfluenceAdapter
 from src.confluence_agent.chunker import chunk_confluence_page
@@ -12,6 +13,14 @@ from src.confluence_agent.config import confluence_config
 logger = logging.getLogger(__name__)
 async def ingest_page(page_id: str, space_key: str = "", team_id: str = "") -> int:
     team_id = team_id or confluence_config.team_id
     adapter = ConfluenceAdapter(team_id=team_id)
@@ -32,8 +41,7 @@ async def ingest_page(page_id: str, space_key: str = "", team_id: str = "") -> i
         chunk.text = m
     embedded = embed_chunks(chunks)
-    delete_chunks_for_doc(raw_doc.doc_id)
-    upsert_chunks(embedded)
     logger.info("confluence_pipeline: stored %d chunks for page %s", len(embedded), page_id)
     return len(embedded)
@@ -54,8 +62,7 @@ async def ingest_space(space_key: str, team_id: str = "") -> int:
         for chunk, m in zip(chunks, masked):
             chunk.text = m
         embedded = embed_chunks(chunks)
-        delete_chunks_for_doc(raw_doc.doc_id)
-        upsert_chunks(embedded)
         total += len(embedded)
         logger.info("confluence_pipeline: stored %d chunks for page %s", len(embedded), pid)

 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
+from ingestion.storage import supabase_store
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.confluence_agent.adapter import ConfluenceAdapter
 from src.confluence_agent.chunker import chunk_confluence_page
 logger = logging.getLogger(__name__)
+def _store(raw_doc, embedded):
+    supabase_store.upsert_document(raw_doc)
+    supabase_store.delete_chunks_for_doc(raw_doc.doc_id)
+    supabase_store.upsert_chunks(embedded)
+    delete_chunks_for_doc(raw_doc.doc_id)
+    upsert_chunks(embedded)
 async def ingest_page(page_id: str, space_key: str = "", team_id: str = "") -> int:
     team_id = team_id or confluence_config.team_id
     adapter = ConfluenceAdapter(team_id=team_id)
         chunk.text = m
     embedded = embed_chunks(chunks)
+    _store(raw_doc, embedded)
     logger.info("confluence_pipeline: stored %d chunks for page %s", len(embedded), page_id)
     return len(embedded)
         for chunk, m in zip(chunks, masked):
             chunk.text = m
         embedded = embed_chunks(chunks)
+        _store(raw_doc, embedded)
         total += len(embedded)
         logger.info("confluence_pipeline: stored %d chunks for page %s", len(embedded), pid)

src/file_agent/pipeline.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from __future__ import annotations
 import logging
 from pathlib import Path
 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.file_agent.chunker import chunk_file_content
 from src.file_agent.config import file_config
@@ -16,11 +19,15 @@ logger = logging.getLogger(__name__)
 _SUPPORTED_FORMATS = {"pdf", "docx", "xml", "text", "csv", "xlsx", "html"}
 def process_file(file_path: str, team_id: str = "") -> int:
-    """
-    Full pipeline: detect → parse → chunk → PII mask → embed → upsert Qdrant.
-    Returns the number of chunks stored. Raises on fatal errors.
-    """
     team_id = team_id or file_config.team_id
     fmt = detect_format(file_path)
@@ -44,11 +51,18 @@ def process_file(file_path: str, team_id: str = "") -> int:
         chunk.text = m
     embedded = embed_chunks(chunks)
-    # Idempotent: all chunks share the same doc_id derived from file name
-    doc_id = embedded[0].doc_id
-    delete_chunks_for_doc(doc_id)
-    upsert_chunks(embedded)
-    logger.info("file_pipeline: stored %d chunks for %s (format=%s)", len(embedded), Path(file_path).name, fmt)
     return len(embedded)

 from __future__ import annotations
+import hashlib
 import logging
 from pathlib import Path
+from ingestion.models import RawDocument
 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
+from ingestion.storage import supabase_store
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.file_agent.chunker import chunk_file_content
 from src.file_agent.config import file_config
 _SUPPORTED_FORMATS = {"pdf", "docx", "xml", "text", "csv", "xlsx", "html"}
+def _store(raw_doc, embedded):
+    supabase_store.upsert_document(raw_doc)
+    supabase_store.delete_chunks_for_doc(raw_doc.doc_id)
+    supabase_store.upsert_chunks(embedded)
+    delete_chunks_for_doc(raw_doc.doc_id)
+    upsert_chunks(embedded)
 def process_file(file_path: str, team_id: str = "") -> int:
     team_id = team_id or file_config.team_id
     fmt = detect_format(file_path)
         chunk.text = m
     embedded = embed_chunks(chunks)
+    file_name = Path(file_path).name
+    doc_id = hashlib.sha256(f"file:{file_name}".encode()).hexdigest()
+    raw_doc = RawDocument(
+        doc_id=doc_id,
+        title=file_name,
+        content="",
+        source_url=f"file://{Path(file_path).resolve().as_posix()}",
+        source_type="file",
+        team_id=team_id,
+        metadata={"file_name": file_name, "format": fmt},
+    )
+    _store(raw_doc, embedded)
+    logger.info("file_pipeline: stored %d chunks for %s (format=%s)", len(embedded), file_name, fmt)
     return len(embedded)

src/jira_agent/pipeline.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.jira_agent.adapter import JiraAdapter
 from src.jira_agent.chunker import chunk_jira_issue
@@ -12,11 +13,15 @@ from src.jira_agent.config import jira_config
 logger = logging.getLogger(__name__)
 async def ingest_issue(issue_key: str, team_id: str = "") -> int:
-    """
-    Full pipeline for a single Jira issue.
-    Returns the number of chunks stored.
-    """
     team_id = team_id or jira_config.team_id
     adapter = JiraAdapter(team_id=team_id)
@@ -36,17 +41,13 @@ async def ingest_issue(issue_key: str, team_id: str = "") -> int:
         chunk.text = masked
     embedded = embed_chunks(chunks)
-    # Idempotent: remove old vectors before upserting new ones
-    delete_chunks_for_doc(raw_doc.doc_id)
-    upsert_chunks(embedded)
     logger.info("jira_pipeline: stored %d chunks for %s", len(embedded), issue_key)
     return len(embedded)
 async def ingest_project(project_key: str, team_id: str = "") -> int:
-    """Full sync of all issues in a project. Returns total chunks stored."""
     team_id = team_id or jira_config.team_id
     adapter = JiraAdapter(team_id=team_id)
     docs = await adapter.fetch_all(project_key)
@@ -61,8 +62,7 @@ async def ingest_project(project_key: str, team_id: str = "") -> int:
         for chunk, masked in zip(chunks, masked_texts):
             chunk.text = masked
         embedded = embed_chunks(chunks)
-        delete_chunks_for_doc(raw_doc.doc_id)
-        upsert_chunks(embedded)
         total += len(embedded)
         logger.info("jira_pipeline: stored %d chunks for %s", len(embedded), key)

 from ingestion.pipeline.embedder import embed_chunks
 from ingestion.pipeline.pii_masker import mask_chunks
+from ingestion.storage import supabase_store
 from ingestion.storage.qdrant_store import delete_chunks_for_doc, upsert_chunks
 from src.jira_agent.adapter import JiraAdapter
 from src.jira_agent.chunker import chunk_jira_issue
 logger = logging.getLogger(__name__)
+def _store(raw_doc, embedded):
+    supabase_store.upsert_document(raw_doc)
+    supabase_store.delete_chunks_for_doc(raw_doc.doc_id)
+    supabase_store.upsert_chunks(embedded)
+    delete_chunks_for_doc(raw_doc.doc_id)
+    upsert_chunks(embedded)
 async def ingest_issue(issue_key: str, team_id: str = "") -> int:
     team_id = team_id or jira_config.team_id
     adapter = JiraAdapter(team_id=team_id)
         chunk.text = masked
     embedded = embed_chunks(chunks)
+    _store(raw_doc, embedded)
     logger.info("jira_pipeline: stored %d chunks for %s", len(embedded), issue_key)
     return len(embedded)
 async def ingest_project(project_key: str, team_id: str = "") -> int:
     team_id = team_id or jira_config.team_id
     adapter = JiraAdapter(team_id=team_id)
     docs = await adapter.fetch_all(project_key)
         for chunk, masked in zip(chunks, masked_texts):
             chunk.text = masked
         embedded = embed_chunks(chunks)
+        _store(raw_doc, embedded)
         total += len(embedded)
         logger.info("jira_pipeline: stored %d chunks for %s", len(embedded), key)