Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

[NOTICKET] add updated_at metadata & prevent duplicate while adding new embedding

#10

by rhbt6767 - opened 17 days ago

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+21

-3

Files changed (1) hide show

src/pipeline/db_pipeline/db_pipeline_service.py +21 -3

src/pipeline/db_pipeline/db_pipeline_service.py CHANGED Viewed

@@ -10,12 +10,14 @@ async vector writes stay on the event loop.
 import asyncio
 from contextlib import contextmanager
 from typing import Any, Iterator, Optional
 from langchain_core.documents import Document as LangChainDocument
-from sqlalchemy import URL, create_engine
 from sqlalchemy.engine import Engine
 from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.models.credentials import DbType
@@ -146,7 +148,7 @@ class DbPipelineService:
             engine.dispose()
     def _to_document(
-        self, user_id: str, table_name: str, entry: dict
     ) -> LangChainDocument:
         col = entry["col"]
         return LangChainDocument(
@@ -154,6 +156,7 @@ class DbPipelineService:
             metadata={
                 "user_id": user_id,
                 "source_type": "database",
                 "data": {
                     "table_name": table_name,
                     "column_name": col["name"],
@@ -178,13 +181,28 @@ class DbPipelineService:
         vector_store = get_vector_store()
         logger.info("db pipeline start", user_id=user_id)
         schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
         total = 0
         for table_name, columns in schema.items():
             logger.info("profiling table", table=table_name, columns=len(columns))
             entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
-            docs = [self._to_document(user_id, table_name, e) for e in entries]
             if docs:
                 await vector_store.aadd_documents(docs)
                 total += len(docs)

 import asyncio
 from contextlib import contextmanager
+from datetime import datetime, timezone, timedelta
 from typing import Any, Iterator, Optional
 from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy import URL, create_engine, text
 from sqlalchemy.engine import Engine
+from src.db.postgres.connection import _pgvector_engine
 from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.models.credentials import DbType
             engine.dispose()
     def _to_document(
+        self, user_id: str, table_name: str, entry: dict, updated_at: str
     ) -> LangChainDocument:
         col = entry["col"]
         return LangChainDocument(
             metadata={
                 "user_id": user_id,
                 "source_type": "database",
+                "updated_at": updated_at,
                 "data": {
                     "table_name": table_name,
                     "column_name": col["name"],
         vector_store = get_vector_store()
         logger.info("db pipeline start", user_id=user_id)
+        async with _pgvector_engine.begin() as conn:
+            result = await conn.execute(
+                text(
+                    "DELETE FROM langchain_pg_embedding "
+                    "WHERE cmetadata->>'user_id' = :user_id "
+                    "  AND cmetadata->>'source_type' = 'database' "
+                    "  AND collection_id = ("
+                    "    SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'"
+                    "  )"
+                ),
+                {"user_id": user_id},
+            )
+            logger.info("cleared old db embeddings", user_id=user_id, deleted=result.rowcount)
         schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
+        updated_at = datetime.now(timezone(timedelta(hours=7))).isoformat()
         total = 0
         for table_name, columns in schema.items():
             logger.info("profiling table", table=table_name, columns=len(columns))
             entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
+            docs = [self._to_document(user_id, table_name, e, updated_at) for e in entries]
             if docs:
                 await vector_store.aadd_documents(docs)
                 total += len(docs)