Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

Rifqi Hafizuddin commited on Apr 15

Commit

e13a901

1 Parent(s): d913315

[NOTICKET][DB] menyesuaikan format struktur db_pipeline sesuai dengan file lain

Browse files

Files changed (4) hide show

src/pipeline/db_pipeline/__init__.py +2 -2
src/pipeline/db_pipeline/connector.py +0 -74
src/pipeline/db_pipeline/db_pipeline.py +0 -68
src/pipeline/db_pipeline/db_pipeline_service.py +148 -0

src/pipeline/db_pipeline/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from src.pipeline.db_pipeline.pipeline import run_db_pipeline
-__all__ = ["run_db_pipeline"]


1	+ from src.pipeline.db_pipeline.db_pipeline_service import DbPipelineService, db_pipeline_service
2
3	+ __all__ = ["DbPipelineService", "db_pipeline_service"]

src/pipeline/db_pipeline/connector.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""Connectors for user-provided databases.
-The pipeline does not own user credentials — an API layer (outside this folder)
-builds an Engine via `connect(...)` and passes it to `run_db_pipeline`. Use
-`engine_scope(...)` for guaranteed disposal of the connection pool.
-"""
-from contextlib import contextmanager
-from typing import Iterator, Literal
-from sqlalchemy import URL, create_engine
-from sqlalchemy.engine import Engine
-from src.middlewares.logging import get_logger
-logger = get_logger("db_connector")
-DbType = Literal["postgresql", "mysql", "sqlserver"]
-def get_postgres_engine(
-    host: str, port: int, dbname: str, username: str, password: str
-) -> Engine:
-    """Build a Postgres engine with safe URL escaping (handles special chars in password)."""
-    url = URL.create(
-        drivername="postgresql+psycopg2",
-        username=username,
-        password=password,
-        host=host,
-        port=port,
-        database=dbname,
-    )
-    return create_engine(url)
-def connect(
-    db_type: DbType,
-    host: str,
-    port: int,
-    dbname: str,
-    username: str,
-    password: str,
-) -> Engine:
-    """Connect to a user-provided database. Returns a SQLAlchemy engine."""
-    logger.info("connecting to user db", db_type=db_type, host=host, port=port, dbname=dbname)
-    if db_type == "postgresql":
-        return get_postgres_engine(host, port, dbname, username, password)
-    elif db_type == "sqlserver":
-        raise NotImplementedError("SQL Server support coming soon")
-    elif db_type == "mysql":
-        raise NotImplementedError("MySQL support coming soon")
-    else:
-        raise ValueError(f"Unsupported db_type: {db_type}")
-@contextmanager
-def engine_scope(
-    db_type: DbType,
-    host: str,
-    port: int,
-    dbname: str,
-    username: str,
-    password: str,
-) -> Iterator[Engine]:
-    """Yield a connected Engine and dispose its pool on exit.
-    API callers should prefer this over raw `connect(...)` so user DB
-    connection pools do not leak between pipeline runs.
-    """
-    engine = connect(db_type, host, port, dbname, username, password)
-    try:
-        yield engine
-    finally:
-        engine.dispose()

src/pipeline/db_pipeline/db_pipeline.py DELETED Viewed

@@ -1,68 +0,0 @@
-"""End-to-end DB ingestion pipeline: introspect user's DB -> profile columns ->
-build text -> embed + store in the shared PGVector collection.
-Each column becomes one LangChainDocument with metadata tagging user_id and
-source_type='database', so it is retrievable via the existing retriever.
-"""
-import asyncio
-from typing import Optional
-from langchain_core.documents import Document as LangChainDocument
-from sqlalchemy.engine import Engine
-from src.db.postgres.vector_store import get_vector_store
-from src.middlewares.logging import get_logger
-from src.pipeline.db_pipeline.extractor import get_schema, profile_table
-logger = get_logger("db_pipeline")
-def _to_document(user_id: str, table_name: str, entry: dict) -> LangChainDocument:
-    col = entry["col"]
-    return LangChainDocument(
-        page_content=entry["text"],
-        metadata={
-            "user_id": user_id,
-            "source_type": "database",
-            "data": {
-                "table_name": table_name,
-                "column_name": col["name"],
-                "column_type": col["type"],
-                "is_primary_key": col.get("is_primary_key", False),
-                "foreign_key": col.get("foreign_key"),
-            },
-        },
-    )
-async def run_db_pipeline(
-    user_id: str,
-    engine: Engine,
-    exclude_tables: Optional[frozenset[str]] = None,
-) -> int:
-    """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
-    Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
-    async vector writes stay on the event loop.
-    Returns:
-        Total number of chunks ingested.
-    """
-    vector_store = get_vector_store()
-    logger.info("db pipeline start", user_id=user_id)
-    schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
-    total = 0
-    for table_name, columns in schema.items():
-        logger.info("profiling table", table=table_name, columns=len(columns))
-        entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
-        docs = [_to_document(user_id, table_name, e) for e in entries]
-        if docs:
-            await vector_store.aadd_documents(docs)
-            total += len(docs)
-            logger.info("ingested chunks", table=table_name, count=len(docs))
-    logger.info("db pipeline complete", user_id=user_id, total=total)
-    return total

src/pipeline/db_pipeline/db_pipeline_service.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Service for ingesting a user's external database into the vector store.
+End-to-end flow: connect -> introspect schema -> profile columns -> build text
+-> embed + store in the shared PGVector collection (tagged with
+`source_type="database"`, retrievable via the same retriever used for docs).
+Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
+async vector writes stay on the event loop.
+"""
+import asyncio
+from contextlib import contextmanager
+from typing import Iterator, Optional
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy import URL, create_engine
+from sqlalchemy.engine import Engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.models.credentials import DbType
+from src.pipeline.db_pipeline.extractor import get_schema, profile_table
+logger = get_logger("db_pipeline")
+class DbPipelineService:
+    """End-to-end DB ingestion: connect -> introspect -> profile -> embed -> store."""
+    def connect(
+        self,
+        db_type: DbType,
+        host: str,
+        port: int,
+        database: str,
+        username: str,
+        password: str,
+        ssl_mode: Optional[str] = None,
+    ) -> Engine:
+        """Build a SQLAlchemy engine for the user's database.
+        Supabase aliases to the Postgres driver (same URL shape). Other
+        engines raise NotImplementedError until their connector is added.
+        `ssl_mode` maps to libpq's `sslmode` query param for postgres/supabase
+        (required for managed DBs like Neon/Supabase: "require", "verify-ca",
+        "verify-full"). Ignored for other db_types until those connectors land.
+        """
+        logger.info(
+            "connecting to user db", db_type=db_type, host=host, port=port, database=database
+        )
+        if db_type in ("postgres", "supabase"):
+            query = {"sslmode": ssl_mode} if ssl_mode else {}
+            url = URL.create(
+                drivername="postgresql+psycopg2",
+                username=username,
+                password=password,
+                host=host,
+                port=port,
+                database=database,
+                query=query,
+            )
+            return create_engine(url)
+        elif db_type == "mysql":
+            raise NotImplementedError("MySQL support coming soon")
+        elif db_type == "sqlserver":
+            raise NotImplementedError("SQL Server support coming soon")
+        elif db_type == "bigquery":
+            raise NotImplementedError("BigQuery support coming soon")
+        elif db_type == "snowflake":
+            raise NotImplementedError("Snowflake support coming soon")
+        else:
+            raise ValueError(f"Unsupported db_type: {db_type}")
+    @contextmanager
+    def engine_scope(
+        self,
+        db_type: DbType,
+        host: str,
+        port: int,
+        database: str,
+        username: str,
+        password: str,
+        ssl_mode: Optional[str] = None,
+    ) -> Iterator[Engine]:
+        """Yield a connected Engine and dispose its pool on exit.
+        API callers should prefer this over raw `connect(...)` so user DB
+        connection pools do not leak between pipeline runs.
+        """
+        engine = self.connect(
+            db_type, host, port, database, username, password, ssl_mode
+        )
+        try:
+            yield engine
+        finally:
+            engine.dispose()
+    def _to_document(
+        self, user_id: str, table_name: str, entry: dict
+    ) -> LangChainDocument:
+        col = entry["col"]
+        return LangChainDocument(
+            page_content=entry["text"],
+            metadata={
+                "user_id": user_id,
+                "source_type": "database",
+                "data": {
+                    "table_name": table_name,
+                    "column_name": col["name"],
+                    "column_type": col["type"],
+                    "is_primary_key": col.get("is_primary_key", False),
+                    "foreign_key": col.get("foreign_key"),
+                },
+            },
+        )
+    async def run(
+        self,
+        user_id: str,
+        engine: Engine,
+        exclude_tables: Optional[frozenset[str]] = None,
+    ) -> int:
+        """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
+        Returns:
+            Total number of chunks ingested.
+        """
+        vector_store = get_vector_store()
+        logger.info("db pipeline start", user_id=user_id)
+        schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
+        total = 0
+        for table_name, columns in schema.items():
+            logger.info("profiling table", table=table_name, columns=len(columns))
+            entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
+            docs = [self._to_document(user_id, table_name, e) for e in entries]
+            if docs:
+                await vector_store.aadd_documents(docs)
+                total += len(docs)
+                logger.info("ingested chunks", table=table_name, count=len(docs))
+        logger.info("db pipeline complete", user_id=user_id, total=total)
+        return total
+db_pipeline_service = DbPipelineService()