Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

Rifqi Hafizuddin commited on Apr 27

Commit

ba550a5

1 Parent(s): 767625e

[KM-438-439] add retriever feature

Browse files

Files changed (13) hide show

src/query/__init__.py +0 -0
src/query/base.py +32 -0
src/query/executors/__init__.py +0 -0
src/query/executors/db_executor.py +409 -0
src/query/executors/tabular.py +39 -0
src/query/query_executor.py +52 -0
src/rag/base.py +20 -0
src/rag/retriever.py +22 -48
src/rag/retrievers/__init__.py +0 -0
src/rag/retrievers/baseline.py +70 -0
src/rag/retrievers/document.py +32 -0
src/rag/retrievers/schema.py +349 -0
src/rag/router.py +75 -0

src/query/__init__.py ADDED Viewed

File without changes

src/query/base.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Shared contract for query executors."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.rag.base import RetrievalResult
+@dataclass
+class QueryResult:
+    source_type: str        # "database" or "document"
+    source_id: str          # database_client_id or document_id
+    table_or_file: str
+    columns: list[str]
+    rows: list[dict]
+    row_count: int
+    metadata: dict = field(default_factory=dict)
+    # metadata should include "column_types": {"col_name": "dtype"} when available
+class BaseExecutor(ABC):
+    @abstractmethod
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]: ...

src/query/executors/__init__.py ADDED Viewed

File without changes

src/query/executors/db_executor.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""Executor for registered database sources (source_type="database").
+Flow per (client_id, question):
+  1. Collect all relevant (table_name, column_name) pairs from retrieval results.
+  2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
+  3. Build a schema context string and send to LLM → structured SQLQuery output.
+  4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
+  5. Execute on the user's DB via engine_scope + asyncio.to_thread.
+  6. Return QueryResult per client_id (may span multiple tables via JOINs).
+Supported db_types: postgres, supabase, mysql.
+Other types are skipped with a warning — they do not raise.
+"""
+import asyncio
+from collections import defaultdict
+from typing import Any
+import sqlglot
+import sqlglot.expressions as exp
+import tiktoken
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import AzureChatOpenAI
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.config.settings import settings
+from src.database_client.database_client_service import database_client_service
+from src.db.postgres.connection import _pgvector_engine
+from src.middlewares.logging import get_logger
+from src.models.sql_query import SQLQuery
+from src.pipeline.db_pipeline import db_pipeline_service
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+from src.utils.db_credential_encryption import decrypt_credentials_dict
+logger = get_logger("db_executor")
+_enc = tiktoken.get_encoding("cl100k_base")
+_SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
+_MAX_RETRIES = 3
+_MAX_LIMIT = 500
+_SQL_SYSTEM_PROMPT = """\
+You are a SQL data analyst working with a user's database.
+Generate a single SQL SELECT statement that answers the user's question.
+Database dialect: {dialect}
+Rules:
+- ONLY reference tables and columns listed in the schema below. Do not invent names.
+- Always include a LIMIT clause (max {limit}).
+- Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
+- Prefer explicit JOINs over subqueries when combining tables.
+- For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
+- For date filtering, use dialect-appropriate functions ({dialect} syntax).
+Schema:
+{schema}
+{error_section}"""
+class DbExecutor(BaseExecutor):
+    def __init__(self) -> None:
+        self._llm = AzureChatOpenAI(
+            azure_deployment=settings.azureai_deployment_name_4o,
+            openai_api_version=settings.azureai_api_version_4o,
+            azure_endpoint=settings.azureai_endpoint_url_4o,
+            api_key=settings.azureai_api_key_4o,
+            temperature=0,
+        )
+        self._prompt = ChatPromptTemplate.from_messages([
+            ("system", _SQL_SYSTEM_PROMPT),
+            ("human", "{question}"),
+        ])
+        self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        db_results = [r for r in results if r.source_type == "database"]
+        if not db_results:
+            return []
+        # Group by client_id — one SQL generation + execution pass per client
+        by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
+        for r in db_results:
+            client_id = r.metadata.get("database_client_id", "")
+            if client_id:
+                by_client[client_id].append(r)
+            else:
+                logger.warning("db result missing database_client_id, skipping")
+        query_results: list[QueryResult] = []
+        for client_id, client_results in by_client.items():
+            try:
+                qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
+                if qr:
+                    query_results.append(qr)
+            except Exception as e:
+                logger.error("db executor failed for client", client_id=client_id, error=str(e))
+        return query_results
+    # ------------------------------------------------------------------
+    # Per-client execution
+    # ------------------------------------------------------------------
+    async def _execute_for_client(
+        self,
+        client_id: str,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int,
+    ) -> QueryResult | None:
+        client = await database_client_service.get(db, client_id)
+        if not client:
+            logger.warning("database client not found", client_id=client_id)
+            return None
+        if client.user_id != user_id:
+            logger.warning("client ownership mismatch", client_id=client_id)
+            return None
+        if client.db_type not in _SUPPORTED_DB_TYPES:
+            logger.warning("unsupported db_type for query execution", db_type=client.db_type)
+            return None
+        # Distinct table names from retrieval results, expanded via FK relationships
+        table_names = list({
+            r.metadata.get("data", {}).get("table_name")
+            for r in results
+            if r.metadata.get("data", {}).get("table_name")
+        })
+        table_names = await self._expand_with_fk_tables(client_id, user_id, table_names)
+        full_schema = await self._fetch_full_schema(client_id, table_names, user_id)
+        if not full_schema:
+            logger.warning("no schema found in vector store", client_id=client_id, tables=table_names)
+            return None
+        schema_ctx = self._build_schema_context(full_schema)
+        capped_limit = min(limit, _MAX_LIMIT)
+        dialect = client.db_type
+        # SQL generation with retry
+        validated_sql: str | None = None
+        prev_error: str = ""
+        prev_reasoning: str = ""
+        for attempt in range(_MAX_RETRIES):
+            if prev_error:
+                error_section = (
+                    f"Previous attempt reasoning: {prev_reasoning}\n"
+                    f"Previous attempt failed: {prev_error}\n"
+                    "Fix the issue above."
+                )
+            else:
+                error_section = ""
+            try:
+                prompt_text = schema_ctx + error_section + question
+                input_tokens = len(_enc.encode(prompt_text))
+                logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
+                result: SQLQuery = await self._chain.ainvoke({
+                    "schema": schema_ctx,
+                    "dialect": dialect,
+                    "limit": capped_limit,
+                    "error_section": error_section,
+                    "question": question,
+                })
+                sql = result.sql.strip()
+                validation_error = self._validate(sql, full_schema, capped_limit)
+                if validation_error:
+                    prev_error = validation_error
+                    prev_reasoning = result.reasoning
+                    logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
+                    continue
+                validated_sql = self._enforce_limit(sql, capped_limit)
+                output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
+                logger.info(
+                    "sql generated",
+                    attempt=attempt + 1,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    total_tokens=input_tokens + output_tokens,
+                    reasoning=result.reasoning,
+                )
+                break
+            except Exception as e:
+                prev_error = str(e)
+                logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
+        if not validated_sql:
+            logger.error("sql generation failed after retries", client_id=client_id)
+            return None
+        # Execute on user's DB
+        creds = decrypt_credentials_dict(client.credentials)
+        with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
+            rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
+        column_types = {
+            col["name"]: col["type"]
+            for cols in full_schema.values()
+            for col in cols
+        }
+        columns = list(rows[0].keys()) if rows else []
+        return QueryResult(
+            source_type="database",
+            source_id=client_id,
+            table_or_file=", ".join(table_names),
+            columns=columns,
+            rows=rows,
+            row_count=len(rows),
+            metadata={
+                "db_type": client.db_type,
+                "client_name": client.name,
+                "sql": validated_sql,
+                "column_types": {c: column_types.get(c, "unknown") for c in columns},
+            },
+        )
+    # ------------------------------------------------------------------
+    # Schema helpers
+    # ------------------------------------------------------------------
+    async def _expand_with_fk_tables(
+        self,
+        client_id: str,
+        user_id: str,
+        table_names: list[str],
+    ) -> list[str]:
+        """Expand table_names with any tables FK-referenced by the retrieved tables.
+        Prevents SQL generation failures when a required table (e.g. orders) wasn't
+        returned by retrieval but is referenced via FK from a table that was
+        (e.g. order_items.order_id -> orders.id).
+        """
+        if not table_names:
+            return table_names
+        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
+        sql = text(f"""
+            SELECT DISTINCT lpe.cmetadata->'data'->>'foreign_key' AS fk
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
+              AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
+        """)
+        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+        for i, name in enumerate(table_names):
+            params[f"t{i}"] = name
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, params)
+            rows = result.fetchall()
+        expanded = set(table_names)
+        for row in rows:
+            fk = row.fk  # format: "referred_table.referred_column"
+            if fk:
+                referred_table = fk.split(".")[0]
+                expanded.add(referred_table)
+        if expanded != set(table_names):
+            logger.info(
+                "expanded tables via FK",
+                original=sorted(table_names),
+                expanded=sorted(expanded),
+            )
+        return list(expanded)
+    async def _fetch_full_schema(
+        self,
+        client_id: str,
+        table_names: list[str],
+        user_id: str,
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Fetch ALL column chunks for the given tables from PGVector.
+        Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
+                                "foreign_key": ..., "content": ...}]}
+        """
+        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
+        sql = text(f"""
+            SELECT lpe.cmetadata, lpe.document
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
+            ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
+        """)
+        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+        for i, name in enumerate(table_names):
+            params[f"t{i}"] = name
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, params)
+            rows = result.fetchall()
+        schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
+        for row in rows:
+            data = row.cmetadata.get("data", {})
+            table = data.get("table_name")
+            if table:
+                schema[table].append({
+                    "name": data.get("column_name", ""),
+                    "type": data.get("column_type", ""),
+                    "is_primary_key": data.get("is_primary_key", False),
+                    "foreign_key": data.get("foreign_key"),
+                    "content": row.document,  # chunk text includes top values / samples
+                })
+        return dict(schema)
+    def _build_schema_context(self, schema: dict[str, list[dict[str, Any]]]) -> str:
+        lines: list[str] = []
+        for table, columns in schema.items():
+            lines.append(f"Table: {table}")
+            for col in columns:
+                flags = []
+                if col["is_primary_key"]:
+                    flags.append("PRIMARY KEY")
+                if col["foreign_key"]:
+                    flags.append(f"FK -> {col['foreign_key']}")
+                flag_str = f"  [{', '.join(flags)}]" if flags else ""
+                lines.append(f"  - {col['name']}  {col['type']}{flag_str}")
+                # Include sample/top-values line from chunk content if present
+                for line in col["content"].splitlines():
+                    if line.startswith(("Top values:", "Sample values:")):
+                        lines.append(f"    {line}")
+                        break
+            lines.append("")
+        return "\n".join(lines).strip()
+    # ------------------------------------------------------------------
+    # Guardrails
+    # ------------------------------------------------------------------
+    def _validate(self, sql: str, schema: dict[str, list[dict]], limit: int) -> str:
+        """Return an error string if validation fails, empty string if OK."""
+        # Layer 1: sqlglot parse + SELECT-only check
+        try:
+            parsed = sqlglot.parse_one(sql)
+        except sqlglot.errors.ParseError as e:
+            return f"SQL parse error: {e}"
+        if not isinstance(parsed, exp.Select):
+            return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
+        # Check for DML anywhere in the AST (including writeable CTEs)
+        for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
+            return f"DML ({type(node).__name__}) is not allowed."
+        # Layer 2: schema grounding — table names
+        known_tables = {t.lower() for t in schema}
+        for tbl in parsed.find_all(exp.Table):
+            name = tbl.name.lower()
+            if name and name not in known_tables:
+                return f"Unknown table '{tbl.name}'. Only use tables from the schema."
+        # Layer 3: LIMIT enforcement (inject if missing — done before execution)
+        return ""
+    # ------------------------------------------------------------------
+    # SQL execution
+    # ------------------------------------------------------------------
+    def _enforce_limit(self, sql: str, limit: int) -> str:
+        """Inject or cap LIMIT using sqlglot AST manipulation."""
+        parsed = sqlglot.parse_one(sql)
+        existing = parsed.find(exp.Limit)
+        if existing:
+            current = int(existing.expression.this)
+            if current > limit:
+                existing.expression.set("this", limit)
+        else:
+            parsed = parsed.limit(limit)
+        return parsed.sql()
+    def _run_sql(self, engine: Any, sql: str) -> list[dict]:
+        # Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
+        with engine.connect() as conn:
+            result = conn.execute(text(sql))
+            return [dict(row) for row in result.mappings()]
+db_executor = DbExecutor()

src/query/executors/tabular.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Executor for tabular document sources (source_type="document", file_type csv/xlsx).
+Flow:
+  1. Group RetrievalResult chunks by document_id.
+  2. For each document: download bytes from Azure Blob -> read with pandas.
+  3. Filter DataFrame to relevant columns identified by retrieval.
+  4. Return QueryResult per document.
+"""
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.middlewares.logging import get_logger
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+logger = get_logger("tabular_executor")
+_TABULAR_FILE_TYPES = ("csv", "xlsx")
+class TabularExecutor(BaseExecutor):
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        # TODO: implement
+        # 1. filter results where source_type == "document" and file_type in _TABULAR_FILE_TYPES
+        # 2. group by document_id -> list of column_names
+        # 3. per group: look up Document by document_id -> get blob_name
+        # 4. blob_storage.download_file(blob_name) -> pd.read_csv / pd.read_excel
+        # 5. df[relevant_columns].head(limit) -> rows as list[dict]
+        # 6. return QueryResult per document
+        raise NotImplementedError
+tabular_executor = TabularExecutor()

src/query/query_executor.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
+import asyncio
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.middlewares.logging import get_logger
+from src.query.base import QueryResult
+from src.query.executors.db_executor import db_executor
+from src.query.executors.tabular import tabular_executor
+from src.rag.base import RetrievalResult
+logger = get_logger("query_executor")
+class QueryExecutor:
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        db_results = [r for r in results if r.source_type == "database"]
+        tabular_results = [
+            r for r in results
+            if r.source_type == "document"
+            and r.metadata.get("data", {}).get("file_type") in ("csv", "xlsx")
+        ]
+        async def _empty() -> list[QueryResult]:
+            return []
+        batches = await asyncio.gather(
+            db_executor.execute(db_results, user_id, db, question, limit) if db_results else _empty(),
+            tabular_executor.execute(tabular_results, user_id, db, question, limit) if tabular_results else _empty(),
+            return_exceptions=True,
+        )
+        query_results: list[QueryResult] = []
+        for batch in batches:
+            if isinstance(batch, Exception):
+                logger.error("executor failed", error=str(batch))
+                continue
+            query_results.extend(batch)
+        logger.info("query execution complete", total=len(query_results))
+        return query_results
+query_executor = QueryExecutor()

src/rag/base.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Shared contract for all retriever implementations."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+@dataclass
+class RetrievalResult:
+    content: str
+    metadata: dict[str, Any]
+    score: float
+    source_type: str  # "document" | "database"
+class BaseRetriever(ABC):
+    @abstractmethod
+    async def retrieve(
+        self, query: str, user_id: str, k: int = 5
+    ) -> list[RetrievalResult]: ...

src/rag/retriever.py CHANGED Viewed

@@ -1,69 +1,43 @@
-"""Service for retrieving relevant documents from vector store."""
-import hashlib
-import json
-from src.db.postgres.vector_store import get_vector_store
-from src.db.redis.connection import get_redis
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
-from typing import List, Dict, Any
 logger = get_logger("retriever")
-_RETRIEVAL_CACHE_TTL = 3600  # 1 hour
 class RetrieverService:
-    """Service for retrieving relevant documents."""
     def __init__(self):
-        self.vector_store = get_vector_store()
     async def retrieve(
         self,
         query: str,
         user_id: str,
         db: AsyncSession,
-        k: int = 5
-    ) -> List[Dict[str, Any]]:
-        """Retrieve relevant chunks for a query, scoped to the user's documents.
-        Returns:
-            List of dicts with keys: content, metadata
-            metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
-        """
         try:
-            redis = await get_redis()
-            query_hash = hashlib.md5(query.encode()).hexdigest()
-            cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
-            cached = await redis.get(cache_key)
-            if cached:
-                logger.info("Returning cached retrieval results")
-                return json.loads(cached)
-            logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
-            docs = await self.vector_store.asimilarity_search(
-                query=query,
-                k=k,
-                filter={"user_id": user_id}
-            )
-            results = [
-                {
-                    "content": doc.page_content,
-                    "metadata": doc.metadata,
-                }
-                for doc in docs
-            ]
-            logger.info(f"Retrieved {len(results)} chunks")
-            await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
-            return results
         except Exception as e:
-            logger.error("Retrieval failed", error=str(e))
             return []

+"""Public retrieval API — thin wrapper around RetrievalRouter."""
+from typing import Any
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
+from src.rag.retrievers.document import document_retriever
+from src.rag.retrievers.schema import schema_retriever
+from src.rag.router import RetrievalRouter, SourceHint
 logger = get_logger("retriever")
 class RetrieverService:
+    """Public retrieval service used by chat.py and search tools.
+    Delegates to RetrievalRouter which dispatches based on source_hint.
+    Returns List[Dict] to preserve backward compatibility with chat.py.
+    """
     def __init__(self):
+        self._router = RetrievalRouter(
+            schema_retriever=schema_retriever,
+            document_retriever=document_retriever,
+        )
     async def retrieve(
         self,
         query: str,
         user_id: str,
         db: AsyncSession,
+        k: int = 5,
+        source_hint: SourceHint = "both",
+    ) -> list[dict[str, Any]]:
         try:
+            results = await self._router.retrieve(query, user_id, source_hint, k)
+            return [{"content": r.content, "metadata": r.metadata} for r in results]
         except Exception as e:
+            logger.error("retrieval failed", error=str(e))
             return []

src/rag/retrievers/__init__.py ADDED Viewed

File without changes

src/rag/retrievers/baseline.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Service for retrieving relevant documents from vector store."""
+import hashlib
+import json
+from src.db.postgres.vector_store import get_vector_store
+from src.db.redis.connection import get_redis
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.middlewares.logging import get_logger
+from typing import List, Dict, Any
+logger = get_logger("retriever")
+_RETRIEVAL_CACHE_TTL = 3600  # 1 hour
+class RetrieverService:
+    """Service for retrieving relevant documents."""
+    def __init__(self):
+        self.vector_store = get_vector_store()
+    async def retrieve(
+        self,
+        query: str,
+        user_id: str,
+        db: AsyncSession,
+        k: int = 5
+    ) -> List[Dict[str, Any]]:
+        """Retrieve relevant chunks for a query, scoped to the user's documents.
+        Returns:
+            List of dicts with keys: content, metadata
+            metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
+        """
+        try:
+            redis = await get_redis()
+            query_hash = hashlib.md5(query.encode()).hexdigest()
+            cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
+            cached = await redis.get(cache_key)
+            if cached:
+                logger.info("Returning cached retrieval results")
+                return json.loads(cached)
+            logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
+            docs = await self.vector_store.asimilarity_search(
+                query=query,
+                k=k,
+                filter={"user_id": user_id}
+            )
+            results = [
+                {
+                    "content": doc.page_content,
+                    "metadata": doc.metadata,
+                }
+                for doc in docs
+            ]
+            logger.info(f"Retrieved {len(results)} chunks")
+            await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
+            return results
+        except Exception as e:
+            logger.error("Retrieval failed", error=str(e))
+            return []
+retriever = RetrieverService()

src/rag/retrievers/document.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular).
+TEAMMATE: implement retrieve() below.
+Strategy: MMR (amax_marginal_relevance_search) + score threshold to avoid returning
+near-identical chunks from the same PDF page.
+Filter: source_type="document" AND data->>'file_type' NOT IN ('csv', 'xlsx')
+"""
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("document_retriever")
+_SCORE_THRESHOLD = 0.45  # discard chunks with cosine distance above this
+class DocumentRetriever(BaseRetriever):
+    def __init__(self):
+        self.vector_store = get_vector_store()
+    async def retrieve(
+        self, query: str, user_id: str, k: int = 5
+    ) -> list[RetrievalResult]:
+        # TODO (teammate): implement MMR retrieval for prose documents
+        # Filter: {"user_id": user_id, "source_type": "document"}
+        # then post-filter to exclude file_type in ("csv", "xlsx")
+        logger.info("document retriever not yet implemented — returning empty")
+        return []
+document_retriever = DocumentRetriever()

src/rag/retrievers/schema.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""Schema retriever — handles DB schemas (source_type="database") and tabular file
+columns stored as source_type="document" with file_type in ("csv","xlsx").
+Multiple retrieval strategies are exposed for benchmarking. The active strategy
+used by the router is `retrieve()`, which dispatches to ACTIVE_STRATEGY.
+Change ACTIVE_STRATEGY at module level to switch without touching the router.
+All strategies embed the query exactly once, then fan out to parallel SQL legs.
+Vector distance strategies:
+  dense_no_threshold  — cosine (<=>), no score floor, always returns k chunks
+  dense_dot           — inner product (<#>), equivalent to cosine for normalized embeddings
+  dense_l2            — L2/euclidean (<->), monotonic with cosine on unit-sphere vectors
+  hybrid              — RRF merge of dense + FTS (database + tabular)
+  hybrid_bm25         — RRF merge of dense + FTS (database only)
+"""
+import asyncio
+import time
+from typing import Literal
+from sqlalchemy import text
+from src.db.postgres.connection import _pgvector_engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("schema_retriever")
+_TABULAR_FILE_TYPES = ("csv", "xlsx")
+Strategy = Literal["dense_no_threshold", "dense_dot", "dense_l2", "hybrid", "hybrid_bm25"]
+ACTIVE_STRATEGY: Strategy = "hybrid_bm25"
+class SchemaRetriever(BaseRetriever):
+    def __init__(self):
+        self.vector_store = get_vector_store()
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    async def _embed_query(self, query: str) -> list[float]:
+        return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
+    async def _search_db(
+        self, embedding: list[float], user_id: str, k: int, operator: str = "<=>"
+    ) -> list[RetrievalResult]:
+        """Vector search over database chunks. Accepts a pre-computed embedding."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        if operator == "<#>":
+            score_sql = f"(lpe.embedding <#> '{emb_str}'::vector) * -1"
+        elif operator == "<->":
+            score_sql = f"1.0 / (1.0 + (lpe.embedding <-> '{emb_str}'::vector))"
+        else:
+            score_sql = f"1.0 - (lpe.embedding <=> '{emb_str}'::vector)"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata, {score_sql} AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+            ORDER BY lpe.embedding {operator} '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    async def _search_tabular(
+        self, embedding: list[float], user_id: str, k: int, operator: str = "<=>"
+    ) -> list[RetrievalResult]:
+        """Vector search over tabular document chunks. Accepts a pre-computed embedding."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        if operator == "<#>":
+            score_sql = f"(lpe.embedding <#> '{emb_str}'::vector) * -1"
+        elif operator == "<->":
+            score_sql = f"1.0 / (1.0 + (lpe.embedding <-> '{emb_str}'::vector))"
+        else:
+            score_sql = f"1.0 - (lpe.embedding <=> '{emb_str}'::vector)"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata, {score_sql} AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+            ORDER BY lpe.embedding {operator} '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
+        results = []
+        for row in rows:
+            results.append(
+                RetrievalResult(
+                    content=row.document,
+                    metadata=row.cmetadata,
+                    score=float(row.score),
+                    source_type="document",
+                )
+            )
+            if len(results) >= k:
+                break
+        return results
+    async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        """Full-text search over DB schema chunks using PostgreSQL tsvector.
+        Requires GIN index on langchain_pg_embedding.document (created by init_db.py).
+        """
+        sql = text("""
+            SELECT lpe.document, lpe.cmetadata,
+                   ts_rank(to_tsvector('english', lpe.document),
+                           plainto_tsquery('english', :query)) AS rank
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
+            ORDER BY rank DESC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.rank),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    async def _search_fts_tabular(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        """Full-text search over tabular document chunks using PostgreSQL tsvector."""
+        sql = text("""
+            SELECT lpe.document, lpe.cmetadata,
+                   ts_rank(to_tsvector('english', lpe.document),
+                           plainto_tsquery('english', :query)) AS rank
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+              AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
+            ORDER BY rank DESC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.rank),
+                source_type="document",
+            )
+            for row in rows
+        ]
+    def _rrf_merge(
+        self,
+        *ranked_lists: list[RetrievalResult],
+        k_rrf: int = 60,
+        top_k: int = 5,
+    ) -> list[RetrievalResult]:
+        """Reciprocal Rank Fusion — combines ranked lists using rank positions only."""
+        scores: dict[tuple, float] = {}
+        index: dict[tuple, RetrievalResult] = {}
+        for ranked in ranked_lists:
+            for rank, result in enumerate(ranked):
+                data = result.metadata.get("data", {})
+                key = (data.get("table_name"), data.get("column_name") or data.get("filename"))
+                scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
+                if key not in index or result.score > index[key].score:
+                    index[key] = result
+        def _key(r: RetrievalResult) -> tuple:
+            d = r.metadata.get("data", {})
+            return (d.get("table_name"), d.get("column_name") or d.get("filename"))
+        merged = sorted(index.values(), key=lambda r: scores[_key(r)], reverse=True)
+        return merged[:top_k]
+    def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
+        """Deduplicate by (table_name, column_name), keeping highest score per unique column."""
+        seen: dict[tuple, RetrievalResult] = {}
+        for r in results:
+            data = r.metadata.get("data", {})
+            key = (data.get("table_name"), data.get("column_name") or data.get("filename"))
+            if key not in seen or r.score > seen[key].score:
+                seen[key] = r
+        return sorted(seen.values(), key=lambda r: r.score, reverse=True)
+    # ------------------------------------------------------------------
+    # Named strategies — one embed call each, legs run in parallel
+    # ------------------------------------------------------------------
+    async def dense_no_threshold(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Cosine similarity, no score cutoff — always returns k chunks."""
+        embedding = await self._embed_query(query)
+        db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
+        )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def dense_dot(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Inner product similarity (<#>).
+        For L2-normalized embeddings (OpenAI), ranking is identical to cosine.
+        Score = raw inner product (not bounded to [0,1]).
+        """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k, "<#>"),
+            self._search_tabular(embedding, user_id, k, "<#>"),
+        )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def dense_l2(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """L2 (Euclidean) distance similarity (<->).
+        For L2-normalized embeddings (OpenAI), ranking order matches cosine.
+        Score = 1 / (1 + l2_distance), bounded to (0, 1].
+        """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k, "<->"),
+            self._search_tabular(embedding, user_id, k, "<->"),
+        )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def hybrid(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + FTS over both database and tabular sources.
+        Embeds once, then runs all four legs (dense db, dense tabular, fts db,
+        fts tabular) in a single asyncio.gather.
+        """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results, fts_db, fts_tabular = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
+            self._search_fts_db(query, user_id, k * 4),
+            self._search_fts_tabular(query, user_id, k * 4),
+        )
+        dense = self._dedup(db_results + tabular_results)[:k]
+        fts_all = self._dedup(fts_db + fts_tabular)
+        return self._rrf_merge(dense, fts_all, top_k=k)
+    async def hybrid_bm25(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + FTS (database chunks only).
+        Embeds once, then runs dense db, dense tabular, and fts db legs in parallel.
+        """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results, fts_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
+            self._search_fts_db(query, user_id, k * 4),
+        )
+        dense = self._dedup(db_results + tabular_results)[:k]
+        return self._rrf_merge(dense, self._dedup(fts_results), top_k=k)
+    # ------------------------------------------------------------------
+    # Public interface — called by the router
+    # ------------------------------------------------------------------
+    async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        strategy_fn = getattr(self, ACTIVE_STRATEGY)
+        results = await strategy_fn(query, user_id, k)
+        logger.info("schema retrieval", strategy=ACTIVE_STRATEGY, count=len(results))
+        return results
+# ------------------------------------------------------------------
+# Benchmark helper — import in test scripts
+# ------------------------------------------------------------------
+async def benchmark(
+    query: str,
+    user_id: str,
+    k: int = 5,
+    strategies: list[Strategy] | None = None,
+) -> dict[str, dict]:
+    """Run multiple strategies against the same query and return timing + results."""
+    retriever = SchemaRetriever()
+    targets: list[Strategy] = strategies or [
+        "dense_no_threshold",
+        "dense_dot",
+        "dense_l2",
+        "hybrid",
+        "hybrid_bm25",
+    ]
+    report: dict[str, dict] = {}
+    for name in targets:
+        fn = getattr(retriever, name)
+        t0 = time.perf_counter()
+        chunks = await fn(query, user_id, k)
+        elapsed_ms = round((time.perf_counter() - t0) * 1000)
+        total_chars = sum(len(r.content) for r in chunks)
+        report[name] = {
+            "chunks": len(chunks),
+            "estimated_tokens": total_chars // 4,
+            "elapsed_ms": elapsed_ms,
+            "results": chunks,
+        }
+    return report
+schema_retriever = SchemaRetriever()

src/rag/router.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Routes retrieval requests to the appropriate retriever based on source_hint."""
+import asyncio
+import hashlib
+import json
+from typing import Literal
+from src.db.redis.connection import get_redis
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("retrieval_router")
+_CACHE_TTL = 3600  # 1 hour
+SourceHint = Literal["document", "schema", "both"]
+class RetrievalRouter:
+    def __init__(
+        self,
+        schema_retriever: BaseRetriever,
+        document_retriever: BaseRetriever,
+    ):
+        self._retrievers: dict[str, BaseRetriever] = {
+            "schema": schema_retriever,
+            "document": document_retriever,
+        }
+    def _route(self, source_hint: SourceHint) -> list[BaseRetriever]:
+        if source_hint == "schema":
+            return [self._retrievers["schema"]]
+        if source_hint == "document":
+            return [self._retrievers["document"]]
+        return list(self._retrievers.values())
+    async def retrieve(
+        self,
+        query: str,
+        user_id: str,
+        source_hint: SourceHint = "both",
+        k: int = 10,
+    ) -> list[RetrievalResult]:
+        redis = await get_redis()
+        query_hash = hashlib.md5(query.encode()).hexdigest()
+        cache_key = f"retrieval:{user_id}:{source_hint}:{query_hash}:{k}"
+        cached = await redis.get(cache_key)
+        if cached:
+            logger.info("returning cached retrieval results", source_hint=source_hint)
+            raw = json.loads(cached)
+            return [RetrievalResult(**r) for r in raw]
+        retrievers = self._route(source_hint)
+        batches = await asyncio.gather(
+            *[r.retrieve(query, user_id, k) for r in retrievers],
+            return_exceptions=True,
+        )
+        results: list[RetrievalResult] = []
+        for batch in batches:
+            if isinstance(batch, Exception):
+                logger.error("retriever failed", error=str(batch))
+                continue
+            results.extend(batch)
+        results.sort(key=lambda r: r.score, reverse=True)
+        results = results[:k]
+        logger.info("retrieved chunks", count=len(results), source_hint=source_hint)
+        await redis.setex(
+            cache_key,
+            _CACHE_TTL,
+            json.dumps([vars(r) for r in results]),
+        )
+        return results