Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

Rifqi Hafizuddin commited on 23 days ago

Commit

bb29492

1 Parent(s): 00aa61d

[NOTICKET] now retrieve db tables first, then get column from the obtained tables. reduce k to 5

Browse files

Files changed (3) hide show

src/api/v1/chat.py +3 -3
src/query/executors/db_executor.py +38 -4
src/rag/retrievers/schema.py +88 -23

src/api/v1/chat.py CHANGED Viewed

@@ -190,11 +190,11 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         if intent_result is None:
             # Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
-            # k=10 (not the wrapper default of 5) so the merged top-k spans more
             # tables — db_executor's FK expansion is one-hop and cannot bridge
             # 2-hop gaps (e.g. customers -> order_items -> products) on its own.
             retrieval_task = asyncio.create_task(
-                retriever.retrieve(request.message, request.user_id, db, k=10)
             )
             history_task = asyncio.create_task(
                 load_history(db, request.room_id, limit=6)  # 6 msgs (3 pairs) for orchestrator
@@ -222,7 +222,7 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
                         query=search_query,
                         user_id=request.user_id,
                         db=db,
-                        k=10,
                         source_hint=intent_result.get("source_hint", "both"),
                     )
                 else:

         if intent_result is None:
             # Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
+            # k=5
             # tables — db_executor's FK expansion is one-hop and cannot bridge
             # 2-hop gaps (e.g. customers -> order_items -> products) on its own.
             retrieval_task = asyncio.create_task(
+                retriever.retrieve(request.message, request.user_id, db, k=5)
             )
             history_task = asyncio.create_task(
                 load_history(db, request.room_id, limit=6)  # 6 msgs (3 pairs) for orchestrator
                         query=search_query,
                         user_id=request.user_id,
                         db=db,
+                        k=5,
                         source_hint=intent_result.get("source_hint", "both"),
                     )
                 else:

src/query/executors/db_executor.py CHANGED Viewed

@@ -193,7 +193,12 @@ class DbExecutor(BaseExecutor):
                 })
                 sql = result.sql.strip()
                 allowed_tables = set(full_schema) | set(related_schema)
-                validation_error = self._validate(sql, allowed_tables, capped_limit)
                 if validation_error:
                     prev_error = validation_error
                     prev_reasoning = result.reasoning
@@ -559,11 +564,21 @@ class DbExecutor(BaseExecutor):
     # Guardrails
     # ------------------------------------------------------------------
-    def _validate(self, sql: str, allowed_tables: set[str], limit: int) -> str:
         """Return an error string if validation fails, empty string if OK.
         `allowed_tables` is the union of hit-table names and FK-related table
         names — both are legal targets for SELECT/JOIN.
         """
         # Layer 1: sqlglot parse + SELECT-only check
         try:
@@ -580,12 +595,31 @@ class DbExecutor(BaseExecutor):
         # Layer 2: schema grounding — table names
         known_tables = {t.lower() for t in allowed_tables}
         for tbl in parsed.find_all(exp.Table):
             name = tbl.name.lower()
             if name and name not in known_tables:
                 return f"Unknown table '{tbl.name}'. Only use tables from the schema."
-        # Layer 3: LIMIT enforcement (inject if missing — done before execution)
         return ""
     # ------------------------------------------------------------------

                 })
                 sql = result.sql.strip()
                 allowed_tables = set(full_schema) | set(related_schema)
+                column_map: dict[str, set[str]] = {
+                    t: {c["name"] for c in cols} for t, cols in full_schema.items()
+                }
+                for t, info in related_schema.items():
+                    column_map[t] = set(info.get("column_names") or [])
+                validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
                 if validation_error:
                     prev_error = validation_error
                     prev_reasoning = result.reasoning
     # Guardrails
     # ------------------------------------------------------------------
+    def _validate(
+        self,
+        sql: str,
+        allowed_tables: set[str],
+        limit: int,
+        column_map: dict[str, set[str]] | None = None,
+    ) -> str:
         """Return an error string if validation fails, empty string if OK.
         `allowed_tables` is the union of hit-table names and FK-related table
         names — both are legal targets for SELECT/JOIN.
+        `column_map` maps table_name → set of valid column names. When provided,
+        any qualified table.column reference not found in the map triggers a retry
+        with an informative error so the LLM can self-correct without hallucinating.
         """
         # Layer 1: sqlglot parse + SELECT-only check
         try:
         # Layer 2: schema grounding — table names
         known_tables = {t.lower() for t in allowed_tables}
+        alias_to_table: dict[str, str] = {}
         for tbl in parsed.find_all(exp.Table):
             name = tbl.name.lower()
             if name and name not in known_tables:
                 return f"Unknown table '{tbl.name}'. Only use tables from the schema."
+            alias = (tbl.alias or tbl.name).lower()
+            alias_to_table[alias] = name
+        # Layer 3: column grounding — qualified references only (table.column)
+        if column_map:
+            normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
+            for col_node in parsed.find_all(exp.Column):
+                tbl_ref = col_node.table
+                if not tbl_ref:
+                    continue  # unqualified — skip, can't resolve without full alias tracking
+                tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
+                col_name = col_node.name.lower()
+                if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
+                    available = ", ".join(sorted(normalized_map[tbl_name]))
+                    return (
+                        f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
+                        f"Available columns: {available}."
+                    )
+        # Layer 4: LIMIT enforcement (inject if missing — done before execution)
         return ""
     # ------------------------------------------------------------------

src/rag/retrievers/schema.py CHANGED Viewed

@@ -194,26 +194,6 @@ class SchemaRetriever(BaseRetriever):
             d.get("sheet_name"),
         )
-    def _rrf_merge(
-        self,
-        *ranked_lists: list[RetrievalResult],
-        k_rrf: int = 60,
-        top_k: int = 5,
-    ) -> list[RetrievalResult]:
-        """Reciprocal Rank Fusion — combines ranked lists using rank positions only."""
-        scores: dict[tuple, float] = {}
-        index: dict[tuple, RetrievalResult] = {}
-        for ranked in ranked_lists:
-            for rank, result in enumerate(ranked):
-                key = self._chunk_key(result)
-                scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
-                if key not in index or result.score > index[key].score:
-                    index[key] = result
-        merged = sorted(index.values(), key=lambda r: scores[self._chunk_key(r)], reverse=True)
-        return merged[:top_k]
     def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
         """Deduplicate by chunk identity, keeping highest score per unique key."""
         seen: dict[tuple, RetrievalResult] = {}
@@ -223,12 +203,93 @@ class SchemaRetriever(BaseRetriever):
                 seen[key] = r
         return sorted(seen.values(), key=lambda r: r.score, reverse=True)
     # ------------------------------------------------------------------
     # Public interface — called by the router
     # ------------------------------------------------------------------
     async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """RRF merge of dense (DB columns + DB tables + tabular) and FTS (DB cols only)."""
         embedding = await self._embed_query(query)
         db_col_results, db_tbl_results, tabular_results, fts_results = await asyncio.gather(
             self._search_db(embedding, user_id, k),
@@ -236,11 +297,15 @@ class SchemaRetriever(BaseRetriever):
             self._search_tabular(embedding, user_id, k),
             self._search_fts_db(query, user_id, k * 4),
         )
-        dense = self._dedup(db_col_results + db_tbl_results + tabular_results)[:k]
-        results = self._rrf_merge(dense, self._dedup(fts_results), top_k=k)
         logger.info(
             "schema retrieval",
             count=len(results),
             db_cols=len(db_col_results),
             db_tables=len(db_tbl_results),
             tabular=len(tabular_results),

             d.get("sheet_name"),
         )
     def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
         """Deduplicate by chunk identity, keeping highest score per unique key."""
         seen: dict[tuple, RetrievalResult] = {}
                 seen[key] = r
         return sorted(seen.values(), key=lambda r: r.score, reverse=True)
+    def _rank_db_tables(
+        self,
+        tbl_results: list[RetrievalResult],
+        col_results: list[RetrievalResult],
+        fts_results: list[RetrievalResult],
+        top_k: int,
+        k_rrf: int = 60,
+    ) -> list[RetrievalResult]:
+        """Rank DB tables by RRF across three legs:
+          L1 (primary): table-summary chunk similarity
+          L2 (vote):    best column-chunk position per table
+          L3 (vote):    best FTS position per table
+        Returns top-k table-chunk RetrievalResults. For tables surfaced by
+        L2/L3 but missing a table chunk, a minimal stub is returned so that
+        db_executor._fetch_full_schema can seed off data.table_name.
+        """
+        # L1: tables ranked by table-chunk cosine score
+        tbl_index: dict[str, RetrievalResult] = {}
+        tbl_ranked: list[str] = []
+        for r in tbl_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in tbl_index:
+                tbl_index[tname] = r
+                tbl_ranked.append(tname)
+        # L2: tables ranked by first-appearance in column-chunk list (best col score)
+        col_table_ranked: list[str] = []
+        seen: set[str] = set()
+        for r in col_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in seen:
+                col_table_ranked.append(tname)
+                seen.add(tname)
+        # L3: tables ranked by first-appearance in FTS list
+        fts_table_ranked: list[str] = []
+        seen = set()
+        for r in fts_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in seen:
+                fts_table_ranked.append(tname)
+                seen.add(tname)
+        # RRF over table names across the three legs
+        rrf_scores: dict[str, float] = {}
+        for ranked_list in [tbl_ranked, col_table_ranked, fts_table_ranked]:
+            for rank, tname in enumerate(ranked_list):
+                rrf_scores[tname] = rrf_scores.get(tname, 0.0) + 1.0 / (k_rrf + rank + 1)
+        top_tables = sorted(rrf_scores, key=lambda t: rrf_scores[t], reverse=True)[:top_k]
+        results: list[RetrievalResult] = []
+        for tname in top_tables:
+            if tname in tbl_index:
+                r = tbl_index[tname]
+                r.score = rrf_scores[tname]
+                results.append(r)
+            else:
+                # Surfaced by column/FTS votes with no table chunk — minimal stub
+                results.append(RetrievalResult(
+                    content=f"Table: {tname}",
+                    metadata={"data": {"table_name": tname}, "source_type": "database"},
+                    score=rrf_scores[tname],
+                    source_type="database",
+                ))
+        return results
     # ------------------------------------------------------------------
     # Public interface — called by the router
     # ------------------------------------------------------------------
     async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Table-first retrieval for DB sources; chunk-level for tabular.
+        DB tables are ranked via RRF across three legs:
+          L1 (primary): table-summary chunk similarity
+          L2 (vote): top-K column-chunk cosine, grouped by table
+          L3 (vote): top-K FTS column hits, grouped by table
+        db_executor downstream fetches the full per-column schema for the
+        ranked table set via _fetch_full_schema — the column chunks returned
+        here are intentionally NOT used as the schema source, only for voting.
+        Tabular (CSV/XLSX) chunks remain at column/sheet level since they have
+        no table-level chunks.
+        """
         embedding = await self._embed_query(query)
         db_col_results, db_tbl_results, tabular_results, fts_results = await asyncio.gather(
             self._search_db(embedding, user_id, k),
             self._search_tabular(embedding, user_id, k),
             self._search_fts_db(query, user_id, k * 4),
         )
+        db_ranked = self._rank_db_tables(db_tbl_results, db_col_results, fts_results, top_k=k)
+        tabular_final = self._dedup(tabular_results)[:k]
+        results = db_ranked + tabular_final
         logger.info(
             "schema retrieval",
             count=len(results),
+            db_tables_ranked=len(db_ranked),
             db_cols=len(db_col_results),
             db_tables=len(db_tbl_results),
             tabular=len(tabular_results),