Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 13 days ago

Commit

de32ab0

1 Parent(s): c9d3b33

[NOTICKET] rrf merge now at router level

Browse files

Files changed (1) hide show

src/rag/router.py +119 -19

src/rag/router.py CHANGED Viewed

@@ -1,8 +1,14 @@
-"""Routes retrieval requests to the appropriate retriever based on source_hint."""
 import asyncio
 import hashlib
 import json
 from typing import Literal
 from src.db.redis.connection import get_redis
@@ -12,9 +18,73 @@ from src.rag.base import BaseRetriever, RetrievalResult
 logger = get_logger("retrieval_router")
 _CACHE_TTL = 3600  # 1 hour
 SourceHint = Literal["document", "schema", "both"]
 class RetrievalRouter:
     def __init__(
         self,
@@ -26,12 +96,12 @@ class RetrievalRouter:
             "document": document_retriever,
         }
-    def _route(self, source_hint: SourceHint) -> list[BaseRetriever]:
         if source_hint == "schema":
-            return [self._retrievers["schema"]]
         if source_hint == "document":
-            return [self._retrievers["document"]]
-        return list(self._retrievers.values())
     async def retrieve(
         self,
@@ -42,7 +112,7 @@ class RetrievalRouter:
     ) -> list[RetrievalResult]:
         redis = await get_redis()
         query_hash = hashlib.md5(query.encode()).hexdigest()
-        cache_key = f"retrieval:{user_id}:{source_hint}:{query_hash}:{k}"
         cached = await redis.get(cache_key)
         if cached:
@@ -50,26 +120,56 @@ class RetrievalRouter:
             raw = json.loads(cached)
             return [RetrievalResult(**r) for r in raw]
-        retrievers = self._route(source_hint)
         batches = await asyncio.gather(
-            *[r.retrieve(query, user_id, k) for r in retrievers],
             return_exceptions=True,
         )
-        results: list[RetrievalResult] = []
-        for batch in batches:
             if isinstance(batch, Exception):
-                logger.error("retriever failed", error=str(batch))
                 continue
-            results.extend(batch)
-        results.sort(key=lambda r: r.score, reverse=True)
-        results = results[:k]
-        logger.info("retrieved chunks", count=len(results), source_hint=source_hint)
-        await redis.setex(
-            cache_key,
-            _CACHE_TTL,
-            json.dumps([vars(r) for r in results]),
         )
         return results

+"""Routes retrieval requests to the appropriate retriever based on source_hint.
+Cross-retriever merging uses Reciprocal Rank Fusion (RRF) on per-retriever
+ranked lists — score scales differ across retrievers (RRF, cosine, distance)
+and aren't directly comparable, so we rank-merge instead of score-merge.
+"""
 import asyncio
 import hashlib
 import json
+from dataclasses import asdict
 from typing import Literal
 from src.db.redis.connection import get_redis
 logger = get_logger("retrieval_router")
 _CACHE_TTL = 3600  # 1 hour
+_CACHE_KEY_PREFIX = "retrieval"
+_RRF_K = 60  # standard RRF constant
 SourceHint = Literal["document", "schema", "both"]
+def _result_dedup_key(r: RetrievalResult) -> tuple:
+    """Cross-retriever dedup key — distinguishes DB columns vs DB tables vs
+    tabular columns vs prose chunks vs sheet-level (future)."""
+    data = r.metadata.get("data", {})
+    return (
+        r.source_type,
+        data.get("table_name"),
+        data.get("column_name"),
+        data.get("filename"),
+        data.get("sheet_name"),
+        data.get("chunk_index"),  # disambiguates multiple prose chunks per doc
+    )
+def _rrf_merge(
+    ranked_lists: list[list[RetrievalResult]],
+    top_k: int,
+    k_rrf: int = _RRF_K,
+) -> list[RetrievalResult]:
+    """Reciprocal Rank Fusion across retriever batches.
+    Each input list is treated as already best-first ordered. Items are
+    deduped via _result_dedup_key and re-ranked by aggregated reciprocal
+    rank across all lists. Score on the returned RetrievalResult is the
+    aggregated RRF score (uniform scale across legs).
+    """
+    scores: dict[tuple, float] = {}
+    index: dict[tuple, RetrievalResult] = {}
+    for ranked in ranked_lists:
+        for rank, result in enumerate(ranked):
+            key = _result_dedup_key(result)
+            scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
+            # Keep the first occurrence; metadata is identical for the same
+            # key across lists, so any copy is fine.
+            if key not in index:
+                index[key] = result
+    merged = sorted(index.values(), key=lambda r: scores[_result_dedup_key(r)], reverse=True)
+    # Overwrite score with RRF score so downstream consumers see a uniform scale.
+    for r in merged:
+        r.score = scores[_result_dedup_key(r)]
+    return merged[:top_k]
+async def invalidate_retrieval_cache(user_id: str) -> int:
+    """Delete every cached retrieval entry for `user_id`.
+    Called by ingest/upload/delete API handlers after a successful write so
+    the next retrieval picks up the new data instead of stale cached top-k.
+    Returns the number of keys removed.
+    """
+    redis = await get_redis()
+    pattern = f"{_CACHE_KEY_PREFIX}:{user_id}:*"
+    keys = [key async for key in redis.scan_iter(match=pattern)]
+    if not keys:
+        return 0
+    deleted = await redis.delete(*keys)
+    logger.info("retrieval cache invalidated", user_id=user_id, deleted=deleted)
+    return int(deleted)
 class RetrievalRouter:
     def __init__(
         self,
             "document": document_retriever,
         }
+    def _route(self, source_hint: SourceHint) -> list[tuple[str, BaseRetriever]]:
         if source_hint == "schema":
+            return [("schema", self._retrievers["schema"])]
         if source_hint == "document":
+            return [("document", self._retrievers["document"])]
+        return list(self._retrievers.items())
     async def retrieve(
         self,
     ) -> list[RetrievalResult]:
         redis = await get_redis()
         query_hash = hashlib.md5(query.encode()).hexdigest()
+        cache_key = f"{_CACHE_KEY_PREFIX}:{user_id}:{source_hint}:{query_hash}:{k}"
         cached = await redis.get(cache_key)
         if cached:
             raw = json.loads(cached)
             return [RetrievalResult(**r) for r in raw]
+        results = await self._retrieve_uncached(query, user_id, source_hint, k)
+        # Empty-result fallback: orchestrator may have misclassified intent.
+        # Retry once with "both" before giving up. No-op when source_hint is
+        # already "both".
+        if not results and source_hint != "both":
+            logger.warning(
+                "empty retrieval, falling back to source_hint='both'",
+                original_source_hint=source_hint,
+            )
+            results = await self._retrieve_uncached(query, user_id, "both", k)
+        await redis.setex(
+            cache_key,
+            _CACHE_TTL,
+            json.dumps([asdict(r) for r in results]),
+        )
+        return results
+    async def _retrieve_uncached(
+        self,
+        query: str,
+        user_id: str,
+        source_hint: SourceHint,
+        k: int,
+    ) -> list[RetrievalResult]:
+        routed = self._route(source_hint)
         batches = await asyncio.gather(
+            *[r.retrieve(query, user_id, k) for _, r in routed],
             return_exceptions=True,
         )
+        valid_lists: list[list[RetrievalResult]] = []
+        per_retriever: dict[str, int | str] = {}
+        for (name, _), batch in zip(routed, batches):
             if isinstance(batch, Exception):
+                logger.error("retriever failed", retriever=name, error=str(batch))
+                per_retriever[name] = "error"
                 continue
+            valid_lists.append(batch)
+            per_retriever[name] = len(batch)
+        results = _rrf_merge(valid_lists, top_k=k)
+        logger.info(
+            "router result",
+            source_hint=source_hint,
+            per_retriever=per_retriever,
+            final_count=len(results),
+            top_score=results[0].score if results else None,
+            bottom_score=results[-1].score if results else None,
         )
         return results