Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Feb 28

Commit

e7c9ee6

1 Parent(s): 4fc2936

Deploy d8ad462

Browse files

Files changed (11) hide show

app/api/chat.py +69 -16
app/main.py +6 -1
app/models/pipeline.py +8 -0
app/pipeline/graph.py +44 -13
app/pipeline/nodes/cache.py +4 -1
app/pipeline/nodes/retrieve.py +82 -25
app/pipeline/nodes/rewrite_query.py +95 -0
app/services/embedder.py +22 -7
app/services/sparse_encoder.py +80 -0
app/services/vector_store.py +139 -47
requirements.txt +3 -0

app/api/chat.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import re
 import time
@@ -28,6 +29,55 @@ def _is_criticism(message: str) -> bool:
     return any(sig in lowered for sig in _CRITICISM_SIGNALS)
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
@@ -41,16 +91,12 @@ async def chat_endpoint(
     # All singletons pre-built in lifespan — zero allocation in hot path.
     pipeline = request.app.state.pipeline
     conv_store = request.app.state.conversation_store
     session_id = request_data.session_id
-    # Fetch prior turns and detect criticism BEFORE the pipeline runs.
-    # Both are synchronous SQLite reads (<3ms) so they don't block the event loop
-    # meaningfully, but we keep them outside sse_generator to avoid any closure issues.
     conversation_history = conv_store.get_recent(session_id)
     criticism = _is_criticism(request_data.message)
     if criticism and conversation_history:
-        # Auto-record negative feedback on the previous turn so the self-improvement
-        # loop picks it up during the next reranker fine-tune cycle.
         conv_store.mark_last_negative(session_id)
     initial_state: PipelineState = {  # type: ignore[assignment]
@@ -71,6 +117,9 @@ async def chat_endpoint(
         "latency_ms": 0,
         "error": None,
         "interaction_id": None,
     }
     async def sse_generator():
@@ -81,19 +130,10 @@ async def chat_endpoint(
         try:
             async for event in pipeline.astream(initial_state):
-                # Abort on client disconnect — prevents orphaned instances burning vCPU-seconds.
                 if await request.is_disconnected():
                     break
                 for node_name, updates in event.items():
-                    # ── Stage transparency ─────────────────────────────────────────
-                    # Emit named stage events so the frontend can show a live
-                    # progress indicator ("checking cache" → "searching" → "writing").
-                    # Mapping: node name → SSE stage label.
-                    #
-                    # cache    miss  → "checking"  (semantic cache lookup ran, no hit)
-                    # gemini_fast → already emits thinking:true if routing to RAG
-                    # retrieve  done → "generating" (retrieval complete, LLM starting)
                     if node_name == "cache" and updates.get("cached") is False:
                         yield f'data: {json.dumps({"stage": "checking"})}\n\n'
                     elif node_name == "cache" and updates.get("cached") is True:
@@ -102,11 +142,13 @@ async def chat_endpoint(
                     if node_name == "retrieve":
                         yield f'data: {json.dumps({"stage": "generating"})}\n\n'
-                    # Gemini signalled it needs the knowledge base.
                     if updates.get("thinking") is True:
                         yield f'data: {json.dumps({"thinking": True, "stage": "searching"})}\n\n'
-                    # ── Answer tokens ──────────────────────────────────────────────
                     if "answer" in updates:
                         answer_update = updates["answer"]
                         delta = (
@@ -138,6 +180,16 @@ async def chat_endpoint(
             yield f'data: {json.dumps({"done": True, "sources": sources_list, "cached": is_cached, "latency_ms": elapsed_ms, "interaction_id": interaction_id})}\n\n'
         except Exception as exc:
             yield f'data: {json.dumps({"error": str(exc) or "Generation failed"})}\n\n'
@@ -146,3 +198,4 @@ async def chat_endpoint(
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )

+import asyncio
 import json
 import re
 import time
     return any(sig in lowered for sig in _CRITICISM_SIGNALS)
+async def _generate_follow_ups(
+    query: str,
+    answer: str,
+    sources: list,
+    llm_client,
+) -> list[str]:
+    """
+    Generates 3 specific follow-up questions after the main answer is complete.
+    Runs as a concurrent asyncio Task — zero added latency after the done event.
+    Questions must be:
+    - Specific to the answer content (never generic like "tell me more")
+    - Phrased naturally (< 12 words)
+    - Answerable from the knowledge base
+    """
+    source_titles = [
+        (s.title if hasattr(s, "title") else s.get("title", ""))
+        for s in sources[:3]
+    ]
+    titles_str = ", ".join(t for t in source_titles if t) or "the knowledge base"
+    prompt = (
+        f"Question asked: {query}\n\n"
+        f"Answer given (excerpt): {answer[:400]}\n\n"
+        f"Sources referenced: {titles_str}\n\n"
+        "Write exactly 3 follow-up questions a recruiter would naturally ask next. "
+        "Each question must be specific to the content above — not generic. "
+        "Each question must be under 12 words. "
+        "Output ONLY the 3 questions, one per line, no numbering or bullet points."
+    )
+    system = (
+        "You write concise follow-up questions for a portfolio chatbot. "
+        "Never write generic questions like 'tell me more' or 'what else'. "
+        "Each question must be under 12 words and reference specifics from the answer."
+    )
+    try:
+        stream = llm_client.complete_with_complexity(
+            prompt=prompt, system=system, stream=True, complexity="simple"
+        )
+        raw = ""
+        async for token in stream:
+            raw += token
+        questions = [q.strip() for q in raw.strip().splitlines() if q.strip()][:3]
+        return questions
+    except Exception:
+        return []
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
     # All singletons pre-built in lifespan — zero allocation in hot path.
     pipeline = request.app.state.pipeline
     conv_store = request.app.state.conversation_store
+    llm_client = request.app.state.llm_client
     session_id = request_data.session_id
     conversation_history = conv_store.get_recent(session_id)
     criticism = _is_criticism(request_data.message)
     if criticism and conversation_history:
         conv_store.mark_last_negative(session_id)
     initial_state: PipelineState = {  # type: ignore[assignment]
         "latency_ms": 0,
         "error": None,
         "interaction_id": None,
+        "retrieval_attempts": 0,
+        "rewritten_query": None,
+        "follow_ups": [],
     }
     async def sse_generator():
         try:
             async for event in pipeline.astream(initial_state):
                 if await request.is_disconnected():
                     break
                 for node_name, updates in event.items():
                     if node_name == "cache" and updates.get("cached") is False:
                         yield f'data: {json.dumps({"stage": "checking"})}\n\n'
                     elif node_name == "cache" and updates.get("cached") is True:
                     if node_name == "retrieve":
                         yield f'data: {json.dumps({"stage": "generating"})}\n\n'
+                    # CRAG rewrite in progress — inform the frontend the query is being refined.
+                    if node_name == "rewrite_query":
+                        yield f'data: {json.dumps({"stage": "refining"})}\n\n'
                     if updates.get("thinking") is True:
                         yield f'data: {json.dumps({"thinking": True, "stage": "searching"})}\n\n'
                     if "answer" in updates:
                         answer_update = updates["answer"]
                         delta = (
             yield f'data: {json.dumps({"done": True, "sources": sources_list, "cached": is_cached, "latency_ms": elapsed_ms, "interaction_id": interaction_id})}\n\n'
+            # ── Follow-up questions ────────────────────────────────────────────
+            # Generated after the done event so it never delays answer delivery.
+            # Works for both cache hits (no sources) and full RAG responses.
+            if final_answer and not await request.is_disconnected():
+                follow_ups = await _generate_follow_ups(
+                    request_data.message, final_answer, final_sources, llm_client
+                )
+                if follow_ups:
+                    yield f'data: {json.dumps({"follow_ups": follow_ups})}\n\n'
         except Exception as exc:
             yield f'data: {json.dumps({"error": str(exc) or "Generation failed"})}\n\n'
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )

app/main.py CHANGED Viewed

@@ -76,12 +76,17 @@ async def lifespan(app: FastAPI):
     # ingest run doesn't crash every search with "collection not found".
     vector_store.ensure_collection()
     app.state.pipeline = build_pipeline({
         "classifier": GuardClassifier(),
         "cache": app.state.semantic_cache,
         "embedder": embedder,
         "gemini": gemini_client,
-        "llm": get_llm_client(settings),
         "vector_store": vector_store,
         "reranker": reranker,
         "db_path": settings.DB_PATH,

     # ingest run doesn't crash every search with "collection not found".
     vector_store.ensure_collection()
+    llm_client = get_llm_client(settings)
+    # Expose llm_client on app state so chat.py can use it for follow-up
+    # question generation without re-constructing the client per request.
+    app.state.llm_client = llm_client
     app.state.pipeline = build_pipeline({
         "classifier": GuardClassifier(),
         "cache": app.state.semantic_cache,
         "embedder": embedder,
         "gemini": gemini_client,
+        "llm": llm_client,
         "vector_store": vector_store,
         "reranker": reranker,
         "db_path": settings.DB_PATH,

app/models/pipeline.py CHANGED Viewed

@@ -43,3 +43,11 @@ class PipelineState(TypedDict):
     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]

     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]
+    # CRAG: counts retrieve node invocations; 2 = one retry was attempted.
+    # Starts at 0 in initial state; retrieve increments it each call.
+    retrieval_attempts: int
+    # Set by the rewrite_query node when CRAG triggers; None otherwise.
+    rewritten_query: Optional[str]
+    # Follow-up question suggestions generated after the main answer.
+    # 3 short questions specific to content in the answer.
+    follow_ups: list[str]

app/pipeline/graph.py CHANGED Viewed

@@ -6,9 +6,13 @@ from app.pipeline.nodes.guard import make_guard_node
 from app.pipeline.nodes.cache import make_cache_node
 from app.pipeline.nodes.gemini_fast import make_gemini_fast_node
 from app.pipeline.nodes.retrieve import make_retrieve_node
 from app.pipeline.nodes.generate import make_generate_node
 from app.pipeline.nodes.log_eval import make_log_eval_node
 def route_guard(state: PipelineState) -> str:
     if state.get("guard_passed", False):
@@ -33,19 +37,42 @@ def route_gemini(state: PipelineState) -> str:
     return "research"
 def build_pipeline(services: dict) -> CompiledStateGraph:
     graph = StateGraph(PipelineState)
-    graph.add_node("guard",        make_guard_node(services["classifier"]))
-    # Cache node embeds the query; gemini_fast and retrieve reuse that embedding.
-    graph.add_node("cache",        make_cache_node(services["cache"], services["embedder"]))
-    graph.add_node("gemini_fast",  make_gemini_fast_node(services["gemini"]))
-    graph.add_node("retrieve",     make_retrieve_node(
-                                   services["vector_store"],
-                                   services["embedder"],
-                                   services["reranker"]))
-    graph.add_node("generate",     make_generate_node(services["llm"], services["gemini"]))
-    graph.add_node("log_eval",     make_log_eval_node(services["db_path"]))
     graph.set_entry_point("guard")
@@ -58,9 +85,13 @@ def build_pipeline(services: dict) -> CompiledStateGraph:
     graph.add_conditional_edges("gemini_fast", route_gemini,
         {"answered": "log_eval", "research": "retrieve"})
-    # Always route retrieve → generate. generate handles empty chunks with a
-    # clean "not in knowledge base" response; no need for a separate not_found edge.
-    graph.add_edge("retrieve", "generate")
     graph.add_edge("generate", "log_eval")
     graph.add_edge("log_eval", END)

 from app.pipeline.nodes.cache import make_cache_node
 from app.pipeline.nodes.gemini_fast import make_gemini_fast_node
 from app.pipeline.nodes.retrieve import make_retrieve_node
+from app.pipeline.nodes.rewrite_query import make_rewrite_query_node, _has_meaningful_token
 from app.pipeline.nodes.generate import make_generate_node
 from app.pipeline.nodes.log_eval import make_log_eval_node
+# Relevance gate threshold — matches retrieve.py constant.
+_MIN_TOP_SCORE: float = -3.5
 def route_guard(state: PipelineState) -> str:
     if state.get("guard_passed", False):
     return "research"
+def route_retrieve_result(state: PipelineState) -> str:
+    """
+    CRAG routing: if the first retrieval returned nothing above threshold,
+    rewrite the query once and retry. Exactly one retry is permitted.
+    Conditions for a rewrite attempt:
+      1. retrieval_attempts == 1 (first pass just completed, no retry yet).
+      2. reranked_chunks is empty (nothing above the -3.5 threshold).
+      3. Query has at least one meaningful non-stop-word token (guards against
+         empty or fully-generic queries where a rewrite wouldn't help).
+    """
+    attempts = state.get("retrieval_attempts", 1)
+    reranked = state.get("reranked_chunks", [])
+    if (
+        attempts == 1
+        and not reranked
+        and _has_meaningful_token(state.get("query", ""))
+    ):
+        return "rewrite"
+    return "generate"
 def build_pipeline(services: dict) -> CompiledStateGraph:
     graph = StateGraph(PipelineState)
+    graph.add_node("guard",         make_guard_node(services["classifier"]))
+    graph.add_node("cache",         make_cache_node(services["cache"], services["embedder"]))
+    graph.add_node("gemini_fast",   make_gemini_fast_node(services["gemini"]))
+    graph.add_node("retrieve",      make_retrieve_node(
+                                    services["vector_store"],
+                                    services["embedder"],
+                                    services["reranker"]))
+    # CRAG: one query rewrite on failed retrieval — then retrieve runs a second time.
+    graph.add_node("rewrite_query", make_rewrite_query_node(services["gemini"]))
+    graph.add_node("generate",      make_generate_node(services["llm"], services["gemini"]))
+    graph.add_node("log_eval",      make_log_eval_node(services["db_path"]))
     graph.set_entry_point("guard")
     graph.add_conditional_edges("gemini_fast", route_gemini,
         {"answered": "log_eval", "research": "retrieve"})
+    # After retrieve: either run CRAG rewrite (one retry) or proceed to generate.
+    graph.add_conditional_edges("retrieve", route_retrieve_result,
+        {"rewrite": "rewrite_query", "generate": "generate"})
+    # After rewrite: go straight back to retrieve for the second attempt.
+    # The cycle terminates because route_retrieve_result checks retrieval_attempts.
+    graph.add_edge("rewrite_query", "retrieve")
     graph.add_edge("generate", "log_eval")
     graph.add_edge("log_eval", END)

app/pipeline/nodes/cache.py CHANGED Viewed

@@ -16,7 +16,10 @@ from app.services.semantic_cache import SemanticCache
 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
-        embedding = await embedder.embed_one(state["query"])
         query_embedding = np.array(embedding)
         cached = await cache.get(query_embedding)

 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
+        # is_query=True: prepend BGE asymmetric instruction so query embedding
+        # lands in the retrieval-optimised neighbourhood of the vector space.
+        # Document embeddings at ingestion time use is_query=False (default).
+        embedding = await embedder.embed_one(state["query"], is_query=True)
         query_embedding = np.array(embedding)
         cached = await cache.get(query_embedding)

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from typing import Callable
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
 from app.services.reranker import Reranker
 # Cross-encoder ms-marco-MiniLM-L-6-v2 returns raw logits (not sigmoid).
 # Highly relevant docs score 0–15; completely off-topic score below –5.
@@ -20,55 +22,109 @@ _MIN_TOP_SCORE: float = -3.5
 # relevant sources and making the answer look one-dimensional.
 _MAX_CHUNKS_PER_DOC: int = 2
-def make_retrieve_node(vector_store: VectorStore, embedder: Embedder, reranker: Reranker) -> Callable[[PipelineState], dict]:
     async def retrieve_node(state: PipelineState) -> dict:
         query = state["query"]
-        expanded = state.get("expanded_queries", [query])
-        # Reuse the embedding computed by cache_node — the first element of
-        # expanded_queries is always the original query. Avoids a duplicate
-        # HTTP call to the embedder Space (~200-400ms saved per request).
         cached_embedding: list[float] | None = state.get("query_embedding")
         if cached_embedding is not None and len(expanded) == 1:
-            # Fast path: single query, embedding already computed.
             query_vectors = [cached_embedding]
         else:
-            # Multi-query or no cached embedding — embed all at once in one call.
-            query_vectors = await embedder.embed(expanded)
-        all_chunks: list[Chunk] = []
-        for vector in query_vectors:
-            chunks = vector_store.search(query_vector=vector, top_k=10)
-            all_chunks.extend(chunks)
-        # Deduplicate by doc_id + section before reranking.
         seen: set[str] = set()
         unique_chunks: list[Chunk] = []
-        for c in all_chunks:
-            fingerprint = f"{c['metadata']['doc_id']}::{c['metadata']['section']}"
-            if fingerprint not in seen:
-                seen.add(fingerprint)
                 unique_chunks.append(c)
         reranked = await reranker.rerank(query, unique_chunks, top_k=5)
-        # Relevance gate: if the highest-scoring chunk doesn't meet the minimum
-        # cross-encoder threshold, the knowledge base genuinely has nothing useful
-        # for this query. Return not-found so generate_node isn't fed garbage context
-        # that causes vague or hallucinated responses.
         top_score = reranked[0]["metadata"].get("rerank_score", 0.0) if reranked else None
         if not reranked or (top_score is not None and top_score < _MIN_TOP_SCORE):
             return {
-                "answer": "",   # empty — generate_node will produce the "not found" reply
                 "retrieved_chunks": [],
                 "reranked_chunks": [],
             }
-        # Source diversity: cap chunks per doc to prevent one verbose document
-        # from filling all context slots and drowning out other relevant sources.
-        # Applied after reranking so the reranker sees the full candidate set.
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
         for chunk in reranked:
@@ -80,6 +136,7 @@ def make_retrieve_node(vector_store: VectorStore, embedder: Embedder, reranker:
         return {
             "retrieved_chunks": unique_chunks,
             "reranked_chunks": diverse_chunks,
         }
     return retrieve_node

+import asyncio
 from typing import Callable
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
 from app.services.reranker import Reranker
+from app.services.sparse_encoder import SparseEncoder
 # Cross-encoder ms-marco-MiniLM-L-6-v2 returns raw logits (not sigmoid).
 # Highly relevant docs score 0–15; completely off-topic score below –5.
 # relevant sources and making the answer look one-dimensional.
 _MAX_CHUNKS_PER_DOC: int = 2
+# RRF rank fusion constant. k=60 is the original Cormack et al. default.
+# Higher k reduces the influence of top-1 rank advantage.
+_RRF_K: int = 60
+# Module-level singleton — BM25 model downloads once (~5 MB), cached in memory.
+_sparse_encoder = SparseEncoder()
+def _rrf_merge(ranked_lists: list[list[Chunk]]) -> list[Chunk]:
+    """
+    Reciprocal Rank Fusion across multiple ranked chunk lists.
+    Score formula: Σ 1 / (rank + 1 + k) over all lists that contain the chunk.
+    Deduplication by doc_id::section fingerprint before merging so the same
+    passage retrieved by both dense and sparse does not double-count.
+    Pure Python, no external dependencies.
+    """
+    scores: dict[str, float] = {}
+    chunks_by_fp: dict[str, Chunk] = {}
+    for ranked in ranked_lists:
+        seen_in_list: set[str] = set()
+        for rank, chunk in enumerate(ranked):
+            fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
+            if fp in seen_in_list:
+                continue  # Already contributed this chunk from this ranked list
+            seen_in_list.add(fp)
+            scores[fp] = scores.get(fp, 0.0) + 1.0 / (rank + 1 + _RRF_K)
+            chunks_by_fp[fp] = chunk
+    sorted_fps = sorted(scores, key=lambda x: scores[x], reverse=True)
+    return [chunks_by_fp[fp] for fp in sorted_fps]
+def make_retrieve_node(
+    vector_store: VectorStore, embedder: Embedder, reranker: Reranker
+) -> Callable[[PipelineState], dict]:
     async def retrieve_node(state: PipelineState) -> dict:
+        attempts = state.get("retrieval_attempts", 0)
         query = state["query"]
+        # On a CRAG retry (attempts >= 1) the query has been rewritten and
+        # query_embedding is explicitly set to None — always re-embed.
+        # On the first attempt, reuse the embedding computed by the cache node.
         cached_embedding: list[float] | None = state.get("query_embedding")
+        if attempts >= 1:
+            # Second attempt: re-embed the rewritten query with is_query=True.
+            cached_embedding = None
+        expanded = [query]  # gemini_fast may fill expanded_queries on first attempt
+        if attempts == 0:
+            expanded = state.get("expanded_queries", [query])
+        # Embed all query variants in one batched call (is_query=True for asymmetric BGE).
         if cached_embedding is not None and len(expanded) == 1:
             query_vectors = [cached_embedding]
         else:
+            query_vectors = await embedder.embed(expanded, is_query=True)
+        # ── Dense search (all query variants) ─────────────────────────────────
+        dense_results: list[list[Chunk]] = []
+        for vec in query_vectors:
+            chunks = vector_store.search(query_vector=vec, top_k=10)
+            dense_results.append(chunks)
+        # ── Sparse (BM25) search (primary query only) ─────────────────────────
+        # Runs concurrently with dense search isn't possible here since dense
+        # is synchronous Qdrant calls, but we parallelise encode + sparse search.
+        sparse_results: list[Chunk] = []
+        if _sparse_encoder.available:
+            indices, values = _sparse_encoder.encode_one(query)
+            sparse_results = vector_store.search_sparse(indices, values, top_k=10)
+        # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
+        # Merge dense (per variant) + sparse into one ranked list.
+        all_ranked_lists = dense_results + ([sparse_results] if sparse_results else [])
+        fused: list[Chunk] = _rrf_merge(all_ranked_lists)
+        # ── Deduplication (question-point collapse) ────────────────────────────
+        # Multiple points for the same chunk (main + question points from Stage 3)
+        # share the same doc_id::section fingerprint and collapse here.
         seen: set[str] = set()
         unique_chunks: list[Chunk] = []
+        for c in fused:
+            fp = f"{c['metadata']['doc_id']}::{c['metadata']['section']}"
+            if fp not in seen:
+                seen.add(fp)
                 unique_chunks.append(c)
         reranked = await reranker.rerank(query, unique_chunks, top_k=5)
+        # ── Relevance gate ─────────────────────────────────────────────────────
         top_score = reranked[0]["metadata"].get("rerank_score", 0.0) if reranked else None
         if not reranked or (top_score is not None and top_score < _MIN_TOP_SCORE):
             return {
+                "answer": "",
                 "retrieved_chunks": [],
                 "reranked_chunks": [],
+                "retrieval_attempts": attempts + 1,
             }
+        # ── Source diversity cap ───────────────────────────────────────────────
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
         for chunk in reranked:
         return {
             "retrieved_chunks": unique_chunks,
             "reranked_chunks": diverse_chunks,
+            "retrieval_attempts": attempts + 1,
         }
     return retrieve_node

app/pipeline/nodes/rewrite_query.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+backend/app/pipeline/nodes/rewrite_query.py
+CRAG (Corrective RAG) query rewriter — fires exactly once per request when:
+  1. The first retrieval attempt returned no chunks above the relevance threshold.
+  2. The query contains at least one meaningful non-stop-word token.
+Calls Gemini Flash (temp 0.7) to produce one alternative phrasing that preserves
+the visitor's intent but uses different vocabulary. The pipeline then runs Retrieve
+and Rerank a second time with this new query. There is exactly one retry — the
+graph routing enforces this via the retrieval_attempts counter in state.
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from app.models.pipeline import PipelineState
+from app.services.gemini_client import GeminiClient
+logger = logging.getLogger(__name__)
+_REWRITE_PROMPT = """\
+A search query failed to find relevant results in a portfolio knowledge base about Darshan Chheda.
+The knowledge base contains his blog posts, project descriptions, CV/resume, and GitHub README files.
+Original query: {query}
+Rephrase this query using different vocabulary that might better match how the content is written.
+Strategies: expand abbreviations, use synonyms, reframe as "did Darshan..." if the query uses a name/tech.
+Output ONLY the rewritten query — one sentence, no explanation, no quotes.
+"""
+# Same stop-word set as generate.py — keeps modules consistent.
+_STOP_WORDS = frozenset({
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "have", "has", "had", "do", "does", "did", "will", "would", "could",
+    "should", "may", "might", "can", "to", "of", "in", "on", "for",
+    "with", "at", "by", "from", "and", "or", "but", "not", "what",
+    "who", "how", "why", "when", "where", "tell", "me", "about", "his",
+    "he", "him", "any", "some", "that", "this", "it", "its",
+})
+def _has_meaningful_token(query: str) -> bool:
+    """True when the query has at least one non-stop-word token of length >= 3."""
+    return any(
+        w not in _STOP_WORDS and len(w) >= 3
+        for w in __import__("re").findall(r"[a-z]+", query.lower())
+    )
+def make_rewrite_query_node(gemini_client: GeminiClient) -> Any:
+    async def rewrite_query_node(state: PipelineState) -> dict:
+        query = state["query"]
+        logger.info("CRAG: rewriting failed query %r", query)
+        if not gemini_client.is_configured:
+            # No Gemini — pass query through unchanged; second retrieve will also fail
+            # and generate will handle the not-found path gracefully.
+            logger.debug("Gemini not configured; skipping query rewrite.")
+            return {
+                "rewritten_query": query,
+                "retrieval_attempts": state.get("retrieval_attempts", 1) + 1,
+                "query_embedding": None,  # Force re-embed so retrieve doesn't use stale embedding
+            }
+        try:
+            response = await gemini_client._client.aio.models.generate_content(
+                model=gemini_client._model,
+                contents=_REWRITE_PROMPT.format(query=query),
+                config={"temperature": 0.7},
+            )
+            rewritten = (response.text or query).strip().strip('"').strip("'")
+        except Exception as exc:
+            logger.warning("Query rewrite Gemini call failed (%s); using original.", exc)
+            rewritten = query
+        if not rewritten or rewritten == query:
+            logger.debug("Rewrite produced no change; using original query.")
+            rewritten = query
+        else:
+            logger.info("CRAG rewrite: %r → %r", query, rewritten)
+        # Clearing query_embedding forces the retrieve node to re-embed the new query.
+        # retrieval_attempts is incremented so the graph does not loop again after
+        # this second retrieval attempt.
+        return {
+            "query": rewritten,
+            "rewritten_query": rewritten,
+            "retrieval_attempts": state.get("retrieval_attempts", 1) + 1,
+            "query_embedding": None,
+        }
+    return rewrite_query_node

app/services/embedder.py CHANGED Viewed

@@ -23,27 +23,42 @@ def _get_local_model() -> Any:
     return _local_model
 class Embedder:
     def __init__(self, remote_url: str = "", environment: str = "local") -> None:
         self._remote = environment == "prod" and bool(remote_url)
         self._url = remote_url.rstrip("/") if self._remote else ""
-    async def embed(self, texts: list[str]) -> list[list[float]]:
-        """Encodes texts, returns List of L2-normalised 384-dim float vectors."""
         if not texts:
             return []
         if self._remote:
-            # Use a fresh async client per call — HF Spaces does not guarantee
-            # a stable connection lifecycle, so a persistent client risks stale sockets.
             async with httpx.AsyncClient(timeout=30.0) as client:
-                resp = await client.post(f"{self._url}/embed", json={"texts": texts})
                 resp.raise_for_status()
                 return resp.json()["embeddings"]
         model = _get_local_model()
         vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False)
         return vectors.tolist()
-    async def embed_one(self, text: str) -> list[float]:
         """Convenience wrapper for a single string."""
-        results = await self.embed([text])
         return results[0]

     return _local_model
+# BGE asymmetric query instruction — prepended locally when is_query=True and
+# environment is local. In prod the HF Space accepts is_query and prepends itself.
+_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
 class Embedder:
     def __init__(self, remote_url: str = "", environment: str = "local") -> None:
         self._remote = environment == "prod" and bool(remote_url)
         self._url = remote_url.rstrip("/") if self._remote else ""
+    async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]:
+        """
+        Encodes texts, returns List of L2-normalised 384-dim float vectors.
+        is_query=True: prepend BGE asymmetric query instruction (queries only).
+        is_query=False: encode as-is (document/ingestion embeddings).
+        See BGE paper: 2-4% NDCG gain from using the correct prefix on queries.
+        """
         if not texts:
             return []
         if self._remote:
+            # HF Space handles the prefix server-side when is_query=True.
             async with httpx.AsyncClient(timeout=30.0) as client:
+                resp = await client.post(
+                    f"{self._url}/embed",
+                    json={"texts": texts, "is_query": is_query},
+                )
                 resp.raise_for_status()
                 return resp.json()["embeddings"]
         model = _get_local_model()
+        if is_query:
+            texts = [_BGE_QUERY_PREFIX + t for t in texts]
         vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False)
         return vectors.tolist()
+    async def embed_one(self, text: str, is_query: bool = False) -> list[float]:
         """Convenience wrapper for a single string."""
+        results = await self.embed([text], is_query=is_query)
         return results[0]

app/services/sparse_encoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+backend/app/services/sparse_encoder.py
+BM25 sparse encoder backed by FastEmbed's Qdrant/bm25 model.
+Used at ingestion time (ingest.py) and at query time (retrieve node).
+The model downloads a ~5 MB vocabulary file on first use. Subsequent calls
+are fully local. The module-level singleton is loaded lazily on first call
+to avoid startup delay in the API Space.
+Fallback: if fastembed is not installed, encode() returns empty sparse vectors
+so dense-only retrieval continues working unchanged.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Optional
+logger = logging.getLogger(__name__)
+_model: Optional[Any] = None
+_fastembed_available: Optional[bool] = None
+def _get_model() -> Optional[Any]:
+    global _model, _fastembed_available  # noqa: PLW0603
+    if _fastembed_available is False:
+        return None
+    if _model is not None:
+        return _model
+    try:
+        from fastembed import SparseTextEmbedding  # type: ignore[import]
+        _model = SparseTextEmbedding(model_name="Qdrant/bm25")
+        _fastembed_available = True
+        logger.info("FastEmbed BM25 sparse encoder loaded (Qdrant/bm25).")
+        return _model
+    except Exception as exc:
+        _fastembed_available = False
+        logger.warning(
+            "FastEmbed not available — sparse retrieval disabled, falling back to dense-only. (%s)",
+            exc,
+        )
+        return None
+class SparseEncoder:
+    """
+    Wraps FastEmbed SparseTextEmbedding to produce BM25 sparse vectors.
+    Returns list of (indices, values) tuples — one per input text. If FastEmbed
+    is unavailable, returns empty ([], []) tuples so callers can gracefully skip
+    sparse indexing without breaking the ingestion pipeline.
+    """
+    def encode(self, texts: list[str]) -> list[tuple[list[int], list[float]]]:
+        """Encode a batch of texts. Returns [(indices, values), ...] per text."""
+        if not texts:
+            return []
+        model = _get_model()
+        if model is None:
+            return [([], []) for _ in texts]
+        try:
+            results = []
+            for emb in model.embed(texts):
+                # fastembed SparseEmbedding exposes .indices and .values as numpy arrays.
+                results.append((emb.indices.tolist(), emb.values.tolist()))
+            return results
+        except Exception as exc:
+            logger.warning("BM25 encoding failed (%s); returning empty sparse vectors.", exc)
+            return [([], []) for _ in texts]
+    def encode_one(self, text: str) -> tuple[list[int], list[float]]:
+        """Convenience wrapper for a single string."""
+        return self.encode([text])[0]
+    @property
+    def available(self) -> bool:
+        """True if FastEmbed loaded successfully and sparse encoding is active."""
+        return _get_model() is not None

app/services/vector_store.py CHANGED Viewed

@@ -1,113 +1,205 @@
 import uuid
 from typing import Optional
 from qdrant_client import QdrantClient
-from qdrant_client.models import PointStruct, VectorParams, Distance, Filter, FieldCondition, MatchValue, PayloadSchemaType
 from app.models.pipeline import Chunk, ChunkMetadata
 from app.core.exceptions import RetrievalError
 class VectorStore:
     def __init__(self, client: QdrantClient, collection: str):
         self.client = client
         self.collection = collection
-    def ensure_collection(self) -> None:
-        """Creates collection with vectors size=384, distance=Cosine if it does not exist.
-        Also ensures payload index on metadata.doc_id exists for efficient dedup deletes."""
         collections = self.client.get_collections().collections
         exists = any(c.name == self.collection for c in collections)
         if not exists:
             self.client.create_collection(
                 collection_name=self.collection,
-                vectors_config=VectorParams(size=384, distance=Distance.COSINE),
             )
-        # Keyword index allows filter-by-doc_id in delete_by_doc_id.
-        # create_payload_index is idempotent — safe to call on every startup.
         self.client.create_payload_index(
             collection_name=self.collection,
             field_name="metadata.doc_id",
             field_schema=PayloadSchemaType.KEYWORD,
         )
-    def upsert_chunks(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
-        """Builds PointStruct list and calls client.upsert. Batch size 100."""
-        if len(chunks) != len(embeddings):
-            raise ValueError("Number of chunks must match number of embeddings")
         if not chunks:
             return
         points = []
-        for chunk, vector in zip(chunks, embeddings):
             points.append(
                 PointStruct(
                     id=str(uuid.uuid4()),
                     vector=vector,
-                    payload=chunk
                 )
             )
-        # Qdrant client upsert takes care of batching if needed, but we can chunk our points list
         batch_size = 100
         for i in range(0, len(points), batch_size):
-            batch = points[i:i + batch_size]
             self.client.upsert(
                 collection_name=self.collection,
-                points=batch
             )
     def delete_by_doc_id(self, doc_id: str) -> None:
-        """Filters on metadata.doc_id and deletes. Called before upsert for incremental updates."""
         try:
-             self.client.delete(
                 collection_name=self.collection,
                 points_selector=Filter(
                     must=[
                         FieldCondition(
                             key="metadata.doc_id",
-                            match=MatchValue(value=doc_id)
                         )
                     ]
-                )
-             )
-        except Exception as e:
-            # Qdrant raises if index or something missing, but in setup we might just proceed
-           pass
-    def search(self, query_vector: list[float], top_k: int = 20, filters: Optional[dict] = None) -> list[Chunk]:
-        """Returns chunks with metadata populated from payload."""
         try:
             qdrant_filter = None
             if filters:
-                must_conditions = []
-                for key, value in filters.items():
-                    must_conditions.append(
-                        FieldCondition(
-                            key=f"metadata.{key}",
-                            match=MatchValue(value=value)
-                        )
-                    )
-                if must_conditions:
-                    qdrant_filter = Filter(must=must_conditions)
             results = self.client.search(
                 collection_name=self.collection,
-                query_vector=query_vector,
                 limit=top_k,
-                query_filter=qdrant_filter
             )
-            chunks = []
-            for hit in results:
-                if hit.payload:
-                     chunks.append(Chunk(**hit.payload))
-            return chunks
-        except Exception as e:
             raise RetrievalError(
-                f"Vector search failed: {e}", context={"error": str(e)}
-            ) from e

+import logging
 import uuid
 from typing import Optional
 from qdrant_client import QdrantClient
+from qdrant_client.models import (
+    Distance,
+    FieldCondition,
+    Filter,
+    MatchValue,
+    NamedSparseVector,
+    NamedVector,
+    PayloadSchemaType,
+    PointStruct,
+    SparseIndexParams,
+    SparseVector,
+    SparseVectorParams,
+    VectorParams,
+)
 from app.models.pipeline import Chunk, ChunkMetadata
 from app.core.exceptions import RetrievalError
+logger = logging.getLogger(__name__)
+# Named vector keys used in the Qdrant collection.
+_DENSE_VEC = "dense"
+_SPARSE_VEC = "sparse"
 class VectorStore:
     def __init__(self, client: QdrantClient, collection: str):
         self.client = client
         self.collection = collection
+    def ensure_collection(self, allow_recreate: bool = False) -> None:
+        """
+        Creates or migrates the collection to support named dense + sparse vectors.
+        allow_recreate=True (ingestion): if the collection exists with the old
+        unnamed-vector format, delete and recreate it. Ingestion will re-index
+        everything on the same run, so data loss is acceptable.
+        allow_recreate=False (API startup): never touch an existing collection.
+        The API will use whatever format is already deployed.
+        """
         collections = self.client.get_collections().collections
         exists = any(c.name == self.collection for c in collections)
+        if exists and allow_recreate:
+            try:
+                info = self.client.get_collection(self.collection)
+                is_old_format = not isinstance(info.config.params.vectors, dict)
+                has_no_sparse = not info.config.params.sparse_vectors
+                if is_old_format or has_no_sparse:
+                    logger.info(
+                        "Collection %r uses old vector format; recreating for hybrid search.",
+                        self.collection,
+                    )
+                    self.client.delete_collection(self.collection)
+                    exists = False
+            except Exception as exc:
+                logger.warning("Could not inspect collection format (%s); skipping migration.", exc)
         if not exists:
             self.client.create_collection(
                 collection_name=self.collection,
+                vectors_config={
+                    _DENSE_VEC: VectorParams(size=384, distance=Distance.COSINE),
+                },
+                sparse_vectors_config={
+                    # on_disk=False keeps sparse index in RAM for sub-ms lookup.
+                    _SPARSE_VEC: SparseVectorParams(
+                        index=SparseIndexParams(on_disk=False)
+                    ),
+                },
             )
+            logger.info("Created collection %r with dense + sparse vectors.", self.collection)
+        # Keyword index for filter-by-doc_id in delete_by_doc_id. Idempotent.
         self.client.create_payload_index(
             collection_name=self.collection,
             field_name="metadata.doc_id",
             field_schema=PayloadSchemaType.KEYWORD,
         )
+    def upsert_chunks(
+        self,
+        chunks: list[Chunk],
+        dense_embeddings: list[list[float]],
+        sparse_embeddings: Optional[list[tuple[list[int], list[float]]]] = None,
+    ) -> None:
+        """
+        Builds PointStruct list with named dense (and optionally sparse) vectors.
+        sparse_embeddings: list of (indices, values) tuples from SparseEncoder.
+        If None or empty sparse vector for a chunk, dense-only point is used.
+        """
+        if len(chunks) != len(dense_embeddings):
+            raise ValueError("Number of chunks must match number of dense embeddings")
         if not chunks:
             return
         points = []
+        for i, (chunk, dense_vec) in enumerate(zip(chunks, dense_embeddings)):
+            vector: dict = {_DENSE_VEC: dense_vec}
+            if sparse_embeddings is not None:
+                indices, values = sparse_embeddings[i]
+                if indices:  # Skip empty sparse vectors gracefully
+                    vector[_SPARSE_VEC] = SparseVector(
+                        indices=indices, values=values
+                    )
             points.append(
                 PointStruct(
                     id=str(uuid.uuid4()),
                     vector=vector,
+                    payload=chunk,
                 )
             )
         batch_size = 100
         for i in range(0, len(points), batch_size):
             self.client.upsert(
                 collection_name=self.collection,
+                points=points[i : i + batch_size],
             )
     def delete_by_doc_id(self, doc_id: str) -> None:
+        """Filters on metadata.doc_id and deletes all matching points."""
         try:
+            self.client.delete(
                 collection_name=self.collection,
                 points_selector=Filter(
                     must=[
                         FieldCondition(
                             key="metadata.doc_id",
+                            match=MatchValue(value=doc_id),
                         )
                     ]
+                ),
+            )
+        except Exception:
+            pass  # Safe to ignore — collection or index may not exist yet
+    def search(
+        self,
+        query_vector: list[float],
+        top_k: int = 20,
+        filters: Optional[dict] = None,
+    ) -> list[Chunk]:
+        """Dense vector search using the named 'dense' vector."""
         try:
             qdrant_filter = None
             if filters:
+                must_conditions = [
+                    FieldCondition(key=f"metadata.{k}", match=MatchValue(value=v))
+                    for k, v in filters.items()
+                ]
+                qdrant_filter = Filter(must=must_conditions)
             results = self.client.search(
                 collection_name=self.collection,
+                query_vector=NamedVector(name=_DENSE_VEC, vector=query_vector),
                 limit=top_k,
+                query_filter=qdrant_filter,
+                with_payload=True,
             )
+            return [Chunk(**hit.payload) for hit in results if hit.payload]
+        except Exception as exc:
             raise RetrievalError(
+                f"Dense vector search failed: {exc}", context={"error": str(exc)}
+            ) from exc
+    def search_sparse(
+        self,
+        indices: list[int],
+        values: list[float],
+        top_k: int = 20,
+    ) -> list[Chunk]:
+        """
+        BM25 sparse vector search using the named 'sparse' vector.
+        Returns empty list if sparse vectors are absent or indices is empty.
+        """
+        if not indices:
+            return []
+        try:
+            results = self.client.search(
+                collection_name=self.collection,
+                query_vector=NamedSparseVector(
+                    name=_SPARSE_VEC,
+                    vector=SparseVector(indices=indices, values=values),
+                ),
+                limit=top_k,
+                with_payload=True,
+            )
+            return [Chunk(**hit.payload) for hit in results if hit.payload]
+        except Exception as exc:
+            # Sparse index may not exist on old collections — log and continue.
+            logger.warning("Sparse search failed (%s); skipping sparse results.", exc)
+            return []

requirements.txt CHANGED Viewed

@@ -20,4 +20,7 @@ presidio-analyzer>=2.2.354
 tenacity>=8.3.0
 python-jose[cryptography]>=3.3.0
 google-genai>=1.0.0
 toon_format @ git+https://github.com/toon-format/toon-python.git

 tenacity>=8.3.0
 python-jose[cryptography]>=3.3.0
 google-genai>=1.0.0
+# fastembed: powers BM25 sparse retrieval (Stage 2). Qdrant/bm25 vocabulary
+# downloads ~5 MB on first use then runs fully local — no GPU, no network at query time.
+fastembed>=0.3.6
 toon_format @ git+https://github.com/toon-format/toon-python.git