Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Feb 28

Commit

4ef165a

1 Parent(s): f0e94ef

Deploy 583b552

Browse files

Files changed (10) hide show

app/api/chat.py +81 -0
app/models/chat.py +4 -0
app/models/pipeline.py +37 -19
app/pipeline/nodes/gemini_fast.py +19 -2
app/pipeline/nodes/generate.py +52 -7
app/pipeline/nodes/log_eval.py +22 -3
app/pipeline/nodes/retrieve.py +84 -4
app/services/conversation_store.py +67 -45
app/services/gemini_client.py +165 -0
app/services/vector_store.py +40 -0

app/api/chat.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import time
 from fastapi import APIRouter, Request, Depends
@@ -76,6 +77,32 @@ async def _generate_follow_ups(
         return []
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
@@ -104,10 +131,36 @@ async def chat_endpoint(
     session_id = request_data.session_id
     conversation_history = conv_store.get_recent(session_id)
     criticism = _is_criticism(request_data.message)
     if criticism and conversation_history:
         conv_store.mark_last_negative(session_id)
     initial_state: PipelineState = {  # type: ignore[assignment]
         "query": request_data.message,
         "session_id": request_data.session_id,
@@ -131,6 +184,16 @@ async def chat_endpoint(
         "follow_ups": [],
         "path": None,
         "query_topic": None,
     }
     async def sse_generator():
@@ -194,6 +257,24 @@ async def chat_endpoint(
                 if follow_ups:
                     yield f"event: follow_ups\ndata: {json.dumps({'questions': follow_ups})}\n\n"
         except Exception as exc:
             yield f"data: {json.dumps({'error': str(exc) or 'Generation failed'})}\n\n"

+import asyncio
 import json
 import time
 from fastapi import APIRouter, Request, Depends
         return []
+async def _update_summary_async(
+    conv_store,
+    gemini_client,
+    session_id: str,
+    previous_summary: str | None,
+    query: str,
+    answer: str,
+    processing_api_key: str | None,
+) -> None:
+    """
+    Triggered post-response to update the rolling conversation summary.
+    Failures are silently swallowed — summary is best-effort context, not critical.
+    """
+    try:
+        new_summary = await gemini_client.update_conversation_summary(
+            previous_summary=previous_summary or "",
+            new_turn_q=query,
+            new_turn_a=answer[:600],  # cap answer chars sent to Gemini
+            processing_api_key=processing_api_key,
+        )
+        if new_summary:
+            conv_store.set_summary(session_id, new_summary)
+    except Exception:
+        pass
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
     session_id = request_data.session_id
     conversation_history = conv_store.get_recent(session_id)
+    conversation_summary = conv_store.get_summary(session_id)
     criticism = _is_criticism(request_data.message)
     if criticism and conversation_history:
         conv_store.mark_last_negative(session_id)
+    # Stage 2: decontextualize the query concurrently with Guard when we have a
+    # rolling summary. Reference-heavy queries like "tell me more about that project"
+    # embed poorly; a self-contained rewrite fixes retrieval without added latency
+    # because Gemini Flash runs while Guard is classifying the query.
+    gemini_client = getattr(request.app.state, "gemini_client", None)
+    decontextualized_query: str | None = None
+    decontext_task: asyncio.Task | None = None
+    if conversation_summary and gemini_client and gemini_client.is_configured:
+        decontext_task = asyncio.create_task(
+            gemini_client.decontextualize_query(request_data.message, conversation_summary)
+        )
+    # Await decontextualization result before the pipeline begins (retrieve node
+    # will use it if present; Guard runs first so the latency is masked).
+    if decontext_task is not None:
+        try:
+            result = await asyncio.wait_for(decontext_task, timeout=3.0)
+            # Only use the rewritten form when it differs non-trivially from the raw
+            # query to avoid polluting retrieval with identical-but-slightly-rephrased
+            # versions that waste embedding budget.
+            if result and result.strip().lower() != request_data.message.strip().lower():
+                decontextualized_query = result.strip()
+        except Exception:
+            pass  # Decontextualization is best-effort; fall back to raw query.
     initial_state: PipelineState = {  # type: ignore[assignment]
         "query": request_data.message,
         "session_id": request_data.session_id,
         "follow_ups": [],
         "path": None,
         "query_topic": None,
+        # Stage 1: follow-up bypass for Gemini fast-path
+        "is_followup": request_data.is_followup,
+        # Stage 2: progressive history summarisation
+        "conversation_summary": conversation_summary or None,
+        "decontextualized_query": decontextualized_query,
+        # Stage 3: SELF-RAG critic scores (populated by generate node)
+        "critic_groundedness": None,
+        "critic_completeness": None,
+        "critic_specificity": None,
+        "critic_quality": None,
     }
     async def sse_generator():
                 if follow_ups:
                     yield f"event: follow_ups\ndata: {json.dumps({'questions': follow_ups})}\n\n"
+            # Stage 2: update rolling summary asynchronously — fired after the
+            # response is fully delivered so it adds zero latency to the turn.
+            if final_answer and gemini_client and gemini_client.is_configured:
+                processing_key = getattr(
+                    request.app.state, "gemini_processing_api_key", None
+                )
+                asyncio.create_task(
+                    _update_summary_async(
+                        conv_store=conv_store,
+                        gemini_client=gemini_client,
+                        session_id=session_id,
+                        previous_summary=conversation_summary,
+                        query=request_data.message,
+                        answer=final_answer,
+                        processing_api_key=processing_key,
+                    )
+                )
         except Exception as exc:
             yield f"data: {json.dumps({'error': str(exc) or 'Generation failed'})}\n\n"

app/models/chat.py CHANGED Viewed

@@ -17,6 +17,10 @@ class ChatRequest(BaseModel):
         max_length=64,
         pattern=r"^[a-zA-Z0-9_-]+$",
     )
 class ChatResponse(BaseModel):

         max_length=64,
         pattern=r"^[a-zA-Z0-9_-]+$",
     )
+    # True when the query was submitted via a follow-up pill button.
+    # Bypasses the Gemini fast-path unconditionally so pill follow-ups
+    # always produce cited, chunk-grounded answers rather than TOON summaries.
+    is_followup: bool = False
 class ChatResponse(BaseModel):

app/models/pipeline.py CHANGED Viewed

@@ -1,17 +1,34 @@
 import operator
-from typing import Annotated, Literal, Optional, TypedDict
 from app.models.chat import SourceRef
-class ChunkMetadata(TypedDict):
     doc_id: str
     source_title: str
     source_url: str
     section: str
-    source_type: Literal["blog", "project", "github", "bio", "cv"]
     date: str
     tags: list[str]
 class Chunk(TypedDict):
@@ -34,33 +51,34 @@ class PipelineState(TypedDict):
     guard_passed: bool
     thinking: bool          # True while Gemini has signalled RAG is needed
     # Last N Q/A pairs for this session — injected into prompts for follow-up context.
-    # List of {"q": str, "a": str} dicts, oldest first, answers truncated to 120 chars.
     conversation_history: list
     # True when the current query explicitly criticises the previous answer.
-    # Triggers automatic negative feedback on the prior interaction and forces
-    # Gemini editorial reformat regardless of the low-trust heuristic score.
     is_criticism: bool
     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]
     # CRAG: counts retrieve node invocations; 2 = one retry was attempted.
-    # Starts at 0 in initial state; retrieve increments it each call.
     retrieval_attempts: int
-    # Set by the rewrite_query node when CRAG triggers; None otherwise.
     rewritten_query: Optional[str]
-    # Top cross-encoder score from the last retrieve call.
-    # Used by route_retrieve_result to trigger a CRAG rewrite on low-confidence
-    # retrieval (non-empty but weak matches) in addition to the empty-chunk case.
     top_rerank_score: Optional[float]
-    # Follow-up question suggestions generated after the main answer.
-    # 3 short questions specific to content in the answer.
     follow_ups: list[str]
     # Which pipeline branch produced the final answer.
-    # Values: "cache_hit", "gemini_fast", "rag", "blocked".
-    # Set by cache, gemini_fast, and generate nodes respectively.
-    # data_prep.py filters to path=="rag" when building reranker triplets because
-    # only RAG interactions have chunk associations that form valid training pairs.
     path: Optional[str]
-    # 1–3 word topic extracted from the query by the guard node (extract_topic).
-    # Stored in state so retrieve_node can reuse it without recomputing.
     query_topic: Optional[str]

 import operator
+from typing import Annotated, Optional, TypedDict
 from app.models.chat import SourceRef
+class ChunkMetadata(TypedDict, total=False):
+    """
+    Per-chunk payload stored in Qdrant.
+    All fields have total=False so new optional fields (raptor_level, parent_id,
+    linked_chunks) can coexist with existing points that don't have them.
+    Required fields (doc_id, source_title, source_url, section, source_type) are
+    always present in practice; callers should .get() with a default for safety.
+    """
     doc_id: str
     source_title: str
     source_url: str
     section: str
+    source_type: str          # "blog" | "project" | "github" | "bio" | "cv"
     date: str
     tags: list[str]
+    # RAPTOR hierarchical indexing (Stage 4).
+    # 0 = leaf chunk (original content), 1 = cluster summary, 2 = document summary.
+    # Absent on pre-RAPTOR points — treat as 0.
+    raptor_level: int
+    # Qdrant point ID of the parent RAPTOR summary node. None for top-level nodes.
+    parent_id: str
+    # Stage 5: fingerprints (doc_id::section) of semantically linked chunks within
+    # the same RAPTOR cluster (cosine similarity > 0.85). Used for context expansion.
+    linked_chunks: list[str]
 class Chunk(TypedDict):
     guard_passed: bool
     thinking: bool          # True while Gemini has signalled RAG is needed
     # Last N Q/A pairs for this session — injected into prompts for follow-up context.
     conversation_history: list
+    # Stage 2: rolling conversation summary (single paragraph, ≤150 tokens).
+    # Injected into generate/gemini_fast instead of raw turn list when present.
+    conversation_summary: Optional[str]
+    # Stage 2: self-contained query rewritten before retrieval when the original
+    # contains unresolved pronouns/references. Used for embedding; original query
+    # is used for display and system prompt.
+    decontextualized_query: Optional[str]
     # True when the current query explicitly criticises the previous answer.
     is_criticism: bool
+    # Stage 1: True when submitted via a follow-up pill button.
+    # Bypasses Gemini fast-path so pill follow-ups always produce cited RAG answers.
+    is_followup: bool
     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]
     # CRAG: counts retrieve node invocations; 2 = one retry was attempted.
     retrieval_attempts: int
     rewritten_query: Optional[str]
+    # Top cross-encoder score from the last retrieve call. Used by CRAG routing.
     top_rerank_score: Optional[float]
     follow_ups: list[str]
     # Which pipeline branch produced the final answer.
     path: Optional[str]
+    # 1–3 word topic extracted from the query by the guard node.
     query_topic: Optional[str]
+    # Stage 3: SELF-RAG critic scores (1–3 each). Logged to SQLite for training.
+    critic_groundedness: Optional[int]   # all claims supported by a specific chunk
+    critic_completeness: Optional[int]  # answer uses all relevant available chunks
+    critic_specificity: Optional[int]   # answer contains specific names/numbers
+    critic_quality: Optional[str]       # "high" | "medium" | "low"

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -32,13 +32,15 @@ from app.core.quality import is_low_trust
 logger = logging.getLogger(__name__)
 # Words that reliably indicate the visitor wants a deep, cited answer.
-# Kept intentionally small: false negatives route to Gemini first, then RAG
-# on a tool call. False positives here add one Gemini RTT unnecessarily.
 _COMPLEX_SIGNALS: frozenset[str] = frozenset({
     "how", "why", "explain", "implement", "architecture", "deep",
     "detail", "technical", "compare", "difference", "algorithm",
     "code", "example", "breakdown", "analysis", "source", "cite",
     "reference", "proof", "derive", "calculate", "optimise", "optimize",
 })
 # Minimum token count for a query to be classified as complex.
@@ -76,6 +78,21 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         writer({"type": "status", "label": "Thinking about your question directly..."})
         query = state["query"]
         complexity = "complex" if _is_complex(query) else "simple"
         # When Gemini is not configured (GEMINI_API_KEY not set), route all

 logger = logging.getLogger(__name__)
 # Words that reliably indicate the visitor wants a deep, cited answer.
 _COMPLEX_SIGNALS: frozenset[str] = frozenset({
     "how", "why", "explain", "implement", "architecture", "deep",
     "detail", "technical", "compare", "difference", "algorithm",
     "code", "example", "breakdown", "analysis", "source", "cite",
     "reference", "proof", "derive", "calculate", "optimise", "optimize",
+    # Follow-up depth signals — these phrases appear in pill-generated questions
+    # and always indicate the user wants a cited, retrieved answer not a summary.
+    "tell me more", "more detail", "more about", "what about",
+    "explain that", "go deeper", "expand", "elaborate", "dig into",
 })
 # Minimum token count for a query to be classified as complex.
         writer({"type": "status", "label": "Thinking about your question directly..."})
         query = state["query"]
+        # Stage 1: Follow-up pill submissions bypass Gemini entirely.
+        # is_followup=True means the query came from a pill button — it is a
+        # reference-heavy string like "What technologies did he use for that?" that
+        # will produce a TOON summary answer from Gemini. Always route to RAG so
+        # the response is cited and chunk-grounded.
+        if state.get("is_followup", False):
+            logger.debug("is_followup=True — forcing RAG, skipping Gemini fast-path.")
+            writer({"type": "status", "label": "Needs deep retrieval, checking portfolio..."})
+            return {
+                "query_complexity": "complex",  # force 70B for quality
+                "expanded_queries": [query],
+                "thinking": False,
+            }
         complexity = "complex" if _is_complex(query) else "simple"
         # When Gemini is not configured (GEMINI_API_KEY not set), route all

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -96,13 +96,22 @@ No apologies, no padding, vary your phrasing.
 """.format(topics=_TOPIC_SUGGESTIONS)
-def _format_history(history: list[dict]) -> str:
     """
-    Render prior turns as a compact prefix block.
-    Each turn is one line: "[Tn] Q: ... | A: ..."
-    Returns empty string when there is no history (first message in session).
-    Token cost: ~20-35 tokens per turn; max 3 turns → <110 tokens overhead.
     """
     if not history:
         return ""
     lines = [
@@ -129,7 +138,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # ── Not-found path ─────────────────────────────────────────────────
         if not reranked_chunks:
             writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
-            history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
@@ -167,7 +176,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         context_block = "\n\n".join(context_parts)
-        history_prefix = _format_history(state.get("conversation_history") or [])
         is_criticism = state.get("is_criticism", False)
         criticism_note = (
             "NOTE: The visitor says the previous answer was wrong. "
@@ -265,10 +274,46 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
         cited_sources = [sr for i, sr in enumerate(source_refs, start=1) if i in cited_indices]
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
             "path": "rag",
         }
     return generate_node

 """.format(topics=_TOPIC_SUGGESTIONS)
+def _format_history(state: "PipelineState") -> str:
     """
+    Render conversation context as a compact prefix block.
+    Stage 2 — progressive history summarisation:
+    If a rolling `conversation_summary` is present in state, inject that
+    single paragraph instead of the raw 3-turn transcript.  The summary is
+    ~150 tokens; the raw transcript costs 20-35 tokens per turn but degrades
+    at turn 4+ due to pronoun ambiguity and stale context.  We keep the raw
+    turns as fallback when Gemini hasn't produced a summary yet.
     """
+    summary = state.get("conversation_summary")
+    if summary:
+        return f"Running conversation context:\n{summary}\n\n"
+    history = state.get("conversation_history") or []
     if not history:
         return ""
     lines = [
         # ── Not-found path ─────────────────────────────────────────────────
         if not reranked_chunks:
             writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
+            history_prefix = _format_history(state)
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
         context_block = "\n\n".join(context_parts)
+        history_prefix = _format_history(state)
         is_criticism = state.get("is_criticism", False)
         criticism_note = (
             "NOTE: The visitor says the previous answer was wrong. "
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
         cited_sources = [sr for i, sr in enumerate(source_refs, start=1) if i in cited_indices]
+        # ── Stage 3: SELF-RAG critic ──────────────────────────────────────────
+        # Runs after answer is fully streamed — zero latency impact on first token.
+        # Scores groundedness (stays in passages), completeness (covers the query),
+        # and specificity (concrete names/numbers vs vague language) on 1-3 each.
+        # Scores are stored in state for log_eval and downstream quality analysis.
+        critic_scores: dict[str, int | None] = {
+            "critic_groundedness": None,
+            "critic_completeness": None,
+            "critic_specificity": None,
+            "critic_quality": None,
+        }
+        if gemini_client is not None and full_answer and reranked_chunks:
+            try:
+                scores = await gemini_client.critique_rag_answer(
+                    query=query,
+                    context_block=context_block,
+                    answer=full_answer,
+                    decontextualized_query=state.get("decontextualized_query"),
+                )
+                g = scores.get("groundedness", 3)
+                c = scores.get("completeness", 3)
+                s = scores.get("specificity", 3)
+                # Composite quality label for quick log filtering:
+                # 'high' if average >= 2.5, 'medium' if >= 1.5, else 'low'
+                avg = (g + c + s) / 3.0
+                quality = "high" if avg >= 2.5 else ("medium" if avg >= 1.5 else "low")
+                critic_scores = {
+                    "critic_groundedness": g,
+                    "critic_completeness": c,
+                    "critic_specificity": s,
+                    "critic_quality": quality,
+                }
+            except Exception as exc:
+                logger.debug("SELF-RAG critic failed (non-critical): %s", exc)
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
             "path": "rag",
+            **critic_scores,
         }
     return generate_node

app/pipeline/nodes/log_eval.py CHANGED Viewed

@@ -61,7 +61,11 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                     latency_ms           INTEGER,
                     cached               BOOLEAN,
                     feedback             INTEGER DEFAULT 0,
-                    path                 TEXT DEFAULT 'rag'
                 )
                 """
             )
@@ -72,6 +76,11 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                 ("session_id", "TEXT DEFAULT ''"),
                 # path column: old rows default to "rag" — they were all RAG interactions.
                 ("path", "TEXT DEFAULT 'rag'"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -81,8 +90,10 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
-                    (timestamp, session_id, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached, path)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.now(tz=timezone.utc).isoformat(),
@@ -95,6 +106,10 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                     state.get("latency_ms", 0),
                     state.get("cached", False),
                     path,
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]
@@ -126,6 +141,10 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                     "cached": state.get("cached", False),
                     "feedback": 0,
                     "path": path,
                 }
                 github_log.append(record)

                     latency_ms           INTEGER,
                     cached               BOOLEAN,
                     feedback             INTEGER DEFAULT 0,
+                    path                 TEXT DEFAULT 'rag',
+                    critic_groundedness  INTEGER,
+                    critic_completeness  INTEGER,
+                    critic_specificity   INTEGER,
+                    critic_quality       TEXT
                 )
                 """
             )
                 ("session_id", "TEXT DEFAULT ''"),
                 # path column: old rows default to "rag" — they were all RAG interactions.
                 ("path", "TEXT DEFAULT 'rag'"),
+                # Stage 3 SELF-RAG critic scores
+                ("critic_groundedness", "INTEGER"),
+                ("critic_completeness", "INTEGER"),
+                ("critic_specificity", "INTEGER"),
+                ("critic_quality", "TEXT"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
+                    (timestamp, session_id, query, answer, chunks_used, rerank_scores,
+                     reranked_chunks_json, latency_ms, cached, path,
+                     critic_groundedness, critic_completeness, critic_specificity, critic_quality)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.now(tz=timezone.utc).isoformat(),
                     state.get("latency_ms", 0),
                     state.get("cached", False),
                     path,
+                    state.get("critic_groundedness"),
+                    state.get("critic_completeness"),
+                    state.get("critic_specificity"),
+                    state.get("critic_quality"),
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]
                     "cached": state.get("cached", False),
                     "feedback": 0,
                     "path": path,
+                    "critic_groundedness": state.get("critic_groundedness"),
+                    "critic_completeness": state.get("critic_completeness"),
+                    "critic_specificity": state.get("critic_specificity"),
+                    "critic_quality": state.get("critic_quality"),
                 }
                 github_log.append(record)

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import asyncio
 from typing import Callable
 from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
@@ -39,6 +42,20 @@ _SIBLING_EXPAND_TOP_N: int = 5   # expand from the top-N RRF-ranked unique chunk
 _SIBLING_FETCH_LIMIT: int = 5    # fetch up to N siblings per document
 _SIBLING_TOTAL_CAP: int = 8      # max additional chunks added via sibling expansion
 # Keywords that imply the visitor wants depth from a specific source type.
 # Values are the source_type values set by ingest (ChunkMetadata.source_type).
 _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
@@ -118,6 +135,11 @@ def make_retrieve_node(
         attempts = state.get("retrieval_attempts", 0)
         query = state["query"]
         # Reuse the topic computed by the guard node — no recomputation needed.
         topic = state.get("query_topic") or ""
@@ -136,9 +158,12 @@ def make_retrieve_node(
             # Second attempt: re-embed the rewritten query with is_query=True.
             cached_embedding = None
-        expanded = [query]  # gemini_fast may fill expanded_queries on first attempt
         if attempts == 0:
-            expanded = state.get("expanded_queries", [query])
         # Embed all query variants in one batched call (is_query=True for asymmetric BGE).
         if cached_embedding is not None and len(expanded) == 1:
@@ -152,12 +177,36 @@ def make_retrieve_node(
             chunks = vector_store.search(query_vector=vec, top_k=20)
             dense_results.append(chunks)
         # ── Sparse (BM25) search (primary query only) ──────────────────────��──────
         # Runs concurrently with dense search isn't possible here since dense
         # is synchronous Qdrant calls, but we parallelise encode + sparse search.
         sparse_results: list[Chunk] = []
         if _sparse_encoder.available:
-            indices, values = _sparse_encoder.encode_one(query)
             sparse_results = vector_store.search_sparse(indices, values, top_k=20)
         # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
@@ -222,7 +271,38 @@ def make_retrieve_node(
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
-        reranked = await reranker.rerank(query, unique_chunks, top_k=7)
         # ── Relevance gate ─────────────────────────────────────────────────────
         top_score = reranked[0]["metadata"].get("rerank_score", 0.0) if reranked else None

 import asyncio
+import logging
 from typing import Callable
 from langgraph.config import get_stream_writer
+logger = logging.getLogger(__name__)
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
 _SIBLING_FETCH_LIMIT: int = 5    # fetch up to N siblings per document
 _SIBLING_TOTAL_CAP: int = 8      # max additional chunks added via sibling expansion
+# RAPTOR (Stage 4): top-N cluster summary hits to expand into child leaf chunks.
+# When a level-1 cluster node appears in the top-3 results, its linked leaf
+# children are fetched from Qdrant and added to the candidate pool before reranking.
+_RAPTOR_CLUSTER_TOP_K: int = 5   # how many cluster nodes to search for
+_RAPTOR_EXPAND_TOP_N: int = 3    # expand children for top-N cluster hits only
+_RAPTOR_CHILD_FETCH_LIMIT: int = 6  # max leaf children fetched per cluster hit
+# Stage 5 linked_chunks expansion cap. After reranking, each top chunk may link
+# to near-duplicate passages (same skill in resume vs. project README). We expand
+# up to this many additional candidates before the context cap is applied.
+_LINKED_CHUNKS_EXPAND_TOP_N: int = 5   # expand links from top-N reranked chunks
+_LINKED_CHUNKS_PER_CHUNK: int = 2      # max linked neighbours per source chunk
+_LINKED_CHUNKS_TOTAL_CAP: int = 4      # total linked-chunk additions across all seeds
 # Keywords that imply the visitor wants depth from a specific source type.
 # Values are the source_type values set by ingest (ChunkMetadata.source_type).
 _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
         attempts = state.get("retrieval_attempts", 0)
         query = state["query"]
+        # Stage 2: use the self-contained decontextualized rewrite for embedding
+        # when one was produced.  "Tell me more about that ML project" has terrible
+        # cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
+        # "What ML projects has Darshan built?" dramatically improves recall.
+        retrieval_query = state.get("decontextualized_query") or query
         # Reuse the topic computed by the guard node — no recomputation needed.
         topic = state.get("query_topic") or ""
             # Second attempt: re-embed the rewritten query with is_query=True.
             cached_embedding = None
+        expanded = [retrieval_query]  # gemini_fast may fill expanded_queries on first attempt
         if attempts == 0:
+            expanded = state.get("expanded_queries") or [retrieval_query]
+            # Ensure decontextualized form is the primary search query if present.
+            if retrieval_query != query and retrieval_query not in expanded:
+                expanded = [retrieval_query] + expanded
         # Embed all query variants in one batched call (is_query=True for asymmetric BGE).
         if cached_embedding is not None and len(expanded) == 1:
             chunks = vector_store.search(query_vector=vec, top_k=20)
             dense_results.append(chunks)
+        # ── Stage 4: RAPTOR cluster search (parallel to dense leaf search) ──────────
+        # Query the level-1 RAPTOR cluster nodes with the primary query vector.
+        # If a cluster node scores in the top-_RAPTOR_EXPAND_TOP_N results, we
+        # fetch its child leaf chunks (via linked_chunks payload) and add them to
+        # the candidate pool before RRF fusion.  This gives the retriever a
+        # "zoomed-out" structural view that pure cosine over leaves misses.
+        primary_vec = query_vectors[0]
+        raptor_cluster_hits = vector_store.search_by_raptor_level(
+            query_vector=primary_vec, level=1, top_k=_RAPTOR_CLUSTER_TOP_K
+        )
+        raptor_leaf_expansions: list[Chunk] = []
+        for cluster_chunk in raptor_cluster_hits[:_RAPTOR_EXPAND_TOP_N]:
+            # linked_chunks stores "doc_id::section" fingerprints of child leaves.
+            linked_fps: list[str] = cluster_chunk["metadata"].get("linked_chunks") or []
+            for fp in linked_fps[:_RAPTOR_CHILD_FETCH_LIMIT]:
+                if "::" not in fp:
+                    continue
+                child_doc_id, _ = fp.split("::", 1)
+                siblings = vector_store.fetch_by_doc_id(child_doc_id, limit=3)
+                raptor_leaf_expansions.extend(siblings)
+        if raptor_leaf_expansions:
+            logger.debug("RAPTOR: added %d child leaf candidates.", len(raptor_leaf_expansions))
+            dense_results.append(raptor_leaf_expansions)
         # ── Sparse (BM25) search (primary query only) ──────────────────────��──────
         # Runs concurrently with dense search isn't possible here since dense
         # is synchronous Qdrant calls, but we parallelise encode + sparse search.
         sparse_results: list[Chunk] = []
         if _sparse_encoder.available:
+            indices, values = _sparse_encoder.encode_one(retrieval_query)
             sparse_results = vector_store.search_sparse(indices, values, top_k=20)
         # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
+        reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=7)
+        # ── Stage 5: linked_chunks expansion ─────────────────────────────────────
+        # After reranking, inspect the top-N chunks for linked_chunks edges set by
+        # RaptorBuilder.  These link near-duplicate passages from different source
+        # documents (e.g. the same skill mentioned in CV and a project README).
+        # Expanding them ensures the LLM can cross-cite both authoritative sources.
+        if reranked:
+            linked_fps_seen: set[str] = {
+                f"{c['metadata']['doc_id']}::{c['metadata']['section']}" for c in reranked
+            }
+            linked_added = 0
+            for seed in reranked[:_LINKED_CHUNKS_EXPAND_TOP_N]:
+                if linked_added >= _LINKED_CHUNKS_TOTAL_CAP:
+                    break
+                linked_fps: list[str] = seed["metadata"].get("linked_chunks") or []
+                for fp in linked_fps[:_LINKED_CHUNKS_PER_CHUNK]:
+                    if fp in linked_fps_seen or "::" not in fp:
+                        continue
+                    linked_fps_seen.add(fp)
+                    child_doc_id, _ = fp.split("::", 1)
+                    siblings = vector_store.fetch_by_doc_id(child_doc_id, limit=2)
+                    for sib in siblings:
+                        sib_fp = f"{sib['metadata']['doc_id']}::{sib['metadata']['section']}"
+                        if sib_fp not in linked_fps_seen:
+                            linked_fps_seen.add(sib_fp)
+                            reranked.append(sib)
+                            linked_added += 1
+                            if linked_added >= _LINKED_CHUNKS_TOTAL_CAP:
+                                break
+                    if linked_added >= _LINKED_CHUNKS_TOTAL_CAP:
+                        break
         # ── Relevance gate ─────────────────────────────────────────────────────
         top_score = reranked[0]["metadata"].get("rerank_score", 0.0) if reranked else None

app/services/conversation_store.py CHANGED Viewed

@@ -1,22 +1,17 @@
 """
 backend/app/services/conversation_store.py
-SQLite-backed per-session conversation history.
-Reads the last N completed turns for a session from the existing `interactions`
-table so the LLM has conversational context without a separate store.
-Answers are truncated to 120 chars before injection — enough context for
-referential follow-ups ("tell me more", "what else?", "that's wrong") without
-wasting significant token budget on verbatim prior answers.
-All reads/writes are synchronous sqlite3 (<3ms on SSD) — acceptable because:
-  1. The call happens once at request start, outside the model call path.
-  2. SQLite WAL mode allows concurrent readers and one writer without blocking.
-Issue 1: mark_last_negative() now also fires github_log.append_feedback() so
-negative labels persist across HF Space restarts.  Without this, negative
-examples accumulated during a session are lost on the next restart, and
-data_prep.py cannot produce accurate hard-negative training triplets.
 """
 from __future__ import annotations
@@ -27,32 +22,75 @@ from datetime import datetime, timezone
 logger = logging.getLogger(__name__)
-# Visible answer length per turn injected into context.
-# 120 chars ≈ 25 tokens — plenty to resolve pronouns and follow-up references.
 _ANSWER_PREVIEW_LEN = 120
-# Default number of prior turns to surface. Three covers the typical "yes,
-# but what about X?", "and Y?", "ok fix the previous answer" pattern.
 _DEFAULT_MAX_TURNS = 3
 class ConversationStore:
     """
-    Thin read/write layer over the `interactions` SQLite table for session history.
     One instance is created at startup and shared across all requests via app.state.
     """
     def __init__(self, db_path: str, github_log=None) -> None:
         self._db_path = db_path
         self._github_log = github_log
     def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
         """
         Return the last `max_turns` completed Q/A pairs for `session_id`,
-        oldest first (so LLMs read them in chronological order).
-        Returns an empty list if there is no history or the table doesn't exist yet.
-        Each entry: {"q": str, "a": str}  —  `a` is truncated to _ANSWER_PREVIEW_LEN.
         """
         try:
             with sqlite3.connect(self._db_path) as conn:
@@ -66,30 +104,21 @@ class ConversationStore:
                     (session_id, max_turns),
                 ).fetchall()
         except sqlite3.OperationalError:
-            # Table doesn't exist yet (first ever request) — not an error.
             return []
         except Exception as exc:
             logger.warning("ConversationStore.get_recent failed: %s", exc)
             return []
-        # Reverse so oldest is first (chronological order for the LLM).
         turns = []
         for query, answer in reversed(rows):
             a_preview = answer[:_ANSWER_PREVIEW_LEN]
             if len(answer) > _ANSWER_PREVIEW_LEN:
-                a_preview += "…"
             turns.append({"q": query, "a": a_preview})
         return turns
     def mark_last_negative(self, session_id: str) -> None:
-        """
-        Set feedback=-1 on the most recent interaction for `session_id` in SQLite,
-        then durably record the correction in the GitHub JSONL log so the negative
-        label survives a HF Space restart.
-        data_prep.py reads {type:"feedback", feedback:-1} correction records from
-        the durable log and applies them when building reranker training triplets.
-        """
         try:
             with sqlite3.connect(self._db_path) as conn:
                 conn.execute(
@@ -107,18 +136,13 @@ class ConversationStore:
         except Exception as exc:
             logger.warning("ConversationStore.mark_last_negative SQLite failed: %s", exc)
-        # Durable correction record — survives Space restart; not in SQLite only.
         if self._github_log is not None:
             self._github_log.append_feedback(session_id, feedback=-1)
     def populate_from_records(self, records: list[dict]) -> None:
         """
         Replay interaction records from the durable GitHub log into SQLite.
-        Called at startup when SQLite is empty after a Space restart so conversation
-        history is available without requiring a full log replay on every request.
-        Only inserts rows for path='rag'|'gemini_fast'|'cache_hit' interactions;
-        skips feedback correction records (type='feedback') which are not interactions.
         """
         import os
         db_dir = os.path.dirname(self._db_path)
@@ -152,8 +176,6 @@ class ConversationStore:
                     )
                     """
                 )
-                # Apply feedback corrections: build a map session_id -> feedback
-                # so they can be applied when inserting the matching interactions.
                 feedback_corrections: dict[str, int] = {}
                 for r in records:
                     if r.get("type") == "feedback":

 """
 backend/app/services/conversation_store.py
+SQLite-backed per-session conversation history with progressive summarisation.
+Stage 2 additions:
+  - A `conversation_summaries` table stores one rolling summary paragraph per
+    session. After each completed turn, GeminiClient.update_conversation_summary()
+    is called asynchronously and the result is persisted here.
+  - get_recent() is unchanged (raw turns still available for the 3-turn fallback).
+  - get_summary() / set_summary() are thin wrappers on the new table.
+The raw `interactions` table is still the source of truth for reranker training.
+Summaries are only for live context injection and have no training significance.
 """
 from __future__ import annotations
 logger = logging.getLogger(__name__)
 _ANSWER_PREVIEW_LEN = 120
 _DEFAULT_MAX_TURNS = 3
 class ConversationStore:
     """
+    Thin read/write layer over SQLite for session history and rolling summaries.
     One instance is created at startup and shared across all requests via app.state.
     """
     def __init__(self, db_path: str, github_log=None) -> None:
         self._db_path = db_path
         self._github_log = github_log
+        self._ensure_summary_table()
+    def _ensure_summary_table(self) -> None:
+        """Create the conversation_summaries table idempotently at startup."""
+        import os
+        db_dir = os.path.dirname(self._db_path)
+        if db_dir:
+            os.makedirs(db_dir, exist_ok=True)
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                conn.execute(
+                    """
+                    CREATE TABLE IF NOT EXISTS conversation_summaries (
+                        session_id TEXT PRIMARY KEY,
+                        summary    TEXT NOT NULL DEFAULT '',
+                        updated_at TEXT NOT NULL
+                    )
+                    """
+                )
+        except Exception as exc:
+            logger.warning("Could not create conversation_summaries table: %s", exc)
+    def get_summary(self, session_id: str) -> str:
+        """Return the rolling summary for this session, or '' if none exists."""
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                row = conn.execute(
+                    "SELECT summary FROM conversation_summaries WHERE session_id = ?",
+                    (session_id,),
+                ).fetchone()
+            return row[0] if row else ""
+        except Exception as exc:
+            logger.warning("get_summary failed: %s", exc)
+            return ""
+    def set_summary(self, session_id: str, summary: str) -> None:
+        """Upsert the rolling summary for this session."""
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                conn.execute(
+                    """
+                    INSERT INTO conversation_summaries (session_id, summary, updated_at)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(session_id) DO UPDATE SET
+                        summary    = excluded.summary,
+                        updated_at = excluded.updated_at
+                    """,
+                    (session_id, summary, datetime.now(tz=timezone.utc).isoformat()),
+                )
+        except Exception as exc:
+            logger.warning("set_summary failed: %s", exc)
     def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
         """
         Return the last `max_turns` completed Q/A pairs for `session_id`,
+        oldest first. Each entry: {"q": str, "a": str}.
         """
         try:
             with sqlite3.connect(self._db_path) as conn:
                     (session_id, max_turns),
                 ).fetchall()
         except sqlite3.OperationalError:
             return []
         except Exception as exc:
             logger.warning("ConversationStore.get_recent failed: %s", exc)
             return []
         turns = []
         for query, answer in reversed(rows):
             a_preview = answer[:_ANSWER_PREVIEW_LEN]
             if len(answer) > _ANSWER_PREVIEW_LEN:
+                a_preview += "\u2026"
             turns.append({"q": query, "a": a_preview})
         return turns
     def mark_last_negative(self, session_id: str) -> None:
+        """Set feedback=-1 on the most recent interaction for this session."""
         try:
             with sqlite3.connect(self._db_path) as conn:
                 conn.execute(
         except Exception as exc:
             logger.warning("ConversationStore.mark_last_negative SQLite failed: %s", exc)
         if self._github_log is not None:
             self._github_log.append_feedback(session_id, feedback=-1)
     def populate_from_records(self, records: list[dict]) -> None:
         """
         Replay interaction records from the durable GitHub log into SQLite.
+        Called at startup when SQLite is empty after a Space restart.
         """
         import os
         db_dir = os.path.dirname(self._db_path)
                     )
                     """
                 )
                 feedback_corrections: dict[str, int] = {}
                 for r in records:
                     if r.get("type") == "feedback":

app/services/gemini_client.py CHANGED Viewed

@@ -145,6 +145,171 @@ class GeminiClient:
             logger.warning("Gemini reformat failed (%s); keeping Groq draft.", exc)
             return None
     @property
     def is_configured(self) -> bool:
         return self._client is not None

             logger.warning("Gemini reformat failed (%s); keeping Groq draft.", exc)
             return None
+    async def decontextualize_query(
+        self,
+        query: str,
+        summary: str,
+    ) -> str:
+        """
+        Rewrite a reference-heavy follow-up query into a self-contained question.
+        Called on the live request path (runs concurrently with Guard) when the
+        session has a rolling summary and the query contains pronouns/references.
+        Returns the rewritten query, or the original if Gemini is unavailable or
+        the call fails.
+        Example:
+          query:   "What about his caching approach?"
+          summary: "Discussed Darshan's RAG system using Qdrant and semantic cache."
+          output:  "What caching strategy does Darshan use in his RAG system?"
+        """
+        if not self._client:
+            return query
+        prompt = (
+            f"Conversation so far:\n{summary}\n\n"
+            f"Current question: {query}\n\n"
+            "Rewrite the current question as a fully self-contained question that "
+            "can be understood without any prior context. Replace all pronouns and "
+            "references ('it', 'that', 'this', 'the same', 'his', etc.) with the "
+            "specific subject they refer to. Output ONLY the rewritten question — "
+            "no explanation, no quotes, one sentence."
+        )
+        try:
+            from google.genai import types  # noqa: PLC0415
+            response = await self._client.aio.models.generate_content(  # type: ignore[attr-defined]
+                model=self._model,
+                contents=prompt,
+                config=types.GenerateContentConfig(temperature=0.1, max_output_tokens=80),
+            )
+            rewritten = (response.candidates[0].content.parts[0].text or "").strip().strip('"').strip("'")
+            if rewritten and rewritten != query:
+                logger.debug("Decontextualized %r → %r", query[:50], rewritten[:60])
+                return rewritten
+        except Exception as exc:
+            logger.warning("decontextualize_query failed (%s); using original.", exc)
+        return query
+    async def update_conversation_summary(
+        self,
+        previous_summary: str,
+        new_turn_q: str,
+        new_turn_a: str,
+        processing_api_key: str = "",
+    ) -> str:
+        """
+        Progressive summary update — called AFTER the response is delivered
+        so it adds zero perceived latency.
+        Takes the previous rolling summary (initially empty) and one new Q/A turn
+        and asks Gemini Flash to produce an updated single-paragraph summary of
+        the entire conversation, capped at 150 tokens.
+        Uses the GEMINI_PROCESSING_API_KEY when provided so this offline step
+        does not consume live API quota. Falls back to the instance's own client
+        if no processing key is set.
+        """
+        if not self._client and not processing_api_key:
+            return previous_summary
+        prior_block = (
+            f"Previous summary:\n{previous_summary}\n\n" if previous_summary else ""
+        )
+        prompt = (
+            f"{prior_block}"
+            f"New turn:\nQ: {new_turn_q}\nA: {new_turn_a[:300]}\n\n"
+            "Write an updated summary of the whole conversation in ONE paragraph "
+            "of at most 150 tokens. Mention the topics discussed and key facts "
+            "established. Be specific (include names, project names, technologies). "
+            "Output ONLY the summary paragraph."
+        )
+        try:
+            from google.genai import types  # noqa: PLC0415
+            # Use a separate client with the processing key when provided.
+            if processing_api_key:
+                from google import genai as _genai  # noqa: PLC0415
+                proc_client = _genai.Client(api_key=processing_api_key)
+                client_to_use = proc_client.aio
+            else:
+                client_to_use = self._client.aio  # type: ignore[attr-defined]
+            response = await client_to_use.models.generate_content(
+                model=self._model,
+                contents=prompt,
+                config=types.GenerateContentConfig(temperature=0.0, max_output_tokens=180),
+            )
+            text = (response.candidates[0].content.parts[0].text or "").strip()
+            if text:
+                logger.debug("Conversation summary updated (%d chars).", len(text))
+                return text
+        except Exception as exc:
+            logger.warning("update_conversation_summary failed (%s); keeping previous.", exc)
+        return previous_summary
+    async def critique_rag_answer(
+        self,
+        query: str,
+        context_block: str,
+        answer: str,
+        decontextualized_query: str = "",
+    ) -> dict[str, int]:
+        """
+        SELF-RAG critic: score Groq's generated answer on three dimensions (1–3).
+        Dimension 1 — Groundedness: Are all factual claims supported by a chunk?
+        Dimension 2 — Completeness: Does the answer use all relevant available chunks?
+        Dimension 3 — Specificity: Does the answer give names/numbers/details?
+        Returns {"groundedness": int, "completeness": int, "specificity": int}.
+        Defaults to {"groundedness": 3, "completeness": 3, "specificity": 3} when
+        Gemini is unavailable (treat as high quality to avoid unnecessary retries).
+        """
+        _default = {"groundedness": 3, "completeness": 3, "specificity": 3}
+        if not self._client:
+            return _default
+        display_query = decontextualized_query or query
+        prompt = (
+            f"Original question: {query}\n"
+            + (f"Interpreted as: {decontextualized_query}\n" if decontextualized_query and decontextualized_query != query else "")
+            + f"\nRetrieved passages:\n{context_block[:3000]}\n\n"
+            f"Generated answer:\n{answer[:1500]}\n\n"
+            "Score the answer on three dimensions. Output ONLY three lines in this exact format:\n"
+            "groundedness: <1|2|3>\n"
+            "completeness: <1|2|3>\n"
+            "specificity: <1|2|3>\n\n"
+            "Scoring guide:\n"
+            "groundedness — 3: every claim comes from a passage. 2: most do. 1: claims not in passages.\n"
+            "completeness — 3: all relevant passages used. 2: partially used. 1: relevant passages ignored.\n"
+            "specificity  — 3: specific details (names, numbers, examples). 2: mixed. 1: entirely generic.\n"
+        )
+        try:
+            from google.genai import types  # noqa: PLC0415
+            response = await self._client.aio.models.generate_content(  # type: ignore[attr-defined]
+                model=self._model,
+                contents=prompt,
+                config=types.GenerateContentConfig(temperature=0.0, max_output_tokens=30),
+            )
+            text = (response.candidates[0].content.parts[0].text or "").strip()
+            scores: dict[str, int] = {}
+            for line in text.splitlines():
+                if ":" in line:
+                    k, _, v = line.partition(":")
+                    k = k.strip().lower()
+                    try:
+                        val = int(v.strip())
+                        if k in ("groundedness", "completeness", "specificity") and 1 <= val <= 3:
+                            scores[k] = val
+                    except ValueError:
+                        pass
+            if len(scores) == 3:
+                logger.debug("SELF-RAG critic: %s", scores)
+                return scores
+        except Exception as exc:
+            logger.warning("critique_rag_answer failed (%s); defaulting to high quality.", exc)
+        return _default
     @property
     def is_configured(self) -> bool:
         return self._client is not None

app/services/vector_store.py CHANGED Viewed

@@ -234,3 +234,43 @@ class VectorStore:
         except Exception as exc:
             logger.warning("fetch_by_doc_id failed for %r: %s", doc_id, exc)
             return []

         except Exception as exc:
             logger.warning("fetch_by_doc_id failed for %r: %s", doc_id, exc)
             return []
+    def search_by_raptor_level(
+        self,
+        query_vector: list[float],
+        level: int,
+        top_k: int = 5,
+    ) -> list[Chunk]:
+        """
+        Dense vector search restricted to chunks at a specific RAPTOR hierarchy level.
+        level=0 → leaf chunks (normal passage-level chunks).
+        level=1 → cluster summary nodes generated by RaptorBuilder.
+        level=2 → reserved for document-level summaries.
+        Filter is applied via Qdrant payload filter on metadata.raptor_level.
+        Old chunks that pre-date RAPTOR indexing lack the field and are excluded,
+        which is the correct behaviour (they are effectively level-0 leaves already
+        returned by the main dense search in retrieve.py).
+        """
+        try:
+            results = self.client.search(
+                collection_name=self.collection,
+                query_vector=NamedVector(name=_DENSE_VEC, vector=query_vector),
+                limit=top_k,
+                query_filter=Filter(
+                    must=[
+                        FieldCondition(
+                            key="metadata.raptor_level",
+                            match=MatchValue(value=level),
+                        )
+                    ]
+                ),
+                with_payload=True,
+            )
+            return [Chunk(**hit.payload) for hit in results if hit.payload]
+        except Exception as exc:
+            logger.warning(
+                "search_by_raptor_level(level=%d) failed: %s — skipping RAPTOR results.", level, exc
+            )
+            return []