Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Feb 28

Commit

efdd22e

1 Parent(s): 3d134a6

Deploy c8a8192

Browse files

Files changed (10) hide show

app/api/chat.py +49 -46
app/core/topic.py +79 -0
app/models/pipeline.py +3 -0
app/pipeline/nodes/cache.py +8 -0
app/pipeline/nodes/gemini_fast.py +13 -5
app/pipeline/nodes/generate.py +82 -29
app/pipeline/nodes/guard.py +40 -27
app/pipeline/nodes/retrieve.py +65 -5
pytest.ini +2 -0
tests/conftest.py +14 -4

app/api/chat.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import asyncio
 import json
-import re
 import time
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
@@ -37,7 +35,7 @@ async def _generate_follow_ups(
 ) -> list[str]:
     """
     Generates 3 specific follow-up questions after the main answer is complete.
-    Runs as a concurrent asyncio Task — zero added latency after the done event.
     Questions must be:
     - Specific to the answer content (never generic like "tell me more")
@@ -85,10 +83,21 @@ async def chat_endpoint(
     request_data: ChatRequest,
     token_payload: dict = Depends(verify_jwt),
 ) -> StreamingResponse:
-    """Stream RAG answer as SSE. Cache hits return in <100ms."""
     start_time = time.monotonic()
-    # All singletons pre-built in lifespan — zero allocation in hot path.
     pipeline = request.app.state.pipeline
     conv_store = request.app.state.conversation_store
     llm_client = request.app.state.llm_client
@@ -120,6 +129,8 @@ async def chat_endpoint(
         "retrieval_attempts": 0,
         "rewritten_query": None,
         "follow_ups": [],
     }
     async def sse_generator():
@@ -129,45 +140,35 @@ async def chat_endpoint(
         interaction_id = None
         try:
-            async for event in pipeline.astream(initial_state):
                 if await request.is_disconnected():
                     break
-                for node_name, updates in event.items():
-                    if node_name == "cache" and updates.get("cached") is False:
-                        yield f'data: {json.dumps({"stage": "checking"})}\n\n'
-                    elif node_name == "cache" and updates.get("cached") is True:
-                        yield f'data: {json.dumps({"stage": "cache_hit"})}\n\n'
-                    if node_name == "retrieve":
-                        yield f'data: {json.dumps({"stage": "generating"})}\n\n'
-                    # CRAG rewrite in progress — inform the frontend the query is being refined.
-                    if node_name == "rewrite_query":
-                        yield f'data: {json.dumps({"stage": "refining"})}\n\n'
-                    if updates.get("thinking") is True:
-                        yield f'data: {json.dumps({"thinking": True, "stage": "searching"})}\n\n'
-                    if "answer" in updates:
-                        answer_update = updates["answer"]
-                        delta = (
-                            answer_update[len(final_answer):]
-                            if answer_update.startswith(final_answer)
-                            else answer_update
-                        )
-                        final_answer = answer_update
-                        if delta:
-                            yield f'data: {json.dumps({"token": delta})}\n\n'
-                    if "sources" in updates:
-                        final_sources = updates["sources"]
-                    if "cached" in updates:
-                        is_cached = updates["cached"]
-                    if "interaction_id" in updates and updates["interaction_id"] is not None:
-                        interaction_id = updates["interaction_id"]
             elapsed_ms = int((time.monotonic() - start_time) * 1000)
@@ -178,24 +179,26 @@ async def chat_endpoint(
                 for s in final_sources
             ]
-            yield f'data: {json.dumps({"done": True, "sources": sources_list, "cached": is_cached, "latency_ms": elapsed_ms, "interaction_id": interaction_id})}\n\n'
             # ── Follow-up questions ────────────────────────────────────────────
             # Generated after the done event so it never delays answer delivery.
-            # Works for both cache hits (no sources) and full RAG responses.
             if final_answer and not await request.is_disconnected():
                 follow_ups = await _generate_follow_ups(
                     request_data.message, final_answer, final_sources, llm_client
                 )
                 if follow_ups:
-                    yield f'data: {json.dumps({"follow_ups": follow_ups})}\n\n'
         except Exception as exc:
-            yield f'data: {json.dumps({"error": str(exc) or "Generation failed"})}\n\n'
     return StreamingResponse(
         sse_generator(),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )

 import json
 import time
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
 ) -> list[str]:
     """
     Generates 3 specific follow-up questions after the main answer is complete.
+    Runs after the answer stream finishes — zero added latency before first token.
     Questions must be:
     - Specific to the answer content (never generic like "tell me more")
     request_data: ChatRequest,
     token_payload: dict = Depends(verify_jwt),
 ) -> StreamingResponse:
+    """Stream RAG answer as typed SSE events.
+    Event sequence for a full RAG request:
+        event: status   — guard label, cache miss, gemini routing, retrieve labels
+        event: reading  — one per unique source found in Qdrant (before rerank)
+        event: sources  — final selected sources array (after rerank)
+        event: thinking — CoT scratchpad tokens (70B only)
+        event: token    — answer tokens
+        event: follow_ups — three suggested follow-up questions
+    For cache hits: status → status → token
+    For Gemini fast-path: status → status → token
+    """
     start_time = time.monotonic()
     pipeline = request.app.state.pipeline
     conv_store = request.app.state.conversation_store
     llm_client = request.app.state.llm_client
         "retrieval_attempts": 0,
         "rewritten_query": None,
         "follow_ups": [],
+        "path": None,
+        "query_topic": None,
     }
     async def sse_generator():
         interaction_id = None
         try:
+            # stream_mode=["custom", "updates"] yields (mode, data) tuples:
+            #   mode="custom"  → data is whatever writer(payload) was called with
+            #   mode="updates" → data is {node_name: state_updates_dict}
+            async for mode, data in pipeline.astream(
+                initial_state,
+                stream_mode=["custom", "updates"],
+            ):
                 if await request.is_disconnected():
                     break
+                if mode == "custom":
+                    # Forward writer events as named SSE events.
+                    # Each node emits {"type": "<event_name>", ...payload}.
+                    event_type = data.get("type", "status")
+                    # Strip the "type" key so the client receives a clean payload.
+                    payload = {k: v for k, v in data.items() if k != "type"}
+                    yield f"event: {event_type}\ndata: {json.dumps(payload)}\n\n"
+                elif mode == "updates":
+                    # Capture terminal state for the done event; do not re-emit tokens.
+                    for _node_name, updates in data.items():
+                        if "sources" in updates and updates["sources"]:
+                            final_sources = updates["sources"]
+                        if "cached" in updates:
+                            is_cached = updates["cached"]
+                        if "interaction_id" in updates and updates["interaction_id"] is not None:
+                            interaction_id = updates["interaction_id"]
+                        if "answer" in updates and updates["answer"]:
+                            final_answer = updates["answer"]
             elapsed_ms = int((time.monotonic() - start_time) * 1000)
                 for s in final_sources
             ]
+            # The done event uses plain data: (no event: type) for backward
+            # compatibility with widgets that listen on the raw data channel.
+            yield (
+                f"data: {json.dumps({'done': True, 'sources': sources_list, 'cached': is_cached, 'latency_ms': elapsed_ms, 'interaction_id': interaction_id})}\n\n"
+            )
             # ── Follow-up questions ────────────────────────────────────────────
             # Generated after the done event so it never delays answer delivery.
             if final_answer and not await request.is_disconnected():
                 follow_ups = await _generate_follow_ups(
                     request_data.message, final_answer, final_sources, llm_client
                 )
                 if follow_ups:
+                    yield f"event: follow_ups\ndata: {json.dumps({'questions': follow_ups})}\n\n"
         except Exception as exc:
+            yield f"data: {json.dumps({'error': str(exc) or 'Generation failed'})}\n\n"
     return StreamingResponse(
         sse_generator(),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )

app/core/topic.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+backend/app/core/topic.py
+Extracts a 1–3 word topic label from a natural-language query.
+Used by Guard, Retrieve, and any node that surfaces context-specific status
+labels ("Checking your question about machine learning", "Searching portfolio
+for RAG pipeline") without any LLM call.  The extraction is a pure set-lookup
+— it adds no measurable latency.
+>>> extract_topic("What are Darshan's machine learning projects?")
+'machine learning projects'
+>>> extract_topic("Tell me about his background")
+'background'
+>>> extract_topic("How does he implement RAG?")
+'implement RAG'
+>>> extract_topic("What is")
+'What is'
+"""
+from __future__ import annotations
+import re
+# Comprehensive stopword set: prepositions, articles, auxiliary verbs, common
+# question words, personal pronouns, demonstratives, and portfolio-query filler.
+# Content-bearing words (nouns, adjectives, verbs like "implement", "built")
+# are intentionally absent — they ARE the topic.
+_STOPWORDS: frozenset[str] = frozenset({
+    # Articles
+    "a", "an", "the",
+    # Prepositions
+    "about", "above", "across", "after", "against", "along", "among",
+    "around", "at", "before", "behind", "below", "beneath", "beside",
+    "between", "beyond", "by", "during", "except", "for", "from", "in",
+    "inside", "into", "like", "near", "of", "off", "on", "onto", "out",
+    "outside", "over", "past", "regarding", "since", "through",
+    "throughout", "to", "toward", "under", "underneath", "until", "up",
+    "upon", "with", "within", "without",
+    # Conjunctions
+    "and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
+    # Common auxiliary verbs
+    "is", "are", "was", "were", "be", "been", "being",
+    "has", "have", "had", "do", "does", "did",
+    "will", "would", "could", "should", "may", "might", "can", "shall",
+    # Question words
+    "what", "who", "where", "when", "how", "why", "which",
+    # Personal pronouns
+    "i", "you", "he", "she", "it", "we", "they",
+    "me", "him", "her", "us", "them",
+    "my", "your", "his", "its", "our", "their",
+    "mine", "yours", "hers", "ours", "theirs",
+    # Demonstratives
+    "this", "that", "these", "those",
+    # Common portfolio-query filler
+    "tell", "me", "about", "show", "give", "list", "get", "find",
+    "look", "also", "just", "really", "very", "more", "most",
+    "some", "any", "other", "another", "same", "such", "own",
+    "darshan", "chheda",  # owner name is not a useful topic word
+})
+def extract_topic(query: str) -> str:
+    """Return a 1–3 word topic phrase extracted from ``query``.
+    Words matching the stopword set are stripped (case-insensitive).  The first
+    1–3 remaining words are returned joined by spaces.  If the query resolves
+    to zero content words (all stopwords, or empty), the first two whitespace-
+    separated tokens of the original query are returned unchanged so the caller
+    always receives a non-empty string.
+    """
+    tokens = re.findall(r"[a-zA-Z']+", query)
+    content = [t for t in tokens if t.lower() not in _STOPWORDS and len(t) > 1]
+    if not content:
+        # Fallback: keep the first two words of the original query verbatim.
+        parts = query.strip().split()
+        return " ".join(parts[:2]) if len(parts) >= 2 else (parts[0] if parts else query)
+    return " ".join(content[:3])

app/models/pipeline.py CHANGED Viewed

@@ -57,3 +57,6 @@ class PipelineState(TypedDict):
     # data_prep.py filters to path=="rag" when building reranker triplets because
     # only RAG interactions have chunk associations that form valid training pairs.
     path: Optional[str]

     # data_prep.py filters to path=="rag" when building reranker triplets because
     # only RAG interactions have chunk associations that form valid training pairs.
     path: Optional[str]
+    # 1–3 word topic extracted from the query by the guard node (extract_topic).
+    # Stored in state so retrieve_node can reuse it without recomputing.
+    query_topic: Optional[str]

app/pipeline/nodes/cache.py CHANGED Viewed

@@ -18,6 +18,7 @@
 from typing import Callable
 import numpy as np
 from app.models.pipeline import PipelineState
 from app.services.semantic_cache import SemanticCache
@@ -44,6 +45,9 @@ def _has_unresolved_reference(query: str) -> bool:
 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
         query = state["query"]
         has_history = bool(state.get("conversation_history"))
@@ -62,6 +66,10 @@ def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState],
         cached = await cache.get(query_embedding)
         if cached:
             return {
                 "answer": cached,
                 "cached": True,

 from typing import Callable
 import numpy as np
+from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState
 from app.services.semantic_cache import SemanticCache
 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
+        writer = get_stream_writer()
+        writer({"type": "status", "label": "Looking up in memory..."})
         query = state["query"]
         has_history = bool(state.get("conversation_history"))
         cached = await cache.get(query_embedding)
         if cached:
+            writer({"type": "status", "label": "Found a recent answer, loading..."})
+            # Emit the full cached answer as a single token event — the cache
+            # returns complete text, not a stream, so one event is correct.
+            writer({"type": "token", "text": cached})
             return {
                 "answer": cached,
                 "cached": True,

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -23,6 +23,8 @@ from __future__ import annotations
 import logging
 from typing import Any
 from app.models.pipeline import PipelineState
 from app.services.gemini_client import GeminiClient
 from app.core.quality import is_low_trust
@@ -70,6 +72,9 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
     """
     async def gemini_fast(state: PipelineState) -> dict:
         query = state["query"]
         complexity = "complex" if _is_complex(query) else "simple"
@@ -77,6 +82,7 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         # traffic straight to RAG — behaviour is identical to the old graph.
         if not gemini_client.is_configured:
             logger.debug("Gemini not configured; routing query to RAG.")
             return {
                 "query_complexity": complexity,
                 "expanded_queries": [query],
@@ -90,21 +96,22 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         if answer is not None:
             # Run the same quality gate that guards Groq answers.
-            # Gemini fast-path has no retrieved chunks, so only the hedge-phrase
-            # and short-complex-answer signals apply (chunks argument is []).
             if is_low_trust(answer, [], complexity):
                 logger.debug(
                     "Gemini fast-path answer failed quality gate — routing to RAG."
                 )
-                # Clear the answer so route_gemini() sends us to RAG.
                 return {
                     "query_complexity": complexity,
                     "expanded_queries": [query],
                     "thinking": True,
                 }
-            # Gemini answered from context and passed quality gate.
             logger.debug("Gemini fast-path answered query (len=%d)", len(answer))
             return {
                 "query_complexity": complexity,
                 "answer": answer,
@@ -113,9 +120,10 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
                 "path": "gemini_fast",
             }
-        # Gemini called search_knowledge_base() — signal RAG via thinking=True.
         rag_query = tool_query or query
         logger.debug("Gemini routed to RAG (tool_query=%r)", rag_query)
         return {
             "query_complexity": complexity,
             "expanded_queries": [rag_query],

 import logging
 from typing import Any
+from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState
 from app.services.gemini_client import GeminiClient
 from app.core.quality import is_low_trust
     """
     async def gemini_fast(state: PipelineState) -> dict:
+        writer = get_stream_writer()
+        writer({"type": "status", "label": "Thinking about your question directly..."})
         query = state["query"]
         complexity = "complex" if _is_complex(query) else "simple"
         # traffic straight to RAG — behaviour is identical to the old graph.
         if not gemini_client.is_configured:
             logger.debug("Gemini not configured; routing query to RAG.")
+            writer({"type": "status", "label": "Needs deeper search, checking portfolio..."})
             return {
                 "query_complexity": complexity,
                 "expanded_queries": [query],
         if answer is not None:
             # Run the same quality gate that guards Groq answers.
             if is_low_trust(answer, [], complexity):
                 logger.debug(
                     "Gemini fast-path answer failed quality gate — routing to RAG."
                 )
+                writer({"type": "status", "label": "Needs deeper search, checking portfolio..."})
                 return {
                     "query_complexity": complexity,
                     "expanded_queries": [query],
                     "thinking": True,
                 }
+            # Gemini answered and passed the quality gate.
             logger.debug("Gemini fast-path answered query (len=%d)", len(answer))
+            writer({"type": "status", "label": "Got a direct answer, writing now..."})
+            # Gemini does not stream; emit the complete answer as one token event.
+            writer({"type": "token", "text": answer})
             return {
                 "query_complexity": complexity,
                 "answer": answer,
                 "path": "gemini_fast",
             }
+        # Gemini called search_knowledge_base() — route to full RAG.
         rag_query = tool_query or query
         logger.debug("Gemini routed to RAG (tool_query=%r)", rag_query)
+        writer({"type": "status", "label": "Needs deeper search, checking portfolio..."})
         return {
             "query_complexity": complexity,
             "expanded_queries": [rag_query],

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -2,6 +2,8 @@ import logging
 import re
 from typing import Callable
 from app.models.chat import SourceRef
 from app.models.pipeline import PipelineState
 from app.services.llm_client import LLMClient
@@ -139,37 +141,38 @@ def _format_history(history: list[dict]) -> str:
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
     async def generate_node(state: PipelineState) -> dict:
         query = state["query"]
         complexity = state.get("query_complexity", "simple")
         reranked_chunks = state.get("reranked_chunks", [])
         # ── Not-found path ─────────────────────────────────────────────────
-        # Retrieve found no relevant chunks (either KB empty or below rerank
-        # threshold). Use a short, model-generated honest refusal so guard
-        # rejections and not-found both route here with quality responses.
         if not reranked_chunks:
             history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
-                complexity="simple",  # always lightweight — no RAG needed
             )
             full_answer = ""
             async for token in stream:
                 full_answer += token
             return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Pre-LLM coherence shortcut ──────────────────────────────────────
-        # Check that at least one meaningful query token appears somewhere in
-        # the retrieved chunks. If there is zero textual overlap AND the top
-        # rerank score is negative, the retriever returned topically unrelated
-        # chunks — skip the LLM call entirely and go straight to not-found.
-        # This saves a Groq call (~300ms) when the KB truly has nothing.
         top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
         query_toks = _query_tokens(query)
         if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
             history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
@@ -180,6 +183,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
             full_answer = ""
             async for token in stream:
                 full_answer += token
             return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Build numbered context block ────────────────────────────────────
@@ -188,7 +192,6 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         for i, chunk in enumerate(reranked_chunks, start=1):
             meta = chunk["metadata"]
-            # Include title and URL so the LLM can verify passage relevance.
             header = f"[{i}] {meta['source_title']}"
             if meta.get("source_url"):
                 header += f" ({meta['source_url']})"
@@ -203,10 +206,6 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         context_block = "\n\n".join(context_parts)
-        # ── Compact conversation history prefix ─────────────────────────────
-        # Injected before passages so the model can resolve follow-up references
-        # ("tell me more", "which one used Java?", "that was wrong") without
-        # needing to re-retrieve resolved information.
         history_prefix = _format_history(state.get("conversation_history") or [])
         is_criticism = state.get("is_criticism", False)
         criticism_note = (
@@ -216,44 +215,98 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         )
         prompt = f"{criticism_note}{history_prefix}Passages:\n{context_block}\n\nVisitor question: {query}"
-        # ── Generate with CoT ────────────────────────────────────────────────
-        # The system prompt instructs the model to write reasoning inside
-        # <think>...</think>, then produce the visible answer after it.
-        # We buffer the full response so we can strip the scratchpad and run
-        # the low-trust quality check before delivering to the client.
         stream = llm_client.complete_with_complexity(
             prompt=prompt,
             system=_SYSTEM_PROMPT,
             stream=True,
             complexity=complexity,
         )
-        raw_answer = ""
         async for token in stream:
             raw_answer += token
-        # Strip CoT scratchpad — model obeys <think>...</think> convention.
         full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
         # ── Quality gate: Gemini editorial reformat ──────────────────────────
-        # Fires when: (a) criticism was detected — always reformat to be safe, or
-        # (b) low-trust heuristic flags the draft (hedging / no citations / too short).
-        # Zero extra cost on good responses; ~200-400ms only when genuinely needed.
         if gemini_client is not None and (is_criticism or is_low_trust(full_answer, reranked_chunks, complexity)):
             logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
                 full_answer = reformatted
-        # Only surface sources the LLM actually cited — keeps citation list tight.
-        # Fall back to top-2 if the model produced no [N] markers.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
         cited_sources = [sr for i, sr in enumerate(source_refs, start=1) if i in cited_indices]
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
-            # Tag this interaction so data_prep.py can filter to RAG-path only
-            # when building reranker triplets (only RAG has chunk associations).
             "path": "rag",
         }

 import re
 from typing import Callable
+from langgraph.config import get_stream_writer
 from app.models.chat import SourceRef
 from app.models.pipeline import PipelineState
 from app.services.llm_client import LLMClient
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
+    # Number of token chunks to buffer before deciding there is no CoT block.
+    # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
+    # 50 chunks is enough to cover the opening tag without delaying short answers.
+    _THINK_LOOKAHEAD: int = 50
     async def generate_node(state: PipelineState) -> dict:
+        writer = get_stream_writer()
         query = state["query"]
         complexity = state.get("query_complexity", "simple")
         reranked_chunks = state.get("reranked_chunks", [])
         # ── Not-found path ─────────────────────────────────────────────────
         if not reranked_chunks:
+            writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
             history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
+                complexity="simple",
             )
             full_answer = ""
             async for token in stream:
                 full_answer += token
+                writer({"type": "token", "text": token})
             return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Pre-LLM coherence shortcut ──────────────────────────────────────
         top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
         query_toks = _query_tokens(query)
         if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
+            writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
             history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
                 prompt=f"{history_prefix}Visitor question: {query}",
             full_answer = ""
             async for token in stream:
                 full_answer += token
+                writer({"type": "token", "text": token})
             return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Build numbered context block ────────────────────────────────────
         for i, chunk in enumerate(reranked_chunks, start=1):
             meta = chunk["metadata"]
             header = f"[{i}] {meta['source_title']}"
             if meta.get("source_url"):
                 header += f" ({meta['source_url']})"
         context_block = "\n\n".join(context_parts)
         history_prefix = _format_history(state.get("conversation_history") or [])
         is_criticism = state.get("is_criticism", False)
         criticism_note = (
         )
         prompt = f"{criticism_note}{history_prefix}Passages:\n{context_block}\n\nVisitor question: {query}"
+        # ── Streaming CoT-aware token emission ──────────────────────────────
+        # Groq streams tokens one chunk at a time. We intercept them to:
+        #   Phase 1 — detect and buffer the <think> block, emitting thinking events.
+        #   Phase 2 — emit answer tokens in real time after </think>.
+        # If no <think> tag appears in the first _THINK_LOOKAHEAD token chunks
+        # (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
         stream = llm_client.complete_with_complexity(
             prompt=prompt,
             system=_SYSTEM_PROMPT,
             stream=True,
             complexity=complexity,
         )
+        raw_answer = ""          # complete unmodified response for quality gate
+        buf = ""                 # character buffer for tag detection
+        in_think = False         # currently inside <think> block
+        think_done = False       # </think> was found; switched to direct streaming
+        no_cot = False           # no <think> seen in first LOOKAHEAD token chunks
+        token_chunk_count = 0    # number of token chunks received
+        think_first_emitted = False  # CoT first-sentence status already sent
         async for token in stream:
             raw_answer += token
+            token_chunk_count += 1
+            if think_done or no_cot:
+                # Phase 2: real-time answer streaming.
+                writer({"type": "token", "text": token})
+                continue
+            buf += token
+            if not in_think:
+                if "<think>" in buf:
+                    in_think = True
+                    pre = buf[: buf.index("<think>")]
+                    if pre.strip():
+                        # Text before the think tag is part of the answer.
+                        writer({"type": "token", "text": pre})
+                    buf = buf[buf.index("<think>") + 7:]  # 7 = len("<think>")
+                elif token_chunk_count >= _THINK_LOOKAHEAD:
+                    # No CoT block in first 50 chunks — emit buffered and go direct.
+                    no_cot = True
+                    writer({"type": "token", "text": buf})
+                    buf = ""
+            else:
+                # Phase 1: inside the <think> block; buffer until </think>.
+                if "</think>" in buf:
+                    idx = buf.index("</think>")
+                    think_txt = buf[:idx].strip()
+                    after_think = buf[idx + 9:]  # 9 = len("</think>")
+                    if think_txt and not think_first_emitted:
+                        # Surface the first sentence as a legible status label.
+                        for j, ch in enumerate(think_txt):
+                            if ch in ".?!\n":
+                                first_sent = think_txt[: j + 1].strip()[:120]
+                                writer({"type": "status", "label": first_sent})
+                                think_first_emitted = True
+                                break
+                    if think_txt:
+                        writer({"type": "thinking", "text": think_txt})
+                    think_done = True
+                    buf = ""
+                    if after_think.strip():
+                        writer({"type": "token", "text": after_think})
+        # Flush buffer if the stream ended mid-detection (e.g. model forgot </think>).
+        if buf:
+            writer({"type": "token", "text": buf})
+        # ── Strip CoT scratchpad ────────────────────────────────────────────
         full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
         # ── Quality gate: Gemini editorial reformat ──────────────────────────
+        # Fires when: (a) criticism detected — always reformat, or
+        # (b) low-trust heuristic flags the draft. Zero extra cost on good responses.
         if gemini_client is not None and (is_criticism or is_low_trust(full_answer, reranked_chunks, complexity)):
             logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
                 full_answer = reformatted
+        # Only surface sources the LLM actually cited.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
         cited_sources = [sr for i, sr in enumerate(source_refs, start=1) if i in cited_indices]
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
             "path": "rag",
         }

app/pipeline/nodes/guard.py CHANGED Viewed

@@ -1,45 +1,58 @@
 from typing import Callable
 from app.models.pipeline import PipelineState
 from app.security.guard_classifier import GuardClassifier
 from app.security.sanitizer import sanitize_input, redact_pii
 def make_guard_node(classifier: GuardClassifier) -> Callable[[PipelineState], dict]:
     def guard_node(state: PipelineState) -> dict:
         original_query = state["query"]
-        # 1. Sanitize
         sanitized = sanitize_input(original_query)
-        # 2. PII Redact
-        # Note: the prompt says "Return cleaned text. Used in log_eval node before writing to SQLite."
-        # If we redact it here, the rest of the pipeline gets the redacted text.
-        # This is safe and ensures PII doesn't leak into LLM prompts or vector similarity.
         clean_query = redact_pii(sanitized)
-        # Optional validation based on length/nulls in case sanitize failed (though sanitize strips nulls and truncates to 500)
         if len(clean_query) == 0:
-             return {
-                 "query": clean_query,
-                 "guard_passed": False,
-                 "answer": "I can only answer questions about Darshan's work, projects, and background.",
-                 "path": "blocked",
-             }
-        # 3. Classify (Scope evaluation)
         is_safe, score = classifier.is_in_scope(clean_query)
         if not is_safe:
-             return {
-                 "query": clean_query,
-                 "guard_passed": False,
-                 "answer": "I can only answer questions about Darshan's work, projects, and background.",
-                 "path": "blocked",
-             }
         return {
             "query": clean_query,
-            "guard_passed": True
         }
     return guard_node

 from typing import Callable
+from langgraph.config import get_stream_writer
+from app.core.topic import extract_topic
 from app.models.pipeline import PipelineState
 from app.security.guard_classifier import GuardClassifier
 from app.security.sanitizer import sanitize_input, redact_pii
 def make_guard_node(classifier: GuardClassifier) -> Callable[[PipelineState], dict]:
     def guard_node(state: PipelineState) -> dict:
+        writer = get_stream_writer()
         original_query = state["query"]
+        # 1. Sanitize and PII-redact before any LLM or classifier call.
         sanitized = sanitize_input(original_query)
         clean_query = redact_pii(sanitized)
+        # Emit the first status event now that we have a clean query to describe.
+        # Topic extraction is O(N) set lookup — adds zero measurable latency.
+        if clean_query:
+            topic = extract_topic(clean_query)
+            label = f"Checking your question about {topic}" if topic else "Checking your question"
+        else:
+            topic = ""
+            label = "Checking your question"
+        writer({"type": "status", "label": label})
         if len(clean_query) == 0:
+            return {
+                "query": clean_query,
+                "guard_passed": False,
+                "answer": "I can only answer questions about Darshan's work, projects, and background.",
+                "path": "blocked",
+                "query_topic": topic,
+            }
+        # 2. Classify (scope evaluation).
         is_safe, score = classifier.is_in_scope(clean_query)
         if not is_safe:
+            return {
+                "query": clean_query,
+                "guard_passed": False,
+                "answer": "I can only answer questions about Darshan's work, projects, and background.",
+                "path": "blocked",
+                "query_topic": topic,
+            }
         return {
             "query": clean_query,
+            "guard_passed": True,
+            "query_topic": topic,
         }
     return guard_node

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import asyncio
 from typing import Callable
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
@@ -90,13 +92,33 @@ def _rrf_merge(ranked_lists: list[list[Chunk]]) -> list[Chunk]:
     return [chunks_by_fp[fp] for fp in sorted_fps]
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
 ) -> Callable[[PipelineState], dict]:
     async def retrieve_node(state: PipelineState) -> dict:
         attempts = state.get("retrieval_attempts", 0)
         query = state["query"]
         # On a CRAG retry (attempts >= 1) the query has been rewritten and
         # query_embedding is explicitly set to None — always re-embed.
         # On the first attempt, reuse the embedding computed by the cache node.
@@ -134,6 +156,25 @@ def make_retrieve_node(
         all_ranked_lists = dense_results + ([sparse_results] if sparse_results else [])
         fused: list[Chunk] = _rrf_merge(all_ranked_lists)
         # ── Deduplication (question-point collapse) ────────────────────────────
         # Multiple points for the same chunk (main + question points from Stage 3)
         # share the same doc_id::section fingerprint and collapse here.
@@ -145,6 +186,11 @@ def make_retrieve_node(
                 seen.add(fp)
                 unique_chunks.append(c)
         reranked = await reranker.rerank(query, unique_chunks, top_k=5)
         # ── Relevance gate ─────────────────────────────────────────────────────
@@ -158,11 +204,6 @@ def make_retrieve_node(
             }
         # ── Source diversity cap (query-aware) ─────────────────────────────────
-        # Broad queries: max 2 chunks per source document (anti-resume-monopoly).
-        # Focused queries (experience, skills, project, blog): raise the cap for
-        # the matching source type to 4, cap everything else at 1.  This lets
-        # the resume fill appropriately on "what is Darshan's work experience?"
-        # without harming answer quality on broad queries.
         focused_type = _focused_source_type(query)
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
@@ -179,6 +220,25 @@ def make_retrieve_node(
                 diverse_chunks.append(chunk)
                 doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1
         return {
             "retrieved_chunks": unique_chunks,
             "reranked_chunks": diverse_chunks,

 import asyncio
 from typing import Callable
+from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
     return [chunks_by_fp[fp] for fp in sorted_fps]
+_TYPE_REMAP: dict[str, str] = {
+    "github": "readme",
+    "bio": "resume",
+    "cv": "resume",
+    "blog": "blog",
+    "project": "project",
+}
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
 ) -> Callable[[PipelineState], dict]:
     async def retrieve_node(state: PipelineState) -> dict:
+        writer = get_stream_writer()
         attempts = state.get("retrieval_attempts", 0)
         query = state["query"]
+        # Reuse the topic computed by the guard node — no recomputation needed.
+        topic = state.get("query_topic") or ""
+        searching_label = (
+            f"Searching portfolio for {topic}..."
+            if topic
+            else "Searching portfolio..."
+        )
+        writer({"type": "status", "label": searching_label})
         # On a CRAG retry (attempts >= 1) the query has been rewritten and
         # query_embedding is explicitly set to None — always re-embed.
         # On the first attempt, reuse the embedding computed by the cache node.
         all_ranked_lists = dense_results + ([sparse_results] if sparse_results else [])
         fused: list[Chunk] = _rrf_merge(all_ranked_lists)
+        # ── Reading events — one per unique source document ────────────────────
+        # Emitted BEFORE deduplication so the user sees sources appear in
+        # real time as Qdrant returns them, matching Perplexity's "Reading..."
+        # display. Deduplication here is by source_url so blog posts with
+        # multiple chunk hits fire only one event.
+        seen_urls: set[str] = set()
+        for chunk in fused:
+            meta = chunk["metadata"]
+            url = meta.get("source_url") or ""
+            dedup_key = url if url else meta.get("doc_id", "")
+            if dedup_key and dedup_key not in seen_urls:
+                seen_urls.add(dedup_key)
+                writer({
+                    "type": "reading",
+                    "title": meta.get("source_title", ""),
+                    "url": url or None,
+                    "source_type": _TYPE_REMAP.get(meta.get("source_type", ""), meta.get("source_type", "")),
+                })
         # ── Deduplication (question-point collapse) ────────────────────────────
         # Multiple points for the same chunk (main + question points from Stage 3)
         # share the same doc_id::section fingerprint and collapse here.
                 seen.add(fp)
                 unique_chunks.append(c)
+        writer({
+            "type": "status",
+            "label": f"Comparing {len(unique_chunks)} sources for relevance...",
+        })
         reranked = await reranker.rerank(query, unique_chunks, top_k=5)
         # ── Relevance gate ─────────────────────────────────────────────────────
             }
         # ── Source diversity cap (query-aware) ─────────────────────────────────
         focused_type = _focused_source_type(query)
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
                 diverse_chunks.append(chunk)
                 doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1
+        # ── Sources event — final selected sources shown before the answer ──────
+        # This is the Perplexity-style source card row that appears before tokens.
+        # Emitted here so the frontend can display source cards before Groq starts.
+        sources_payload = []
+        for chunk in diverse_chunks:
+            meta = chunk["metadata"]
+            url = meta.get("source_url") or None
+            sources_payload.append({
+                "title": meta.get("source_title", ""),
+                "url": url,
+                "source_type": _TYPE_REMAP.get(meta.get("source_type", ""), meta.get("source_type", "")),
+                "section": meta.get("section", ""),
+            })
+        writer({"type": "sources", "sources": sources_payload})
+        # Let the user know what top source the answer will be written from.
+        top_title = diverse_chunks[0]["metadata"].get("source_title", "sources")
+        writer({"type": "status", "label": f"Writing answer from {top_title}..."})
         return {
             "retrieved_chunks": unique_chunks,
             "reranked_chunks": diverse_chunks,

pytest.ini CHANGED Viewed

@@ -4,3 +4,5 @@ python_files = test_*.py
 python_classes = Test*
 python_functions = test_*
 addopts = -x --tb=short -q

 python_classes = Test*
 python_functions = test_*
 addopts = -x --tb=short -q
+filterwarnings =
+    ignore::DeprecationWarning:slowapi.*

tests/conftest.py CHANGED Viewed

@@ -57,10 +57,20 @@ def app_client():
     mock_pipeline = MagicMock()
-    async def fake_astream(state):
-        yield {"guard": {"guard_passed": True}}
-        yield {"cache": {"cached": False}}
-        yield {"generate": {"answer": "I built TextOps.", "sources": []}}
     mock_pipeline.astream = fake_astream

     mock_pipeline = MagicMock()
+    async def fake_astream(state, stream_mode=None):
+        # Support the new stream_mode=["custom", "updates"] tuple format used by chat.py.
+        if isinstance(stream_mode, list):
+            yield ("custom", {"type": "status", "label": "Checking your question"})
+            yield ("updates", {"guard": {"guard_passed": True}})
+            yield ("updates", {"cache": {"cached": False}})
+            yield ("custom", {"type": "status", "label": "Thinking about your question directly..."})
+            yield ("custom", {"type": "token", "text": "I built TextOps."})
+            yield ("updates", {"generate": {"answer": "I built TextOps.", "sources": []}})
+        else:
+            # Fallback for any code that still calls astream without stream_mode.
+            yield {"guard": {"guard_passed": True}}
+            yield {"cache": {"cached": False}}
+            yield {"generate": {"answer": "I built TextOps.", "sources": []}}
     mock_pipeline.astream = fake_astream