Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Feb 28

Commit

65543f1

1 Parent(s): dee57c6

Deploy 493901d

Browse files

Files changed (9) hide show

app/api/chat.py +31 -0
app/core/config.py +1 -1
app/main.py +2 -0
app/models/pipeline.py +7 -0
app/pipeline/nodes/gemini_fast.py +4 -1
app/pipeline/nodes/generate.py +52 -22
app/pipeline/nodes/log_eval.py +5 -2
app/services/conversation_store.py +97 -0
app/services/gemini_client.py +23 -8

app/api/chat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import time
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
@@ -10,6 +11,22 @@ from app.security.jwt_auth import verify_jwt
 router = APIRouter()
 @router.post("")
 @chat_rate_limit()
@@ -23,6 +40,18 @@ async def chat_endpoint(
     # All singletons pre-built in lifespan — zero allocation in hot path.
     pipeline = request.app.state.pipeline
     initial_state: PipelineState = {  # type: ignore[assignment]
         "query": request_data.message,
@@ -37,6 +66,8 @@ async def chat_endpoint(
         "cache_key": None,
         "guard_passed": False,
         "thinking": False,
         "latency_ms": 0,
         "error": None,
         "interaction_id": None,

 import json
+import re
 import time
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
 router = APIRouter()
+# Phrases a visitor uses when telling the bot it gave a wrong answer.
+# Matched on the lowercased raw message before any LLM call — O(1), zero cost.
+_CRITICISM_SIGNALS: frozenset[str] = frozenset({
+    "that's wrong", "thats wrong", "you're wrong", "youre wrong",
+    "not right", "wrong answer", "you got it wrong", "that is wrong",
+    "that's incorrect", "you're incorrect", "thats incorrect", "youre incorrect",
+    "fix that", "fix your answer", "actually no", "no that's", "no thats",
+    "that was wrong", "your answer was wrong", "wrong information",
+    "incorrect information", "that's not right", "thats not right",
+})
+def _is_criticism(message: str) -> bool:
+    lowered = message.lower()
+    return any(sig in lowered for sig in _CRITICISM_SIGNALS)
 @router.post("")
 @chat_rate_limit()
     # All singletons pre-built in lifespan — zero allocation in hot path.
     pipeline = request.app.state.pipeline
+    conv_store = request.app.state.conversation_store
+    session_id = request_data.session_id
+    # Fetch prior turns and detect criticism BEFORE the pipeline runs.
+    # Both are synchronous SQLite reads (<3ms) so they don't block the event loop
+    # meaningfully, but we keep them outside sse_generator to avoid any closure issues.
+    conversation_history = conv_store.get_recent(session_id)
+    criticism = _is_criticism(request_data.message)
+    if criticism and conversation_history:
+        # Auto-record negative feedback on the previous turn so the self-improvement
+        # loop picks it up during the next reranker fine-tune cycle.
+        conv_store.mark_last_negative(session_id)
     initial_state: PipelineState = {  # type: ignore[assignment]
         "query": request_data.message,
         "cache_key": None,
         "guard_passed": False,
         "thinking": False,
+        "conversation_history": conversation_history,
+        "is_criticism": criticism,
         "latency_ms": 0,
         "error": None,
         "interaction_id": None,

app/core/config.py CHANGED Viewed

@@ -50,7 +50,7 @@ class Settings(BaseSettings):
     # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
     GEMINI_API_KEY: Optional[str] = None
     GEMINI_PROCESSING_API_KEY: Optional[str] = None
-    GEMINI_MODEL: str = "gemini-2.0-flash"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
     # HuggingFace Space model servers.

     # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
     GEMINI_API_KEY: Optional[str] = None
     GEMINI_PROCESSING_API_KEY: Optional[str] = None
+    GEMINI_MODEL: str = "gemini-2.5-flash-lite"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
     # HuggingFace Space model servers.

app/main.py CHANGED Viewed

@@ -19,6 +19,7 @@ from app.services.embedder import Embedder
 from app.services.gemini_client import GeminiClient
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
 from qdrant_client import QdrantClient
 logger = get_logger(__name__)
@@ -35,6 +36,7 @@ async def lifespan(app: FastAPI):
         ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
         similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
     )
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.

 from app.services.gemini_client import GeminiClient
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
+from app.services.conversation_store import ConversationStore
 from qdrant_client import QdrantClient
 logger = get_logger(__name__)
         ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
         similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
     )
+    app.state.conversation_store = ConversationStore(settings.DB_PATH)
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.

app/models/pipeline.py CHANGED Viewed

@@ -33,6 +33,13 @@ class PipelineState(TypedDict):
     cache_key: Optional[str]
     guard_passed: bool
     thinking: bool          # True while Gemini has signalled RAG is needed
     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]

     cache_key: Optional[str]
     guard_passed: bool
     thinking: bool          # True while Gemini has signalled RAG is needed
+    # Last N Q/A pairs for this session — injected into prompts for follow-up context.
+    # List of {"q": str, "a": str} dicts, oldest first, answers truncated to 120 chars.
+    conversation_history: list
+    # True when the current query explicitly criticises the previous answer.
+    # Triggers automatic negative feedback on the prior interaction and forces
+    # Gemini editorial reformat regardless of the low-trust heuristic score.
+    is_criticism: bool
     latency_ms: int
     error: Optional[str]
     interaction_id: Optional[int]

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -61,7 +61,10 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
                 "thinking": False,
             }
-        answer, tool_query = await gemini_client.fast_answer(query)
         if answer is not None:
             # Gemini answered from context — no RAG needed.

                 "thinking": False,
             }
+        answer, tool_query = await gemini_client.fast_answer(
+            query,
+            history=state.get("conversation_history") or [],
+        )
         if answer is not None:
             # Gemini answered from context — no RAG needed.

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -8,12 +8,12 @@ from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
-# Covers known Darshan content areas so the LLM can give a specific redirect
-# when the knowledge base has nothing relevant instead of a vague hedge.
 _TOPIC_SUGGESTIONS = (
-    "projects (assembly donut, AI/ML work, text processing tools, web apps, ESP32 projects), "
-    "blog posts (he has written on embedded systems, AI, software engineering topics), "
-    "skills (Python, C/C++, Java, ML frameworks, embedded systems), "
     "education, work experience, or general background"
 )
@@ -69,18 +69,19 @@ CRITICAL SAFETY RULES — override everything above:
 """.format(topics=_TOPIC_SUGGESTIONS)
 # When retrieve found nothing relevant (empty reranked_chunks), give a direct
-# honest answer rather than a vague "I don't have information" hedge.
 _NOT_FOUND_SYSTEM = """\
 You are the assistant on Darshan Chheda's portfolio website.
-The knowledge base was searched but returned no relevant results for this question.
-Respond in 1–2 sentences:
-1. Confirm this specific topic isn't in the content you can access.
-2. Optionally suggest a related area Darshan HAS covered: {topics}.
-Rules:
-- No apologies. No "Unfortunately". No long disclaimers.
-- Do not invent details. Be direct and move on.
 """.format(topics=_TOPIC_SUGGESTIONS)
 # Tokenise query into a set of normalised words for overlap detection.
@@ -112,6 +113,22 @@ def _chunks_overlap_query(tokens: frozenset[str], chunks: list) -> bool:
     return any(tok in combined for tok in tokens)
 # Phrases that indicate the model hedged despite having source passages.
 # Gemini reformat is triggered when any of these appear in the Groq draft.
 _HEDGE_PHRASES: tuple[str, ...] = (
@@ -161,8 +178,9 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # threshold). Use a short, model-generated honest refusal so guard
         # rejections and not-found both route here with quality responses.
         if not reranked_chunks:
             stream = llm_client.complete_with_complexity(
-                prompt=f"Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
                 complexity="simple",  # always lightweight — no RAG needed
@@ -181,8 +199,9 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
         query_toks = _query_tokens(query)
         if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
             stream = llm_client.complete_with_complexity(
-                prompt=f"Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
                 complexity="simple",
@@ -212,7 +231,19 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
             )
         context_block = "\n\n".join(context_parts)
-        prompt = f"Passages:\n{context_block}\n\nVisitor question: {query}"
         # ── Generate with CoT ────────────────────────────────────────────────
         # The system prompt instructs the model to write reasoning inside
@@ -233,12 +264,11 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
         # ── Quality gate: Gemini editorial reformat ──────────────────────────
-        # If the Groq draft is low-trust (hedging survived, citations missing,
-        # or suspiciously thin for a complex query), ask Gemini Flash to rewrite
-        # it. This only fires for genuinely bad drafts; normal responses are
-        # untouched and add zero latency.
-        if gemini_client is not None and _is_low_trust(full_answer, reranked_chunks, complexity):
-            logger.debug("Low-trust Groq draft detected — requesting Gemini reformat.")
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
                 full_answer = reformatted

 logger = logging.getLogger(__name__)
+# Generic category labels used only to redirect visitors to valid content areas.
+# IMPORTANT: never list specific project/tech names here. If the model sees
+# "Assembly Donut" or "Java" in its system prompt it will present them as
+# retrieved facts even when Qdrant returned zero chunks (hallucination source).
 _TOPIC_SUGGESTIONS = (
+    "his projects, blog posts, technical skills, "
     "education, work experience, or general background"
 )
 """.format(topics=_TOPIC_SUGGESTIONS)
 # When retrieve found nothing relevant (empty reranked_chunks), give a direct
+# honest response. NO specific names or details — the model has no retrieved
+# context here, so anything specific it says would be fabricated.
 _NOT_FOUND_SYSTEM = """\
 You are the assistant on Darshan Chheda's portfolio website.
+The knowledge base search returned no relevant results for this question.
+Respond in exactly 1-2 sentences:
+- State plainly that you don't have that specific information available right now.
+- Suggest the visitor ask about {topics}, where content is available.
+CRITICAL: Do NOT name any specific project, technology, company, blog post, or skill.
+You have NO retrieved facts — any specific name you produce is fabricated.
+Be brief, honest, and generic. No apologies, no padding.
 """.format(topics=_TOPIC_SUGGESTIONS)
 # Tokenise query into a set of normalised words for overlap detection.
     return any(tok in combined for tok in tokens)
+def _format_history(history: list[dict]) -> str:
+    """
+    Render prior turns as a compact prefix block.
+    Each turn is one line: "[Tn] Q: ... | A: ..."
+    Returns empty string when there is no history (first message in session).
+    Token cost: ~20-35 tokens per turn; max 3 turns → <110 tokens overhead.
+    """
+    if not history:
+        return ""
+    lines = [
+        f"[T{i + 1}] Q: {t['q']} | A: {t['a']}"
+        for i, t in enumerate(history)
+    ]
+    return "Prior conversation (oldest first):\n" + "\n".join(lines) + "\n\n"
 # Phrases that indicate the model hedged despite having source passages.
 # Gemini reformat is triggered when any of these appear in the Groq draft.
 _HEDGE_PHRASES: tuple[str, ...] = (
         # threshold). Use a short, model-generated honest refusal so guard
         # rejections and not-found both route here with quality responses.
         if not reranked_chunks:
+            history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
+                prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
                 complexity="simple",  # always lightweight — no RAG needed
         top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
         query_toks = _query_tokens(query)
         if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
+            history_prefix = _format_history(state.get("conversation_history") or [])
             stream = llm_client.complete_with_complexity(
+                prompt=f"{history_prefix}Visitor question: {query}",
                 system=_NOT_FOUND_SYSTEM,
                 stream=True,
                 complexity="simple",
             )
         context_block = "\n\n".join(context_parts)
+        # ── Compact conversation history prefix ───────────────────────────���─
+        # Injected before passages so the model can resolve follow-up references
+        # ("tell me more", "which one used Java?", "that was wrong") without
+        # needing to re-retrieve resolved information.
+        history_prefix = _format_history(state.get("conversation_history") or [])
+        is_criticism = state.get("is_criticism", False)
+        criticism_note = (
+            "NOTE: The visitor says the previous answer was wrong. "
+            "Re-examine the passages carefully and correct any errors.\n\n"
+            if is_criticism else ""
+        )
+        prompt = f"{criticism_note}{history_prefix}Passages:\n{context_block}\n\nVisitor question: {query}"
         # ── Generate with CoT ────────────────────────────────────────────────
         # The system prompt instructs the model to write reasoning inside
         full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
         # ── Quality gate: Gemini editorial reformat ──────────────────────────
+        # Fires when: (a) criticism was detected — always reformat to be safe, or
+        # (b) low-trust heuristic flags the draft (hedging / no citations / too short).
+        # Zero extra cost on good responses; ~200-400ms only when genuinely needed.
+        if gemini_client is not None and (is_criticism or _is_low_trust(full_answer, reranked_chunks, complexity)):
+            logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
                 full_answer = reformatted

app/pipeline/nodes/log_eval.py CHANGED Viewed

@@ -43,6 +43,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
                 CREATE TABLE IF NOT EXISTS interactions (
                     id                   INTEGER PRIMARY KEY AUTOINCREMENT,
                     timestamp            TEXT,
                     query                TEXT,
                     answer               TEXT,
                     chunks_used          TEXT,
@@ -58,6 +59,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             for col, definition in [
                 ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
                 ("feedback", "INTEGER DEFAULT 0"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -67,11 +69,12 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
-                    (timestamp, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.utcnow().isoformat() + "Z",
                     state.get("query", ""),
                     state.get("answer", ""),
                     chunks_used,

                 CREATE TABLE IF NOT EXISTS interactions (
                     id                   INTEGER PRIMARY KEY AUTOINCREMENT,
                     timestamp            TEXT,
+                    session_id           TEXT,
                     query                TEXT,
                     answer               TEXT,
                     chunks_used          TEXT,
             for col, definition in [
                 ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
                 ("feedback", "INTEGER DEFAULT 0"),
+                ("session_id", "TEXT DEFAULT ''"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
+                    (timestamp, session_id, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.utcnow().isoformat() + "Z",
+                    state.get("session_id", ""),
                     state.get("query", ""),
                     state.get("answer", ""),
                     chunks_used,

app/services/conversation_store.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+backend/app/services/conversation_store.py
+SQLite-backed per-session conversation history.
+Reads the last N completed turns for a session from the existing `interactions`
+table so the LLM has conversational context without a separate store.
+Answers are truncated to 120 chars before injection — enough context for
+referential follow-ups ("tell me more", "what else?", "that's wrong") without
+wasting significant token budget on verbatim prior answers.
+All reads/writes are synchronous sqlite3 (<3ms on SSD) — acceptable because:
+  1. The call happens once at request start, outside the model call path.
+  2. SQLite WAL mode allows concurrent readers and one writer without blocking.
+"""
+from __future__ import annotations
+import logging
+import sqlite3
+logger = logging.getLogger(__name__)
+# Visible answer length per turn injected into context.
+# 120 chars ≈ 25 tokens — plenty to resolve pronouns and follow-up references.
+_ANSWER_PREVIEW_LEN = 120
+# Default number of prior turns to surface. Three covers the typical "yes,
+# but what about X?", "and Y?", "ok fix the previous answer" pattern.
+_DEFAULT_MAX_TURNS = 3
+class ConversationStore:
+    """
+    Thin read/write layer over the `interactions` SQLite table for session history.
+    One instance is created at startup and shared across all requests via app.state.
+    """
+    def __init__(self, db_path: str) -> None:
+        self._db_path = db_path
+    def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
+        """
+        Return the last `max_turns` completed Q/A pairs for `session_id`,
+        oldest first (so LLMs read them in chronological order).
+        Returns an empty list if there is no history or the table doesn't exist yet.
+        Each entry: {"q": str, "a": str}  —  `a` is truncated to _ANSWER_PREVIEW_LEN.
+        """
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                rows = conn.execute(
+                    """
+                    SELECT query, answer FROM interactions
+                    WHERE session_id = ? AND answer != ''
+                    ORDER BY id DESC
+                    LIMIT ?
+                    """,
+                    (session_id, max_turns),
+                ).fetchall()
+        except sqlite3.OperationalError:
+            # Table doesn't exist yet (first ever request) — not an error.
+            return []
+        except Exception as exc:
+            logger.warning("ConversationStore.get_recent failed: %s", exc)
+            return []
+        # Reverse so oldest is first (chronological order for the LLM).
+        turns = []
+        for query, answer in reversed(rows):
+            a_preview = answer[:_ANSWER_PREVIEW_LEN]
+            if len(answer) > _ANSWER_PREVIEW_LEN:
+                a_preview += "…"
+            turns.append({"q": query, "a": a_preview})
+        return turns
+    def mark_last_negative(self, session_id: str) -> None:
+        """
+        Set feedback=-1 on the most recent interaction for `session_id`.
+        Called when the current user message clearly criticises the previous answer.
+        This feeds the self-improvement loop in data_prep.py / purge_bad_chunks.py.
+        """
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                conn.execute(
+                    """
+                    UPDATE interactions SET feedback = -1
+                    WHERE id = (
+                        SELECT id FROM interactions
+                        WHERE session_id = ?
+                        ORDER BY id DESC
+                        LIMIT 1
+                    )
+                    """,
+                    (session_id,),
+                )
+        except Exception as exc:
+            logger.warning("ConversationStore.mark_last_negative failed: %s", exc)

app/services/gemini_client.py CHANGED Viewed

@@ -159,22 +159,35 @@ class GeminiClient:
             self._cache.popitem(last=False)  # FIFO: remove oldest
         self._cache[key] = (answer, tool_query, time.monotonic())
-    async def fast_answer(self, query: str) -> tuple[Optional[str], Optional[str]]:
         """
         Ask Gemini to answer or signal it needs the full knowledge base.
         Returns one of:
           (answer: str, None)       — Gemini answered from context; stream to user, no citations.
           (None, tool_query: str)   — Gemini called search_knowledge_base(); run RAG pipeline.
         """
         if not self._client:
             return None, query
         cache_key = _normalise(query)
-        cached = self._cache_get(cache_key)
-        if cached is not None:
-            logger.debug("Gemini cache hit for key=%r", cache_key[:40])
-            return cached
         from google.genai import types  # noqa: PLC0415
@@ -226,7 +239,7 @@ class GeminiClient:
         try:
             response = await self._client.aio.models.generate_content(  # type: ignore[attr-defined]
                 model=self._model,
-                contents=query,
                 config=types.GenerateContentConfig(
                     system_instruction=system_prompt,
                     tools=[search_tool],
@@ -240,7 +253,8 @@ class GeminiClient:
                 if hasattr(part, "function_call") and part.function_call:
                     tool_query = (part.function_call.args or {}).get("query", query)
                     result = None, str(tool_query)
-                    self._cache_set(cache_key, *result)
                     logger.debug("Gemini called search_knowledge_base(query=%r)", tool_query)
                     return result
                 if hasattr(part, "text") and part.text:
@@ -248,7 +262,8 @@ class GeminiClient:
             if answer_parts:
                 answer = "".join(answer_parts).strip()
-                self._cache_set(cache_key, answer, None)
                 return answer, None
             # Empty response — fall back to RAG gracefully.

             self._cache.popitem(last=False)  # FIFO: remove oldest
         self._cache[key] = (answer, tool_query, time.monotonic())
+    async def fast_answer(self, query: str, history: list[dict] | None = None) -> tuple[Optional[str], Optional[str]]:
         """
         Ask Gemini to answer or signal it needs the full knowledge base.
         Returns one of:
           (answer: str, None)       — Gemini answered from context; stream to user, no citations.
           (None, tool_query: str)   — Gemini called search_knowledge_base(); run RAG pipeline.
+        When `history` is provided (non-empty), the cache is bypassed entirely because
+        the same question in an active conversation may need a different answer based on
+        what was established in earlier turns. Cache only applies to context-free queries.
         """
         if not self._client:
             return None, query
+        use_cache = not history  # skip cache when conversation context is present
         cache_key = _normalise(query)
+        if use_cache:
+            cached = self._cache_get(cache_key)
+            if cached is not None:
+                logger.debug("Gemini cache hit for key=%r", cache_key[:40])
+                return cached
+        # Build user message — prepend prior turns so Gemini has referential context.
+        if history:
+            prior = "\n".join(f"Q: {t['q']}\nA: {t['a']}" for t in history)
+            user_message = f"[Prior conversation]\n{prior}\n\n[Current question]\n{query}"
+        else:
+            user_message = query
         from google.genai import types  # noqa: PLC0415
         try:
             response = await self._client.aio.models.generate_content(  # type: ignore[attr-defined]
                 model=self._model,
+                contents=user_message,
                 config=types.GenerateContentConfig(
                     system_instruction=system_prompt,
                     tools=[search_tool],
                 if hasattr(part, "function_call") and part.function_call:
                     tool_query = (part.function_call.args or {}).get("query", query)
                     result = None, str(tool_query)
+                    if use_cache:
+                        self._cache_set(cache_key, *result)
                     logger.debug("Gemini called search_knowledge_base(query=%r)", tool_query)
                     return result
                 if hasattr(part, "text") and part.text:
             if answer_parts:
                 answer = "".join(answer_parts).strip()
+                if use_cache:
+                    self._cache_set(cache_key, answer, None)
                 return answer, None
             # Empty response — fall back to RAG gracefully.