GitHub Actions commited on
Commit
65543f1
Β·
1 Parent(s): dee57c6

Deploy 493901d

Browse files
app/api/chat.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  import time
3
  from fastapi import APIRouter, Request, Depends
4
  from fastapi.responses import StreamingResponse
@@ -10,6 +11,22 @@ from app.security.jwt_auth import verify_jwt
10
 
11
  router = APIRouter()
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  @router.post("")
15
  @chat_rate_limit()
@@ -23,6 +40,18 @@ async def chat_endpoint(
23
 
24
  # All singletons pre-built in lifespan β€” zero allocation in hot path.
25
  pipeline = request.app.state.pipeline
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  initial_state: PipelineState = { # type: ignore[assignment]
28
  "query": request_data.message,
@@ -37,6 +66,8 @@ async def chat_endpoint(
37
  "cache_key": None,
38
  "guard_passed": False,
39
  "thinking": False,
 
 
40
  "latency_ms": 0,
41
  "error": None,
42
  "interaction_id": None,
 
1
  import json
2
+ import re
3
  import time
4
  from fastapi import APIRouter, Request, Depends
5
  from fastapi.responses import StreamingResponse
 
11
 
12
  router = APIRouter()
13
 
14
+ # Phrases a visitor uses when telling the bot it gave a wrong answer.
15
+ # Matched on the lowercased raw message before any LLM call β€” O(1), zero cost.
16
+ _CRITICISM_SIGNALS: frozenset[str] = frozenset({
17
+ "that's wrong", "thats wrong", "you're wrong", "youre wrong",
18
+ "not right", "wrong answer", "you got it wrong", "that is wrong",
19
+ "that's incorrect", "you're incorrect", "thats incorrect", "youre incorrect",
20
+ "fix that", "fix your answer", "actually no", "no that's", "no thats",
21
+ "that was wrong", "your answer was wrong", "wrong information",
22
+ "incorrect information", "that's not right", "thats not right",
23
+ })
24
+
25
+
26
+ def _is_criticism(message: str) -> bool:
27
+ lowered = message.lower()
28
+ return any(sig in lowered for sig in _CRITICISM_SIGNALS)
29
+
30
 
31
  @router.post("")
32
  @chat_rate_limit()
 
40
 
41
  # All singletons pre-built in lifespan β€” zero allocation in hot path.
42
  pipeline = request.app.state.pipeline
43
+ conv_store = request.app.state.conversation_store
44
+ session_id = request_data.session_id
45
+
46
+ # Fetch prior turns and detect criticism BEFORE the pipeline runs.
47
+ # Both are synchronous SQLite reads (<3ms) so they don't block the event loop
48
+ # meaningfully, but we keep them outside sse_generator to avoid any closure issues.
49
+ conversation_history = conv_store.get_recent(session_id)
50
+ criticism = _is_criticism(request_data.message)
51
+ if criticism and conversation_history:
52
+ # Auto-record negative feedback on the previous turn so the self-improvement
53
+ # loop picks it up during the next reranker fine-tune cycle.
54
+ conv_store.mark_last_negative(session_id)
55
 
56
  initial_state: PipelineState = { # type: ignore[assignment]
57
  "query": request_data.message,
 
66
  "cache_key": None,
67
  "guard_passed": False,
68
  "thinking": False,
69
+ "conversation_history": conversation_history,
70
+ "is_criticism": criticism,
71
  "latency_ms": 0,
72
  "error": None,
73
  "interaction_id": None,
app/core/config.py CHANGED
@@ -50,7 +50,7 @@ class Settings(BaseSettings):
50
  # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
51
  GEMINI_API_KEY: Optional[str] = None
52
  GEMINI_PROCESSING_API_KEY: Optional[str] = None
53
- GEMINI_MODEL: str = "gemini-2.0-flash"
54
  GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
55
 
56
  # HuggingFace Space model servers.
 
50
  # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
51
  GEMINI_API_KEY: Optional[str] = None
52
  GEMINI_PROCESSING_API_KEY: Optional[str] = None
53
+ GEMINI_MODEL: str = "gemini-2.5-flash-lite"
54
  GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
55
 
56
  # HuggingFace Space model servers.
app/main.py CHANGED
@@ -19,6 +19,7 @@ from app.services.embedder import Embedder
19
  from app.services.gemini_client import GeminiClient
20
  from app.services.reranker import Reranker
21
  from app.services.semantic_cache import SemanticCache
 
22
  from qdrant_client import QdrantClient
23
 
24
  logger = get_logger(__name__)
@@ -35,6 +36,7 @@ async def lifespan(app: FastAPI):
35
  ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
36
  similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
37
  )
 
38
 
39
  # DagsHub/MLflow experiment tracking β€” optional, only active when token is set.
40
  # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
 
19
  from app.services.gemini_client import GeminiClient
20
  from app.services.reranker import Reranker
21
  from app.services.semantic_cache import SemanticCache
22
+ from app.services.conversation_store import ConversationStore
23
  from qdrant_client import QdrantClient
24
 
25
  logger = get_logger(__name__)
 
36
  ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
37
  similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
38
  )
39
+ app.state.conversation_store = ConversationStore(settings.DB_PATH)
40
 
41
  # DagsHub/MLflow experiment tracking β€” optional, only active when token is set.
42
  # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
app/models/pipeline.py CHANGED
@@ -33,6 +33,13 @@ class PipelineState(TypedDict):
33
  cache_key: Optional[str]
34
  guard_passed: bool
35
  thinking: bool # True while Gemini has signalled RAG is needed
 
 
 
 
 
 
 
36
  latency_ms: int
37
  error: Optional[str]
38
  interaction_id: Optional[int]
 
33
  cache_key: Optional[str]
34
  guard_passed: bool
35
  thinking: bool # True while Gemini has signalled RAG is needed
36
+ # Last N Q/A pairs for this session β€” injected into prompts for follow-up context.
37
+ # List of {"q": str, "a": str} dicts, oldest first, answers truncated to 120 chars.
38
+ conversation_history: list
39
+ # True when the current query explicitly criticises the previous answer.
40
+ # Triggers automatic negative feedback on the prior interaction and forces
41
+ # Gemini editorial reformat regardless of the low-trust heuristic score.
42
+ is_criticism: bool
43
  latency_ms: int
44
  error: Optional[str]
45
  interaction_id: Optional[int]
app/pipeline/nodes/gemini_fast.py CHANGED
@@ -61,7 +61,10 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
61
  "thinking": False,
62
  }
63
 
64
- answer, tool_query = await gemini_client.fast_answer(query)
 
 
 
65
 
66
  if answer is not None:
67
  # Gemini answered from context β€” no RAG needed.
 
61
  "thinking": False,
62
  }
63
 
64
+ answer, tool_query = await gemini_client.fast_answer(
65
+ query,
66
+ history=state.get("conversation_history") or [],
67
+ )
68
 
69
  if answer is not None:
70
  # Gemini answered from context β€” no RAG needed.
app/pipeline/nodes/generate.py CHANGED
@@ -8,12 +8,12 @@ from app.services.llm_client import LLMClient
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
- # Covers known Darshan content areas so the LLM can give a specific redirect
12
- # when the knowledge base has nothing relevant instead of a vague hedge.
 
 
13
  _TOPIC_SUGGESTIONS = (
14
- "projects (assembly donut, AI/ML work, text processing tools, web apps, ESP32 projects), "
15
- "blog posts (he has written on embedded systems, AI, software engineering topics), "
16
- "skills (Python, C/C++, Java, ML frameworks, embedded systems), "
17
  "education, work experience, or general background"
18
  )
19
 
@@ -69,18 +69,19 @@ CRITICAL SAFETY RULES β€” override everything above:
69
  """.format(topics=_TOPIC_SUGGESTIONS)
70
 
71
  # When retrieve found nothing relevant (empty reranked_chunks), give a direct
72
- # honest answer rather than a vague "I don't have information" hedge.
 
73
  _NOT_FOUND_SYSTEM = """\
74
  You are the assistant on Darshan Chheda's portfolio website.
75
- The knowledge base was searched but returned no relevant results for this question.
76
 
77
- Respond in 1–2 sentences:
78
- 1. Confirm this specific topic isn't in the content you can access.
79
- 2. Optionally suggest a related area Darshan HAS covered: {topics}.
80
 
81
- Rules:
82
- - No apologies. No "Unfortunately". No long disclaimers.
83
- - Do not invent details. Be direct and move on.
84
  """.format(topics=_TOPIC_SUGGESTIONS)
85
 
86
  # Tokenise query into a set of normalised words for overlap detection.
@@ -112,6 +113,22 @@ def _chunks_overlap_query(tokens: frozenset[str], chunks: list) -> bool:
112
  return any(tok in combined for tok in tokens)
113
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  # Phrases that indicate the model hedged despite having source passages.
116
  # Gemini reformat is triggered when any of these appear in the Groq draft.
117
  _HEDGE_PHRASES: tuple[str, ...] = (
@@ -161,8 +178,9 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
161
  # threshold). Use a short, model-generated honest refusal so guard
162
  # rejections and not-found both route here with quality responses.
163
  if not reranked_chunks:
 
164
  stream = llm_client.complete_with_complexity(
165
- prompt=f"Visitor question: {query}",
166
  system=_NOT_FOUND_SYSTEM,
167
  stream=True,
168
  complexity="simple", # always lightweight β€” no RAG needed
@@ -181,8 +199,9 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
181
  top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
182
  query_toks = _query_tokens(query)
183
  if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
 
184
  stream = llm_client.complete_with_complexity(
185
- prompt=f"Visitor question: {query}",
186
  system=_NOT_FOUND_SYSTEM,
187
  stream=True,
188
  complexity="simple",
@@ -212,7 +231,19 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
212
  )
213
 
214
  context_block = "\n\n".join(context_parts)
215
- prompt = f"Passages:\n{context_block}\n\nVisitor question: {query}"
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  # ── Generate with CoT ────────────────────────────────────────────────
218
  # The system prompt instructs the model to write reasoning inside
@@ -233,12 +264,11 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
233
  full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
234
 
235
  # ── Quality gate: Gemini editorial reformat ──────────────────────────
236
- # If the Groq draft is low-trust (hedging survived, citations missing,
237
- # or suspiciously thin for a complex query), ask Gemini Flash to rewrite
238
- # it. This only fires for genuinely bad drafts; normal responses are
239
- # untouched and add zero latency.
240
- if gemini_client is not None and _is_low_trust(full_answer, reranked_chunks, complexity):
241
- logger.debug("Low-trust Groq draft detected β€” requesting Gemini reformat.")
242
  reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
243
  if reformatted:
244
  full_answer = reformatted
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
+ # Generic category labels used only to redirect visitors to valid content areas.
12
+ # IMPORTANT: never list specific project/tech names here. If the model sees
13
+ # "Assembly Donut" or "Java" in its system prompt it will present them as
14
+ # retrieved facts even when Qdrant returned zero chunks (hallucination source).
15
  _TOPIC_SUGGESTIONS = (
16
+ "his projects, blog posts, technical skills, "
 
 
17
  "education, work experience, or general background"
18
  )
19
 
 
69
  """.format(topics=_TOPIC_SUGGESTIONS)
70
 
71
  # When retrieve found nothing relevant (empty reranked_chunks), give a direct
72
+ # honest response. NO specific names or details β€” the model has no retrieved
73
+ # context here, so anything specific it says would be fabricated.
74
  _NOT_FOUND_SYSTEM = """\
75
  You are the assistant on Darshan Chheda's portfolio website.
76
+ The knowledge base search returned no relevant results for this question.
77
 
78
+ Respond in exactly 1-2 sentences:
79
+ - State plainly that you don't have that specific information available right now.
80
+ - Suggest the visitor ask about {topics}, where content is available.
81
 
82
+ CRITICAL: Do NOT name any specific project, technology, company, blog post, or skill.
83
+ You have NO retrieved facts β€” any specific name you produce is fabricated.
84
+ Be brief, honest, and generic. No apologies, no padding.
85
  """.format(topics=_TOPIC_SUGGESTIONS)
86
 
87
  # Tokenise query into a set of normalised words for overlap detection.
 
113
  return any(tok in combined for tok in tokens)
114
 
115
 
116
+ def _format_history(history: list[dict]) -> str:
117
+ """
118
+ Render prior turns as a compact prefix block.
119
+ Each turn is one line: "[Tn] Q: ... | A: ..."
120
+ Returns empty string when there is no history (first message in session).
121
+ Token cost: ~20-35 tokens per turn; max 3 turns β†’ <110 tokens overhead.
122
+ """
123
+ if not history:
124
+ return ""
125
+ lines = [
126
+ f"[T{i + 1}] Q: {t['q']} | A: {t['a']}"
127
+ for i, t in enumerate(history)
128
+ ]
129
+ return "Prior conversation (oldest first):\n" + "\n".join(lines) + "\n\n"
130
+
131
+
132
  # Phrases that indicate the model hedged despite having source passages.
133
  # Gemini reformat is triggered when any of these appear in the Groq draft.
134
  _HEDGE_PHRASES: tuple[str, ...] = (
 
178
  # threshold). Use a short, model-generated honest refusal so guard
179
  # rejections and not-found both route here with quality responses.
180
  if not reranked_chunks:
181
+ history_prefix = _format_history(state.get("conversation_history") or [])
182
  stream = llm_client.complete_with_complexity(
183
+ prompt=f"{history_prefix}Visitor question: {query}",
184
  system=_NOT_FOUND_SYSTEM,
185
  stream=True,
186
  complexity="simple", # always lightweight β€” no RAG needed
 
199
  top_score = reranked_chunks[0]["metadata"].get("rerank_score", 0.0)
200
  query_toks = _query_tokens(query)
201
  if top_score < 0.0 and not _chunks_overlap_query(query_toks, reranked_chunks):
202
+ history_prefix = _format_history(state.get("conversation_history") or [])
203
  stream = llm_client.complete_with_complexity(
204
+ prompt=f"{history_prefix}Visitor question: {query}",
205
  system=_NOT_FOUND_SYSTEM,
206
  stream=True,
207
  complexity="simple",
 
231
  )
232
 
233
  context_block = "\n\n".join(context_parts)
234
+
235
+ # ── Compact conversation history prefix ───────────────────────────���─
236
+ # Injected before passages so the model can resolve follow-up references
237
+ # ("tell me more", "which one used Java?", "that was wrong") without
238
+ # needing to re-retrieve resolved information.
239
+ history_prefix = _format_history(state.get("conversation_history") or [])
240
+ is_criticism = state.get("is_criticism", False)
241
+ criticism_note = (
242
+ "NOTE: The visitor says the previous answer was wrong. "
243
+ "Re-examine the passages carefully and correct any errors.\n\n"
244
+ if is_criticism else ""
245
+ )
246
+ prompt = f"{criticism_note}{history_prefix}Passages:\n{context_block}\n\nVisitor question: {query}"
247
 
248
  # ── Generate with CoT ────────────────────────────────────────────────
249
  # The system prompt instructs the model to write reasoning inside
 
264
  full_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
265
 
266
  # ── Quality gate: Gemini editorial reformat ──────────────────────────
267
+ # Fires when: (a) criticism was detected β€” always reformat to be safe, or
268
+ # (b) low-trust heuristic flags the draft (hedging / no citations / too short).
269
+ # Zero extra cost on good responses; ~200-400ms only when genuinely needed.
270
+ if gemini_client is not None and (is_criticism or _is_low_trust(full_answer, reranked_chunks, complexity)):
271
+ logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
 
272
  reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
273
  if reformatted:
274
  full_answer = reformatted
app/pipeline/nodes/log_eval.py CHANGED
@@ -43,6 +43,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
43
  CREATE TABLE IF NOT EXISTS interactions (
44
  id INTEGER PRIMARY KEY AUTOINCREMENT,
45
  timestamp TEXT,
 
46
  query TEXT,
47
  answer TEXT,
48
  chunks_used TEXT,
@@ -58,6 +59,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
58
  for col, definition in [
59
  ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
60
  ("feedback", "INTEGER DEFAULT 0"),
 
61
  ]:
62
  try:
63
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -67,11 +69,12 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
67
  cursor = conn.execute(
68
  """
69
  INSERT INTO interactions
70
- (timestamp, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached)
71
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
72
  """,
73
  (
74
  datetime.utcnow().isoformat() + "Z",
 
75
  state.get("query", ""),
76
  state.get("answer", ""),
77
  chunks_used,
 
43
  CREATE TABLE IF NOT EXISTS interactions (
44
  id INTEGER PRIMARY KEY AUTOINCREMENT,
45
  timestamp TEXT,
46
+ session_id TEXT,
47
  query TEXT,
48
  answer TEXT,
49
  chunks_used TEXT,
 
59
  for col, definition in [
60
  ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
61
  ("feedback", "INTEGER DEFAULT 0"),
62
+ ("session_id", "TEXT DEFAULT ''"),
63
  ]:
64
  try:
65
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
 
69
  cursor = conn.execute(
70
  """
71
  INSERT INTO interactions
72
+ (timestamp, session_id, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached)
73
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
74
  """,
75
  (
76
  datetime.utcnow().isoformat() + "Z",
77
+ state.get("session_id", ""),
78
  state.get("query", ""),
79
  state.get("answer", ""),
80
  chunks_used,
app/services/conversation_store.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ backend/app/services/conversation_store.py
3
+
4
+ SQLite-backed per-session conversation history.
5
+
6
+ Reads the last N completed turns for a session from the existing `interactions`
7
+ table so the LLM has conversational context without a separate store.
8
+ Answers are truncated to 120 chars before injection β€” enough context for
9
+ referential follow-ups ("tell me more", "what else?", "that's wrong") without
10
+ wasting significant token budget on verbatim prior answers.
11
+
12
+ All reads/writes are synchronous sqlite3 (<3ms on SSD) β€” acceptable because:
13
+ 1. The call happens once at request start, outside the model call path.
14
+ 2. SQLite WAL mode allows concurrent readers and one writer without blocking.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import sqlite3
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Visible answer length per turn injected into context.
24
+ # 120 chars β‰ˆ 25 tokens β€” plenty to resolve pronouns and follow-up references.
25
+ _ANSWER_PREVIEW_LEN = 120
26
+
27
+ # Default number of prior turns to surface. Three covers the typical "yes,
28
+ # but what about X?", "and Y?", "ok fix the previous answer" pattern.
29
+ _DEFAULT_MAX_TURNS = 3
30
+
31
+
32
+ class ConversationStore:
33
+ """
34
+ Thin read/write layer over the `interactions` SQLite table for session history.
35
+ One instance is created at startup and shared across all requests via app.state.
36
+ """
37
+
38
+ def __init__(self, db_path: str) -> None:
39
+ self._db_path = db_path
40
+
41
+ def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
42
+ """
43
+ Return the last `max_turns` completed Q/A pairs for `session_id`,
44
+ oldest first (so LLMs read them in chronological order).
45
+
46
+ Returns an empty list if there is no history or the table doesn't exist yet.
47
+ Each entry: {"q": str, "a": str} β€” `a` is truncated to _ANSWER_PREVIEW_LEN.
48
+ """
49
+ try:
50
+ with sqlite3.connect(self._db_path) as conn:
51
+ rows = conn.execute(
52
+ """
53
+ SELECT query, answer FROM interactions
54
+ WHERE session_id = ? AND answer != ''
55
+ ORDER BY id DESC
56
+ LIMIT ?
57
+ """,
58
+ (session_id, max_turns),
59
+ ).fetchall()
60
+ except sqlite3.OperationalError:
61
+ # Table doesn't exist yet (first ever request) β€” not an error.
62
+ return []
63
+ except Exception as exc:
64
+ logger.warning("ConversationStore.get_recent failed: %s", exc)
65
+ return []
66
+
67
+ # Reverse so oldest is first (chronological order for the LLM).
68
+ turns = []
69
+ for query, answer in reversed(rows):
70
+ a_preview = answer[:_ANSWER_PREVIEW_LEN]
71
+ if len(answer) > _ANSWER_PREVIEW_LEN:
72
+ a_preview += "…"
73
+ turns.append({"q": query, "a": a_preview})
74
+ return turns
75
+
76
+ def mark_last_negative(self, session_id: str) -> None:
77
+ """
78
+ Set feedback=-1 on the most recent interaction for `session_id`.
79
+ Called when the current user message clearly criticises the previous answer.
80
+ This feeds the self-improvement loop in data_prep.py / purge_bad_chunks.py.
81
+ """
82
+ try:
83
+ with sqlite3.connect(self._db_path) as conn:
84
+ conn.execute(
85
+ """
86
+ UPDATE interactions SET feedback = -1
87
+ WHERE id = (
88
+ SELECT id FROM interactions
89
+ WHERE session_id = ?
90
+ ORDER BY id DESC
91
+ LIMIT 1
92
+ )
93
+ """,
94
+ (session_id,),
95
+ )
96
+ except Exception as exc:
97
+ logger.warning("ConversationStore.mark_last_negative failed: %s", exc)
app/services/gemini_client.py CHANGED
@@ -159,22 +159,35 @@ class GeminiClient:
159
  self._cache.popitem(last=False) # FIFO: remove oldest
160
  self._cache[key] = (answer, tool_query, time.monotonic())
161
 
162
- async def fast_answer(self, query: str) -> tuple[Optional[str], Optional[str]]:
163
  """
164
  Ask Gemini to answer or signal it needs the full knowledge base.
165
 
166
  Returns one of:
167
  (answer: str, None) β€” Gemini answered from context; stream to user, no citations.
168
  (None, tool_query: str) β€” Gemini called search_knowledge_base(); run RAG pipeline.
 
 
 
 
169
  """
170
  if not self._client:
171
  return None, query
172
 
 
173
  cache_key = _normalise(query)
174
- cached = self._cache_get(cache_key)
175
- if cached is not None:
176
- logger.debug("Gemini cache hit for key=%r", cache_key[:40])
177
- return cached
 
 
 
 
 
 
 
 
178
 
179
  from google.genai import types # noqa: PLC0415
180
 
@@ -226,7 +239,7 @@ class GeminiClient:
226
  try:
227
  response = await self._client.aio.models.generate_content( # type: ignore[attr-defined]
228
  model=self._model,
229
- contents=query,
230
  config=types.GenerateContentConfig(
231
  system_instruction=system_prompt,
232
  tools=[search_tool],
@@ -240,7 +253,8 @@ class GeminiClient:
240
  if hasattr(part, "function_call") and part.function_call:
241
  tool_query = (part.function_call.args or {}).get("query", query)
242
  result = None, str(tool_query)
243
- self._cache_set(cache_key, *result)
 
244
  logger.debug("Gemini called search_knowledge_base(query=%r)", tool_query)
245
  return result
246
  if hasattr(part, "text") and part.text:
@@ -248,7 +262,8 @@ class GeminiClient:
248
 
249
  if answer_parts:
250
  answer = "".join(answer_parts).strip()
251
- self._cache_set(cache_key, answer, None)
 
252
  return answer, None
253
 
254
  # Empty response β€” fall back to RAG gracefully.
 
159
  self._cache.popitem(last=False) # FIFO: remove oldest
160
  self._cache[key] = (answer, tool_query, time.monotonic())
161
 
162
+ async def fast_answer(self, query: str, history: list[dict] | None = None) -> tuple[Optional[str], Optional[str]]:
163
  """
164
  Ask Gemini to answer or signal it needs the full knowledge base.
165
 
166
  Returns one of:
167
  (answer: str, None) β€” Gemini answered from context; stream to user, no citations.
168
  (None, tool_query: str) β€” Gemini called search_knowledge_base(); run RAG pipeline.
169
+
170
+ When `history` is provided (non-empty), the cache is bypassed entirely because
171
+ the same question in an active conversation may need a different answer based on
172
+ what was established in earlier turns. Cache only applies to context-free queries.
173
  """
174
  if not self._client:
175
  return None, query
176
 
177
+ use_cache = not history # skip cache when conversation context is present
178
  cache_key = _normalise(query)
179
+ if use_cache:
180
+ cached = self._cache_get(cache_key)
181
+ if cached is not None:
182
+ logger.debug("Gemini cache hit for key=%r", cache_key[:40])
183
+ return cached
184
+
185
+ # Build user message β€” prepend prior turns so Gemini has referential context.
186
+ if history:
187
+ prior = "\n".join(f"Q: {t['q']}\nA: {t['a']}" for t in history)
188
+ user_message = f"[Prior conversation]\n{prior}\n\n[Current question]\n{query}"
189
+ else:
190
+ user_message = query
191
 
192
  from google.genai import types # noqa: PLC0415
193
 
 
239
  try:
240
  response = await self._client.aio.models.generate_content( # type: ignore[attr-defined]
241
  model=self._model,
242
+ contents=user_message,
243
  config=types.GenerateContentConfig(
244
  system_instruction=system_prompt,
245
  tools=[search_tool],
 
253
  if hasattr(part, "function_call") and part.function_call:
254
  tool_query = (part.function_call.args or {}).get("query", query)
255
  result = None, str(tool_query)
256
+ if use_cache:
257
+ self._cache_set(cache_key, *result)
258
  logger.debug("Gemini called search_knowledge_base(query=%r)", tool_query)
259
  return result
260
  if hasattr(part, "text") and part.text:
 
262
 
263
  if answer_parts:
264
  answer = "".join(answer_parts).strip()
265
+ if use_cache:
266
+ self._cache_set(cache_key, answer, None)
267
  return answer, None
268
 
269
  # Empty response β€” fall back to RAG gracefully.