Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Feb 28

Commit

3d134a6

1 Parent(s): e7c9ee6

Deploy 85f07db

Browse files

Files changed (16) hide show

app/core/config.py +8 -0
app/core/quality.py +53 -0
app/main.py +39 -3
app/models/pipeline.py +6 -0
app/pipeline/graph.py +1 -1
app/pipeline/nodes/cache.py +46 -2
app/pipeline/nodes/gemini_fast.py +44 -6
app/pipeline/nodes/generate.py +7 -39
app/pipeline/nodes/guard.py +4 -2
app/pipeline/nodes/log_eval.py +49 -9
app/pipeline/nodes/retrieve.py +50 -4
app/security/sanitizer.py +49 -49
app/services/conversation_store.py +100 -5
app/services/github_log.py +165 -0
app/services/llm_client.py +65 -10
requirements.txt +3 -1

app/core/config.py CHANGED Viewed

@@ -53,6 +53,14 @@ class Settings(BaseSettings):
     GEMINI_MODEL: str = "gemini-2.5-flash-lite"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
     # HuggingFace Space model servers.
     # In local env, embedder/reranker run in-process (these URLs are ignored).
     # In prod, the API Space calls the HF embedder/reranker Spaces via HTTP.

     GEMINI_MODEL: str = "gemini-2.5-flash-lite"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
+    # Durable GitHub interaction log — survives HF Space restarts.
+    # PERSONABOT_WRITE_TOKEN: fine-grained PAT with read+write Contents access
+    # on the PersonaBot repo.  When set, every interaction is appended to
+    # data/interactions.jsonl in the repo so training signals persist.
+    # Leave unset in local dev (interactions stay in SQLite only).
+    PERSONABOT_WRITE_TOKEN: Optional[str] = None
+    PERSONABOT_REPO: str = "1337Xcode/PersonaBot"
     # HuggingFace Space model servers.
     # In local env, embedder/reranker run in-process (these URLs are ignored).
     # In prod, the API Space calls the HF embedder/reranker Spaces via HTTP.

app/core/quality.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+backend/app/core/quality.py
+Shared quality-gate logic used by both the generate node (Groq responses) and
+the gemini_fast node (Gemini fast-path responses).
+Centralised here — rather than in generate.py — so the same hedge-detection and
+trust scoring logic runs on every answer regardless of which pipeline branch produced
+it.  Duplicating the list of hedge phrases across two modules was the root cause of
+Bug A (Issue 2): Gemini fast-path answers were never checked for hedge phrases.
+"""
+from __future__ import annotations
+import re
+# Phrases that indicate the model hedged despite having been told not to.
+# Applies to both Groq (generate node) and Gemini (gemini_fast node) outputs.
+_HEDGE_PHRASES: tuple[str, ...] = (
+    "unfortunately",
+    "limited information",
+    "passages only",
+    "passages do not",
+    "passages don't",
+    "you may need to",
+    "you may want to",
+    "i don't have",
+    "i cannot provide",
+    "not able to provide",
+    "does not provide",
+    "does not offer",
+    "no detailed information",
+)
+def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
+    """
+    Return True when the answer is likely poor quality and should be reformatted
+    or rerouted to the full RAG pipeline.
+    Three signals, checked in order of cost (cheapest first):
+      1. A hedge phrase survived the system-prompt prohibition.
+      2. Chunks were retrieved but the model cited nothing (no [N] markers).
+         Not applicable to Gemini fast-path answers (chunks is always empty there).
+      3. Answer is suspiciously short for a complex query (< 30 words).
+    """
+    lowered = answer.lower()
+    if any(phrase in lowered for phrase in _HEDGE_PHRASES):
+        return True
+    if chunks and not re.search(r"\[\d+\]", answer):
+        return True
+    if complexity == "complex" and len(answer.split()) < 30:
+        return True
+    return False

app/main.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from contextlib import asynccontextmanager
 import os
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
@@ -17,6 +18,8 @@ from app.pipeline.graph import build_pipeline
 from app.security.rate_limiter import limiter, custom_rate_limit_handler
 from app.services.embedder import Embedder
 from app.services.gemini_client import GeminiClient
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
 from app.services.conversation_store import ConversationStore
@@ -25,18 +28,47 @@ from qdrant_client import QdrantClient
 logger = get_logger(__name__)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     settings = get_settings()
     logger.info("Starting PersonaBot API | env=%s", settings.ENVIRONMENT)
     # Attach the in-memory semantic cache. No external service required.
     app.state.semantic_cache = SemanticCache(
         max_size=settings.SEMANTIC_CACHE_SIZE,
         ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
         similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
     )
-    app.state.conversation_store = ConversationStore(settings.DB_PATH)
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
@@ -61,7 +93,6 @@ async def lifespan(app: FastAPI):
     )
     app.state.gemini_client = gemini_client
-    from app.services.llm_client import get_llm_client
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
@@ -76,7 +107,11 @@ async def lifespan(app: FastAPI):
     # ingest run doesn't crash every search with "collection not found".
     vector_store.ensure_collection()
-    llm_client = get_llm_client(settings)
     # Expose llm_client on app state so chat.py can use it for follow-up
     # question generation without re-constructing the client per request.
     app.state.llm_client = llm_client
@@ -90,6 +125,7 @@ async def lifespan(app: FastAPI):
         "vector_store": vector_store,
         "reranker": reranker,
         "db_path": settings.DB_PATH,
     })
     app.state.settings = settings
     app.state.qdrant = qdrant

 from contextlib import asynccontextmanager
 import os
+import sqlite3
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from app.security.rate_limiter import limiter, custom_rate_limit_handler
 from app.services.embedder import Embedder
 from app.services.gemini_client import GeminiClient
+from app.services.github_log import GithubLog
+from app.services.llm_client import get_llm_client, TpmBucket
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
 from app.services.conversation_store import ConversationStore
 logger = get_logger(__name__)
+def _sqlite_row_count(db_path: str) -> int:
+    """Return the current interactions row count, or 0 if the table doesn't exist."""
+    try:
+        with sqlite3.connect(db_path) as conn:
+            return conn.execute("SELECT COUNT(*) FROM interactions").fetchone()[0]
+    except sqlite3.OperationalError:
+        return 0
+    except Exception:
+        return 0
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     settings = get_settings()
     logger.info("Starting PersonaBot API | env=%s", settings.ENVIRONMENT)
+    # Durable GitHub interaction log — survives HF Space restarts.
+    # When PERSONABOT_WRITE_TOKEN is not set (local dev), GithubLog.enabled=False
+    # and all append calls are silent no-ops.
+    github_log = GithubLog(
+        write_token=settings.PERSONABOT_WRITE_TOKEN or "",
+        repo=settings.PERSONABOT_REPO,
+    )
+    app.state.github_log = github_log
     # Attach the in-memory semantic cache. No external service required.
     app.state.semantic_cache = SemanticCache(
         max_size=settings.SEMANTIC_CACHE_SIZE,
         ttl_seconds=settings.SEMANTIC_CACHE_TTL_SECONDS,
         similarity_threshold=settings.SEMANTIC_CACHE_SIMILARITY_THRESHOLD,
     )
+    app.state.conversation_store = ConversationStore(settings.DB_PATH, github_log=github_log)
+    # Issue 1: reconstruct SQLite conversation history from the durable GitHub log
+    # after an ephemeral HF Space restart.  Only triggers when SQLite is empty
+    # (<10 rows) so a healthy Space with accumulated data is never overwritten.
+    if github_log.enabled and _sqlite_row_count(settings.DB_PATH) < 10:
+        logger.info("SQLite appears empty — attempting reconstruction from durable log.")
+        recent = await github_log.load_recent(500)
+        if recent:
+            app.state.conversation_store.populate_from_records(recent)
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
     )
     app.state.gemini_client = gemini_client
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
     # ingest run doesn't crash every search with "collection not found".
     vector_store.ensure_collection()
+    # Issue 7: shared TPM bucket tracks token consumption across the current 60s
+    # window.  Injected into GroqClient so it can downgrade 70B → 8B automatically
+    # when the bucket is above 12,000 tokens, preventing hard rate-limit failures.
+    tpm_bucket = TpmBucket()
+    llm_client = get_llm_client(settings, tpm_bucket=tpm_bucket)
     # Expose llm_client on app state so chat.py can use it for follow-up
     # question generation without re-constructing the client per request.
     app.state.llm_client = llm_client
         "vector_store": vector_store,
         "reranker": reranker,
         "db_path": settings.DB_PATH,
+        "github_log": github_log,
     })
     app.state.settings = settings
     app.state.qdrant = qdrant

app/models/pipeline.py CHANGED Viewed

@@ -51,3 +51,9 @@ class PipelineState(TypedDict):
     # Follow-up question suggestions generated after the main answer.
     # 3 short questions specific to content in the answer.
     follow_ups: list[str]

     # Follow-up question suggestions generated after the main answer.
     # 3 short questions specific to content in the answer.
     follow_ups: list[str]
+    # Which pipeline branch produced the final answer.
+    # Values: "cache_hit", "gemini_fast", "rag", "blocked".
+    # Set by cache, gemini_fast, and generate nodes respectively.
+    # data_prep.py filters to path=="rag" when building reranker triplets because
+    # only RAG interactions have chunk associations that form valid training pairs.
+    path: Optional[str]

app/pipeline/graph.py CHANGED Viewed

@@ -72,7 +72,7 @@ def build_pipeline(services: dict) -> CompiledStateGraph:
     # CRAG: one query rewrite on failed retrieval — then retrieve runs a second time.
     graph.add_node("rewrite_query", make_rewrite_query_node(services["gemini"]))
     graph.add_node("generate",      make_generate_node(services["llm"], services["gemini"]))
-    graph.add_node("log_eval",      make_log_eval_node(services["db_path"]))
     graph.set_entry_point("guard")

     # CRAG: one query rewrite on failed retrieval — then retrieve runs a second time.
     graph.add_node("rewrite_query", make_rewrite_query_node(services["gemini"]))
     graph.add_node("generate",      make_generate_node(services["llm"], services["gemini"]))
+    graph.add_node("log_eval",      make_log_eval_node(services["db_path"], services.get("github_log")))
     graph.set_entry_point("guard")

app/pipeline/nodes/cache.py CHANGED Viewed

@@ -5,6 +5,15 @@
 #
 # The computed query embedding is stored in state so the retrieve node can
 # reuse it directly — avoiding a second identical HTTP call to the embedder.
 from typing import Callable
@@ -13,20 +22,55 @@ import numpy as np
 from app.models.pipeline import PipelineState
 from app.services.semantic_cache import SemanticCache
 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
         # is_query=True: prepend BGE asymmetric instruction so query embedding
         # lands in the retrieval-optimised neighbourhood of the vector space.
         # Document embeddings at ingestion time use is_query=False (default).
-        embedding = await embedder.embed_one(state["query"], is_query=True)
         query_embedding = np.array(embedding)
         cached = await cache.get(query_embedding)
         if cached:
-            return {"answer": cached, "cached": True, "query_embedding": embedding}
         # Store embedding in state so retrieve_node doesn't re-embed the same query.
         return {"cached": False, "query_embedding": embedding}
     return cache_node

 #
 # The computed query embedding is stored in state so the retrieve node can
 # reuse it directly — avoiding a second identical HTTP call to the embedder.
+#
+# Issue 5 (cache bypass narrowing):
+# The previous design bypassed the cache unconditionally for any multi-turn
+# session. This prevented caching self-contained follow-up queries like
+# "what programming languages does Darshan know?" even when they appear after
+# prior turns. The corrected behaviour: check for unresolved reference tokens
+# (pronouns, demonstratives) BEFORE the cache lookup. Only queries that
+# contain such tokens AND have conversation history are cache-bypassed. All
+# other queries in multi-turn sessions go through cache normally.
 from typing import Callable
 from app.models.pipeline import PipelineState
 from app.services.semantic_cache import SemanticCache
+# Tokens that indicate the query cannot be understood without prior context:
+# pronouns and demonstratives that refer to something the user said earlier.
+# "his" and "he" are excluded — they almost always refer to Darshan, not a
+# prior turn, and excluding them would bypass cache on most portfolio queries.
+_REFERENCE_TOKENS: frozenset[str] = frozenset({
+    "that", "it", "its", "they", "their", "those",
+    "this", "these", "them", "there", "then",
+})
+def _has_unresolved_reference(query: str) -> bool:
+    """
+    True when the query contains a pronoun or demonstrative that likely refers
+    to something in the prior conversation turn rather than to Darshan or the
+    portfolio content.
+    """
+    tokens = frozenset(query.lower().split())
+    return bool(tokens & _REFERENCE_TOKENS)
 def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState], dict]:
     async def cache_node(state: PipelineState) -> dict:
+        query = state["query"]
+        has_history = bool(state.get("conversation_history"))
+        # If the query contains a reference token AND the session has history,
+        # the query is a genuine follow-up that cannot be resolved without context.
+        # Skip the cache so the pipeline injects history into downstream nodes.
+        if has_history and _has_unresolved_reference(query):
+            embedding = await embedder.embed_one(query, is_query=True)
+            return {"cached": False, "query_embedding": embedding}
         # is_query=True: prepend BGE asymmetric instruction so query embedding
         # lands in the retrieval-optimised neighbourhood of the vector space.
         # Document embeddings at ingestion time use is_query=False (default).
+        embedding = await embedder.embed_one(query, is_query=True)
         query_embedding = np.array(embedding)
         cached = await cache.get(query_embedding)
         if cached:
+            return {
+                "answer": cached,
+                "cached": True,
+                "query_embedding": embedding,
+                "path": "cache_hit",
+            }
         # Store embedding in state so retrieve_node doesn't re-embed the same query.
         return {"cached": False, "query_embedding": embedding}
     return cache_node

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -9,8 +9,14 @@ Decision logic:
   - Gemini calls search_knowledge_base() → state.thinking=True, pipeline
     goes to retrieve+generate so the user gets a cited answer.
-The `expand` node is no longer part of the graph; this node carries the
-complexity classification it depended on (O(1) heuristic, no LLM call).
 """
 from __future__ import annotations
@@ -19,6 +25,7 @@ from typing import Any
 from app.models.pipeline import PipelineState
 from app.services.gemini_client import GeminiClient
 logger = logging.getLogger(__name__)
@@ -32,13 +39,28 @@ _COMPLEX_SIGNALS: frozenset[str] = frozenset({
     "reference", "proof", "derive", "calculate", "optimise", "optimize",
 })
 def _is_complex(query: str) -> bool:
-    """O(1) heuristic — true when the query signals a need for a cited answer."""
     tokens = set(query.lower().split())
-    if len(tokens) > 20:
         return True
-    return bool(tokens & _COMPLEX_SIGNALS)
 def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
@@ -67,13 +89,28 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         )
         if answer is not None:
-            # Gemini answered from context — no RAG needed.
             logger.debug("Gemini fast-path answered query (len=%d)", len(answer))
             return {
                 "query_complexity": complexity,
                 "answer": answer,
                 "sources": [],
                 "thinking": False,
             }
         # Gemini called search_knowledge_base() — signal RAG via thinking=True.
@@ -86,3 +123,4 @@ def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         }
     return gemini_fast

   - Gemini calls search_knowledge_base() → state.thinking=True, pipeline
     goes to retrieve+generate so the user gets a cited answer.
+Bug A fix (Issue 2): Gemini fast-path answers now run through the same
+is_low_trust() quality gate as Groq answers. If the gate fires (hedge phrase
+detected, or suspiciously short complex answer), the answer is discarded
+and the pipeline routes to full RAG instead of returning a low-quality answer.
+Issue 7 fix: _is_complex() now requires BOTH a keyword match AND query length
+> 8 words, eliminating false-positive complex classifications for short
+conversational queries like "How?" or "How many projects?".
 """
 from __future__ import annotations
 from app.models.pipeline import PipelineState
 from app.services.gemini_client import GeminiClient
+from app.core.quality import is_low_trust
 logger = logging.getLogger(__name__)
     "reference", "proof", "derive", "calculate", "optimise", "optimize",
 })
+# Minimum token count for a query to be classified as complex.
+# Queries shorter than this are almost always conversational or simple
+# biographical lookups regardless of vocabulary. "How?" alone currently
+# triggers 70B without this gate; "How many projects?" should not.
+# Documented in copilot-instructions.md — do not lower without profiling.
+_COMPLEX_MIN_WORDS: int = 8
 def _is_complex(query: str) -> bool:
+    """
+    O(1) heuristic — true when the query signals a need for a cited answer.
+    A query is complex only when BOTH conditions hold:
+      1. It contains a complexity-signal keyword (architecture, explain, etc.)
+      2. Its length exceeds _COMPLEX_MIN_WORDS (eliminates "How?" false positives)
+    OR it is extremely long (>20 tokens, reliably indicates detailed request).
+    """
     tokens = set(query.lower().split())
+    token_count = len(tokens)
+    if token_count > 20:
         return True
+    return bool(tokens & _COMPLEX_SIGNALS) and token_count > _COMPLEX_MIN_WORDS
 def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
         )
         if answer is not None:
+            # Run the same quality gate that guards Groq answers.
+            # Gemini fast-path has no retrieved chunks, so only the hedge-phrase
+            # and short-complex-answer signals apply (chunks argument is []).
+            if is_low_trust(answer, [], complexity):
+                logger.debug(
+                    "Gemini fast-path answer failed quality gate — routing to RAG."
+                )
+                # Clear the answer so route_gemini() sends us to RAG.
+                return {
+                    "query_complexity": complexity,
+                    "expanded_queries": [query],
+                    "thinking": True,
+                }
+            # Gemini answered from context and passed quality gate.
             logger.debug("Gemini fast-path answered query (len=%d)", len(answer))
             return {
                 "query_complexity": complexity,
                 "answer": answer,
                 "sources": [],
                 "thinking": False,
+                "path": "gemini_fast",
             }
         # Gemini called search_knowledge_base() — signal RAG via thinking=True.
         }
     return gemini_fast

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Callable
 from app.models.chat import SourceRef
 from app.models.pipeline import PipelineState
 from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
@@ -135,42 +136,6 @@ def _format_history(history: list[dict]) -> str:
     return "Prior conversation (oldest first):\n" + "\n".join(lines) + "\n\n"
-# Phrases that indicate the model hedged despite having source passages.
-# Gemini reformat is triggered when any of these appear in the Groq draft.
-_HEDGE_PHRASES: tuple[str, ...] = (
-    "unfortunately",
-    "limited information",
-    "passages only",
-    "passages do not",
-    "passages don't",
-    "you may need to",
-    "you may want to",
-    "i don't have",
-    "i cannot provide",
-    "not able to provide",
-    "does not provide",
-    "does not offer",
-    "no detailed information",
-)
-def _is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
-    """
-    True when the Groq draft is likely poor quality and Gemini should rewrite it.
-    Three signals:
-      1. Hedging phrase survived the system-prompt prohibition.
-      2. Chunks were retrieved but the model cited nothing (no [N] markers).
-      3. Answer is suspiciously short for a complex query (< 30 words).
-    """
-    lowered = answer.lower()
-    if any(phrase in lowered for phrase in _HEDGE_PHRASES):
-        return True
-    if chunks and not re.search(r"\[\d+\]", answer):
-        return True
-    if complexity == "complex" and len(answer.split()) < 30:
-        return True
-    return False
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
@@ -194,7 +159,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
             full_answer = ""
             async for token in stream:
                 full_answer += token
-            return {"answer": full_answer, "sources": []}
         # ── Pre-LLM coherence shortcut ──────────────────────────────────────
         # Check that at least one meaningful query token appears somewhere in
@@ -215,7 +180,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
             full_answer = ""
             async for token in stream:
                 full_answer += token
-            return {"answer": full_answer, "sources": []}
         # ── Build numbered context block ────────────────────────────────────
         context_parts: list[str] = []
@@ -273,7 +238,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # Fires when: (a) criticism was detected — always reformat to be safe, or
         # (b) low-trust heuristic flags the draft (hedging / no citations / too short).
         # Zero extra cost on good responses; ~200-400ms only when genuinely needed.
-        if gemini_client is not None and (is_criticism or _is_low_trust(full_answer, reranked_chunks, complexity)):
             logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
@@ -287,6 +252,9 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
         }
     return generate_node

 from app.models.chat import SourceRef
 from app.models.pipeline import PipelineState
 from app.services.llm_client import LLMClient
+from app.core.quality import is_low_trust
 logger = logging.getLogger(__name__)
     return "Prior conversation (oldest first):\n" + "\n".join(lines) + "\n\n"
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
             full_answer = ""
             async for token in stream:
                 full_answer += token
+            return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Pre-LLM coherence shortcut ──────────────────────────────────────
         # Check that at least one meaningful query token appears somewhere in
             full_answer = ""
             async for token in stream:
                 full_answer += token
+            return {"answer": full_answer, "sources": [], "path": "rag"}
         # ── Build numbered context block ────────────────────────────────────
         context_parts: list[str] = []
         # Fires when: (a) criticism was detected — always reformat to be safe, or
         # (b) low-trust heuristic flags the draft (hedging / no citations / too short).
         # Zero extra cost on good responses; ~200-400ms only when genuinely needed.
+        if gemini_client is not None and (is_criticism or is_low_trust(full_answer, reranked_chunks, complexity)):
             logger.debug("Triggering Gemini reformat (criticism=%s).", is_criticism)
             reformatted = await gemini_client.reformat_rag_answer(query, context_block, full_answer)
             if reformatted:
         return {
             "answer": full_answer,
             "sources": cited_sources if cited_sources else source_refs[:2],
+            # Tag this interaction so data_prep.py can filter to RAG-path only
+            # when building reranker triplets (only RAG has chunk associations).
+            "path": "rag",
         }
     return generate_node

app/pipeline/nodes/guard.py CHANGED Viewed

@@ -22,7 +22,8 @@ def make_guard_node(classifier: GuardClassifier) -> Callable[[PipelineState], di
              return {
                  "query": clean_query,
                  "guard_passed": False,
-                 "answer": "I can only answer questions about Darshan's work, projects, and background."
              }
         # 3. Classify (Scope evaluation)
@@ -32,7 +33,8 @@ def make_guard_node(classifier: GuardClassifier) -> Callable[[PipelineState], di
              return {
                  "query": clean_query,
                  "guard_passed": False,
-                 "answer": "I can only answer questions about Darshan's work, projects, and background."
              }
         return {

              return {
                  "query": clean_query,
                  "guard_passed": False,
+                 "answer": "I can only answer questions about Darshan's work, projects, and background.",
+                 "path": "blocked",
              }
         # 3. Classify (Scope evaluation)
              return {
                  "query": clean_query,
                  "guard_passed": False,
+                 "answer": "I can only answer questions about Darshan's work, projects, and background.",
+                 "path": "blocked",
              }
         return {

app/pipeline/nodes/log_eval.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import logging
 import sqlite3
 import os
-from datetime import datetime
 from typing import Callable
 from app.models.pipeline import PipelineState
@@ -11,12 +11,20 @@ from app.core.config import get_settings
 logger = logging.getLogger(__name__)
-def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
     """
     Writes interaction to SQLite synchronously (<5ms) inside the request lifespan.
-    Returns the row ID as interaction_id so the API can expose it for feedback.
-    RAGAS evaluation runs separately in the GitHub Actions eval workflow against
-    the accumulated SQLite data — not in the request path.
     """
     def _write_to_sqlite(state: PipelineState) -> int:
@@ -36,6 +44,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             [{"text": c["text"], "doc_id": c["metadata"]["doc_id"]}
              for c in state.get("reranked_chunks", [])]
         )
         with sqlite3.connect(db_path) as conn:
             conn.execute(
@@ -51,7 +60,8 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
                     reranked_chunks_json TEXT,
                     latency_ms           INTEGER,
                     cached               BOOLEAN,
-                    feedback             INTEGER DEFAULT 0
                 )
                 """
             )
@@ -60,6 +70,8 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
                 ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
                 ("feedback", "INTEGER DEFAULT 0"),
                 ("session_id", "TEXT DEFAULT ''"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -69,11 +81,11 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
-                    (timestamp, session_id, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
-                    datetime.utcnow().isoformat() + "Z",
                     state.get("session_id", ""),
                     state.get("query", ""),
                     state.get("answer", ""),
@@ -82,6 +94,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
                     reranked_chunks_json,
                     state.get("latency_ms", 0),
                     state.get("cached", False),
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]
@@ -89,6 +102,33 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
     async def log_eval_node(state: PipelineState) -> dict:
         try:
             row_id = _write_to_sqlite(state)
             return {"interaction_id": row_id}
         except Exception as e:
             # Log but never surface to user — this node is a sink.

 import logging
 import sqlite3
 import os
+from datetime import datetime, timezone
 from typing import Callable
 from app.models.pipeline import PipelineState
 logger = logging.getLogger(__name__)
+def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState], dict]:
     """
     Writes interaction to SQLite synchronously (<5ms) inside the request lifespan.
+    Also appends to the durable GitHub JSONL log (fire-and-forget background task)
+    so training signals survive HuggingFace Space restarts.
+    The `path` field tags which pipeline branch produced the answer:
+      "cache_hit"    — served from semantic cache, no LLM called.
+      "gemini_fast"  — Gemini answered directly from context summary.
+      "rag"          — full retrieve + rerank + Groq path.
+      "blocked"      — guard rejected the query.
+    data_prep.py filters to path=="rag" when building reranker triplets because
+    only RAG interactions have chunk associations for valid training pairs.
     """
     def _write_to_sqlite(state: PipelineState) -> int:
             [{"text": c["text"], "doc_id": c["metadata"]["doc_id"]}
              for c in state.get("reranked_chunks", [])]
         )
+        path = state.get("path") or "rag"
         with sqlite3.connect(db_path) as conn:
             conn.execute(
                     reranked_chunks_json TEXT,
                     latency_ms           INTEGER,
                     cached               BOOLEAN,
+                    feedback             INTEGER DEFAULT 0,
+                    path                 TEXT DEFAULT 'rag'
                 )
                 """
             )
                 ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
                 ("feedback", "INTEGER DEFAULT 0"),
                 ("session_id", "TEXT DEFAULT ''"),
+                # path column: old rows default to "rag" — they were all RAG interactions.
+                ("path", "TEXT DEFAULT 'rag'"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
+                    (timestamp, session_id, query, answer, chunks_used, rerank_scores, reranked_chunks_json, latency_ms, cached, path)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
+                    datetime.now(tz=timezone.utc).isoformat(),
                     state.get("session_id", ""),
                     state.get("query", ""),
                     state.get("answer", ""),
                     reranked_chunks_json,
                     state.get("latency_ms", 0),
                     state.get("cached", False),
+                    path,
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]
     async def log_eval_node(state: PipelineState) -> dict:
         try:
             row_id = _write_to_sqlite(state)
+            # Append to durable GitHub log (fire-and-forget — never blocks the response).
+            if github_log is not None and github_log.enabled:
+                path = state.get("path") or "rag"
+                record = {
+                    "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+                    "session_id": state.get("session_id", ""),
+                    "query": state.get("query", ""),
+                    "answer": state.get("answer", ""),
+                    "chunks_used": json.loads(
+                        json.dumps([c["metadata"]["doc_id"] for c in state.get("reranked_chunks", [])])
+                    ),
+                    "reranked_chunks_json": [
+                        {"text": c["text"], "doc_id": c["metadata"]["doc_id"]}
+                        for c in state.get("reranked_chunks", [])
+                    ],
+                    "rerank_scores": [
+                        c["metadata"].get("rerank_score", 0.0)
+                        for c in state.get("reranked_chunks", [])
+                    ],
+                    "latency_ms": state.get("latency_ms", 0),
+                    "cached": state.get("cached", False),
+                    "feedback": 0,
+                    "path": path,
+                }
+                github_log.append(record)
             return {"interaction_id": row_id}
         except Exception as e:
             # Log but never surface to user — this node is a sink.

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -17,10 +17,27 @@ from app.services.sparse_encoder import SparseEncoder
 # passages that answer tech-stack or experience questions.
 _MIN_TOP_SCORE: float = -3.5
-# Cap the number of chunks taken from any single source document after reranking.
 # Without this, a verbose doc can crowd out all 5 context slots, hiding other
 # relevant sources and making the answer look one-dimensional.
-_MAX_CHUNKS_PER_DOC: int = 2
 # RRF rank fusion constant. k=60 is the original Cormack et al. default.
 # Higher k reduces the influence of top-1 rank advantage.
@@ -30,6 +47,22 @@ _RRF_K: int = 60
 _sparse_encoder = SparseEncoder()
 def _rrf_merge(ranked_lists: list[list[Chunk]]) -> list[Chunk]:
     """
     Reciprocal Rank Fusion across multiple ranked chunk lists.
@@ -124,12 +157,25 @@ def make_retrieve_node(
                 "retrieval_attempts": attempts + 1,
             }
-        # ── Source diversity cap ───────────────────────────────────────────────
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
         for chunk in reranked:
             doc_id = chunk["metadata"]["doc_id"]
-            if doc_counts.get(doc_id, 0) < _MAX_CHUNKS_PER_DOC:
                 diverse_chunks.append(chunk)
                 doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1

 # passages that answer tech-stack or experience questions.
 _MIN_TOP_SCORE: float = -3.5
+# Default cap: max chunks per source document for BROAD queries.
 # Without this, a verbose doc can crowd out all 5 context slots, hiding other
 # relevant sources and making the answer look one-dimensional.
+_MAX_CHUNKS_PER_DOC_BROAD: int = 2
+# For FOCUSED queries, the matching source is allowed more depth (4 slots)
+# while all other sources are capped at 1.  This prevents the cap from
+# removing the 3rd-most-relevant resume section on an experience question.
+_MAX_CHUNKS_PER_DOC_FOCUSED: int = 4
+_MAX_CHUNKS_OTHER_FOCUSED: int = 1
+# Keywords that imply the visitor wants depth from a specific source type.
+# Values are the source_type values set by ingest (ChunkMetadata.source_type).
+_FOCUS_KEYWORDS: dict[frozenset[str], str] = {
+    frozenset({"experience", "work", "job", "role", "career", "internship",
+               "skills", "skill", "education", "degree", "university",
+               "certification", "certifications", "qualified", "resume", "cv"}): "cv",
+    frozenset({"project", "built", "build", "developed", "architecture",
+               "system", "platform", "app", "application"}): "project",
+    frozenset({"blog", "post", "article", "wrote", "writing", "published"}): "blog",
+}
 # RRF rank fusion constant. k=60 is the original Cormack et al. default.
 # Higher k reduces the influence of top-1 rank advantage.
 _sparse_encoder = SparseEncoder()
+def _focused_source_type(query: str) -> str | None:
+    """
+    Return the source_type that the query is focused on, or None for broad queries.
+    A query is focused when it contains at least one keyword that strongly implies
+    a specific content source (resume, project pages, blog posts).  Broad queries
+    that don't match any category retain the 2-per-doc default cap so no single
+    source dominates the 5 context slots.
+    """
+    tokens = frozenset(query.lower().split())
+    for keyword_set, source_type in _FOCUS_KEYWORDS.items():
+        if tokens & keyword_set:
+            return source_type
+    return None
 def _rrf_merge(ranked_lists: list[list[Chunk]]) -> list[Chunk]:
     """
     Reciprocal Rank Fusion across multiple ranked chunk lists.
                 "retrieval_attempts": attempts + 1,
             }
+        # ── Source diversity cap (query-aware) ─────────────────────────────────
+        # Broad queries: max 2 chunks per source document (anti-resume-monopoly).
+        # Focused queries (experience, skills, project, blog): raise the cap for
+        # the matching source type to 4, cap everything else at 1.  This lets
+        # the resume fill appropriately on "what is Darshan's work experience?"
+        # without harming answer quality on broad queries.
+        focused_type = _focused_source_type(query)
         doc_counts: dict[str, int] = {}
         diverse_chunks: list[Chunk] = []
         for chunk in reranked:
             doc_id = chunk["metadata"]["doc_id"]
+            src_type = chunk["metadata"].get("source_type", "")
+            if focused_type and src_type == focused_type:
+                cap = _MAX_CHUNKS_PER_DOC_FOCUSED
+            elif focused_type:
+                cap = _MAX_CHUNKS_OTHER_FOCUSED
+            else:
+                cap = _MAX_CHUNKS_PER_DOC_BROAD
+            if doc_counts.get(doc_id, 0) < cap:
                 diverse_chunks.append(chunk)
                 doc_counts[doc_id] = doc_counts.get(doc_id, 0) + 1

app/security/sanitizer.py CHANGED Viewed

@@ -1,12 +1,25 @@
-import re
-from typing import Optional
-try:
-    from presidio_analyzer import AnalyzerEngine
-except ImportError:
-    AnalyzerEngine = None
-_analyzer = None
 # LLM token delimiters that attackers embed in queries to escape the system prompt
 # or inject new instructions. Strip them before any further processing.
@@ -20,15 +33,23 @@ _RE_INJECT_TOKENS = re.compile(
     re.IGNORECASE,
 )
-def get_analyzer() -> Optional["AnalyzerEngine"]:
-    global _analyzer
-    if _analyzer is None and AnalyzerEngine is not None:
-        try:
-            _analyzer = AnalyzerEngine()
-        except Exception:
-            _analyzer = None
-    return _analyzer
 def sanitize_input(text: str) -> str:
@@ -54,40 +75,19 @@ def sanitize_input(text: str) -> str:
 def redact_pii(text: str) -> str:
     """
-    Use presidio_analyzer.AnalyzerEngine with language="en".
-    Detect EMAIL_ADDRESS, PHONE_NUMBER, UK_NHS, IBAN_CODE, PERSON.
-    Replace detected spans with [REDACTED].
     """
     if not text:
         return text
-    analyzer = get_analyzer()
-    if not analyzer:
-         # Failsafe if Presidio isn't installed/working
-         return text
-    # PERSON is intentionally excluded: visitors are expected to name Darshan Chheda
-    # in their queries. Redacting that breaks retrieval and confuses the LLM.
-    # We only protect against visitor PII that could leak into logs (e-mail, phone, etc.).
-    entities = ["EMAIL_ADDRESS", "PHONE_NUMBER", "UK_NHS", "IBAN_CODE"]
-    try:
-        results = analyzer.analyze(text=text, entities=entities, language='en')
-        if not results:
-             return text
-        # Sort results by start index in reverse order to comfortably replace without shifting
-        # the remaining string indices.
-        results.sort(key=lambda x: x.start, reverse=True)
-        redacted_text = text
-        for result in results:
-            start = result.start
-            end = result.end
-            redacted_text = redacted_text[:start] + "[REDACTED]" + redacted_text[end:]
-        return redacted_text
-    except Exception:
-        # Failsafe fallback
-        return text

+"""
+backend/app/security/sanitizer.py
+Input sanitisation and lightweight PII redaction for user queries.
+Issue 4 resolution: Presidio was replaced with six compiled regex patterns.
+WHY Presidio was removed
+─────────────────────────
+Presidio uses spaCy-based NLP internally: named entity recognition, pattern
+matching, and context analysis.  This added 50-100ms to every request before
+any business logic ran.  For a personal portfolio chatbot, the realistic PII
+risk is near zero — no legitimate user submits their credit card number or SSN
+to a developer's portfolio assistant.  The threat model does not justify the
+latency cost or the large spaCy model in the Docker image.
+Six regex patterns cover every plausible PII type for this use case and run
+in microseconds, not milliseconds.  If Presidio is ever reconsidered, the
+latency cost must be measured and documented before reintroduction.
+DO NOT reintroduce Presidio or spaCy without explicit justification.
+"""
+import re
 # LLM token delimiters that attackers embed in queries to escape the system prompt
 # or inject new instructions. Strip them before any further processing.
     re.IGNORECASE,
 )
+# Six compiled patterns covering plausible PII in portfolio chatbot input.
+# Named capturing groups make the replacements self-documenting.
+# Patterns are ordered cheapest-first (no backtracking before complex ones).
+_PII_PATTERNS: tuple[re.Pattern, ...] = (
+    # Email address
+    re.compile(r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"),
+    # IPv4 address (before phone to avoid 4-octet false positives in phone patterns)
+    re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
+    # UK phone: 07xxx xxxxxx, +44 7xxx xxxxxx, 01xxx xxxxxx, etc.
+    re.compile(r"\b(?:\+44\s?|0)(?:\d\s?){9,10}\b"),
+    # UK National Insurance number: two letters, six digits, one letter (A–D)
+    re.compile(r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", re.IGNORECASE),
+    # UK sort code: xx-xx-xx or xxxxxx (6 digits)
+    re.compile(r"\b\d{2}[-\s]?\d{2}[-\s]?\d{2}\b"),
+    # Credit card: 13–19 digit sequences with optional spaces/dashes
+    re.compile(r"\b(?:\d[ \-]?){13,19}\b"),
+)
 def sanitize_input(text: str) -> str:
 def redact_pii(text: str) -> str:
     """
+    Detect and redact PII using six lightweight compiled regex patterns.
+    Patterns cover: email address, IPv4 address, UK phone number,
+    UK National Insurance number, UK sort code, and credit card number.
+    Runs in microseconds per query — no NLP model, no spaCy, no network calls.
+    PERSON entities are intentionally not redacted: visitors are expected to
+    name Darshan Chheda in their queries. Redacting that breaks retrieval.
     """
     if not text:
         return text
+    for pattern in _PII_PATTERNS:
+        text = pattern.sub("[REDACTED]", text)
+    return text

app/services/conversation_store.py CHANGED Viewed

@@ -12,11 +12,18 @@ wasting significant token budget on verbatim prior answers.
 All reads/writes are synchronous sqlite3 (<3ms on SSD) — acceptable because:
   1. The call happens once at request start, outside the model call path.
   2. SQLite WAL mode allows concurrent readers and one writer without blocking.
 """
 from __future__ import annotations
 import logging
 import sqlite3
 logger = logging.getLogger(__name__)
@@ -35,8 +42,9 @@ class ConversationStore:
     One instance is created at startup and shared across all requests via app.state.
     """
-    def __init__(self, db_path: str) -> None:
         self._db_path = db_path
     def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
         """
@@ -75,9 +83,12 @@ class ConversationStore:
     def mark_last_negative(self, session_id: str) -> None:
         """
-        Set feedback=-1 on the most recent interaction for `session_id`.
-        Called when the current user message clearly criticises the previous answer.
-        This feeds the self-improvement loop in data_prep.py / purge_bad_chunks.py.
         """
         try:
             with sqlite3.connect(self._db_path) as conn:
@@ -94,4 +105,88 @@ class ConversationStore:
                     (session_id,),
                 )
         except Exception as exc:
-            logger.warning("ConversationStore.mark_last_negative failed: %s", exc)

 All reads/writes are synchronous sqlite3 (<3ms on SSD) — acceptable because:
   1. The call happens once at request start, outside the model call path.
   2. SQLite WAL mode allows concurrent readers and one writer without blocking.
+Issue 1: mark_last_negative() now also fires github_log.append_feedback() so
+negative labels persist across HF Space restarts.  Without this, negative
+examples accumulated during a session are lost on the next restart, and
+data_prep.py cannot produce accurate hard-negative training triplets.
 """
 from __future__ import annotations
+import json
 import logging
 import sqlite3
+from datetime import datetime, timezone
 logger = logging.getLogger(__name__)
     One instance is created at startup and shared across all requests via app.state.
     """
+    def __init__(self, db_path: str, github_log=None) -> None:
         self._db_path = db_path
+        self._github_log = github_log
     def get_recent(self, session_id: str, max_turns: int = _DEFAULT_MAX_TURNS) -> list[dict]:
         """
     def mark_last_negative(self, session_id: str) -> None:
         """
+        Set feedback=-1 on the most recent interaction for `session_id` in SQLite,
+        then durably record the correction in the GitHub JSONL log so the negative
+        label survives a HF Space restart.
+        data_prep.py reads {type:"feedback", feedback:-1} correction records from
+        the durable log and applies them when building reranker training triplets.
         """
         try:
             with sqlite3.connect(self._db_path) as conn:
                     (session_id,),
                 )
         except Exception as exc:
+            logger.warning("ConversationStore.mark_last_negative SQLite failed: %s", exc)
+        # Durable correction record — survives Space restart; not in SQLite only.
+        if self._github_log is not None:
+            self._github_log.append_feedback(session_id, feedback=-1)
+    def populate_from_records(self, records: list[dict]) -> None:
+        """
+        Replay interaction records from the durable GitHub log into SQLite.
+        Called at startup when SQLite is empty after a Space restart so conversation
+        history is available without requiring a full log replay on every request.
+        Only inserts rows for path='rag'|'gemini_fast'|'cache_hit' interactions;
+        skips feedback correction records (type='feedback') which are not interactions.
+        """
+        import os
+        db_dir = os.path.dirname(self._db_path)
+        if db_dir:
+            os.makedirs(db_dir, exist_ok=True)
+        interaction_records = [
+            r for r in records
+            if r.get("type") != "feedback" and r.get("query")
+        ]
+        if not interaction_records:
+            return
+        try:
+            with sqlite3.connect(self._db_path) as conn:
+                conn.execute(
+                    """
+                    CREATE TABLE IF NOT EXISTS interactions (
+                        id                   INTEGER PRIMARY KEY AUTOINCREMENT,
+                        timestamp            TEXT,
+                        session_id           TEXT,
+                        query                TEXT,
+                        answer               TEXT,
+                        chunks_used          TEXT,
+                        rerank_scores        TEXT,
+                        reranked_chunks_json TEXT,
+                        latency_ms           INTEGER,
+                        cached               BOOLEAN,
+                        feedback             INTEGER DEFAULT 0,
+                        path                 TEXT DEFAULT 'rag'
+                    )
+                    """
+                )
+                # Apply feedback corrections: build a map session_id -> feedback
+                # so they can be applied when inserting the matching interactions.
+                feedback_corrections: dict[str, int] = {}
+                for r in records:
+                    if r.get("type") == "feedback":
+                        feedback_corrections[r["session_id"]] = r.get("feedback", 0)
+                for r in interaction_records:
+                    sid = r.get("session_id", "")
+                    feedback = feedback_corrections.get(sid, r.get("feedback", 0))
+                    conn.execute(
+                        """
+                        INSERT INTO interactions
+                            (timestamp, session_id, query, answer, chunks_used,
+                             rerank_scores, reranked_chunks_json, latency_ms, cached, feedback, path)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                        """,
+                        (
+                            r.get("timestamp", datetime.now(tz=timezone.utc).isoformat()),
+                            sid,
+                            r.get("query", ""),
+                            r.get("answer", ""),
+                            json.dumps(r.get("chunks_used", [])),
+                            json.dumps(r.get("rerank_scores", [])),
+                            json.dumps(r.get("reranked_chunks_json", [])),
+                            r.get("latency_ms", 0),
+                            r.get("cached", False),
+                            feedback,
+                            r.get("path", "rag"),
+                        ),
+                    )
+            logger.info(
+                "Reconstructed %d interactions from durable GitHub log into SQLite.",
+                len(interaction_records),
+            )
+        except Exception as exc:
+            logger.warning("ConversationStore.populate_from_records failed: %s", exc)

app/services/github_log.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+backend/app/services/github_log.py
+Durable interaction log backed by a JSONL file in the PersonaBot GitHub repo.
+HuggingFace Spaces free tier destroys in-Space storage (SQLite, /data/) on every
+restart, maintenance window, and idle reclamation.  Every interaction written only
+to SQLite is silently reset to zero — the self-improvement loop accumulates nothing
+across restarts.
+This service appends each interaction as a single JSON line to a committed file in
+the PersonaBot repo via the GitHub Contents API, using PERSONABOT_WRITE_TOKEN.  The
+file survives Space restarts because it lives in Git, not on the Space filesystem.
+On Space startup, if SQLite is empty (< 10 rows), the last 500 lines are fetched from
+this file and replayed into SQLite so conversation history and training signals are
+available immediately without a full log replay on every request.
+Negative feedback (mark_last_negative) is durably recorded by appending a correction
+record {type:"feedback", feedback:-1, session_id:...} that data_prep.py interprets when
+building training triplets.
+Failure modes
+─────────────
+If the GitHub API call fails (rate limit, network error, 409 SHA conflict), the error
+is logged at WARNING level and the interaction is NOT lost — it is always written to
+SQLite first.  The durable log is a best-effort durability layer, not a primary store.
+"""
+from __future__ import annotations
+import asyncio
+import base64
+import json
+import logging
+from datetime import datetime, timezone
+import httpx
+logger = logging.getLogger(__name__)
+# Fixed path inside the PersonaBot repository. The retrain workflow reads this
+# file directly from the repo checkout — no admin endpoint download required.
+_LOG_PATH = "data/interactions.jsonl"
+_API_TIMEOUT = 20
+class GithubLog:
+    """
+    Append-only JSONL log backed by the PersonaBot GitHub repo.
+    All writes are fire-and-forget background tasks so they never add latency
+    to the SSE stream.  This object is created once at startup and shared
+    across all requests via app.state.github_log.
+    """
+    def __init__(self, write_token: str, repo: str) -> None:
+        self._token = write_token
+        self._repo = repo
+        self._api_url = f"https://api.github.com/repos/{repo}/contents/{_LOG_PATH}"
+        self._headers = {
+            "Authorization": f"Bearer {write_token}",
+            "Accept": "application/vnd.github+json",
+        }
+    @property
+    def enabled(self) -> bool:
+        return bool(self._token)
+    def append(self, record: dict) -> None:
+        """
+        Schedule a background task to append `record` to the durable JSONL log.
+        Returns immediately — never blocks the request path.
+        """
+        if not self.enabled:
+            return
+        # asyncio.create_task requires a running event loop; log_eval is async so this is safe.
+        asyncio.create_task(self._append_bg(record))
+    async def _append_bg(self, record: dict) -> None:
+        try:
+            async with httpx.AsyncClient(timeout=_API_TIMEOUT) as client:
+                get_r = await client.get(self._api_url, headers=self._headers)
+                if get_r.status_code == 200:
+                    data = get_r.json()
+                    sha: str | None = data["sha"]
+                    current = base64.b64decode(
+                        data["content"].replace("\n", "")
+                    ).decode("utf-8")
+                elif get_r.status_code == 404:
+                    sha = None
+                    current = ""
+                else:
+                    logger.warning(
+                        "GithubLog GET failed (%d) — interaction not logged durably.",
+                        get_r.status_code,
+                    )
+                    return
+                new_content = current.rstrip("\n") + "\n" + json.dumps(record) + "\n"
+                encoded = base64.b64encode(new_content.encode("utf-8")).decode("ascii")
+                payload: dict = {
+                    "message": "log: append interaction [skip ci]",
+                    "content": encoded,
+                }
+                if sha:
+                    payload["sha"] = sha
+                put_r = await client.put(
+                    self._api_url, headers=self._headers, json=payload
+                )
+                if put_r.status_code not in (200, 201):
+                    # 409 = SHA conflict (two concurrent appends) — rare for a portfolio bot.
+                    # The interaction is safe in SQLite; this is a best-effort durability layer.
+                    logger.warning(
+                        "GithubLog PUT failed (%d) — interaction not logged durably.",
+                        put_r.status_code,
+                    )
+        except Exception as exc:
+            logger.warning("GithubLog.append error: %s", exc)
+    async def load_recent(self, n: int = 500) -> list[dict]:
+        """
+        Fetch the last `n` interaction records from the durable log.
+        Used at Space startup to reconstruct SQLite after an ephemeral restart.
+        Returns [] if the file doesn't exist or if the token is not configured.
+        """
+        if not self.enabled:
+            return []
+        try:
+            async with httpx.AsyncClient(timeout=_API_TIMEOUT) as client:
+                r = await client.get(self._api_url, headers=self._headers)
+                if r.status_code == 404:
+                    return []
+                if r.status_code != 200:
+                    logger.warning("GithubLog.load_recent GET failed (%d).", r.status_code)
+                    return []
+                content = base64.b64decode(
+                    r.json()["content"].replace("\n", "")
+                ).decode("utf-8")
+                lines = [ln.strip() for ln in content.splitlines() if ln.strip()]
+                records: list[dict] = []
+                for line in lines[-n:]:
+                    try:
+                        records.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        pass
+                return records
+        except Exception as exc:
+            logger.warning("GithubLog.load_recent error: %s", exc)
+            return []
+    def append_feedback(self, session_id: str, feedback: int) -> None:
+        """
+        Durably record a feedback update without rewriting an existing line.
+        data_prep.py applies these correction records when building triplets.
+        """
+        if not self.enabled:
+            return
+        record = {
+            "type": "feedback",
+            "session_id": session_id,
+            "feedback": feedback,
+            "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+        }
+        asyncio.create_task(self._append_bg(record))

app/services/llm_client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
-from typing import AsyncIterator, Literal, Protocol
 import httpx
 from groq import AsyncGroq
@@ -9,6 +10,41 @@ from app.core.config import Settings
 from app.core.exceptions import GenerationError
 class LLMClient(Protocol):
     async def complete(self, prompt: str, system: str, stream: bool) -> AsyncIterator[str]:
         ...
@@ -21,7 +57,7 @@ class LLMClient(Protocol):
 class GroqClient:
-    def __init__(self, api_key: str, model_default: str, model_large: str):
         if not api_key or api_key == "gsk_placeholder":
              # We might be initialized in a test context without a real key
              self.client = None
@@ -30,6 +66,8 @@ class GroqClient:
         self.model_default = model_default
         self.model_large = model_large
     @retry(stop=stop_after_attempt(2), wait=wait_fixed(1.0), retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)))
     async def classify_complexity(self, query: str) -> Literal["simple", "complex"]:
@@ -86,12 +124,22 @@ class GroqClient:
     async def complete_with_complexity(self, prompt: str, system: str, stream: bool, complexity: str) -> AsyncIterator[str]:
-         # Helper to allow pipeline nodes to pass the pre-classified complexity
          if not self.client:
              raise GenerationError("GroqClient not configured with an API Key.")
-         model = self.model_large if complexity == "complex" else self.model_default
          try:
              stream_response = await self.client.chat.completions.create(
                  messages=[
@@ -99,16 +147,22 @@ class GroqClient:
                      {"role": "user", "content": prompt}
                  ],
                  model=model,
-                 stream=stream # Instruct strictly said stream=True yields token chunks.
              )
              if stream:
                  async for chunk in stream_response:
                      content = chunk.choices[0].delta.content
                      if content:
                          yield content
              else:
-                 yield stream_response.choices[0].message.content
          except Exception as e:
             raise GenerationError("Groq completion failed", context={"error": str(e)}) from e
@@ -186,7 +240,7 @@ class OllamaClient:
              yield token
-def get_llm_client(settings: Settings) -> LLMClient:
     if settings.LLM_PROVIDER == "ollama":
         if not settings.OLLAMA_BASE_URL or not settings.OLLAMA_MODEL:
              raise ValueError("OLLAMA_BASE_URL and OLLAMA_MODEL must be explicitly set when LLM_PROVIDER is 'ollama'")
@@ -199,5 +253,6 @@ def get_llm_client(settings: Settings) -> LLMClient:
         return GroqClient(
             api_key=settings.GROQ_API_KEY or "",
             model_default=settings.GROQ_MODEL_DEFAULT,
-            model_large=settings.GROQ_MODEL_LARGE
         )

 import json
+import time
+from typing import AsyncIterator, Literal, Optional, Protocol
 import httpx
 from groq import AsyncGroq
 from app.core.exceptions import GenerationError
+class TpmBucket:
+    """
+    Sliding 60-second token-consumption tracker shared across all Groq calls.
+    Issue 7: When the bucket exceeds 12,000 estimated tokens in the current
+    minute window, complete_with_complexity() downgrades 70B calls to 8B
+    automatically.  This leaves 2,400 TPM headroom and prevents hard failures
+    (HTTP 429) from degrading the service under load.
+    Token estimates are rough (prompt_chars / 4) but accurate enough for this
+    protective purpose — the goal is load shedding, not exact accounting.
+    """
+    _WINDOW_SECONDS: int = 60
+    _DOWNGRADE_THRESHOLD: int = 12_000
+    def __init__(self) -> None:
+        self._count: int = 0
+        self._window_start: float = time.monotonic()
+    def add(self, estimated_tokens: int) -> None:
+        now = time.monotonic()
+        if now - self._window_start >= self._WINDOW_SECONDS:
+            self._count = 0
+            self._window_start = now
+        self._count += estimated_tokens
+    @property
+    def should_downgrade(self) -> bool:
+        now = time.monotonic()
+        if now - self._window_start >= self._WINDOW_SECONDS:
+            return False
+        return self._count > self._DOWNGRADE_THRESHOLD
 class LLMClient(Protocol):
     async def complete(self, prompt: str, system: str, stream: bool) -> AsyncIterator[str]:
         ...
 class GroqClient:
+    def __init__(self, api_key: str, model_default: str, model_large: str, tpm_bucket: Optional[TpmBucket] = None):
         if not api_key or api_key == "gsk_placeholder":
              # We might be initialized in a test context without a real key
              self.client = None
         self.model_default = model_default
         self.model_large = model_large
+        # Shared TPM bucket — injected at startup, None in test contexts.
+        self._tpm_bucket = tpm_bucket
     @retry(stop=stop_after_attempt(2), wait=wait_fixed(1.0), retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)))
     async def classify_complexity(self, query: str) -> Literal["simple", "complex"]:
     async def complete_with_complexity(self, prompt: str, system: str, stream: bool, complexity: str) -> AsyncIterator[str]:
+         # Helper to allow pipeline nodes to pass the pre-classified complexity.
+         # Issue 7: if the shared TPM bucket is above 12,000 tokens in the current
+         # minute window, downgrade 70B to 8B to prevent hard rate-limit failures.
          if not self.client:
              raise GenerationError("GroqClient not configured with an API Key.")
+         if complexity == "complex" and self._tpm_bucket is not None and self._tpm_bucket.should_downgrade:
+             model = self.model_default
+         else:
+             model = self.model_large if complexity == "complex" else self.model_default
+         # Estimate input tokens before the call so the bucket reflects the full
+         # cost even when the response is long.  4 chars ≈ 1 token (rough heuristic).
+         if self._tpm_bucket is not None:
+             self._tpm_bucket.add((len(prompt) + len(system)) // 4)
          try:
              stream_response = await self.client.chat.completions.create(
                  messages=[
                      {"role": "user", "content": prompt}
                  ],
                  model=model,
+                 stream=stream  # Instruct strictly said stream=True yields token chunks.
              )
              if stream:
                  async for chunk in stream_response:
                      content = chunk.choices[0].delta.content
                      if content:
+                         # Accumulate estimated response tokens in the bucket.
+                         if self._tpm_bucket is not None:
+                             self._tpm_bucket.add(len(content) // 4 or 1)
                          yield content
              else:
+                 full = stream_response.choices[0].message.content
+                 if self._tpm_bucket is not None and full:
+                     self._tpm_bucket.add(len(full) // 4)
+                 yield full
          except Exception as e:
             raise GenerationError("Groq completion failed", context={"error": str(e)}) from e
              yield token
+def get_llm_client(settings: Settings, tpm_bucket: Optional[TpmBucket] = None) -> LLMClient:
     if settings.LLM_PROVIDER == "ollama":
         if not settings.OLLAMA_BASE_URL or not settings.OLLAMA_MODEL:
              raise ValueError("OLLAMA_BASE_URL and OLLAMA_MODEL must be explicitly set when LLM_PROVIDER is 'ollama'")
         return GroqClient(
             api_key=settings.GROQ_API_KEY or "",
             model_default=settings.GROQ_MODEL_DEFAULT,
+            model_large=settings.GROQ_MODEL_LARGE,
+            tpm_bucket=tpm_bucket,
         )

requirements.txt CHANGED Viewed

@@ -16,7 +16,9 @@ groq>=0.5.0
 httpx>=0.27.0
 numpy>=1.26.0
 slowapi>=0.1.9
-presidio-analyzer>=2.2.354
 tenacity>=8.3.0
 python-jose[cryptography]>=3.3.0
 google-genai>=1.0.0

 httpx>=0.27.0
 numpy>=1.26.0
 slowapi>=0.1.9
+# presidio-analyzer was removed (Issue 4): spaCy-based NLP added 50-100ms to every
+# request for near-zero real-world PII risk. Replaced with six compiled regex
+# patterns in sanitizer.py that run in microseconds. See copilot-instructions.md.
 tenacity>=8.3.0
 python-jose[cryptography]>=3.3.0
 google-genai>=1.0.0