Spaces:

XQ
/

Dokumentassistent

Running

App Files Files

XQ commited on Apr 13

Commit

4d2a2da

1 Parent(s): 38d8c65

Add LLM Provider Fallback

Browse files

Files changed (24) hide show

.env.example +26 -0
src/agent/intent_classifier.py +2 -10
src/agent/plan_and_execute.py +10 -52
src/agent/prompts/__init__.py +10 -0
src/agent/prompts/broaden_query.v1.yaml +8 -0
src/agent/prompts/detect_language_and_intent.v1.yaml +13 -0
src/agent/prompts/detect_languages.v1.yaml +8 -0
src/agent/prompts/executor_system.v1.yaml +10 -0
src/agent/prompts/intent_classify.v1.yaml +13 -0
src/agent/prompts/multi_query_decompose.v1.yaml +9 -0
src/agent/prompts/planner.v1.yaml +35 -0
src/agent/prompts/registry.py +165 -0
src/agent/prompts/summarize_document.v1.yaml +16 -0
src/agent/prompts/synthesizer.v1.yaml +14 -0
src/agent/prompts/translate_query.v1.yaml +7 -0
src/agent/router.py +15 -21
src/agent/token_budget.py +87 -0
src/agent/tools.py +10 -24
src/api/main.py +14 -2
src/config.py +27 -0
src/provider.py +92 -0
tests/test_llm_fallback.py +329 -0
tests/test_prompts_registry.py +199 -0
tests/test_token_budget.py +42 -0

.env.example CHANGED Viewed

@@ -135,3 +135,29 @@ LOG_LEVEL=INFO
 # --- Inter-service Communication (bare-metal defaults) -----------------------
 API_BASE_URL=http://localhost:8000  # Docker overrides to http://api:8000

 # --- Inter-service Communication (bare-metal defaults) -----------------------
 API_BASE_URL=http://localhost:8000  # Docker overrides to http://api:8000
+# --- Token Budget (measure-only) ---------------------------------------------
+# When true, the routers log estimated prompt token sizes at the three known
+# generation points (generate_answer, planner, synthesizer). No truncation is
+# applied — this is purely observability. Counts use tiktoken cl100k as a
+# baseline with a 1.5x safety factor for non-OpenAI multilingual tokenizers.
+# TOKEN_BUDGET_ENABLED=false
+# --- LLM Provider Fallback ---------------------------------------------------
+# When enabled, the primary LLM is wrapped with LangChain with_fallbacks so
+# requests that fail on the primary are retried against each provider in the
+# chain (left to right). DEFAULT OFF. Switching from a local privacy-aware
+# provider (Ollama) to a cloud provider (OpenAI / Anthropic / ...) has both
+# COST and DATA-EXFILTRATION implications.
+# Your requests may leave the tenant when switching from local to cloud.
+#
+# Limitations to be aware of:
+#  - Disabled automatically when AGENT_MODE=react (RunnableWithFallbacks is
+#    incompatible with bind_tools used by the react sub-agent).
+#  - Mid-stream failures are NOT covered: with_fallbacks only catches errors
+#    raised before the first token; a connection drop mid-generation will
+#    surface as an exception to the caller.
+#  - Each fallback activation is logged at WARNING level naming the destination
+#    provider — check application logs for unexpected switches.
+# LLM_FALLBACK_ENABLED=false
+# LLM_FALLBACK_PROVIDERS=openai,anthropic    # Comma-separated provider chain

src/agent/intent_classifier.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from src.models import IntentType
 logger = logging.getLogger(__name__)
@@ -16,16 +17,7 @@ _THINK_UNCLOSED_RE = re.compile(r"<think>.*", re.DOTALL)
 _VALID_INTENTS = {intent.value for intent in IntentType}
-_SYSTEM_PROMPT = (
-    "You are an intent classifier. Given a user query, classify it into exactly "
-    "one of the following categories: factual, summary, comparison, procedural, unknown.\n\n"
-    "- factual: the user asks for a specific fact or piece of information.\n"
-    "- summary: the user wants a summary or overview of a topic.\n"
-    "- comparison: the user wants to compare two or more things.\n"
-    "- procedural: the user asks how to do something step by step.\n"
-    "- unknown: the query does not fit any of the above.\n\n"
-    "Respond with ONLY the category name in lowercase, nothing else."
-)
 class IntentClassifier:

 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
+from src.agent.prompts import get_prompt
 from src.models import IntentType
 logger = logging.getLogger(__name__)
 _VALID_INTENTS = {intent.value for intent in IntentType}
+_SYSTEM_PROMPT = get_prompt("intent_classify").template
 class IntentClassifier:

src/agent/plan_and_execute.py CHANGED Viewed

@@ -26,6 +26,8 @@ from langgraph.graph import END, StateGraph
 from langgraph.prebuilt import create_react_agent
 from src.agent.memory import ConversationMemory
 from src.agent.tools import ToolResultStore, detect_document_languages, make_retrieval_tools
 from src.models import GenerationResponse, IntentType, PipelineDetails
 from src.retrieval.hybrid import HybridRetriever
@@ -37,60 +39,12 @@ logger = logging.getLogger(__name__)
 _MAX_STEPS = 6
 # ------------------------------------------------------------------
-# Prompts
 # ------------------------------------------------------------------
-_PLANNER_PROMPT = (
-    "You are a planning assistant for the University of Copenhagen (KU) document system.\n\n"
-    "Given a user question, produce a JSON list of 1–4 steps needed to answer it.\n"
-    "Each step is an object with:\n"
-    '  - "action": one of "search", "search_within", "multi_search", '
-    '"summarize", "list_docs", "fetch_doc"\n'
-    '  - "detail": a short description of what to do (e.g. the search query, document ID)\n\n'
-    "Rules:\n"
-    "- IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.\n"
-    "- For simple factual questions: 1 search step is enough.\n"
-    "- For comparison questions: use multi_search or separate search steps.\n"
-    "- For document overview requests: use summarize.\n"
-    "- For questions with multiple aspects: use 2–4 separate steps.\n"
-    "- Always end with the steps needed; do NOT include a final 'answer' step.\n\n"
-    "Reply with ONLY the JSON array, nothing else. No explanation, no thinking.\n\n"
-    "Examples:\n"
-    'Question: "What is the exam policy?"\n'
-    '[{"action": "search", "detail": "KU eksamensregler"}]\n\n'
-    'Question: "Compare vacation rules for academic vs administrative staff"\n'
-    '[{"action": "search", "detail": "ferieregler videnskabeligt personale"}, '
-    '{"action": "search", "detail": "ferieregler administrativt personale"}]\n\n'
-    'Question: "Summarize the AI policy document"\n'
-    '[{"action": "summarize", "detail": "ku_ai_policy.pdf"}]\n\n'
-    'Question: "Which documents are about AI? Summarize and find the rules for written exams"\n'
-    '[{"action": "list_docs", "detail": "list all available documents"}, '
-    '{"action": "search", "detail": "AI dokumenter KU"}, '
-    '{"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]\n\n'
-    "Now plan for this question:\n"
-)
-_EXECUTOR_SYSTEM = (
-    "/no_think\n"
-    "You are executing ONE step of a plan to answer a user's question about "
-    "University of Copenhagen (KU) documents.\n\n"
-    "You have retrieval tools available. Execute the step described below, "
-    "then summarise what you found in 2-3 sentences. If you find nothing "
-    "relevant, say so clearly.\n\n"
-    "Do NOT produce a final answer — just report what you found for this step."
-)
-_SYNTHESIZER_PROMPT = (
-    "You are a helpful assistant for administrative staff at the University "
-    "of Copenhagen (KU).\n\n"
-    "Below are the results gathered from multiple research steps. "
-    "Synthesize them into a single coherent answer to the user's original question.\n\n"
-    "Guidelines:\n"
-    "- Cite document sources using [1], [2], etc.\n"
-    "- Answer in the same language as the user's question.\n"
-    "- Be concise but thorough.\n"
-    "- If some steps found no results, acknowledge gaps honestly.\n\n"
-)
 # ------------------------------------------------------------------
@@ -146,6 +100,7 @@ class PlanAndExecuteRouter:
         default_top_k: int = 5,
         memory: ConversationMemory | None = None,
         document_languages: list[str] | None = None,
     ) -> None:
         """Initialise the Plan-and-Execute router.
@@ -172,6 +127,7 @@ class PlanAndExecuteRouter:
         self._document_languages: list[str] | None = (
             list(document_languages) if document_languages else None
         )
     def _ensure_document_languages(self) -> list[str]:
         """Lazily detect and cache the document corpus languages via the LLM.
@@ -202,6 +158,7 @@ class PlanAndExecuteRouter:
                 f"{history}\n\n"
             )
         prompt = _PLANNER_PROMPT + history_section + f'Question: "{state["query"]}"'
         raw = _extract_content(self._llm.invoke(prompt))
         logger.info("Planner raw output: %s", raw)
@@ -284,6 +241,7 @@ class PlanAndExecuteRouter:
             f"Research results:\n{gathered}\n\n"
             f"Answer:"
         )
         answer = _extract_content(self._llm.invoke(prompt))
         logger.info("Synthesized final answer (%d chars)", len(answer))
         return {"answer": answer}

 from langgraph.prebuilt import create_react_agent
 from src.agent.memory import ConversationMemory
+from src.agent.prompts import get_prompt
+from src.agent.token_budget import measure as _measure_tokens
 from src.agent.tools import ToolResultStore, detect_document_languages, make_retrieval_tools
 from src.models import GenerationResponse, IntentType, PipelineDetails
 from src.retrieval.hybrid import HybridRetriever
 _MAX_STEPS = 6
 # ------------------------------------------------------------------
+# Prompts (loaded from src/agent/prompts/*.yaml)
 # ------------------------------------------------------------------
+_PLANNER_PROMPT = get_prompt("planner").template
+_EXECUTOR_SYSTEM = get_prompt("executor_system").template
+_SYNTHESIZER_PROMPT = get_prompt("synthesizer").template
 # ------------------------------------------------------------------
         default_top_k: int = 5,
         memory: ConversationMemory | None = None,
         document_languages: list[str] | None = None,
+        token_budget_enabled: bool = False,
     ) -> None:
         """Initialise the Plan-and-Execute router.
         self._document_languages: list[str] | None = (
             list(document_languages) if document_languages else None
         )
+        self._token_budget_enabled = token_budget_enabled
     def _ensure_document_languages(self) -> list[str]:
         """Lazily detect and cache the document corpus languages via the LLM.
                 f"{history}\n\n"
             )
         prompt = _PLANNER_PROMPT + history_section + f'Question: "{state["query"]}"'
+        _measure_tokens("planner", prompt, enabled=self._token_budget_enabled)
         raw = _extract_content(self._llm.invoke(prompt))
         logger.info("Planner raw output: %s", raw)
             f"Research results:\n{gathered}\n\n"
             f"Answer:"
         )
+        _measure_tokens("synthesizer", prompt, enabled=self._token_budget_enabled)
         answer = _extract_content(self._llm.invoke(prompt))
         logger.info("Synthesized final answer (%d chars)", len(answer))
         return {"answer": answer}

src/agent/prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Centralised prompt registry.
+All user-visible LLM prompts live in ``*.yaml`` files in this directory.
+The registry loads them once at import time and returns raw template
+strings that callers format with ``str.format(**kwargs)``.
+"""
+from src.agent.prompts.registry import PromptRegistry, get_prompt, render_prompt
+__all__ = ["PromptRegistry", "get_prompt", "render_prompt"]

src/agent/prompts/broaden_query.v1.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: broaden_query
+version: v1
+description: Rewrite a retrieval query when the reranker confidence is low.
+template: |-
+  The following search query did not return good results from the document database. Rewrite it to be broader or use different keywords while keeping the same meaning. Reply with ONLY the rewritten query, nothing else.
+  Original question: {query}
+  Failed search query: {retrieval_query}

src/agent/prompts/detect_language_and_intent.v1.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+name: detect_language_and_intent
+version: v1
+description: Single-call prompt for the router that detects query language and classifies intent.
+template: |-
+  You are given a user query. Do TWO things:
+  1. Detect the language of the query (reply with the language name in English, e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').
+  2. Classify the intent into exactly one of: {valid_intents}.
+  Reply with EXACTLY two lines, nothing else:
+  language: <language>
+  intent: <intent>
+  Query: {query}

src/agent/prompts/detect_languages.v1.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: detect_languages
+version: v1
+description: Ask the LLM to enumerate every language present across sampled corpus chunks.
+template: |-
+  You are a language detector. The text samples below come from different documents in a knowledge base. Identify ALL distinct languages present across the samples (do not list a language more than once). Reply with ONLY the language names in English, one per line, no explanation.
+  Samples:
+  {sample_text}

src/agent/prompts/executor_system.v1.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: executor_system
+version: v1
+description: System prompt for the per-step ReAct executor sub-agent in Plan-and-Execute.
+template: |-
+  /no_think
+  You are executing ONE step of a plan to answer a user's question about University of Copenhagen (KU) documents.
+  You have retrieval tools available. Execute the step described below, then summarise what you found in 2-3 sentences. If you find nothing relevant, say so clearly.
+  Do NOT produce a final answer — just report what you found for this step.

src/agent/prompts/intent_classify.v1.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+name: intent_classify
+version: v1
+description: System prompt for IntentClassifier — classifies a user query into one of five intent categories.
+template: |-
+  You are an intent classifier. Given a user query, classify it into exactly one of the following categories: factual, summary, comparison, procedural, unknown.
+  - factual: the user asks for a specific fact or piece of information.
+  - summary: the user wants a summary or overview of a topic.
+  - comparison: the user wants to compare two or more things.
+  - procedural: the user asks how to do something step by step.
+  - unknown: the query does not fit any of the above.
+  Respond with ONLY the category name in lowercase, nothing else.

src/agent/prompts/multi_query_decompose.v1.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+name: multi_query_decompose
+version: v1
+description: Decompose a complex user question into 2–4 independent sub-queries for multi_query_search.
+template: |-
+  You are a search query planner. Given a complex question, decompose it into 2-4 simple, independent search queries that together cover all aspects of the question. {lang_clause}
+  Reply with ONLY the queries, one per line, nothing else.
+  Question: {question}

src/agent/prompts/planner.v1.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+name: planner
+version: v1
+description: Planner prompt for the Plan-and-Execute router. Consumed as a raw prefix; caller appends history and question.
+template: |
+  You are a planning assistant for the University of Copenhagen (KU) document system.
+  Given a user question, produce a JSON list of 1–4 steps needed to answer it.
+  Each step is an object with:
+    - "action": one of "search", "search_within", "multi_search", "summarize", "list_docs", "fetch_doc"
+    - "detail": a short description of what to do (e.g. the search query, document ID)
+  Rules:
+  - IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.
+  - For simple factual questions: 1 search step is enough.
+  - For comparison questions: use multi_search or separate search steps.
+  - For document overview requests: use summarize.
+  - For questions with multiple aspects: use 2–4 separate steps.
+  - Always end with the steps needed; do NOT include a final 'answer' step.
+  Reply with ONLY the JSON array, nothing else. No explanation, no thinking.
+  Examples:
+  Question: "What is the exam policy?"
+  [{"action": "search", "detail": "KU eksamensregler"}]
+  Question: "Compare vacation rules for academic vs administrative staff"
+  [{"action": "search", "detail": "ferieregler videnskabeligt personale"}, {"action": "search", "detail": "ferieregler administrativt personale"}]
+  Question: "Summarize the AI policy document"
+  [{"action": "summarize", "detail": "ku_ai_policy.pdf"}]
+  Question: "Which documents are about AI? Summarize and find the rules for written exams"
+  [{"action": "list_docs", "detail": "list all available documents"}, {"action": "search", "detail": "AI dokumenter KU"}, {"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]
+  Now plan for this question:

src/agent/prompts/registry.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""Prompt registry that loads YAML prompt definitions from this package.
+Each ``*.yaml`` file in this directory defines one prompt version with
+frontmatter-style fields (``name``, ``version``, ``description``,
+``template``). The registry is a read-only singleton populated at first
+access; tests can call :func:`reload` to force a refresh.
+Templates are plain Python ``str.format`` templates. The registry
+deliberately does NOT wrap them in a ``PromptTemplate`` — callers already
+have their own composition logic and snapshot tests guarantee that the
+rendered output is byte-identical to the pre-migration strings.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from threading import Lock
+import yaml
+logger = logging.getLogger(__name__)
+_PROMPTS_DIR = Path(__file__).parent
+@dataclass(frozen=True)
+class PromptSpec:
+    """Single loaded prompt definition.
+    Attributes:
+        name: Logical prompt name (e.g. ``intent_classify``).
+        version: Version string (e.g. ``v1``).
+        description: Short human-readable description.
+        template: The raw template string with ``{var}`` placeholders.
+        source_path: Absolute path of the YAML file this spec came from.
+    """
+    name: str
+    version: str
+    description: str
+    template: str
+    source_path: Path
+class PromptRegistry:
+    """Loads and serves prompt templates from YAML files."""
+    _instance: "PromptRegistry | None" = None
+    _lock = Lock()
+    def __init__(self, prompts_dir: Path = _PROMPTS_DIR) -> None:
+        """Initialise the registry.
+        Args:
+            prompts_dir: Directory containing prompt YAML files.
+        """
+        self._prompts_dir = prompts_dir
+        self._by_name: dict[str, dict[str, PromptSpec]] = {}
+        self._latest: dict[str, PromptSpec] = {}
+        self._load()
+    def _load(self) -> None:
+        """Scan ``prompts_dir`` and populate in-memory indices."""
+        self._by_name.clear()
+        self._latest.clear()
+        for path in sorted(self._prompts_dir.glob("*.yaml")):
+            with path.open("r", encoding="utf-8") as fh:
+                data = yaml.safe_load(fh)
+            if not isinstance(data, dict):
+                raise ValueError(f"Prompt file {path} must contain a YAML mapping")
+            missing = {"name", "version", "template"} - data.keys()
+            if missing:
+                raise ValueError(
+                    f"Prompt file {path} is missing fields: {sorted(missing)}"
+                )
+            spec = PromptSpec(
+                name=str(data["name"]),
+                version=str(data["version"]),
+                description=str(data.get("description", "")),
+                template=str(data["template"]),
+                source_path=path,
+            )
+            versions = self._by_name.setdefault(spec.name, {})
+            if spec.version in versions:
+                raise ValueError(
+                    f"Duplicate prompt {spec.name}@{spec.version} in {path}"
+                )
+            versions[spec.version] = spec
+            # Latest wins by lexicographic version comparison — "v2" > "v1".
+            current_latest = self._latest.get(spec.name)
+            if current_latest is None or spec.version > current_latest.version:
+                self._latest[spec.name] = spec
+        logger.info(
+            "PromptRegistry loaded %d prompts from %s",
+            len(self._by_name), self._prompts_dir,
+        )
+    def get(self, name: str, version: str | None = None) -> PromptSpec:
+        """Return the :class:`PromptSpec` for ``name`` / ``version``.
+        Args:
+            name: Logical prompt name.
+            version: Specific version, or ``None`` for the latest.
+        Returns:
+            The matching :class:`PromptSpec`.
+        Raises:
+            KeyError: When no prompt / version matches.
+        """
+        if version is None:
+            spec = self._latest.get(name)
+            if spec is None:
+                raise KeyError(f"Unknown prompt: {name}")
+            return spec
+        versions = self._by_name.get(name, {})
+        if version not in versions:
+            raise KeyError(f"Unknown prompt version: {name}@{version}")
+        return versions[version]
+    def render(self, name: str, version: str | None = None, /, **kwargs: object) -> str:
+        """Fetch ``name`` and format its template with ``**kwargs``.
+        Args:
+            name: Logical prompt name.
+            version: Specific version, or ``None`` for the latest.
+            **kwargs: Template variables.
+        Returns:
+            The rendered prompt string.
+        """
+        spec = self.get(name, version)
+        return spec.template.format(**kwargs)
+    def names(self) -> list[str]:
+        """Return all registered prompt names."""
+        return sorted(self._by_name.keys())
+def _singleton() -> PromptRegistry:
+    """Return the process-wide registry, constructing it on first call."""
+    with PromptRegistry._lock:
+        if PromptRegistry._instance is None:
+            PromptRegistry._instance = PromptRegistry()
+        return PromptRegistry._instance
+def get_prompt(name: str, version: str | None = None) -> PromptSpec:
+    """Shortcut for ``_singleton().get(name, version)``."""
+    return _singleton().get(name, version)
+def render_prompt(name: str, version: str | None = None, /, **kwargs: object) -> str:
+    """Shortcut for ``_singleton().render(name, version, **kwargs)``."""
+    return _singleton().render(name, version, **kwargs)
+def reload() -> None:
+    """Force a reload of the registry — intended for tests."""
+    with PromptRegistry._lock:
+        PromptRegistry._instance = PromptRegistry()

src/agent/prompts/summarize_document.v1.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: summarize_document
+version: v1
+description: Produce a structured summary of a single document in the knowledge base.
+template: |-
+  Produce a structured summary of the following document. Include:
+  1. Document title/topic
+  2. Key points (3-7 bullet points)
+  3. Important rules, deadlines, or requirements mentioned
+  4. Who the document applies to
+  Write the summary in the same language as the document.
+  Document ID: {document_id}
+  Document text:
+  {full_text}

src/agent/prompts/synthesizer.v1.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: synthesizer
+version: v1
+description: System prefix for the Plan-and-Execute synthesizer. Caller appends history, original question, and gathered step results.
+template: |+
+  You are a helpful assistant for administrative staff at the University of Copenhagen (KU).
+  Below are the results gathered from multiple research steps. Synthesize them into a single coherent answer to the user's original question.
+  Guidelines:
+  - Cite document sources using [1], [2], etc.
+  - Answer in the same language as the user's question.
+  - Be concise but thorough.
+  - If some steps found no results, acknowledge gaps honestly.

src/agent/prompts/translate_query.v1.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+name: translate_query
+version: v1
+description: Translate the user query into the corpus language so BM25 can match.
+template: |-
+  Translate the following text to {target}. Reply with ONLY the translated text, nothing else.
+  Text: {query}

src/agent/router.py CHANGED Viewed

@@ -20,6 +20,8 @@ from langgraph.graph import END, StateGraph
 from src.models import IntentType, GenerationResponse, PipelineDetails, QueryResult
 from src.agent.intent_classifier import IntentClassifier
 from src.agent.tools import detect_document_languages
 from src.retrieval.hybrid import HybridRetriever
 from src.retrieval.reranker import Reranker
@@ -140,6 +142,7 @@ class QueryRouter:
         *,
         translate_query: bool = True,
         document_languages: list[str] | None = None,
     ) -> None:
         """Initialize the query router.
@@ -166,6 +169,7 @@ class QueryRouter:
         self._document_languages: list[str] | None = (
             list(document_languages) if document_languages else None
         )
         self._graph = self._build_graph()
     def _ensure_document_languages(self) -> list[str]:
@@ -195,16 +199,10 @@ class QueryRouter:
             Tuple of (detected_language, intent).
         """
         valid_intents = "factual, summary, comparison, procedural, unknown"
-        prompt = (
-            "You are given a user query. Do TWO things:\n"
-            "1. Detect the language of the query (reply with the language name in English, "
-            "e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').\n"
-            "2. Classify the intent into exactly one of: "
-            f"{valid_intents}.\n\n"
-            "Reply with EXACTLY two lines, nothing else:\n"
-            "language: <language>\n"
-            "intent: <intent>\n\n"
-            f"Query: {query}"
         )
         raw = _extract_content(self._llm_chain.invoke(prompt))
         logger.debug("Combined detection raw response: %s", raw)
@@ -266,10 +264,8 @@ class QueryRouter:
             return query
         target = doc_langs[0]
-        translate_prompt = (
-            f"Translate the following text to {target}. "
-            "Reply with ONLY the translated text, nothing else.\n\n"
-            f"Text: {query}"
         )
         translated = _extract_content(self._llm_chain.invoke(translate_prompt))
         logger.info("Translated query to %s: %s", target, translated)
@@ -324,13 +320,10 @@ class QueryRouter:
         Uses the LLM to generate alternative search terms while preserving
         the original meaning, then increments the retry counter.
         """
-        prompt = (
-            "The following search query did not return good results from "
-            "the document database. Rewrite it to be broader or use "
-            "different keywords while keeping the same meaning. "
-            "Reply with ONLY the rewritten query, nothing else.\n\n"
-            f"Original question: {state['query']}\n"
-            f"Failed search query: {state['retrieval_query']}"
         )
         broadened = _extract_content(self._llm_chain.invoke(prompt))
         logger.info(
@@ -380,6 +373,7 @@ class QueryRouter:
         prompt = self._build_prompt(
             state["query"], state["intent"], context, state["user_language"]
         )
         answer = _extract_content(self._llm_chain.invoke(prompt))
         logger.info("Generated answer for intent=%s", state["intent"].value)
         return {"answer": answer}

 from src.models import IntentType, GenerationResponse, PipelineDetails, QueryResult
 from src.agent.intent_classifier import IntentClassifier
+from src.agent.prompts import render_prompt
+from src.agent.token_budget import measure as _measure_tokens
 from src.agent.tools import detect_document_languages
 from src.retrieval.hybrid import HybridRetriever
 from src.retrieval.reranker import Reranker
         *,
         translate_query: bool = True,
         document_languages: list[str] | None = None,
+        token_budget_enabled: bool = False,
     ) -> None:
         """Initialize the query router.
         self._document_languages: list[str] | None = (
             list(document_languages) if document_languages else None
         )
+        self._token_budget_enabled = token_budget_enabled
         self._graph = self._build_graph()
     def _ensure_document_languages(self) -> list[str]:
             Tuple of (detected_language, intent).
         """
         valid_intents = "factual, summary, comparison, procedural, unknown"
+        prompt = render_prompt(
+            "detect_language_and_intent",
+            valid_intents=valid_intents,
+            query=query,
         )
         raw = _extract_content(self._llm_chain.invoke(prompt))
         logger.debug("Combined detection raw response: %s", raw)
             return query
         target = doc_langs[0]
+        translate_prompt = render_prompt(
+            "translate_query", target=target, query=query
         )
         translated = _extract_content(self._llm_chain.invoke(translate_prompt))
         logger.info("Translated query to %s: %s", target, translated)
         Uses the LLM to generate alternative search terms while preserving
         the original meaning, then increments the retry counter.
         """
+        prompt = render_prompt(
+            "broaden_query",
+            query=state["query"],
+            retrieval_query=state["retrieval_query"],
         )
         broadened = _extract_content(self._llm_chain.invoke(prompt))
         logger.info(
         prompt = self._build_prompt(
             state["query"], state["intent"], context, state["user_language"]
         )
+        _measure_tokens("generate_answer", prompt, enabled=self._token_budget_enabled)
         answer = _extract_content(self._llm_chain.invoke(prompt))
         logger.info("Generated answer for intent=%s", state["intent"].value)
         return {"answer": answer}

src/agent/token_budget.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Token counting and budget tracking for LLM prompts.
+Stage 1 (current): measure-only — counts tokens at known prompt injection
+points and logs them. No truncation is performed.
+Token counts are estimates: tiktoken's cl100k tokenizer is used as a
+provider-agnostic baseline, multiplied by a safety factor because non-OpenAI
+multilingual tokenizers (Llama, Gemma, Mistral) typically tokenize Danish /
+mixed-language text 20-40% more aggressively than cl100k.
+Provider-specific tokenizers (Ollama's /api/tokenize, HuggingFace AutoTokenizer)
+are intentionally not used here to keep this module dependency-free and
+process-local. When real usage data exposes the gap, swap in a
+provider-aware backend.
+"""
+from __future__ import annotations
+import logging
+logger = logging.getLogger(__name__)
+# Conservative scaling: cl100k under-counts multilingual / Danish text.
+# 1.5× keeps us on the safe side for budget decisions.
+_DEFAULT_SAFETY_FACTOR = 1.5
+# Fallback when tiktoken is unavailable: ~4 characters per token is the
+# common rule of thumb for English; multiplied by safety factor it's
+# usable as a coarse upper bound.
+_CHARS_PER_TOKEN_FALLBACK = 4
+try:
+    import tiktoken
+    _ENCODER = tiktoken.get_encoding("cl100k_base")
+except Exception:  # noqa: BLE001 — any tiktoken failure → heuristic
+    _ENCODER = None
+    logger.warning("tiktoken unavailable; falling back to character-based token estimation")
+def count_tokens(text: str, *, safety_factor: float = _DEFAULT_SAFETY_FACTOR) -> int:
+    """Estimate the token count of ``text``.
+    Args:
+        text: Text to measure. Empty / None-ish input returns 0.
+        safety_factor: Multiplier applied to the raw count to compensate
+            for non-OpenAI tokenizers being more aggressive on multilingual
+            text. Defaults to 1.5×.
+    Returns:
+        Estimated token count, rounded up to the nearest int.
+    """
+    if not text:
+        return 0
+    if _ENCODER is not None:
+        raw = len(_ENCODER.encode(text, disallowed_special=()))
+    else:
+        raw = max(1, len(text) // _CHARS_PER_TOKEN_FALLBACK)
+    return int(raw * safety_factor + 0.5)
+def measure(
+    prompt_name: str,
+    text: str,
+    *,
+    enabled: bool = True,
+    safety_factor: float = _DEFAULT_SAFETY_FACTOR,
+) -> int:
+    """Count tokens for ``text`` and log the result.
+    Args:
+        prompt_name: Logical name of the prompt being measured (used in
+            log lines so different injection points are easy to grep).
+        text: The fully-rendered prompt string.
+        enabled: When False, returns 0 immediately and logs nothing —
+            lets callers gate on the ``TOKEN_BUDGET_ENABLED`` flag without
+            duplicating the check.
+        safety_factor: See :func:`count_tokens`.
+    Returns:
+        Estimated token count, or 0 when ``enabled`` is False.
+    """
+    if not enabled:
+        return 0
+    count = count_tokens(text, safety_factor=safety_factor)
+    logger.info("token_budget prompt=%s tokens~=%d", prompt_name, count)
+    return count

src/agent/tools.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dataclasses import dataclass, field
 from langchain_core.runnables import Runnable
 from langchain_core.tools import tool
 from src.models import QueryResult
 from src.retrieval.hybrid import HybridRetriever
 from src.retrieval.reranker import Reranker
@@ -121,14 +122,7 @@ def detect_document_languages(
     if not sample_text:
         return []
-    prompt = (
-        "You are a language detector. The text samples below come from "
-        "different documents in a knowledge base. Identify ALL distinct "
-        "languages present across the samples (do not list a language more "
-        "than once). Reply with ONLY the language names in English, one per "
-        "line, no explanation.\n\n"
-        f"Samples:\n{sample_text}"
-    )
     raw = _extract_content(llm.invoke(prompt))
     seen: set[str] = set()
@@ -414,12 +408,10 @@ def make_retrieval_tools(
             store.tool_calls.append(("multi_query_search", question))
             # Step 1: Ask LLM to decompose the question
-            decompose_prompt = (
-                "You are a search query planner. Given a complex question, "
-                "decompose it into 2-4 simple, independent search queries that "
-                f"together cover all aspects of the question. {_lang_clause}\n\n"
-                "Reply with ONLY the queries, one per line, nothing else.\n\n"
-                f"Question: {question}"
             )
             raw = _extract_content(llm_chain.invoke(decompose_prompt))
             sub_queries = [q.strip().lstrip("0123456789.-) ") for q in raw.splitlines() if q.strip()]
@@ -490,16 +482,10 @@ def make_retrieval_tools(
             if len(full_text) > max_chars:
                 full_text = full_text[:max_chars] + "\n\n[... teksten er forkortet ... (text truncated)]"
-            summary_prompt = (
-                "Produce a structured summary of the following document. "
-                "Include:\n"
-                "1. Document title/topic\n"
-                "2. Key points (3-7 bullet points)\n"
-                "3. Important rules, deadlines, or requirements mentioned\n"
-                "4. Who the document applies to\n\n"
-                "Write the summary in the same language as the document.\n\n"
-                f"Document ID: {document_id}\n\n"
-                f"Document text:\n{full_text}"
             )
             summary = _extract_content(llm_chain.invoke(summary_prompt))
             return f"Resumé af {document_id}:\n\n{summary}"

 from langchain_core.runnables import Runnable
 from langchain_core.tools import tool
+from src.agent.prompts import render_prompt
 from src.models import QueryResult
 from src.retrieval.hybrid import HybridRetriever
 from src.retrieval.reranker import Reranker
     if not sample_text:
         return []
+    prompt = render_prompt("detect_languages", sample_text=sample_text)
     raw = _extract_content(llm.invoke(prompt))
     seen: set[str] = set()
             store.tool_calls.append(("multi_query_search", question))
             # Step 1: Ask LLM to decompose the question
+            decompose_prompt = render_prompt(
+                "multi_query_decompose",
+                lang_clause=_lang_clause,
+                question=question,
             )
             raw = _extract_content(llm_chain.invoke(decompose_prompt))
             sub_queries = [q.strip().lstrip("0123456789.-) ") for q in raw.splitlines() if q.strip()]
             if len(full_text) > max_chars:
                 full_text = full_text[:max_chars] + "\n\n[... teksten er forkortet ... (text truncated)]"
+            summary_prompt = render_prompt(
+                "summarize_document",
+                document_id=document_id,
+                full_text=full_text,
             )
             summary = _extract_content(llm_chain.invoke(summary_prompt))
             return f"Resumé af {document_id}:\n\n{summary}"

src/api/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from fastapi import FastAPI
 from langchain_core.output_parsers import StrOutputParser
 from src.config import load_settings
-from src.provider import create_llm, create_embeddings, create_reranker
 from src.retrieval.embedder import Embedder
 from src.retrieval.vector_store import VectorStore
 from src.retrieval.bm25_search import BM25Search
@@ -36,7 +36,17 @@ def create_app() -> FastAPI:
     logging.basicConfig(level=getattr(logging, settings.log_level, logging.INFO))
-    llm = create_llm(settings)
     embeddings = create_embeddings(settings)
     embedder = Embedder(embeddings=embeddings)
@@ -83,6 +93,7 @@ def create_app() -> FastAPI:
             vector_store=vector_store,
             default_top_k=settings.top_k,
             memory=ConversationMemory(),
         )
     else:
         logger.info("Agent mode: pipeline (fixed DAG)")
@@ -94,6 +105,7 @@ def create_app() -> FastAPI:
             reranker=reranker,
             llm_chain=llm_chain,
             translate_query=settings.translate_query,
         )
     session_store = SessionStore(db_path=os.environ.get("SESSION_DB_PATH", "./data/sessions.db"))

 from langchain_core.output_parsers import StrOutputParser
 from src.config import load_settings
+from src.provider import create_llm, create_llm_with_fallback, create_embeddings, create_reranker
 from src.retrieval.embedder import Embedder
 from src.retrieval.vector_store import VectorStore
 from src.retrieval.bm25_search import BM25Search
     logging.basicConfig(level=getattr(logging, settings.log_level, logging.INFO))
+    # React mode's ReAct sub-agent calls llm.bind_tools(...) internally, which
+    # RunnableWithFallbacks does not support. Fall back chain is therefore only
+    # applied in pipeline mode; in react mode we warn and use the primary only.
+    if settings.llm_fallback_enabled and settings.agent_mode == "react":
+        logger.warning(
+            "LLM_FALLBACK_ENABLED is set but AGENT_MODE=react; fallback chain "
+            "is incompatible with tool-calling and will be DISABLED for this run."
+        )
+        llm = create_llm(settings)
+    else:
+        llm = create_llm_with_fallback(settings)
     embeddings = create_embeddings(settings)
     embedder = Embedder(embeddings=embeddings)
             vector_store=vector_store,
             default_top_k=settings.top_k,
             memory=ConversationMemory(),
+            token_budget_enabled=settings.token_budget_enabled,
         )
     else:
         logger.info("Agent mode: pipeline (fixed DAG)")
             reranker=reranker,
             llm_chain=llm_chain,
             translate_query=settings.translate_query,
+            token_budget_enabled=settings.token_budget_enabled,
         )
     session_store = SessionStore(db_path=os.environ.get("SESSION_DB_PATH", "./data/sessions.db"))

src/config.py CHANGED Viewed

@@ -84,6 +84,18 @@ class Settings:
     # Agent mode: "pipeline" (fixed DAG) or "react" (tool-calling ReAct loop)
     agent_mode: str
 def _parse_bool(value: str, *, default: bool) -> bool:
     """Parse a boolean environment variable string.
@@ -180,4 +192,19 @@ def load_settings() -> Settings:
         # Agent mode: "pipeline" keeps the existing fixed DAG; "react" enables
         # the multi-step ReAct loop (requires an LLM with tool-calling support).
         agent_mode=os.environ.get("AGENT_MODE", "pipeline"),
     )

     # Agent mode: "pipeline" (fixed DAG) or "react" (tool-calling ReAct loop)
     agent_mode: str
+    # Token budget (Stage 1: measure-only). When True, prompt sizes are
+    # logged at known injection points; truncation is NOT applied yet.
+    token_budget_enabled: bool
+    # LLM provider fallback. When enabled, the primary generation LLM is
+    # wrapped with LangChain's with_fallbacks across ``llm_fallback_providers``
+    # in order. DEFAULT OFF because an automatic switch from a local
+    # privacy-preserving provider (e.g. Ollama) to a cloud provider (e.g.
+    # OpenAI) has both cost and data-exfiltration implications.
+    llm_fallback_enabled: bool
+    llm_fallback_providers: tuple[str, ...]
 def _parse_bool(value: str, *, default: bool) -> bool:
     """Parse a boolean environment variable string.
         # Agent mode: "pipeline" keeps the existing fixed DAG; "react" enables
         # the multi-step ReAct loop (requires an LLM with tool-calling support).
         agent_mode=os.environ.get("AGENT_MODE", "pipeline"),
+        # Token budget — measure-only logging, off by default.
+        token_budget_enabled=_parse_bool(
+            os.environ.get("TOKEN_BUDGET_ENABLED", ""), default=False
+        ),
+        # LLM fallback chain — off by default for privacy / cost reasons.
+        llm_fallback_enabled=_parse_bool(
+            os.environ.get("LLM_FALLBACK_ENABLED", ""), default=False
+        ),
+        llm_fallback_providers=tuple(
+            p.strip().lower()
+            for p in os.environ.get("LLM_FALLBACK_PROVIDERS", "").split(",")
+            if p.strip()
+        ),
     )

src/provider.py CHANGED Viewed

@@ -110,6 +110,98 @@ def create_llm(settings: Settings) -> BaseChatModel:
             )
 _EVALUATOR_MODEL_FIELD: dict[str, str] = {
     "groq": "groq_model",
     "openai": "openai_model",

             )
+# Exceptions that engage the fallback chain. Set to the broad ``Exception``
+# because real-world LLM SDK errors (openai.RateLimitError,
+# openai.APIConnectionError, httpx.ConnectError, anthropic.APIError, ...)
+# do NOT inherit from stdlib ``ConnectionError`` / ``TimeoutError`` / ``OSError``.
+# A narrower set would silently let the most common transient failures bypass
+# the fallback. Safety relies on three layers instead:
+#   1. The whole feature is opt-in via ``LLM_FALLBACK_ENABLED`` (default off).
+#   2. Every fallback activation logs a WARNING naming the destination provider.
+#   3. Startup logs the full chain at WARNING with cost / privacy reminders.
+_FALLBACK_EXCEPTIONS: tuple[type[BaseException], ...] = (Exception,)
+def _wrap_with_fallback_logging(llm: BaseChatModel, provider: str) -> BaseChatModel:
+    """Wrap ``llm`` so every invocation logs a WARNING naming the provider.
+    The wrapper only fires when the underlying Runnable is actually invoked,
+    which for a fallback entry means the primary (and any earlier fallbacks)
+    already failed. This gives operators a clear trail showing when data
+    leaves the primary provider — critical for the privacy-aware default of
+    this project.
+    Args:
+        llm: The chat model to wrap.
+        provider: Provider label shown in the log message.
+    Returns:
+        A Runnable that transparently delegates to ``llm``.
+    """
+    def _on_start(_run_obj, _config=None) -> None:  # noqa: ANN001
+        logger.warning(
+            "LLM fallback activated: routing request to provider '%s'. "
+            "Check cost / privacy implications.",
+            provider,
+        )
+    return llm.with_listeners(on_start=_on_start)
+def create_llm_with_fallback(settings: Settings) -> BaseChatModel:
+    """Create the generation LLM, optionally wrapping it in a fallback chain.
+    When ``settings.llm_fallback_enabled`` is False OR the fallback list is
+    empty, this is a drop-in equivalent of :func:`create_llm`. Otherwise the
+    primary LLM is wrapped via LangChain's ``with_fallbacks`` so that when
+    the primary raises a transient failure (network / timeout / connection),
+    each fallback provider is tried in order.
+    Args:
+        settings: Application settings.
+    Returns:
+        A BaseChatModel (primary on its own, or primary-with-fallbacks).
+    """
+    primary = create_llm(settings)
+    if not settings.llm_fallback_enabled or not settings.llm_fallback_providers:
+        return primary
+    fallbacks: list[BaseChatModel] = []
+    for provider in settings.llm_fallback_providers:
+        try:
+            fallback_settings = replace(settings, llm_provider=provider)
+            raw = create_llm(fallback_settings)
+        except Exception as exc:  # noqa: BLE001 — log and skip broken fallbacks
+            logger.error(
+                "Skipping LLM fallback provider '%s' due to construction error: %s",
+                provider, exc,
+            )
+            continue
+        fallbacks.append(_wrap_with_fallback_logging(raw, provider))
+    if not fallbacks:
+        logger.warning(
+            "LLM_FALLBACK_ENABLED is true but no fallback providers could be "
+            "constructed; running without fallback."
+        )
+        return primary
+    chain_repr = " -> ".join([settings.llm_provider, *settings.llm_fallback_providers])
+    logger.warning(
+        "LLM fallback chain is ACTIVE: %s. "
+        "On transient failure of the primary, requests will be routed to the "
+        "next provider. This may incur API costs and send data to third-party "
+        "providers.",
+        chain_repr,
+    )
+    return primary.with_fallbacks(
+        fallbacks, exceptions_to_handle=_FALLBACK_EXCEPTIONS
+    )
 _EVALUATOR_MODEL_FIELD: dict[str, str] = {
     "groq": "groq_model",
     "openai": "openai_model",

tests/test_llm_fallback.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""Tests for create_llm_with_fallback and the fallback chain runtime behaviour."""
+from dataclasses import replace
+from unittest.mock import patch
+import pytest
+from langchain_core.language_models.fake_chat_models import FakeListChatModel
+from langchain_core.runnables import RunnableLambda
+import src.provider as provider_module
+from src.config import load_settings
+def _base_settings():  # noqa: ANN202
+    """Return a Settings instance with fallback fields overridable for tests."""
+    return load_settings()
+def test_fallback_disabled_returns_plain_llm() -> None:
+    settings = replace(
+        _base_settings(),
+        llm_fallback_enabled=False,
+        llm_fallback_providers=("openai",),
+    )
+    fake = FakeListChatModel(responses=["hello"])
+    with patch.object(provider_module, "create_llm", return_value=fake) as m:
+        result = provider_module.create_llm_with_fallback(settings)
+    # Exactly one LLM constructed: the primary.
+    assert m.call_count == 1
+    # The returned object is the plain fake — no with_fallbacks wrapper.
+    assert result is fake
+def test_fallback_enabled_but_empty_list_returns_plain_llm() -> None:
+    settings = replace(
+        _base_settings(),
+        llm_fallback_enabled=True,
+        llm_fallback_providers=(),
+    )
+    fake = FakeListChatModel(responses=["hello"])
+    with patch.object(provider_module, "create_llm", return_value=fake):
+        result = provider_module.create_llm_with_fallback(settings)
+    assert result is fake
+def test_fallback_chain_invokes_fallback_on_transient_error(caplog) -> None:  # noqa: ANN001
+    """Primary raises ConnectionError → fallback is used and success is returned."""
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("fallback_stub",),
+    )
+    primary = RunnableLambda(lambda _x: (_ for _ in ()).throw(ConnectionError("down")))
+    fallback = FakeListChatModel(responses=["rescued"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        if s.llm_provider == "primary_stub":
+            return primary
+        return fallback
+    import logging
+    caplog.set_level(logging.WARNING, logger="src.provider")
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        chain = provider_module.create_llm_with_fallback(settings)
+    # Startup warning about the chain must be emitted.
+    assert any("LLM fallback chain is ACTIVE" in r.message for r in caplog.records)
+    # Invoking the chain transparently recovers via the fallback.
+    result = chain.invoke("hi")
+    # FakeListChatModel returns an AIMessage whose content is the response.
+    assert getattr(result, "content", result) == "rescued"
+    # Trigger-time warning must have fired when the fallback was used.
+    assert any("fallback activated" in r.message.lower() for r in caplog.records)
+def test_broken_fallback_provider_is_skipped(caplog) -> None:  # noqa: ANN001
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("broken", "good"),
+    )
+    primary = FakeListChatModel(responses=["primary"])
+    good_fallback = FakeListChatModel(responses=["good"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        if s.llm_provider == "primary_stub":
+            return primary
+        if s.llm_provider == "broken":
+            raise RuntimeError("cannot construct")
+        return good_fallback
+    import logging
+    caplog.set_level(logging.ERROR, logger="src.provider")
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        chain = provider_module.create_llm_with_fallback(settings)
+    assert any("Skipping LLM fallback provider 'broken'" in r.message for r in caplog.records)
+    # Chain should still be usable (wraps primary + good).
+    assert chain is not primary  # with_fallbacks wrapped the result
+def test_streaming_pre_stream_failure_engages_fallback() -> None:
+    """When primary fails before yielding any tokens, fallback streams cleanly.
+    This is the expected happy path: the streaming entry point (e.g. a
+    connection refused at request time) raises before any token leaves the
+    primary, so ``with_fallbacks`` transparently substitutes the fallback
+    and the caller sees exactly one clean stream.
+    """
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("fallback_stub",),
+    )
+    # Primary throws on both invoke and stream — simulates full outage.
+    primary = RunnableLambda(
+        lambda _x: (_ for _ in ()).throw(ConnectionError("primary down"))
+    )
+    fallback = FakeListChatModel(responses=["rescued"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        return primary if s.llm_provider == "primary_stub" else fallback
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        chain = provider_module.create_llm_with_fallback(settings)
+    chunks = list(chain.stream("hi"))
+    joined = "".join(getattr(c, "content", str(c)) for c in chunks)
+    # The fallback's single response is streamed character-by-character by
+    # FakeListChatModel, so the joined output must equal the response exactly
+    # — no duplicated tokens from the primary.
+    assert joined == "rescued"
+def test_streaming_mid_stream_failure_is_not_caught_by_fallback() -> None:
+    """Mid-stream failures propagate; fallback is not engaged.
+    This documents a real limitation of ``RunnableWithFallbacks``: it only
+    catches exceptions raised when the stream is OPENED, not exceptions
+    raised DURING iteration. If the primary yields some tokens and then
+    the connection dies, the caller receives those partial tokens followed
+    by the original exception — NOT a seamless switch to the fallback.
+    This guards against silently relying on fallback to cover mid-stream
+    outages; it cannot.
+    """
+    from langchain_core.messages import AIMessageChunk
+    from langchain_core.runnables import Runnable
+    class PartialThenFail(Runnable):
+        def invoke(self, input, config=None, **kwargs):  # noqa: ANN001, A002
+            raise ConnectionError("primary has no invoke")
+        def stream(self, input, config=None, **kwargs):  # noqa: ANN001, A002
+            yield AIMessageChunk(content="partial-")
+            raise ConnectionError("mid-stream outage")
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("fallback_stub",),
+    )
+    primary = PartialThenFail()
+    fallback = FakeListChatModel(responses=["rescued"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        return primary if s.llm_provider == "primary_stub" else fallback
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        chain = provider_module.create_llm_with_fallback(settings)
+    observed: list[str] = []
+    with pytest.raises(ConnectionError):
+        for chunk in chain.stream("hi"):
+            observed.append(chunk.content)
+    # Partial token was delivered to the caller before the failure bubbled up.
+    assert observed == ["partial-"]
+def test_streaming_integration_with_query_router_uses_fallback(monkeypatch) -> None:  # noqa: ANN001
+    """End-to-end: QueryRouter.route_stream survives a primary LLM outage.
+    Wires a fallback-wrapped LLM into a real QueryRouter and asserts the
+    SSE event stream completes with a ``done`` event carrying the fallback's
+    answer. This proves the fallback integrates with the generation node's
+    downstream ``StrOutputParser`` and with the router's streaming path.
+    """
+    from langchain_core.messages import AIMessage
+    from langchain_core.output_parsers import StrOutputParser
+    from src.agent.intent_classifier import IntentClassifier
+    from src.agent.router import QueryRouter
+    from src.models import IntentType, QueryResult, DocumentChunk
+    # Build the fallback-wrapped LLM.
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("fallback_stub",),
+    )
+    primary = RunnableLambda(
+        lambda _x: (_ for _ in ()).throw(ConnectionError("primary down"))
+    )
+    fallback = RunnableLambda(lambda _x: AIMessage(content="rescued answer"))
+    def fake_create(s):  # noqa: ANN001, ANN202
+        return primary if s.llm_provider == "primary_stub" else fallback
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        llm = provider_module.create_llm_with_fallback(settings)
+    # Stub the intent classifier so we don't need to drive language detection.
+    class _StubClassifier:
+        def classify(self, _query: str) -> IntentType:
+            return IntentType.FACTUAL
+    # Stub retriever + reranker so they return a single fake result.
+    chunk = DocumentChunk(
+        chunk_id="c1",
+        document_id="doc1.pdf",
+        text="Sample context.",
+        metadata={"page_number": 1, "chunk_index": 0},
+    )
+    fake_qr = QueryResult(chunk=chunk, score=0.9, source="dense")
+    class _StubHybridResult:
+        def __init__(self):  # noqa: ANN204
+            self.dense_results = [fake_qr]
+            self.sparse_results = [fake_qr]
+            self.fused_results = [fake_qr]
+    class _StubHybrid:
+        def __init__(self):  # noqa: ANN204
+            self.vector_store = _StubVectorStore()
+        def search_detailed(self, _q: str, top_k: int = 5) -> _StubHybridResult:
+            return _StubHybridResult()
+    class _StubVectorStore:
+        def list_document_ids(self) -> list[str]:
+            return []
+    class _StubReranker:
+        def rerank(self, _q: str, results, top_k: int = 5):  # noqa: ANN001, ANN201
+            return list(results)[:top_k]
+    router = QueryRouter(
+        intent_classifier=_StubClassifier(),
+        hybrid_retriever=_StubHybrid(),
+        reranker=_StubReranker(),
+        llm_chain=llm | StrOutputParser(),
+        translate_query=False,
+        document_languages=["Danish"],
+    )
+    events = list(router.route_stream("Hvor mange feriedage?", top_k=3))
+    done = [e for e in events if e["step"] == "done"]
+    assert done, f"expected a 'done' event, got steps={[e['step'] for e in events]}"
+    assert done[0]["result"]["answer"] == "rescued answer"
+def test_fallback_engages_on_sdk_style_exception() -> None:
+    """Fallback must engage on arbitrary Exception subclasses.
+    Real-world LLM SDK exceptions (openai.RateLimitError, httpx.ConnectError,
+    etc.) do not inherit from stdlib ConnectionError / TimeoutError / OSError.
+    Using a narrow exception tuple would silently make the fallback chain a
+    no-op for these cases. This test simulates one of them with a custom
+    Exception subclass that has no relation to ConnectionError.
+    """
+    class FakeRateLimitError(Exception):
+        """Stand-in for openai.RateLimitError / anthropic.APIError."""
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("fallback_stub",),
+    )
+    primary = RunnableLambda(
+        lambda _x: (_ for _ in ()).throw(FakeRateLimitError("429 Too Many Requests"))
+    )
+    fallback = FakeListChatModel(responses=["rescued"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        return primary if s.llm_provider == "primary_stub" else fallback
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        chain = provider_module.create_llm_with_fallback(settings)
+    result = chain.invoke("hi")
+    assert getattr(result, "content", result) == "rescued"
+def test_all_fallbacks_broken_returns_primary_only(caplog) -> None:  # noqa: ANN001
+    settings = replace(
+        _base_settings(),
+        llm_provider="primary_stub",
+        llm_fallback_enabled=True,
+        llm_fallback_providers=("broken",),
+    )
+    primary = FakeListChatModel(responses=["primary"])
+    def fake_create(s):  # noqa: ANN001, ANN202
+        if s.llm_provider == "primary_stub":
+            return primary
+        raise RuntimeError("nope")
+    import logging
+    caplog.set_level(logging.WARNING, logger="src.provider")
+    with patch.object(provider_module, "create_llm", side_effect=fake_create):
+        result = provider_module.create_llm_with_fallback(settings)
+    assert result is primary
+    assert any(
+        "no fallback providers could be constructed" in r.message
+        for r in caplog.records
+    )

tests/test_prompts_registry.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""Snapshot tests ensuring YAML prompts render byte-identical to the legacy inline strings."""
+from src.agent.prompts import registry as _reg
+from src.agent.prompts import get_prompt, render_prompt
+def setup_module(module) -> None:  # noqa: ANN001
+    """Force a fresh registry load before snapshot tests."""
+    _reg.reload()
+# Golden literals — frozen copies of the pre-migration strings. Any future
+# change to the YAML must preserve these byte-for-byte. Do NOT replace these
+# with imports from the source modules; that would make the test tautological
+# after the source modules are migrated to read from the registry.
+_GOLDEN_INTENT_CLASSIFY = (
+    "You are an intent classifier. Given a user query, classify it into exactly "
+    "one of the following categories: factual, summary, comparison, procedural, unknown.\n\n"
+    "- factual: the user asks for a specific fact or piece of information.\n"
+    "- summary: the user wants a summary or overview of a topic.\n"
+    "- comparison: the user wants to compare two or more things.\n"
+    "- procedural: the user asks how to do something step by step.\n"
+    "- unknown: the query does not fit any of the above.\n\n"
+    "Respond with ONLY the category name in lowercase, nothing else."
+)
+_GOLDEN_PLANNER = (
+    "You are a planning assistant for the University of Copenhagen (KU) document system.\n\n"
+    "Given a user question, produce a JSON list of 1–4 steps needed to answer it.\n"
+    "Each step is an object with:\n"
+    '  - "action": one of "search", "search_within", "multi_search", '
+    '"summarize", "list_docs", "fetch_doc"\n'
+    '  - "detail": a short description of what to do (e.g. the search query, document ID)\n\n'
+    "Rules:\n"
+    "- IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.\n"
+    "- For simple factual questions: 1 search step is enough.\n"
+    "- For comparison questions: use multi_search or separate search steps.\n"
+    "- For document overview requests: use summarize.\n"
+    "- For questions with multiple aspects: use 2–4 separate steps.\n"
+    "- Always end with the steps needed; do NOT include a final 'answer' step.\n\n"
+    "Reply with ONLY the JSON array, nothing else. No explanation, no thinking.\n\n"
+    "Examples:\n"
+    'Question: "What is the exam policy?"\n'
+    '[{"action": "search", "detail": "KU eksamensregler"}]\n\n'
+    'Question: "Compare vacation rules for academic vs administrative staff"\n'
+    '[{"action": "search", "detail": "ferieregler videnskabeligt personale"}, '
+    '{"action": "search", "detail": "ferieregler administrativt personale"}]\n\n'
+    'Question: "Summarize the AI policy document"\n'
+    '[{"action": "summarize", "detail": "ku_ai_policy.pdf"}]\n\n'
+    'Question: "Which documents are about AI? Summarize and find the rules for written exams"\n'
+    '[{"action": "list_docs", "detail": "list all available documents"}, '
+    '{"action": "search", "detail": "AI dokumenter KU"}, '
+    '{"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]\n\n'
+    "Now plan for this question:\n"
+)
+_GOLDEN_EXECUTOR_SYSTEM = (
+    "/no_think\n"
+    "You are executing ONE step of a plan to answer a user's question about "
+    "University of Copenhagen (KU) documents.\n\n"
+    "You have retrieval tools available. Execute the step described below, "
+    "then summarise what you found in 2-3 sentences. If you find nothing "
+    "relevant, say so clearly.\n\n"
+    "Do NOT produce a final answer — just report what you found for this step."
+)
+_GOLDEN_SYNTHESIZER = (
+    "You are a helpful assistant for administrative staff at the University "
+    "of Copenhagen (KU).\n\n"
+    "Below are the results gathered from multiple research steps. "
+    "Synthesize them into a single coherent answer to the user's original question.\n\n"
+    "Guidelines:\n"
+    "- Cite document sources using [1], [2], etc.\n"
+    "- Answer in the same language as the user's question.\n"
+    "- Be concise but thorough.\n"
+    "- If some steps found no results, acknowledge gaps honestly.\n\n"
+)
+def test_intent_classify_matches_golden() -> None:
+    assert get_prompt("intent_classify").template == _GOLDEN_INTENT_CLASSIFY
+def test_planner_matches_golden() -> None:
+    assert get_prompt("planner").template == _GOLDEN_PLANNER
+def test_executor_system_matches_golden() -> None:
+    assert get_prompt("executor_system").template == _GOLDEN_EXECUTOR_SYSTEM
+def test_synthesizer_matches_golden() -> None:
+    assert get_prompt("synthesizer").template == _GOLDEN_SYNTHESIZER
+def test_detect_language_and_intent_renders_identically() -> None:
+    valid_intents = "factual, summary, comparison, procedural, unknown"
+    query = "Hvor mange feriedage har jeg?"
+    expected = (
+        "You are given a user query. Do TWO things:\n"
+        "1. Detect the language of the query (reply with the language name in English, "
+        "e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').\n"
+        "2. Classify the intent into exactly one of: "
+        f"{valid_intents}.\n\n"
+        "Reply with EXACTLY two lines, nothing else:\n"
+        "language: <language>\n"
+        "intent: <intent>\n\n"
+        f"Query: {query}"
+    )
+    rendered = render_prompt(
+        "detect_language_and_intent", valid_intents=valid_intents, query=query
+    )
+    assert rendered == expected
+def test_translate_query_renders_identically() -> None:
+    target = "Danish"
+    query = "How many vacation days do I have?"
+    expected = (
+        f"Translate the following text to {target}. "
+        "Reply with ONLY the translated text, nothing else.\n\n"
+        f"Text: {query}"
+    )
+    assert render_prompt("translate_query", target=target, query=query) == expected
+def test_broaden_query_renders_identically() -> None:
+    query = "exam rules"
+    retrieval_query = "eksamensregler"
+    expected = (
+        "The following search query did not return good results from "
+        "the document database. Rewrite it to be broader or use "
+        "different keywords while keeping the same meaning. "
+        "Reply with ONLY the rewritten query, nothing else.\n\n"
+        f"Original question: {query}\n"
+        f"Failed search query: {retrieval_query}"
+    )
+    assert render_prompt("broaden_query", query=query, retrieval_query=retrieval_query) == expected
+def test_detect_languages_renders_identically() -> None:
+    sample_text = "Dette er en test.\n---\nThis is a test."
+    expected = (
+        "You are a language detector. The text samples below come from "
+        "different documents in a knowledge base. Identify ALL distinct "
+        "languages present across the samples (do not list a language more "
+        "than once). Reply with ONLY the language names in English, one per "
+        "line, no explanation.\n\n"
+        f"Samples:\n{sample_text}"
+    )
+    assert render_prompt("detect_languages", sample_text=sample_text) == expected
+def test_multi_query_decompose_renders_identically() -> None:
+    lang_clause = "The queries should be in Danish (the document base is Danish)."
+    question = "Compare rules between master and bachelor exams."
+    expected = (
+        "You are a search query planner. Given a complex question, "
+        "decompose it into 2-4 simple, independent search queries that "
+        f"together cover all aspects of the question. {lang_clause}\n\n"
+        "Reply with ONLY the queries, one per line, nothing else.\n\n"
+        f"Question: {question}"
+    )
+    rendered = render_prompt(
+        "multi_query_decompose", lang_clause=lang_clause, question=question
+    )
+    assert rendered == expected
+def test_summarize_document_renders_identically() -> None:
+    document_id = "ku_ai_policy.pdf"
+    full_text = "Section 1.\n\nSection 2."
+    expected = (
+        "Produce a structured summary of the following document. "
+        "Include:\n"
+        "1. Document title/topic\n"
+        "2. Key points (3-7 bullet points)\n"
+        "3. Important rules, deadlines, or requirements mentioned\n"
+        "4. Who the document applies to\n\n"
+        "Write the summary in the same language as the document.\n\n"
+        f"Document ID: {document_id}\n\n"
+        f"Document text:\n{full_text}"
+    )
+    assert render_prompt(
+        "summarize_document", document_id=document_id, full_text=full_text
+    ) == expected
+def test_registry_raises_on_unknown_prompt() -> None:
+    import pytest
+    with pytest.raises(KeyError):
+        get_prompt("does_not_exist")
+def test_registry_raises_on_unknown_version() -> None:
+    import pytest
+    with pytest.raises(KeyError):
+        get_prompt("intent_classify", version="v999")

tests/test_token_budget.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Tests for the token_budget measurement helper."""
+import logging
+from src.agent.token_budget import count_tokens, measure
+def test_count_tokens_empty_returns_zero() -> None:
+    assert count_tokens("") == 0
+def test_count_tokens_scales_with_safety_factor() -> None:
+    text = "Hello world, this is a small test sentence."
+    raw = count_tokens(text, safety_factor=1.0)
+    scaled = count_tokens(text, safety_factor=2.0)
+    assert raw > 0
+    # Scaled should be roughly double — allow 1 unit slack from rounding.
+    assert abs(scaled - raw * 2) <= 1
+def test_count_tokens_handles_multilingual() -> None:
+    danish = "Hvad er reglerne for eksamen på Københavns Universitet?"
+    chinese = "学生考试规则是什么？"
+    assert count_tokens(danish) > 0
+    assert count_tokens(chinese) > 0
+def test_measure_disabled_returns_zero_and_no_log(caplog) -> None:  # noqa: ANN001
+    with caplog.at_level(logging.INFO, logger="src.agent.token_budget"):
+        result = measure("planner", "some prompt text", enabled=False)
+    assert result == 0
+    assert not any("token_budget" in rec.message for rec in caplog.records)
+def test_measure_enabled_logs_and_returns_count(caplog) -> None:  # noqa: ANN001
+    with caplog.at_level(logging.INFO, logger="src.agent.token_budget"):
+        result = measure("planner", "Hello world", enabled=True)
+    assert result > 0
+    assert any(
+        "token_budget" in rec.message and "planner" in rec.message
+        for rec in caplog.records
+    )