XQ commited on
Commit
4d2a2da
·
1 Parent(s): 38d8c65

Add LLM Provider Fallback

Browse files
.env.example CHANGED
@@ -135,3 +135,29 @@ LOG_LEVEL=INFO
135
 
136
  # --- Inter-service Communication (bare-metal defaults) -----------------------
137
  API_BASE_URL=http://localhost:8000 # Docker overrides to http://api:8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  # --- Inter-service Communication (bare-metal defaults) -----------------------
137
  API_BASE_URL=http://localhost:8000 # Docker overrides to http://api:8000
138
+
139
+ # --- Token Budget (measure-only) ---------------------------------------------
140
+ # When true, the routers log estimated prompt token sizes at the three known
141
+ # generation points (generate_answer, planner, synthesizer). No truncation is
142
+ # applied — this is purely observability. Counts use tiktoken cl100k as a
143
+ # baseline with a 1.5x safety factor for non-OpenAI multilingual tokenizers.
144
+ # TOKEN_BUDGET_ENABLED=false
145
+
146
+ # --- LLM Provider Fallback ---------------------------------------------------
147
+ # When enabled, the primary LLM is wrapped with LangChain with_fallbacks so
148
+ # requests that fail on the primary are retried against each provider in the
149
+ # chain (left to right). DEFAULT OFF. Switching from a local privacy-aware
150
+ # provider (Ollama) to a cloud provider (OpenAI / Anthropic / ...) has both
151
+ # COST and DATA-EXFILTRATION implications.
152
+ # Your requests may leave the tenant when switching from local to cloud.
153
+ #
154
+ # Limitations to be aware of:
155
+ # - Disabled automatically when AGENT_MODE=react (RunnableWithFallbacks is
156
+ # incompatible with bind_tools used by the react sub-agent).
157
+ # - Mid-stream failures are NOT covered: with_fallbacks only catches errors
158
+ # raised before the first token; a connection drop mid-generation will
159
+ # surface as an exception to the caller.
160
+ # - Each fallback activation is logged at WARNING level naming the destination
161
+ # provider — check application logs for unexpected switches.
162
+ # LLM_FALLBACK_ENABLED=false
163
+ # LLM_FALLBACK_PROVIDERS=openai,anthropic # Comma-separated provider chain
src/agent/intent_classifier.py CHANGED
@@ -7,6 +7,7 @@ from langchain_core.language_models.chat_models import BaseChatModel
7
  from langchain_core.output_parsers import StrOutputParser
8
  from langchain_core.prompts import ChatPromptTemplate
9
 
 
10
  from src.models import IntentType
11
 
12
  logger = logging.getLogger(__name__)
@@ -16,16 +17,7 @@ _THINK_UNCLOSED_RE = re.compile(r"<think>.*", re.DOTALL)
16
 
17
  _VALID_INTENTS = {intent.value for intent in IntentType}
18
 
19
- _SYSTEM_PROMPT = (
20
- "You are an intent classifier. Given a user query, classify it into exactly "
21
- "one of the following categories: factual, summary, comparison, procedural, unknown.\n\n"
22
- "- factual: the user asks for a specific fact or piece of information.\n"
23
- "- summary: the user wants a summary or overview of a topic.\n"
24
- "- comparison: the user wants to compare two or more things.\n"
25
- "- procedural: the user asks how to do something step by step.\n"
26
- "- unknown: the query does not fit any of the above.\n\n"
27
- "Respond with ONLY the category name in lowercase, nothing else."
28
- )
29
 
30
 
31
  class IntentClassifier:
 
7
  from langchain_core.output_parsers import StrOutputParser
8
  from langchain_core.prompts import ChatPromptTemplate
9
 
10
+ from src.agent.prompts import get_prompt
11
  from src.models import IntentType
12
 
13
  logger = logging.getLogger(__name__)
 
17
 
18
  _VALID_INTENTS = {intent.value for intent in IntentType}
19
 
20
+ _SYSTEM_PROMPT = get_prompt("intent_classify").template
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  class IntentClassifier:
src/agent/plan_and_execute.py CHANGED
@@ -26,6 +26,8 @@ from langgraph.graph import END, StateGraph
26
  from langgraph.prebuilt import create_react_agent
27
 
28
  from src.agent.memory import ConversationMemory
 
 
29
  from src.agent.tools import ToolResultStore, detect_document_languages, make_retrieval_tools
30
  from src.models import GenerationResponse, IntentType, PipelineDetails
31
  from src.retrieval.hybrid import HybridRetriever
@@ -37,60 +39,12 @@ logger = logging.getLogger(__name__)
37
  _MAX_STEPS = 6
38
 
39
  # ------------------------------------------------------------------
40
- # Prompts
41
  # ------------------------------------------------------------------
42
 
43
- _PLANNER_PROMPT = (
44
- "You are a planning assistant for the University of Copenhagen (KU) document system.\n\n"
45
- "Given a user question, produce a JSON list of 1–4 steps needed to answer it.\n"
46
- "Each step is an object with:\n"
47
- ' - "action": one of "search", "search_within", "multi_search", '
48
- '"summarize", "list_docs", "fetch_doc"\n'
49
- ' - "detail": a short description of what to do (e.g. the search query, document ID)\n\n'
50
- "Rules:\n"
51
- "- IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.\n"
52
- "- For simple factual questions: 1 search step is enough.\n"
53
- "- For comparison questions: use multi_search or separate search steps.\n"
54
- "- For document overview requests: use summarize.\n"
55
- "- For questions with multiple aspects: use 2–4 separate steps.\n"
56
- "- Always end with the steps needed; do NOT include a final 'answer' step.\n\n"
57
- "Reply with ONLY the JSON array, nothing else. No explanation, no thinking.\n\n"
58
- "Examples:\n"
59
- 'Question: "What is the exam policy?"\n'
60
- '[{"action": "search", "detail": "KU eksamensregler"}]\n\n'
61
- 'Question: "Compare vacation rules for academic vs administrative staff"\n'
62
- '[{"action": "search", "detail": "ferieregler videnskabeligt personale"}, '
63
- '{"action": "search", "detail": "ferieregler administrativt personale"}]\n\n'
64
- 'Question: "Summarize the AI policy document"\n'
65
- '[{"action": "summarize", "detail": "ku_ai_policy.pdf"}]\n\n'
66
- 'Question: "Which documents are about AI? Summarize and find the rules for written exams"\n'
67
- '[{"action": "list_docs", "detail": "list all available documents"}, '
68
- '{"action": "search", "detail": "AI dokumenter KU"}, '
69
- '{"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]\n\n'
70
- "Now plan for this question:\n"
71
- )
72
-
73
- _EXECUTOR_SYSTEM = (
74
- "/no_think\n"
75
- "You are executing ONE step of a plan to answer a user's question about "
76
- "University of Copenhagen (KU) documents.\n\n"
77
- "You have retrieval tools available. Execute the step described below, "
78
- "then summarise what you found in 2-3 sentences. If you find nothing "
79
- "relevant, say so clearly.\n\n"
80
- "Do NOT produce a final answer — just report what you found for this step."
81
- )
82
-
83
- _SYNTHESIZER_PROMPT = (
84
- "You are a helpful assistant for administrative staff at the University "
85
- "of Copenhagen (KU).\n\n"
86
- "Below are the results gathered from multiple research steps. "
87
- "Synthesize them into a single coherent answer to the user's original question.\n\n"
88
- "Guidelines:\n"
89
- "- Cite document sources using [1], [2], etc.\n"
90
- "- Answer in the same language as the user's question.\n"
91
- "- Be concise but thorough.\n"
92
- "- If some steps found no results, acknowledge gaps honestly.\n\n"
93
- )
94
 
95
 
96
  # ------------------------------------------------------------------
@@ -146,6 +100,7 @@ class PlanAndExecuteRouter:
146
  default_top_k: int = 5,
147
  memory: ConversationMemory | None = None,
148
  document_languages: list[str] | None = None,
 
149
  ) -> None:
150
  """Initialise the Plan-and-Execute router.
151
 
@@ -172,6 +127,7 @@ class PlanAndExecuteRouter:
172
  self._document_languages: list[str] | None = (
173
  list(document_languages) if document_languages else None
174
  )
 
175
 
176
  def _ensure_document_languages(self) -> list[str]:
177
  """Lazily detect and cache the document corpus languages via the LLM.
@@ -202,6 +158,7 @@ class PlanAndExecuteRouter:
202
  f"{history}\n\n"
203
  )
204
  prompt = _PLANNER_PROMPT + history_section + f'Question: "{state["query"]}"'
 
205
  raw = _extract_content(self._llm.invoke(prompt))
206
  logger.info("Planner raw output: %s", raw)
207
 
@@ -284,6 +241,7 @@ class PlanAndExecuteRouter:
284
  f"Research results:\n{gathered}\n\n"
285
  f"Answer:"
286
  )
 
287
  answer = _extract_content(self._llm.invoke(prompt))
288
  logger.info("Synthesized final answer (%d chars)", len(answer))
289
  return {"answer": answer}
 
26
  from langgraph.prebuilt import create_react_agent
27
 
28
  from src.agent.memory import ConversationMemory
29
+ from src.agent.prompts import get_prompt
30
+ from src.agent.token_budget import measure as _measure_tokens
31
  from src.agent.tools import ToolResultStore, detect_document_languages, make_retrieval_tools
32
  from src.models import GenerationResponse, IntentType, PipelineDetails
33
  from src.retrieval.hybrid import HybridRetriever
 
39
  _MAX_STEPS = 6
40
 
41
  # ------------------------------------------------------------------
42
+ # Prompts (loaded from src/agent/prompts/*.yaml)
43
  # ------------------------------------------------------------------
44
 
45
+ _PLANNER_PROMPT = get_prompt("planner").template
46
+ _EXECUTOR_SYSTEM = get_prompt("executor_system").template
47
+ _SYNTHESIZER_PROMPT = get_prompt("synthesizer").template
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  # ------------------------------------------------------------------
 
100
  default_top_k: int = 5,
101
  memory: ConversationMemory | None = None,
102
  document_languages: list[str] | None = None,
103
+ token_budget_enabled: bool = False,
104
  ) -> None:
105
  """Initialise the Plan-and-Execute router.
106
 
 
127
  self._document_languages: list[str] | None = (
128
  list(document_languages) if document_languages else None
129
  )
130
+ self._token_budget_enabled = token_budget_enabled
131
 
132
  def _ensure_document_languages(self) -> list[str]:
133
  """Lazily detect and cache the document corpus languages via the LLM.
 
158
  f"{history}\n\n"
159
  )
160
  prompt = _PLANNER_PROMPT + history_section + f'Question: "{state["query"]}"'
161
+ _measure_tokens("planner", prompt, enabled=self._token_budget_enabled)
162
  raw = _extract_content(self._llm.invoke(prompt))
163
  logger.info("Planner raw output: %s", raw)
164
 
 
241
  f"Research results:\n{gathered}\n\n"
242
  f"Answer:"
243
  )
244
+ _measure_tokens("synthesizer", prompt, enabled=self._token_budget_enabled)
245
  answer = _extract_content(self._llm.invoke(prompt))
246
  logger.info("Synthesized final answer (%d chars)", len(answer))
247
  return {"answer": answer}
src/agent/prompts/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Centralised prompt registry.
2
+
3
+ All user-visible LLM prompts live in ``*.yaml`` files in this directory.
4
+ The registry loads them once at import time and returns raw template
5
+ strings that callers format with ``str.format(**kwargs)``.
6
+ """
7
+
8
+ from src.agent.prompts.registry import PromptRegistry, get_prompt, render_prompt
9
+
10
+ __all__ = ["PromptRegistry", "get_prompt", "render_prompt"]
src/agent/prompts/broaden_query.v1.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ name: broaden_query
2
+ version: v1
3
+ description: Rewrite a retrieval query when the reranker confidence is low.
4
+ template: |-
5
+ The following search query did not return good results from the document database. Rewrite it to be broader or use different keywords while keeping the same meaning. Reply with ONLY the rewritten query, nothing else.
6
+
7
+ Original question: {query}
8
+ Failed search query: {retrieval_query}
src/agent/prompts/detect_language_and_intent.v1.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: detect_language_and_intent
2
+ version: v1
3
+ description: Single-call prompt for the router that detects query language and classifies intent.
4
+ template: |-
5
+ You are given a user query. Do TWO things:
6
+ 1. Detect the language of the query (reply with the language name in English, e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').
7
+ 2. Classify the intent into exactly one of: {valid_intents}.
8
+
9
+ Reply with EXACTLY two lines, nothing else:
10
+ language: <language>
11
+ intent: <intent>
12
+
13
+ Query: {query}
src/agent/prompts/detect_languages.v1.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ name: detect_languages
2
+ version: v1
3
+ description: Ask the LLM to enumerate every language present across sampled corpus chunks.
4
+ template: |-
5
+ You are a language detector. The text samples below come from different documents in a knowledge base. Identify ALL distinct languages present across the samples (do not list a language more than once). Reply with ONLY the language names in English, one per line, no explanation.
6
+
7
+ Samples:
8
+ {sample_text}
src/agent/prompts/executor_system.v1.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ name: executor_system
2
+ version: v1
3
+ description: System prompt for the per-step ReAct executor sub-agent in Plan-and-Execute.
4
+ template: |-
5
+ /no_think
6
+ You are executing ONE step of a plan to answer a user's question about University of Copenhagen (KU) documents.
7
+
8
+ You have retrieval tools available. Execute the step described below, then summarise what you found in 2-3 sentences. If you find nothing relevant, say so clearly.
9
+
10
+ Do NOT produce a final answer — just report what you found for this step.
src/agent/prompts/intent_classify.v1.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: intent_classify
2
+ version: v1
3
+ description: System prompt for IntentClassifier — classifies a user query into one of five intent categories.
4
+ template: |-
5
+ You are an intent classifier. Given a user query, classify it into exactly one of the following categories: factual, summary, comparison, procedural, unknown.
6
+
7
+ - factual: the user asks for a specific fact or piece of information.
8
+ - summary: the user wants a summary or overview of a topic.
9
+ - comparison: the user wants to compare two or more things.
10
+ - procedural: the user asks how to do something step by step.
11
+ - unknown: the query does not fit any of the above.
12
+
13
+ Respond with ONLY the category name in lowercase, nothing else.
src/agent/prompts/multi_query_decompose.v1.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: multi_query_decompose
2
+ version: v1
3
+ description: Decompose a complex user question into 2–4 independent sub-queries for multi_query_search.
4
+ template: |-
5
+ You are a search query planner. Given a complex question, decompose it into 2-4 simple, independent search queries that together cover all aspects of the question. {lang_clause}
6
+
7
+ Reply with ONLY the queries, one per line, nothing else.
8
+
9
+ Question: {question}
src/agent/prompts/planner.v1.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: planner
2
+ version: v1
3
+ description: Planner prompt for the Plan-and-Execute router. Consumed as a raw prefix; caller appends history and question.
4
+ template: |
5
+ You are a planning assistant for the University of Copenhagen (KU) document system.
6
+
7
+ Given a user question, produce a JSON list of 1–4 steps needed to answer it.
8
+ Each step is an object with:
9
+ - "action": one of "search", "search_within", "multi_search", "summarize", "list_docs", "fetch_doc"
10
+ - "detail": a short description of what to do (e.g. the search query, document ID)
11
+
12
+ Rules:
13
+ - IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.
14
+ - For simple factual questions: 1 search step is enough.
15
+ - For comparison questions: use multi_search or separate search steps.
16
+ - For document overview requests: use summarize.
17
+ - For questions with multiple aspects: use 2–4 separate steps.
18
+ - Always end with the steps needed; do NOT include a final 'answer' step.
19
+
20
+ Reply with ONLY the JSON array, nothing else. No explanation, no thinking.
21
+
22
+ Examples:
23
+ Question: "What is the exam policy?"
24
+ [{"action": "search", "detail": "KU eksamensregler"}]
25
+
26
+ Question: "Compare vacation rules for academic vs administrative staff"
27
+ [{"action": "search", "detail": "ferieregler videnskabeligt personale"}, {"action": "search", "detail": "ferieregler administrativt personale"}]
28
+
29
+ Question: "Summarize the AI policy document"
30
+ [{"action": "summarize", "detail": "ku_ai_policy.pdf"}]
31
+
32
+ Question: "Which documents are about AI? Summarize and find the rules for written exams"
33
+ [{"action": "list_docs", "detail": "list all available documents"}, {"action": "search", "detail": "AI dokumenter KU"}, {"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]
34
+
35
+ Now plan for this question:
src/agent/prompts/registry.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt registry that loads YAML prompt definitions from this package.
2
+
3
+ Each ``*.yaml`` file in this directory defines one prompt version with
4
+ frontmatter-style fields (``name``, ``version``, ``description``,
5
+ ``template``). The registry is a read-only singleton populated at first
6
+ access; tests can call :func:`reload` to force a refresh.
7
+
8
+ Templates are plain Python ``str.format`` templates. The registry
9
+ deliberately does NOT wrap them in a ``PromptTemplate`` — callers already
10
+ have their own composition logic and snapshot tests guarantee that the
11
+ rendered output is byte-identical to the pre-migration strings.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from threading import Lock
20
+
21
+ import yaml
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ _PROMPTS_DIR = Path(__file__).parent
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class PromptSpec:
30
+ """Single loaded prompt definition.
31
+
32
+ Attributes:
33
+ name: Logical prompt name (e.g. ``intent_classify``).
34
+ version: Version string (e.g. ``v1``).
35
+ description: Short human-readable description.
36
+ template: The raw template string with ``{var}`` placeholders.
37
+ source_path: Absolute path of the YAML file this spec came from.
38
+ """
39
+
40
+ name: str
41
+ version: str
42
+ description: str
43
+ template: str
44
+ source_path: Path
45
+
46
+
47
+ class PromptRegistry:
48
+ """Loads and serves prompt templates from YAML files."""
49
+
50
+ _instance: "PromptRegistry | None" = None
51
+ _lock = Lock()
52
+
53
+ def __init__(self, prompts_dir: Path = _PROMPTS_DIR) -> None:
54
+ """Initialise the registry.
55
+
56
+ Args:
57
+ prompts_dir: Directory containing prompt YAML files.
58
+ """
59
+ self._prompts_dir = prompts_dir
60
+ self._by_name: dict[str, dict[str, PromptSpec]] = {}
61
+ self._latest: dict[str, PromptSpec] = {}
62
+ self._load()
63
+
64
+ def _load(self) -> None:
65
+ """Scan ``prompts_dir`` and populate in-memory indices."""
66
+ self._by_name.clear()
67
+ self._latest.clear()
68
+
69
+ for path in sorted(self._prompts_dir.glob("*.yaml")):
70
+ with path.open("r", encoding="utf-8") as fh:
71
+ data = yaml.safe_load(fh)
72
+ if not isinstance(data, dict):
73
+ raise ValueError(f"Prompt file {path} must contain a YAML mapping")
74
+ missing = {"name", "version", "template"} - data.keys()
75
+ if missing:
76
+ raise ValueError(
77
+ f"Prompt file {path} is missing fields: {sorted(missing)}"
78
+ )
79
+ spec = PromptSpec(
80
+ name=str(data["name"]),
81
+ version=str(data["version"]),
82
+ description=str(data.get("description", "")),
83
+ template=str(data["template"]),
84
+ source_path=path,
85
+ )
86
+ versions = self._by_name.setdefault(spec.name, {})
87
+ if spec.version in versions:
88
+ raise ValueError(
89
+ f"Duplicate prompt {spec.name}@{spec.version} in {path}"
90
+ )
91
+ versions[spec.version] = spec
92
+ # Latest wins by lexicographic version comparison — "v2" > "v1".
93
+ current_latest = self._latest.get(spec.name)
94
+ if current_latest is None or spec.version > current_latest.version:
95
+ self._latest[spec.name] = spec
96
+
97
+ logger.info(
98
+ "PromptRegistry loaded %d prompts from %s",
99
+ len(self._by_name), self._prompts_dir,
100
+ )
101
+
102
+ def get(self, name: str, version: str | None = None) -> PromptSpec:
103
+ """Return the :class:`PromptSpec` for ``name`` / ``version``.
104
+
105
+ Args:
106
+ name: Logical prompt name.
107
+ version: Specific version, or ``None`` for the latest.
108
+
109
+ Returns:
110
+ The matching :class:`PromptSpec`.
111
+
112
+ Raises:
113
+ KeyError: When no prompt / version matches.
114
+ """
115
+ if version is None:
116
+ spec = self._latest.get(name)
117
+ if spec is None:
118
+ raise KeyError(f"Unknown prompt: {name}")
119
+ return spec
120
+ versions = self._by_name.get(name, {})
121
+ if version not in versions:
122
+ raise KeyError(f"Unknown prompt version: {name}@{version}")
123
+ return versions[version]
124
+
125
+ def render(self, name: str, version: str | None = None, /, **kwargs: object) -> str:
126
+ """Fetch ``name`` and format its template with ``**kwargs``.
127
+
128
+ Args:
129
+ name: Logical prompt name.
130
+ version: Specific version, or ``None`` for the latest.
131
+ **kwargs: Template variables.
132
+
133
+ Returns:
134
+ The rendered prompt string.
135
+ """
136
+ spec = self.get(name, version)
137
+ return spec.template.format(**kwargs)
138
+
139
+ def names(self) -> list[str]:
140
+ """Return all registered prompt names."""
141
+ return sorted(self._by_name.keys())
142
+
143
+
144
+ def _singleton() -> PromptRegistry:
145
+ """Return the process-wide registry, constructing it on first call."""
146
+ with PromptRegistry._lock:
147
+ if PromptRegistry._instance is None:
148
+ PromptRegistry._instance = PromptRegistry()
149
+ return PromptRegistry._instance
150
+
151
+
152
+ def get_prompt(name: str, version: str | None = None) -> PromptSpec:
153
+ """Shortcut for ``_singleton().get(name, version)``."""
154
+ return _singleton().get(name, version)
155
+
156
+
157
+ def render_prompt(name: str, version: str | None = None, /, **kwargs: object) -> str:
158
+ """Shortcut for ``_singleton().render(name, version, **kwargs)``."""
159
+ return _singleton().render(name, version, **kwargs)
160
+
161
+
162
+ def reload() -> None:
163
+ """Force a reload of the registry — intended for tests."""
164
+ with PromptRegistry._lock:
165
+ PromptRegistry._instance = PromptRegistry()
src/agent/prompts/summarize_document.v1.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: summarize_document
2
+ version: v1
3
+ description: Produce a structured summary of a single document in the knowledge base.
4
+ template: |-
5
+ Produce a structured summary of the following document. Include:
6
+ 1. Document title/topic
7
+ 2. Key points (3-7 bullet points)
8
+ 3. Important rules, deadlines, or requirements mentioned
9
+ 4. Who the document applies to
10
+
11
+ Write the summary in the same language as the document.
12
+
13
+ Document ID: {document_id}
14
+
15
+ Document text:
16
+ {full_text}
src/agent/prompts/synthesizer.v1.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: synthesizer
2
+ version: v1
3
+ description: System prefix for the Plan-and-Execute synthesizer. Caller appends history, original question, and gathered step results.
4
+ template: |+
5
+ You are a helpful assistant for administrative staff at the University of Copenhagen (KU).
6
+
7
+ Below are the results gathered from multiple research steps. Synthesize them into a single coherent answer to the user's original question.
8
+
9
+ Guidelines:
10
+ - Cite document sources using [1], [2], etc.
11
+ - Answer in the same language as the user's question.
12
+ - Be concise but thorough.
13
+ - If some steps found no results, acknowledge gaps honestly.
14
+
src/agent/prompts/translate_query.v1.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ name: translate_query
2
+ version: v1
3
+ description: Translate the user query into the corpus language so BM25 can match.
4
+ template: |-
5
+ Translate the following text to {target}. Reply with ONLY the translated text, nothing else.
6
+
7
+ Text: {query}
src/agent/router.py CHANGED
@@ -20,6 +20,8 @@ from langgraph.graph import END, StateGraph
20
 
21
  from src.models import IntentType, GenerationResponse, PipelineDetails, QueryResult
22
  from src.agent.intent_classifier import IntentClassifier
 
 
23
  from src.agent.tools import detect_document_languages
24
  from src.retrieval.hybrid import HybridRetriever
25
  from src.retrieval.reranker import Reranker
@@ -140,6 +142,7 @@ class QueryRouter:
140
  *,
141
  translate_query: bool = True,
142
  document_languages: list[str] | None = None,
 
143
  ) -> None:
144
  """Initialize the query router.
145
 
@@ -166,6 +169,7 @@ class QueryRouter:
166
  self._document_languages: list[str] | None = (
167
  list(document_languages) if document_languages else None
168
  )
 
169
  self._graph = self._build_graph()
170
 
171
  def _ensure_document_languages(self) -> list[str]:
@@ -195,16 +199,10 @@ class QueryRouter:
195
  Tuple of (detected_language, intent).
196
  """
197
  valid_intents = "factual, summary, comparison, procedural, unknown"
198
- prompt = (
199
- "You are given a user query. Do TWO things:\n"
200
- "1. Detect the language of the query (reply with the language name in English, "
201
- "e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').\n"
202
- "2. Classify the intent into exactly one of: "
203
- f"{valid_intents}.\n\n"
204
- "Reply with EXACTLY two lines, nothing else:\n"
205
- "language: <language>\n"
206
- "intent: <intent>\n\n"
207
- f"Query: {query}"
208
  )
209
  raw = _extract_content(self._llm_chain.invoke(prompt))
210
  logger.debug("Combined detection raw response: %s", raw)
@@ -266,10 +264,8 @@ class QueryRouter:
266
  return query
267
 
268
  target = doc_langs[0]
269
- translate_prompt = (
270
- f"Translate the following text to {target}. "
271
- "Reply with ONLY the translated text, nothing else.\n\n"
272
- f"Text: {query}"
273
  )
274
  translated = _extract_content(self._llm_chain.invoke(translate_prompt))
275
  logger.info("Translated query to %s: %s", target, translated)
@@ -324,13 +320,10 @@ class QueryRouter:
324
  Uses the LLM to generate alternative search terms while preserving
325
  the original meaning, then increments the retry counter.
326
  """
327
- prompt = (
328
- "The following search query did not return good results from "
329
- "the document database. Rewrite it to be broader or use "
330
- "different keywords while keeping the same meaning. "
331
- "Reply with ONLY the rewritten query, nothing else.\n\n"
332
- f"Original question: {state['query']}\n"
333
- f"Failed search query: {state['retrieval_query']}"
334
  )
335
  broadened = _extract_content(self._llm_chain.invoke(prompt))
336
  logger.info(
@@ -380,6 +373,7 @@ class QueryRouter:
380
  prompt = self._build_prompt(
381
  state["query"], state["intent"], context, state["user_language"]
382
  )
 
383
  answer = _extract_content(self._llm_chain.invoke(prompt))
384
  logger.info("Generated answer for intent=%s", state["intent"].value)
385
  return {"answer": answer}
 
20
 
21
  from src.models import IntentType, GenerationResponse, PipelineDetails, QueryResult
22
  from src.agent.intent_classifier import IntentClassifier
23
+ from src.agent.prompts import render_prompt
24
+ from src.agent.token_budget import measure as _measure_tokens
25
  from src.agent.tools import detect_document_languages
26
  from src.retrieval.hybrid import HybridRetriever
27
  from src.retrieval.reranker import Reranker
 
142
  *,
143
  translate_query: bool = True,
144
  document_languages: list[str] | None = None,
145
+ token_budget_enabled: bool = False,
146
  ) -> None:
147
  """Initialize the query router.
148
 
 
169
  self._document_languages: list[str] | None = (
170
  list(document_languages) if document_languages else None
171
  )
172
+ self._token_budget_enabled = token_budget_enabled
173
  self._graph = self._build_graph()
174
 
175
  def _ensure_document_languages(self) -> list[str]:
 
199
  Tuple of (detected_language, intent).
200
  """
201
  valid_intents = "factual, summary, comparison, procedural, unknown"
202
+ prompt = render_prompt(
203
+ "detect_language_and_intent",
204
+ valid_intents=valid_intents,
205
+ query=query,
 
 
 
 
 
 
206
  )
207
  raw = _extract_content(self._llm_chain.invoke(prompt))
208
  logger.debug("Combined detection raw response: %s", raw)
 
264
  return query
265
 
266
  target = doc_langs[0]
267
+ translate_prompt = render_prompt(
268
+ "translate_query", target=target, query=query
 
 
269
  )
270
  translated = _extract_content(self._llm_chain.invoke(translate_prompt))
271
  logger.info("Translated query to %s: %s", target, translated)
 
320
  Uses the LLM to generate alternative search terms while preserving
321
  the original meaning, then increments the retry counter.
322
  """
323
+ prompt = render_prompt(
324
+ "broaden_query",
325
+ query=state["query"],
326
+ retrieval_query=state["retrieval_query"],
 
 
 
327
  )
328
  broadened = _extract_content(self._llm_chain.invoke(prompt))
329
  logger.info(
 
373
  prompt = self._build_prompt(
374
  state["query"], state["intent"], context, state["user_language"]
375
  )
376
+ _measure_tokens("generate_answer", prompt, enabled=self._token_budget_enabled)
377
  answer = _extract_content(self._llm_chain.invoke(prompt))
378
  logger.info("Generated answer for intent=%s", state["intent"].value)
379
  return {"answer": answer}
src/agent/token_budget.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Token counting and budget tracking for LLM prompts.
2
+
3
+ Stage 1 (current): measure-only — counts tokens at known prompt injection
4
+ points and logs them. No truncation is performed.
5
+
6
+ Token counts are estimates: tiktoken's cl100k tokenizer is used as a
7
+ provider-agnostic baseline, multiplied by a safety factor because non-OpenAI
8
+ multilingual tokenizers (Llama, Gemma, Mistral) typically tokenize Danish /
9
+ mixed-language text 20-40% more aggressively than cl100k.
10
+
11
+ Provider-specific tokenizers (Ollama's /api/tokenize, HuggingFace AutoTokenizer)
12
+ are intentionally not used here to keep this module dependency-free and
13
+ process-local. When real usage data exposes the gap, swap in a
14
+ provider-aware backend.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Conservative scaling: cl100k under-counts multilingual / Danish text.
24
+ # 1.5× keeps us on the safe side for budget decisions.
25
+ _DEFAULT_SAFETY_FACTOR = 1.5
26
+
27
+ # Fallback when tiktoken is unavailable: ~4 characters per token is the
28
+ # common rule of thumb for English; multiplied by safety factor it's
29
+ # usable as a coarse upper bound.
30
+ _CHARS_PER_TOKEN_FALLBACK = 4
31
+
32
+ try:
33
+ import tiktoken
34
+
35
+ _ENCODER = tiktoken.get_encoding("cl100k_base")
36
+ except Exception: # noqa: BLE001 — any tiktoken failure → heuristic
37
+ _ENCODER = None
38
+ logger.warning("tiktoken unavailable; falling back to character-based token estimation")
39
+
40
+
41
+ def count_tokens(text: str, *, safety_factor: float = _DEFAULT_SAFETY_FACTOR) -> int:
42
+ """Estimate the token count of ``text``.
43
+
44
+ Args:
45
+ text: Text to measure. Empty / None-ish input returns 0.
46
+ safety_factor: Multiplier applied to the raw count to compensate
47
+ for non-OpenAI tokenizers being more aggressive on multilingual
48
+ text. Defaults to 1.5×.
49
+
50
+ Returns:
51
+ Estimated token count, rounded up to the nearest int.
52
+ """
53
+ if not text:
54
+ return 0
55
+ if _ENCODER is not None:
56
+ raw = len(_ENCODER.encode(text, disallowed_special=()))
57
+ else:
58
+ raw = max(1, len(text) // _CHARS_PER_TOKEN_FALLBACK)
59
+ return int(raw * safety_factor + 0.5)
60
+
61
+
62
+ def measure(
63
+ prompt_name: str,
64
+ text: str,
65
+ *,
66
+ enabled: bool = True,
67
+ safety_factor: float = _DEFAULT_SAFETY_FACTOR,
68
+ ) -> int:
69
+ """Count tokens for ``text`` and log the result.
70
+
71
+ Args:
72
+ prompt_name: Logical name of the prompt being measured (used in
73
+ log lines so different injection points are easy to grep).
74
+ text: The fully-rendered prompt string.
75
+ enabled: When False, returns 0 immediately and logs nothing —
76
+ lets callers gate on the ``TOKEN_BUDGET_ENABLED`` flag without
77
+ duplicating the check.
78
+ safety_factor: See :func:`count_tokens`.
79
+
80
+ Returns:
81
+ Estimated token count, or 0 when ``enabled`` is False.
82
+ """
83
+ if not enabled:
84
+ return 0
85
+ count = count_tokens(text, safety_factor=safety_factor)
86
+ logger.info("token_budget prompt=%s tokens~=%d", prompt_name, count)
87
+ return count
src/agent/tools.py CHANGED
@@ -7,6 +7,7 @@ from dataclasses import dataclass, field
7
  from langchain_core.runnables import Runnable
8
  from langchain_core.tools import tool
9
 
 
10
  from src.models import QueryResult
11
  from src.retrieval.hybrid import HybridRetriever
12
  from src.retrieval.reranker import Reranker
@@ -121,14 +122,7 @@ def detect_document_languages(
121
  if not sample_text:
122
  return []
123
 
124
- prompt = (
125
- "You are a language detector. The text samples below come from "
126
- "different documents in a knowledge base. Identify ALL distinct "
127
- "languages present across the samples (do not list a language more "
128
- "than once). Reply with ONLY the language names in English, one per "
129
- "line, no explanation.\n\n"
130
- f"Samples:\n{sample_text}"
131
- )
132
  raw = _extract_content(llm.invoke(prompt))
133
 
134
  seen: set[str] = set()
@@ -414,12 +408,10 @@ def make_retrieval_tools(
414
  store.tool_calls.append(("multi_query_search", question))
415
 
416
  # Step 1: Ask LLM to decompose the question
417
- decompose_prompt = (
418
- "You are a search query planner. Given a complex question, "
419
- "decompose it into 2-4 simple, independent search queries that "
420
- f"together cover all aspects of the question. {_lang_clause}\n\n"
421
- "Reply with ONLY the queries, one per line, nothing else.\n\n"
422
- f"Question: {question}"
423
  )
424
  raw = _extract_content(llm_chain.invoke(decompose_prompt))
425
  sub_queries = [q.strip().lstrip("0123456789.-) ") for q in raw.splitlines() if q.strip()]
@@ -490,16 +482,10 @@ def make_retrieval_tools(
490
  if len(full_text) > max_chars:
491
  full_text = full_text[:max_chars] + "\n\n[... teksten er forkortet ... (text truncated)]"
492
 
493
- summary_prompt = (
494
- "Produce a structured summary of the following document. "
495
- "Include:\n"
496
- "1. Document title/topic\n"
497
- "2. Key points (3-7 bullet points)\n"
498
- "3. Important rules, deadlines, or requirements mentioned\n"
499
- "4. Who the document applies to\n\n"
500
- "Write the summary in the same language as the document.\n\n"
501
- f"Document ID: {document_id}\n\n"
502
- f"Document text:\n{full_text}"
503
  )
504
  summary = _extract_content(llm_chain.invoke(summary_prompt))
505
  return f"Resumé af {document_id}:\n\n{summary}"
 
7
  from langchain_core.runnables import Runnable
8
  from langchain_core.tools import tool
9
 
10
+ from src.agent.prompts import render_prompt
11
  from src.models import QueryResult
12
  from src.retrieval.hybrid import HybridRetriever
13
  from src.retrieval.reranker import Reranker
 
122
  if not sample_text:
123
  return []
124
 
125
+ prompt = render_prompt("detect_languages", sample_text=sample_text)
 
 
 
 
 
 
 
126
  raw = _extract_content(llm.invoke(prompt))
127
 
128
  seen: set[str] = set()
 
408
  store.tool_calls.append(("multi_query_search", question))
409
 
410
  # Step 1: Ask LLM to decompose the question
411
+ decompose_prompt = render_prompt(
412
+ "multi_query_decompose",
413
+ lang_clause=_lang_clause,
414
+ question=question,
 
 
415
  )
416
  raw = _extract_content(llm_chain.invoke(decompose_prompt))
417
  sub_queries = [q.strip().lstrip("0123456789.-) ") for q in raw.splitlines() if q.strip()]
 
482
  if len(full_text) > max_chars:
483
  full_text = full_text[:max_chars] + "\n\n[... teksten er forkortet ... (text truncated)]"
484
 
485
+ summary_prompt = render_prompt(
486
+ "summarize_document",
487
+ document_id=document_id,
488
+ full_text=full_text,
 
 
 
 
 
 
489
  )
490
  summary = _extract_content(llm_chain.invoke(summary_prompt))
491
  return f"Resumé af {document_id}:\n\n{summary}"
src/api/main.py CHANGED
@@ -9,7 +9,7 @@ from fastapi import FastAPI
9
  from langchain_core.output_parsers import StrOutputParser
10
 
11
  from src.config import load_settings
12
- from src.provider import create_llm, create_embeddings, create_reranker
13
  from src.retrieval.embedder import Embedder
14
  from src.retrieval.vector_store import VectorStore
15
  from src.retrieval.bm25_search import BM25Search
@@ -36,7 +36,17 @@ def create_app() -> FastAPI:
36
 
37
  logging.basicConfig(level=getattr(logging, settings.log_level, logging.INFO))
38
 
39
- llm = create_llm(settings)
 
 
 
 
 
 
 
 
 
 
40
  embeddings = create_embeddings(settings)
41
 
42
  embedder = Embedder(embeddings=embeddings)
@@ -83,6 +93,7 @@ def create_app() -> FastAPI:
83
  vector_store=vector_store,
84
  default_top_k=settings.top_k,
85
  memory=ConversationMemory(),
 
86
  )
87
  else:
88
  logger.info("Agent mode: pipeline (fixed DAG)")
@@ -94,6 +105,7 @@ def create_app() -> FastAPI:
94
  reranker=reranker,
95
  llm_chain=llm_chain,
96
  translate_query=settings.translate_query,
 
97
  )
98
 
99
  session_store = SessionStore(db_path=os.environ.get("SESSION_DB_PATH", "./data/sessions.db"))
 
9
  from langchain_core.output_parsers import StrOutputParser
10
 
11
  from src.config import load_settings
12
+ from src.provider import create_llm, create_llm_with_fallback, create_embeddings, create_reranker
13
  from src.retrieval.embedder import Embedder
14
  from src.retrieval.vector_store import VectorStore
15
  from src.retrieval.bm25_search import BM25Search
 
36
 
37
  logging.basicConfig(level=getattr(logging, settings.log_level, logging.INFO))
38
 
39
+ # React mode's ReAct sub-agent calls llm.bind_tools(...) internally, which
40
+ # RunnableWithFallbacks does not support. Fall back chain is therefore only
41
+ # applied in pipeline mode; in react mode we warn and use the primary only.
42
+ if settings.llm_fallback_enabled and settings.agent_mode == "react":
43
+ logger.warning(
44
+ "LLM_FALLBACK_ENABLED is set but AGENT_MODE=react; fallback chain "
45
+ "is incompatible with tool-calling and will be DISABLED for this run."
46
+ )
47
+ llm = create_llm(settings)
48
+ else:
49
+ llm = create_llm_with_fallback(settings)
50
  embeddings = create_embeddings(settings)
51
 
52
  embedder = Embedder(embeddings=embeddings)
 
93
  vector_store=vector_store,
94
  default_top_k=settings.top_k,
95
  memory=ConversationMemory(),
96
+ token_budget_enabled=settings.token_budget_enabled,
97
  )
98
  else:
99
  logger.info("Agent mode: pipeline (fixed DAG)")
 
105
  reranker=reranker,
106
  llm_chain=llm_chain,
107
  translate_query=settings.translate_query,
108
+ token_budget_enabled=settings.token_budget_enabled,
109
  )
110
 
111
  session_store = SessionStore(db_path=os.environ.get("SESSION_DB_PATH", "./data/sessions.db"))
src/config.py CHANGED
@@ -84,6 +84,18 @@ class Settings:
84
  # Agent mode: "pipeline" (fixed DAG) or "react" (tool-calling ReAct loop)
85
  agent_mode: str
86
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def _parse_bool(value: str, *, default: bool) -> bool:
89
  """Parse a boolean environment variable string.
@@ -180,4 +192,19 @@ def load_settings() -> Settings:
180
  # Agent mode: "pipeline" keeps the existing fixed DAG; "react" enables
181
  # the multi-step ReAct loop (requires an LLM with tool-calling support).
182
  agent_mode=os.environ.get("AGENT_MODE", "pipeline"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  )
 
84
  # Agent mode: "pipeline" (fixed DAG) or "react" (tool-calling ReAct loop)
85
  agent_mode: str
86
 
87
+ # Token budget (Stage 1: measure-only). When True, prompt sizes are
88
+ # logged at known injection points; truncation is NOT applied yet.
89
+ token_budget_enabled: bool
90
+
91
+ # LLM provider fallback. When enabled, the primary generation LLM is
92
+ # wrapped with LangChain's with_fallbacks across ``llm_fallback_providers``
93
+ # in order. DEFAULT OFF because an automatic switch from a local
94
+ # privacy-preserving provider (e.g. Ollama) to a cloud provider (e.g.
95
+ # OpenAI) has both cost and data-exfiltration implications.
96
+ llm_fallback_enabled: bool
97
+ llm_fallback_providers: tuple[str, ...]
98
+
99
 
100
  def _parse_bool(value: str, *, default: bool) -> bool:
101
  """Parse a boolean environment variable string.
 
192
  # Agent mode: "pipeline" keeps the existing fixed DAG; "react" enables
193
  # the multi-step ReAct loop (requires an LLM with tool-calling support).
194
  agent_mode=os.environ.get("AGENT_MODE", "pipeline"),
195
+
196
+ # Token budget — measure-only logging, off by default.
197
+ token_budget_enabled=_parse_bool(
198
+ os.environ.get("TOKEN_BUDGET_ENABLED", ""), default=False
199
+ ),
200
+
201
+ # LLM fallback chain — off by default for privacy / cost reasons.
202
+ llm_fallback_enabled=_parse_bool(
203
+ os.environ.get("LLM_FALLBACK_ENABLED", ""), default=False
204
+ ),
205
+ llm_fallback_providers=tuple(
206
+ p.strip().lower()
207
+ for p in os.environ.get("LLM_FALLBACK_PROVIDERS", "").split(",")
208
+ if p.strip()
209
+ ),
210
  )
src/provider.py CHANGED
@@ -110,6 +110,98 @@ def create_llm(settings: Settings) -> BaseChatModel:
110
  )
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  _EVALUATOR_MODEL_FIELD: dict[str, str] = {
114
  "groq": "groq_model",
115
  "openai": "openai_model",
 
110
  )
111
 
112
 
113
+ # Exceptions that engage the fallback chain. Set to the broad ``Exception``
114
+ # because real-world LLM SDK errors (openai.RateLimitError,
115
+ # openai.APIConnectionError, httpx.ConnectError, anthropic.APIError, ...)
116
+ # do NOT inherit from stdlib ``ConnectionError`` / ``TimeoutError`` / ``OSError``.
117
+ # A narrower set would silently let the most common transient failures bypass
118
+ # the fallback. Safety relies on three layers instead:
119
+ # 1. The whole feature is opt-in via ``LLM_FALLBACK_ENABLED`` (default off).
120
+ # 2. Every fallback activation logs a WARNING naming the destination provider.
121
+ # 3. Startup logs the full chain at WARNING with cost / privacy reminders.
122
+ _FALLBACK_EXCEPTIONS: tuple[type[BaseException], ...] = (Exception,)
123
+
124
+
125
+ def _wrap_with_fallback_logging(llm: BaseChatModel, provider: str) -> BaseChatModel:
126
+ """Wrap ``llm`` so every invocation logs a WARNING naming the provider.
127
+
128
+ The wrapper only fires when the underlying Runnable is actually invoked,
129
+ which for a fallback entry means the primary (and any earlier fallbacks)
130
+ already failed. This gives operators a clear trail showing when data
131
+ leaves the primary provider — critical for the privacy-aware default of
132
+ this project.
133
+
134
+ Args:
135
+ llm: The chat model to wrap.
136
+ provider: Provider label shown in the log message.
137
+
138
+ Returns:
139
+ A Runnable that transparently delegates to ``llm``.
140
+ """
141
+
142
+ def _on_start(_run_obj, _config=None) -> None: # noqa: ANN001
143
+ logger.warning(
144
+ "LLM fallback activated: routing request to provider '%s'. "
145
+ "Check cost / privacy implications.",
146
+ provider,
147
+ )
148
+
149
+ return llm.with_listeners(on_start=_on_start)
150
+
151
+
152
+ def create_llm_with_fallback(settings: Settings) -> BaseChatModel:
153
+ """Create the generation LLM, optionally wrapping it in a fallback chain.
154
+
155
+ When ``settings.llm_fallback_enabled`` is False OR the fallback list is
156
+ empty, this is a drop-in equivalent of :func:`create_llm`. Otherwise the
157
+ primary LLM is wrapped via LangChain's ``with_fallbacks`` so that when
158
+ the primary raises a transient failure (network / timeout / connection),
159
+ each fallback provider is tried in order.
160
+
161
+ Args:
162
+ settings: Application settings.
163
+
164
+ Returns:
165
+ A BaseChatModel (primary on its own, or primary-with-fallbacks).
166
+ """
167
+ primary = create_llm(settings)
168
+ if not settings.llm_fallback_enabled or not settings.llm_fallback_providers:
169
+ return primary
170
+
171
+ fallbacks: list[BaseChatModel] = []
172
+ for provider in settings.llm_fallback_providers:
173
+ try:
174
+ fallback_settings = replace(settings, llm_provider=provider)
175
+ raw = create_llm(fallback_settings)
176
+ except Exception as exc: # noqa: BLE001 — log and skip broken fallbacks
177
+ logger.error(
178
+ "Skipping LLM fallback provider '%s' due to construction error: %s",
179
+ provider, exc,
180
+ )
181
+ continue
182
+ fallbacks.append(_wrap_with_fallback_logging(raw, provider))
183
+
184
+ if not fallbacks:
185
+ logger.warning(
186
+ "LLM_FALLBACK_ENABLED is true but no fallback providers could be "
187
+ "constructed; running without fallback."
188
+ )
189
+ return primary
190
+
191
+ chain_repr = " -> ".join([settings.llm_provider, *settings.llm_fallback_providers])
192
+ logger.warning(
193
+ "LLM fallback chain is ACTIVE: %s. "
194
+ "On transient failure of the primary, requests will be routed to the "
195
+ "next provider. This may incur API costs and send data to third-party "
196
+ "providers.",
197
+ chain_repr,
198
+ )
199
+
200
+ return primary.with_fallbacks(
201
+ fallbacks, exceptions_to_handle=_FALLBACK_EXCEPTIONS
202
+ )
203
+
204
+
205
  _EVALUATOR_MODEL_FIELD: dict[str, str] = {
206
  "groq": "groq_model",
207
  "openai": "openai_model",
tests/test_llm_fallback.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for create_llm_with_fallback and the fallback chain runtime behaviour."""
2
+
3
+ from dataclasses import replace
4
+ from unittest.mock import patch
5
+
6
+ import pytest
7
+ from langchain_core.language_models.fake_chat_models import FakeListChatModel
8
+ from langchain_core.runnables import RunnableLambda
9
+
10
+ import src.provider as provider_module
11
+ from src.config import load_settings
12
+
13
+
14
+ def _base_settings(): # noqa: ANN202
15
+ """Return a Settings instance with fallback fields overridable for tests."""
16
+ return load_settings()
17
+
18
+
19
+ def test_fallback_disabled_returns_plain_llm() -> None:
20
+ settings = replace(
21
+ _base_settings(),
22
+ llm_fallback_enabled=False,
23
+ llm_fallback_providers=("openai",),
24
+ )
25
+ fake = FakeListChatModel(responses=["hello"])
26
+ with patch.object(provider_module, "create_llm", return_value=fake) as m:
27
+ result = provider_module.create_llm_with_fallback(settings)
28
+ # Exactly one LLM constructed: the primary.
29
+ assert m.call_count == 1
30
+ # The returned object is the plain fake — no with_fallbacks wrapper.
31
+ assert result is fake
32
+
33
+
34
+ def test_fallback_enabled_but_empty_list_returns_plain_llm() -> None:
35
+ settings = replace(
36
+ _base_settings(),
37
+ llm_fallback_enabled=True,
38
+ llm_fallback_providers=(),
39
+ )
40
+ fake = FakeListChatModel(responses=["hello"])
41
+ with patch.object(provider_module, "create_llm", return_value=fake):
42
+ result = provider_module.create_llm_with_fallback(settings)
43
+ assert result is fake
44
+
45
+
46
+ def test_fallback_chain_invokes_fallback_on_transient_error(caplog) -> None: # noqa: ANN001
47
+ """Primary raises ConnectionError → fallback is used and success is returned."""
48
+ settings = replace(
49
+ _base_settings(),
50
+ llm_provider="primary_stub",
51
+ llm_fallback_enabled=True,
52
+ llm_fallback_providers=("fallback_stub",),
53
+ )
54
+
55
+ primary = RunnableLambda(lambda _x: (_ for _ in ()).throw(ConnectionError("down")))
56
+ fallback = FakeListChatModel(responses=["rescued"])
57
+
58
+ def fake_create(s): # noqa: ANN001, ANN202
59
+ if s.llm_provider == "primary_stub":
60
+ return primary
61
+ return fallback
62
+
63
+ import logging
64
+ caplog.set_level(logging.WARNING, logger="src.provider")
65
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
66
+ chain = provider_module.create_llm_with_fallback(settings)
67
+
68
+ # Startup warning about the chain must be emitted.
69
+ assert any("LLM fallback chain is ACTIVE" in r.message for r in caplog.records)
70
+
71
+ # Invoking the chain transparently recovers via the fallback.
72
+ result = chain.invoke("hi")
73
+ # FakeListChatModel returns an AIMessage whose content is the response.
74
+ assert getattr(result, "content", result) == "rescued"
75
+
76
+ # Trigger-time warning must have fired when the fallback was used.
77
+ assert any("fallback activated" in r.message.lower() for r in caplog.records)
78
+
79
+
80
+ def test_broken_fallback_provider_is_skipped(caplog) -> None: # noqa: ANN001
81
+ settings = replace(
82
+ _base_settings(),
83
+ llm_provider="primary_stub",
84
+ llm_fallback_enabled=True,
85
+ llm_fallback_providers=("broken", "good"),
86
+ )
87
+ primary = FakeListChatModel(responses=["primary"])
88
+ good_fallback = FakeListChatModel(responses=["good"])
89
+
90
+ def fake_create(s): # noqa: ANN001, ANN202
91
+ if s.llm_provider == "primary_stub":
92
+ return primary
93
+ if s.llm_provider == "broken":
94
+ raise RuntimeError("cannot construct")
95
+ return good_fallback
96
+
97
+ import logging
98
+ caplog.set_level(logging.ERROR, logger="src.provider")
99
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
100
+ chain = provider_module.create_llm_with_fallback(settings)
101
+
102
+ assert any("Skipping LLM fallback provider 'broken'" in r.message for r in caplog.records)
103
+ # Chain should still be usable (wraps primary + good).
104
+ assert chain is not primary # with_fallbacks wrapped the result
105
+
106
+
107
+ def test_streaming_pre_stream_failure_engages_fallback() -> None:
108
+ """When primary fails before yielding any tokens, fallback streams cleanly.
109
+
110
+ This is the expected happy path: the streaming entry point (e.g. a
111
+ connection refused at request time) raises before any token leaves the
112
+ primary, so ``with_fallbacks`` transparently substitutes the fallback
113
+ and the caller sees exactly one clean stream.
114
+ """
115
+ settings = replace(
116
+ _base_settings(),
117
+ llm_provider="primary_stub",
118
+ llm_fallback_enabled=True,
119
+ llm_fallback_providers=("fallback_stub",),
120
+ )
121
+
122
+ # Primary throws on both invoke and stream — simulates full outage.
123
+ primary = RunnableLambda(
124
+ lambda _x: (_ for _ in ()).throw(ConnectionError("primary down"))
125
+ )
126
+ fallback = FakeListChatModel(responses=["rescued"])
127
+
128
+ def fake_create(s): # noqa: ANN001, ANN202
129
+ return primary if s.llm_provider == "primary_stub" else fallback
130
+
131
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
132
+ chain = provider_module.create_llm_with_fallback(settings)
133
+
134
+ chunks = list(chain.stream("hi"))
135
+ joined = "".join(getattr(c, "content", str(c)) for c in chunks)
136
+ # The fallback's single response is streamed character-by-character by
137
+ # FakeListChatModel, so the joined output must equal the response exactly
138
+ # — no duplicated tokens from the primary.
139
+ assert joined == "rescued"
140
+
141
+
142
+ def test_streaming_mid_stream_failure_is_not_caught_by_fallback() -> None:
143
+ """Mid-stream failures propagate; fallback is not engaged.
144
+
145
+ This documents a real limitation of ``RunnableWithFallbacks``: it only
146
+ catches exceptions raised when the stream is OPENED, not exceptions
147
+ raised DURING iteration. If the primary yields some tokens and then
148
+ the connection dies, the caller receives those partial tokens followed
149
+ by the original exception — NOT a seamless switch to the fallback.
150
+
151
+ This guards against silently relying on fallback to cover mid-stream
152
+ outages; it cannot.
153
+ """
154
+ from langchain_core.messages import AIMessageChunk
155
+ from langchain_core.runnables import Runnable
156
+
157
+ class PartialThenFail(Runnable):
158
+ def invoke(self, input, config=None, **kwargs): # noqa: ANN001, A002
159
+ raise ConnectionError("primary has no invoke")
160
+
161
+ def stream(self, input, config=None, **kwargs): # noqa: ANN001, A002
162
+ yield AIMessageChunk(content="partial-")
163
+ raise ConnectionError("mid-stream outage")
164
+
165
+ settings = replace(
166
+ _base_settings(),
167
+ llm_provider="primary_stub",
168
+ llm_fallback_enabled=True,
169
+ llm_fallback_providers=("fallback_stub",),
170
+ )
171
+ primary = PartialThenFail()
172
+ fallback = FakeListChatModel(responses=["rescued"])
173
+
174
+ def fake_create(s): # noqa: ANN001, ANN202
175
+ return primary if s.llm_provider == "primary_stub" else fallback
176
+
177
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
178
+ chain = provider_module.create_llm_with_fallback(settings)
179
+
180
+ observed: list[str] = []
181
+ with pytest.raises(ConnectionError):
182
+ for chunk in chain.stream("hi"):
183
+ observed.append(chunk.content)
184
+
185
+ # Partial token was delivered to the caller before the failure bubbled up.
186
+ assert observed == ["partial-"]
187
+
188
+
189
+ def test_streaming_integration_with_query_router_uses_fallback(monkeypatch) -> None: # noqa: ANN001
190
+ """End-to-end: QueryRouter.route_stream survives a primary LLM outage.
191
+
192
+ Wires a fallback-wrapped LLM into a real QueryRouter and asserts the
193
+ SSE event stream completes with a ``done`` event carrying the fallback's
194
+ answer. This proves the fallback integrates with the generation node's
195
+ downstream ``StrOutputParser`` and with the router's streaming path.
196
+ """
197
+ from langchain_core.messages import AIMessage
198
+ from langchain_core.output_parsers import StrOutputParser
199
+
200
+ from src.agent.intent_classifier import IntentClassifier
201
+ from src.agent.router import QueryRouter
202
+ from src.models import IntentType, QueryResult, DocumentChunk
203
+
204
+ # Build the fallback-wrapped LLM.
205
+ settings = replace(
206
+ _base_settings(),
207
+ llm_provider="primary_stub",
208
+ llm_fallback_enabled=True,
209
+ llm_fallback_providers=("fallback_stub",),
210
+ )
211
+ primary = RunnableLambda(
212
+ lambda _x: (_ for _ in ()).throw(ConnectionError("primary down"))
213
+ )
214
+ fallback = RunnableLambda(lambda _x: AIMessage(content="rescued answer"))
215
+
216
+ def fake_create(s): # noqa: ANN001, ANN202
217
+ return primary if s.llm_provider == "primary_stub" else fallback
218
+
219
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
220
+ llm = provider_module.create_llm_with_fallback(settings)
221
+
222
+ # Stub the intent classifier so we don't need to drive language detection.
223
+ class _StubClassifier:
224
+ def classify(self, _query: str) -> IntentType:
225
+ return IntentType.FACTUAL
226
+
227
+ # Stub retriever + reranker so they return a single fake result.
228
+ chunk = DocumentChunk(
229
+ chunk_id="c1",
230
+ document_id="doc1.pdf",
231
+ text="Sample context.",
232
+ metadata={"page_number": 1, "chunk_index": 0},
233
+ )
234
+ fake_qr = QueryResult(chunk=chunk, score=0.9, source="dense")
235
+
236
+ class _StubHybridResult:
237
+ def __init__(self): # noqa: ANN204
238
+ self.dense_results = [fake_qr]
239
+ self.sparse_results = [fake_qr]
240
+ self.fused_results = [fake_qr]
241
+
242
+ class _StubHybrid:
243
+ def __init__(self): # noqa: ANN204
244
+ self.vector_store = _StubVectorStore()
245
+ def search_detailed(self, _q: str, top_k: int = 5) -> _StubHybridResult:
246
+ return _StubHybridResult()
247
+
248
+ class _StubVectorStore:
249
+ def list_document_ids(self) -> list[str]:
250
+ return []
251
+
252
+ class _StubReranker:
253
+ def rerank(self, _q: str, results, top_k: int = 5): # noqa: ANN001, ANN201
254
+ return list(results)[:top_k]
255
+
256
+ router = QueryRouter(
257
+ intent_classifier=_StubClassifier(),
258
+ hybrid_retriever=_StubHybrid(),
259
+ reranker=_StubReranker(),
260
+ llm_chain=llm | StrOutputParser(),
261
+ translate_query=False,
262
+ document_languages=["Danish"],
263
+ )
264
+
265
+ events = list(router.route_stream("Hvor mange feriedage?", top_k=3))
266
+ done = [e for e in events if e["step"] == "done"]
267
+ assert done, f"expected a 'done' event, got steps={[e['step'] for e in events]}"
268
+ assert done[0]["result"]["answer"] == "rescued answer"
269
+
270
+
271
+ def test_fallback_engages_on_sdk_style_exception() -> None:
272
+ """Fallback must engage on arbitrary Exception subclasses.
273
+
274
+ Real-world LLM SDK exceptions (openai.RateLimitError, httpx.ConnectError,
275
+ etc.) do not inherit from stdlib ConnectionError / TimeoutError / OSError.
276
+ Using a narrow exception tuple would silently make the fallback chain a
277
+ no-op for these cases. This test simulates one of them with a custom
278
+ Exception subclass that has no relation to ConnectionError.
279
+ """
280
+
281
+ class FakeRateLimitError(Exception):
282
+ """Stand-in for openai.RateLimitError / anthropic.APIError."""
283
+
284
+ settings = replace(
285
+ _base_settings(),
286
+ llm_provider="primary_stub",
287
+ llm_fallback_enabled=True,
288
+ llm_fallback_providers=("fallback_stub",),
289
+ )
290
+
291
+ primary = RunnableLambda(
292
+ lambda _x: (_ for _ in ()).throw(FakeRateLimitError("429 Too Many Requests"))
293
+ )
294
+ fallback = FakeListChatModel(responses=["rescued"])
295
+
296
+ def fake_create(s): # noqa: ANN001, ANN202
297
+ return primary if s.llm_provider == "primary_stub" else fallback
298
+
299
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
300
+ chain = provider_module.create_llm_with_fallback(settings)
301
+
302
+ result = chain.invoke("hi")
303
+ assert getattr(result, "content", result) == "rescued"
304
+
305
+
306
+ def test_all_fallbacks_broken_returns_primary_only(caplog) -> None: # noqa: ANN001
307
+ settings = replace(
308
+ _base_settings(),
309
+ llm_provider="primary_stub",
310
+ llm_fallback_enabled=True,
311
+ llm_fallback_providers=("broken",),
312
+ )
313
+ primary = FakeListChatModel(responses=["primary"])
314
+
315
+ def fake_create(s): # noqa: ANN001, ANN202
316
+ if s.llm_provider == "primary_stub":
317
+ return primary
318
+ raise RuntimeError("nope")
319
+
320
+ import logging
321
+ caplog.set_level(logging.WARNING, logger="src.provider")
322
+ with patch.object(provider_module, "create_llm", side_effect=fake_create):
323
+ result = provider_module.create_llm_with_fallback(settings)
324
+
325
+ assert result is primary
326
+ assert any(
327
+ "no fallback providers could be constructed" in r.message
328
+ for r in caplog.records
329
+ )
tests/test_prompts_registry.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Snapshot tests ensuring YAML prompts render byte-identical to the legacy inline strings."""
2
+
3
+ from src.agent.prompts import registry as _reg
4
+ from src.agent.prompts import get_prompt, render_prompt
5
+
6
+
7
+ def setup_module(module) -> None: # noqa: ANN001
8
+ """Force a fresh registry load before snapshot tests."""
9
+ _reg.reload()
10
+
11
+
12
+ # Golden literals — frozen copies of the pre-migration strings. Any future
13
+ # change to the YAML must preserve these byte-for-byte. Do NOT replace these
14
+ # with imports from the source modules; that would make the test tautological
15
+ # after the source modules are migrated to read from the registry.
16
+
17
+ _GOLDEN_INTENT_CLASSIFY = (
18
+ "You are an intent classifier. Given a user query, classify it into exactly "
19
+ "one of the following categories: factual, summary, comparison, procedural, unknown.\n\n"
20
+ "- factual: the user asks for a specific fact or piece of information.\n"
21
+ "- summary: the user wants a summary or overview of a topic.\n"
22
+ "- comparison: the user wants to compare two or more things.\n"
23
+ "- procedural: the user asks how to do something step by step.\n"
24
+ "- unknown: the query does not fit any of the above.\n\n"
25
+ "Respond with ONLY the category name in lowercase, nothing else."
26
+ )
27
+
28
+ _GOLDEN_PLANNER = (
29
+ "You are a planning assistant for the University of Copenhagen (KU) document system.\n\n"
30
+ "Given a user question, produce a JSON list of 1–4 steps needed to answer it.\n"
31
+ "Each step is an object with:\n"
32
+ ' - "action": one of "search", "search_within", "multi_search", '
33
+ '"summarize", "list_docs", "fetch_doc"\n'
34
+ ' - "detail": a short description of what to do (e.g. the search query, document ID)\n\n'
35
+ "Rules:\n"
36
+ "- IMPORTANT: Most questions probably only need 1 step. Only use 2+ steps when the question explicitly asks about multiple distinct topics.\n"
37
+ "- For simple factual questions: 1 search step is enough.\n"
38
+ "- For comparison questions: use multi_search or separate search steps.\n"
39
+ "- For document overview requests: use summarize.\n"
40
+ "- For questions with multiple aspects: use 2–4 separate steps.\n"
41
+ "- Always end with the steps needed; do NOT include a final 'answer' step.\n\n"
42
+ "Reply with ONLY the JSON array, nothing else. No explanation, no thinking.\n\n"
43
+ "Examples:\n"
44
+ 'Question: "What is the exam policy?"\n'
45
+ '[{"action": "search", "detail": "KU eksamensregler"}]\n\n'
46
+ 'Question: "Compare vacation rules for academic vs administrative staff"\n'
47
+ '[{"action": "search", "detail": "ferieregler videnskabeligt personale"}, '
48
+ '{"action": "search", "detail": "ferieregler administrativt personale"}]\n\n'
49
+ 'Question: "Summarize the AI policy document"\n'
50
+ '[{"action": "summarize", "detail": "ku_ai_policy.pdf"}]\n\n'
51
+ 'Question: "Which documents are about AI? Summarize and find the rules for written exams"\n'
52
+ '[{"action": "list_docs", "detail": "list all available documents"}, '
53
+ '{"action": "search", "detail": "AI dokumenter KU"}, '
54
+ '{"action": "search", "detail": "regler skriftlige opgaver eksamen GAI"}]\n\n'
55
+ "Now plan for this question:\n"
56
+ )
57
+
58
+ _GOLDEN_EXECUTOR_SYSTEM = (
59
+ "/no_think\n"
60
+ "You are executing ONE step of a plan to answer a user's question about "
61
+ "University of Copenhagen (KU) documents.\n\n"
62
+ "You have retrieval tools available. Execute the step described below, "
63
+ "then summarise what you found in 2-3 sentences. If you find nothing "
64
+ "relevant, say so clearly.\n\n"
65
+ "Do NOT produce a final answer — just report what you found for this step."
66
+ )
67
+
68
+ _GOLDEN_SYNTHESIZER = (
69
+ "You are a helpful assistant for administrative staff at the University "
70
+ "of Copenhagen (KU).\n\n"
71
+ "Below are the results gathered from multiple research steps. "
72
+ "Synthesize them into a single coherent answer to the user's original question.\n\n"
73
+ "Guidelines:\n"
74
+ "- Cite document sources using [1], [2], etc.\n"
75
+ "- Answer in the same language as the user's question.\n"
76
+ "- Be concise but thorough.\n"
77
+ "- If some steps found no results, acknowledge gaps honestly.\n\n"
78
+ )
79
+
80
+
81
+ def test_intent_classify_matches_golden() -> None:
82
+ assert get_prompt("intent_classify").template == _GOLDEN_INTENT_CLASSIFY
83
+
84
+
85
+ def test_planner_matches_golden() -> None:
86
+ assert get_prompt("planner").template == _GOLDEN_PLANNER
87
+
88
+
89
+ def test_executor_system_matches_golden() -> None:
90
+ assert get_prompt("executor_system").template == _GOLDEN_EXECUTOR_SYSTEM
91
+
92
+
93
+ def test_synthesizer_matches_golden() -> None:
94
+ assert get_prompt("synthesizer").template == _GOLDEN_SYNTHESIZER
95
+
96
+
97
+ def test_detect_language_and_intent_renders_identically() -> None:
98
+ valid_intents = "factual, summary, comparison, procedural, unknown"
99
+ query = "Hvor mange feriedage har jeg?"
100
+ expected = (
101
+ "You are given a user query. Do TWO things:\n"
102
+ "1. Detect the language of the query (reply with the language name in English, "
103
+ "e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').\n"
104
+ "2. Classify the intent into exactly one of: "
105
+ f"{valid_intents}.\n\n"
106
+ "Reply with EXACTLY two lines, nothing else:\n"
107
+ "language: <language>\n"
108
+ "intent: <intent>\n\n"
109
+ f"Query: {query}"
110
+ )
111
+ rendered = render_prompt(
112
+ "detect_language_and_intent", valid_intents=valid_intents, query=query
113
+ )
114
+ assert rendered == expected
115
+
116
+
117
+ def test_translate_query_renders_identically() -> None:
118
+ target = "Danish"
119
+ query = "How many vacation days do I have?"
120
+ expected = (
121
+ f"Translate the following text to {target}. "
122
+ "Reply with ONLY the translated text, nothing else.\n\n"
123
+ f"Text: {query}"
124
+ )
125
+ assert render_prompt("translate_query", target=target, query=query) == expected
126
+
127
+
128
+ def test_broaden_query_renders_identically() -> None:
129
+ query = "exam rules"
130
+ retrieval_query = "eksamensregler"
131
+ expected = (
132
+ "The following search query did not return good results from "
133
+ "the document database. Rewrite it to be broader or use "
134
+ "different keywords while keeping the same meaning. "
135
+ "Reply with ONLY the rewritten query, nothing else.\n\n"
136
+ f"Original question: {query}\n"
137
+ f"Failed search query: {retrieval_query}"
138
+ )
139
+ assert render_prompt("broaden_query", query=query, retrieval_query=retrieval_query) == expected
140
+
141
+
142
+ def test_detect_languages_renders_identically() -> None:
143
+ sample_text = "Dette er en test.\n---\nThis is a test."
144
+ expected = (
145
+ "You are a language detector. The text samples below come from "
146
+ "different documents in a knowledge base. Identify ALL distinct "
147
+ "languages present across the samples (do not list a language more "
148
+ "than once). Reply with ONLY the language names in English, one per "
149
+ "line, no explanation.\n\n"
150
+ f"Samples:\n{sample_text}"
151
+ )
152
+ assert render_prompt("detect_languages", sample_text=sample_text) == expected
153
+
154
+
155
+ def test_multi_query_decompose_renders_identically() -> None:
156
+ lang_clause = "The queries should be in Danish (the document base is Danish)."
157
+ question = "Compare rules between master and bachelor exams."
158
+ expected = (
159
+ "You are a search query planner. Given a complex question, "
160
+ "decompose it into 2-4 simple, independent search queries that "
161
+ f"together cover all aspects of the question. {lang_clause}\n\n"
162
+ "Reply with ONLY the queries, one per line, nothing else.\n\n"
163
+ f"Question: {question}"
164
+ )
165
+ rendered = render_prompt(
166
+ "multi_query_decompose", lang_clause=lang_clause, question=question
167
+ )
168
+ assert rendered == expected
169
+
170
+
171
+ def test_summarize_document_renders_identically() -> None:
172
+ document_id = "ku_ai_policy.pdf"
173
+ full_text = "Section 1.\n\nSection 2."
174
+ expected = (
175
+ "Produce a structured summary of the following document. "
176
+ "Include:\n"
177
+ "1. Document title/topic\n"
178
+ "2. Key points (3-7 bullet points)\n"
179
+ "3. Important rules, deadlines, or requirements mentioned\n"
180
+ "4. Who the document applies to\n\n"
181
+ "Write the summary in the same language as the document.\n\n"
182
+ f"Document ID: {document_id}\n\n"
183
+ f"Document text:\n{full_text}"
184
+ )
185
+ assert render_prompt(
186
+ "summarize_document", document_id=document_id, full_text=full_text
187
+ ) == expected
188
+
189
+
190
+ def test_registry_raises_on_unknown_prompt() -> None:
191
+ import pytest
192
+ with pytest.raises(KeyError):
193
+ get_prompt("does_not_exist")
194
+
195
+
196
+ def test_registry_raises_on_unknown_version() -> None:
197
+ import pytest
198
+ with pytest.raises(KeyError):
199
+ get_prompt("intent_classify", version="v999")
tests/test_token_budget.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the token_budget measurement helper."""
2
+
3
+ import logging
4
+
5
+ from src.agent.token_budget import count_tokens, measure
6
+
7
+
8
+ def test_count_tokens_empty_returns_zero() -> None:
9
+ assert count_tokens("") == 0
10
+
11
+
12
+ def test_count_tokens_scales_with_safety_factor() -> None:
13
+ text = "Hello world, this is a small test sentence."
14
+ raw = count_tokens(text, safety_factor=1.0)
15
+ scaled = count_tokens(text, safety_factor=2.0)
16
+ assert raw > 0
17
+ # Scaled should be roughly double — allow 1 unit slack from rounding.
18
+ assert abs(scaled - raw * 2) <= 1
19
+
20
+
21
+ def test_count_tokens_handles_multilingual() -> None:
22
+ danish = "Hvad er reglerne for eksamen på Københavns Universitet?"
23
+ chinese = "学生考试规则是什么?"
24
+ assert count_tokens(danish) > 0
25
+ assert count_tokens(chinese) > 0
26
+
27
+
28
+ def test_measure_disabled_returns_zero_and_no_log(caplog) -> None: # noqa: ANN001
29
+ with caplog.at_level(logging.INFO, logger="src.agent.token_budget"):
30
+ result = measure("planner", "some prompt text", enabled=False)
31
+ assert result == 0
32
+ assert not any("token_budget" in rec.message for rec in caplog.records)
33
+
34
+
35
+ def test_measure_enabled_logs_and_returns_count(caplog) -> None: # noqa: ANN001
36
+ with caplog.at_level(logging.INFO, logger="src.agent.token_budget"):
37
+ result = measure("planner", "Hello world", enabled=True)
38
+ assert result > 0
39
+ assert any(
40
+ "token_budget" in rec.message and "planner" in rec.message
41
+ for rec in caplog.records
42
+ )