awellis commited on
Commit
cff57a5
·
1 Parent(s): ceb7cfa

Update README and enhance query processing with logging and error handling

Browse files

- Updated README to reflect changes in model names and configuration options.
- Added detailed logging for query processing, document retrieval, and email drafting in `app_simple.py`.
- Improved error handling in email composition and retrieval processes.
- Introduced a new `QueryRewriter` class for enhanced query rewriting capabilities.
- Added test scripts for debugging query processing and retrieval.

README.md CHANGED
@@ -23,7 +23,7 @@ This system helps administrative staff compose accurate, professional email resp
23
  - **In-Memory Document Store**: No Docker required
24
  - **PydanticAI**: Multi-agent orchestration with structured outputs (optional)
25
  - **Gradio**: Clean, light web interface
26
- - **OpenAI GPT-5-nano**: Fast language model
27
 
28
  ## Two Versions Available
29
 
@@ -176,7 +176,7 @@ The app is configured for automatic deployment to Hugging Face Spaces via `app.p
176
  Key configuration options in `.env`:
177
 
178
  ### LLM Configuration
179
- - `LLM_MODEL`: OpenAI model (default: gpt-4o)
180
  - `EMBEDDING_MODEL`: Embedding model (default: text-embedding-3-small)
181
  - `LLM_TEMPERATURE`: Temperature for generation (0-1)
182
  - `LLM_MAX_TOKENS`: Maximum tokens per response
 
23
  - **In-Memory Document Store**: No Docker required
24
  - **PydanticAI**: Multi-agent orchestration with structured outputs (optional)
25
  - **Gradio**: Clean, light web interface
26
+ - **OpenAI GPT-5-mini**: Reliable language model (upgraded from gpt-5-nano)
27
 
28
  ## Two Versions Available
29
 
 
176
  Key configuration options in `.env`:
177
 
178
  ### LLM Configuration
179
+ - `LLM_MODEL`: OpenAI model (default: gpt-4o, recommended: gpt-5-mini)
180
  - `EMBEDDING_MODEL`: Embedding model (default: text-embedding-3-small)
181
  - `LLM_TEMPERATURE`: Temperature for generation (0-1)
182
  - `LLM_MAX_TOKENS`: Maximum tokens per response
app_simple.py CHANGED
@@ -143,14 +143,32 @@ class SimpleFastAssistant:
143
  import time
144
  start = time.time()
145
 
 
 
146
  # Retrieve documents
147
  docs = self.retriever.retrieve(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Build context
150
- context = "\n\n".join([
151
- f"Document {i+1} (from {doc.meta.get('source_file', 'unknown')}):\n{doc.content}"
152
- for i, doc in enumerate(docs[:3])
153
- ])
154
 
155
  # Single LLM call
156
  system_prompt = """You are an email assistant for BFH (Bern University of Applied Sciences) administrative staff.
@@ -176,6 +194,16 @@ Context from knowledge base:
176
 
177
  Compose a professional email response."""
178
 
 
 
 
 
 
 
 
 
 
 
179
  # GPT-5 uses max_completion_tokens instead of max_tokens
180
  completion_params = {
181
  "model": self.config.llm.model_name,
@@ -189,19 +217,45 @@ Compose a professional email response."""
189
  if "gpt-5" in self.config.llm.model_name:
190
  completion_params["max_completion_tokens"] = self.config.llm.max_tokens
191
  # GPT-5-nano only supports temperature=1 (default), so don't set it
 
192
  else:
193
  completion_params["max_tokens"] = self.config.llm.max_tokens
194
  completion_params["temperature"] = self.config.llm.temperature
 
 
 
195
 
196
- response = self.client.chat.completions.create(**completion_params)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- email = response.choices[0].message.content
199
  elapsed = time.time() - start
200
 
201
  # Parse subject and body
202
- lines = email.split('\n')
203
- subject = lines[0].replace('Subject:', '').strip() if lines else "Response"
204
- body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else email
 
 
 
 
 
 
205
 
206
  stats = f"**Time:** {elapsed:.1f}s | **Docs:** {len(docs)} | **Model:** {self.config.llm.model_name}"
207
 
@@ -218,6 +272,17 @@ Compose a professional email response."""
218
  'explanation': self._explain_relevance(query, doc, i + 1)
219
  })
220
 
 
 
 
 
 
 
 
 
 
 
 
221
  return subject, body, stats, chunks_info
222
 
223
 
@@ -237,10 +302,16 @@ class QueryProcessor:
237
  if not query or not query.strip():
238
  return "", "", "", ""
239
 
240
- subject, body, stats, chunks_info = self.assistant.process_query(query)
241
- chunks_html = self.formatter.format_chunks_html(chunks_info)
242
-
243
- return subject, body, stats, chunks_html
 
 
 
 
 
 
244
 
245
 
246
  # ============================================================================
 
143
  import time
144
  start = time.time()
145
 
146
+ logger.info(f"[DEBUG] Processing query: '{query}'")
147
+
148
  # Retrieve documents
149
  docs = self.retriever.retrieve(query)
150
+ logger.info(f"[DEBUG] Retrieved {len(docs)} documents")
151
+
152
+ if len(docs) == 0:
153
+ logger.warning(f"[DEBUG] No documents retrieved for query: '{query}'")
154
+
155
+ # Build context - limit for GPT-5-nano's smaller context window
156
+ # GPT-5-nano has limited capacity, so we need to be more conservative
157
+ max_docs = 2 if "gpt-5" in self.config.llm.model_name else 3
158
+ max_chars_per_doc = 800 if "gpt-5" in self.config.llm.model_name else 1500
159
+
160
+ context_parts = []
161
+ for i, doc in enumerate(docs[:max_docs]):
162
+ doc_content = doc.content
163
+ if len(doc_content) > max_chars_per_doc:
164
+ doc_content = doc_content[:max_chars_per_doc] + "..."
165
+ logger.debug(f"[DEBUG] Truncated document {i+1} from {len(doc.content)} to {max_chars_per_doc} chars")
166
+
167
+ context_parts.append(f"Document {i+1} (from {doc.meta.get('source_file', 'unknown')}):\n{doc_content}")
168
 
169
+ context = "\n\n".join(context_parts)
170
+
171
+ logger.info(f"[DEBUG] Context length: {len(context)} chars (using {len(context_parts)} of {len(docs)} docs)")
 
 
172
 
173
  # Single LLM call
174
  system_prompt = """You are an email assistant for BFH (Bern University of Applied Sciences) administrative staff.
 
194
 
195
  Compose a professional email response."""
196
 
197
+ logger.info(f"[DEBUG] User prompt length: {len(user_prompt)} chars")
198
+ logger.info(f"[DEBUG] System prompt length: {len(system_prompt)} chars")
199
+ logger.info(f"[DEBUG] Total prompt length: {len(system_prompt) + len(user_prompt)} chars")
200
+ logger.debug(f"[DEBUG] User prompt preview: {user_prompt[:500]}...")
201
+
202
+ # Check if prompt might be too long for gpt-5-nano
203
+ total_chars = len(system_prompt) + len(user_prompt)
204
+ if "gpt-5" in self.config.llm.model_name and total_chars > 8000:
205
+ logger.warning(f"[DEBUG] Prompt may be too long for gpt-5-nano ({total_chars} chars). Consider using fewer documents or a different model.")
206
+
207
  # GPT-5 uses max_completion_tokens instead of max_tokens
208
  completion_params = {
209
  "model": self.config.llm.model_name,
 
217
  if "gpt-5" in self.config.llm.model_name:
218
  completion_params["max_completion_tokens"] = self.config.llm.max_tokens
219
  # GPT-5-nano only supports temperature=1 (default), so don't set it
220
+ logger.info(f"[DEBUG] Using GPT-5 parameters: max_completion_tokens={self.config.llm.max_tokens}")
221
  else:
222
  completion_params["max_tokens"] = self.config.llm.max_tokens
223
  completion_params["temperature"] = self.config.llm.temperature
224
+ logger.info(f"[DEBUG] Using standard parameters: max_tokens={self.config.llm.max_tokens}, temp={self.config.llm.temperature}")
225
+
226
+ logger.info(f"[DEBUG] Calling LLM with model: {self.config.llm.model_name}")
227
 
228
+ try:
229
+ response = self.client.chat.completions.create(**completion_params)
230
+ email = response.choices[0].message.content
231
+
232
+ # Check for null/empty response
233
+ if email is None or email.strip() == "":
234
+ logger.error(f"[DEBUG] LLM returned null or empty response!")
235
+ logger.error(f"[DEBUG] Full response: {response}")
236
+ email = "Error: The model returned an empty response. Please try again."
237
+ else:
238
+ logger.info(f"[DEBUG] LLM response received: {len(email)} chars")
239
+ logger.debug(f"[DEBUG] LLM response preview: {email[:300]}...")
240
+
241
+ except Exception as e:
242
+ logger.error(f"[DEBUG] LLM call failed: {e}")
243
+ import traceback
244
+ traceback.print_exc()
245
+ raise
246
 
 
247
  elapsed = time.time() - start
248
 
249
  # Parse subject and body
250
+ if email and email.strip():
251
+ lines = email.split('\n')
252
+ subject = lines[0].replace('Subject:', '').strip() if lines else "Response"
253
+ body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else email
254
+ else:
255
+ subject = "Error"
256
+ body = "No response generated from the model."
257
+
258
+ logger.info(f"[DEBUG] Parsed email - Subject: '{subject[:50] if len(subject) > 50 else subject}', Body: {len(body)} chars")
259
 
260
  stats = f"**Time:** {elapsed:.1f}s | **Docs:** {len(docs)} | **Model:** {self.config.llm.model_name}"
261
 
 
272
  'explanation': self._explain_relevance(query, doc, i + 1)
273
  })
274
 
275
+ # Ensure we return non-empty strings
276
+ if not subject or subject.strip() == "":
277
+ subject = "(No subject generated)"
278
+ logger.warning("[DEBUG] Subject was empty, using fallback")
279
+
280
+ if not body or body.strip() == "":
281
+ body = "(No email body generated - please check logs for errors)"
282
+ logger.warning("[DEBUG] Body was empty, using fallback")
283
+
284
+ logger.info(f"[DEBUG] Returning - Subject: '{subject[:30]}...', Body length: {len(body)}, Chunks: {len(chunks_info)}")
285
+
286
  return subject, body, stats, chunks_info
287
 
288
 
 
302
  if not query or not query.strip():
303
  return "", "", "", ""
304
 
305
+ try:
306
+ subject, body, stats, chunks_info = self.assistant.process_query(query)
307
+ chunks_html = self.formatter.format_chunks_html(chunks_info)
308
+ return subject, body, stats, chunks_html
309
+ except Exception as e:
310
+ logger.error(f"[ERROR] Failed to process query: {e}")
311
+ import traceback
312
+ traceback.print_exc()
313
+ error_msg = f"Error processing query: {str(e)}"
314
+ return "Error", error_msg, f"**Status:** Failed", f"<p style='color:red'>{error_msg}</p>"
315
 
316
 
317
  # ============================================================================
src/agents/composer_agent.py CHANGED
@@ -121,12 +121,45 @@ Based on this information, compose a complete email response that addresses the
121
 
122
  except Exception as e:
123
  logger.error(f"Error composing email: {e}")
124
- # Return minimal draft on error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  return EmailDraft(
126
- subject="Ihre Anfrage / Your Inquiry",
127
- body="Vielen Dank für Ihre Anfrage. Wir werden uns in Kürze bei Ihnen melden.\n\nThank you for your inquiry. We will get back to you shortly.",
128
  tone="professional",
129
- confidence=0.0,
130
  )
131
 
132
  def _build_context(self, documents: List[Document]) -> str:
@@ -140,7 +173,14 @@ Based on this information, compose a complete email response that addresses the
140
  Formatted context text
141
  """
142
  if not documents:
143
- return "No relevant documents found in the knowledge base."
 
 
 
 
 
 
 
144
 
145
  context_parts = []
146
  for i, doc in enumerate(documents, 1):
 
121
 
122
  except Exception as e:
123
  logger.error(f"Error composing email: {e}")
124
+ # Return minimal draft on error based on detected language
125
+ language = intent.language if intent and hasattr(intent, 'language') else "de"
126
+
127
+ if language == "de":
128
+ subject = "Ihre Anfrage zur Studienadministration"
129
+ body = """Guten Tag,
130
+
131
+ Vielen Dank für Ihre Anfrage.
132
+
133
+ Leider konnte ich zu Ihrer spezifischen Frage keine detaillierten Informationen in unserer Wissensdatenbank finden. Ich empfehle Ihnen, sich direkt an die Studienadministration der BFH zu wenden:
134
+
135
+ - Website: https://www.bfh.ch
136
+ - Studienadministration: Kontaktinformationen finden Sie auf der BFH-Website
137
+
138
+ Die Mitarbeitenden können Ihnen bei Ihrer Anfrage persönlich weiterhelfen.
139
+
140
+ Freundliche Grüsse
141
+ BFH Studienadministration"""
142
+ else:
143
+ subject = "Your Inquiry to Student Administration"
144
+ body = """Hello,
145
+
146
+ Thank you for your inquiry.
147
+
148
+ Unfortunately, I could not find detailed information regarding your specific question in our knowledge base. I recommend contacting BFH Student Administration directly:
149
+
150
+ - Website: https://www.bfh.ch
151
+ - Student Administration: Contact information available on the BFH website
152
+
153
+ The staff will be able to assist you personally with your inquiry.
154
+
155
+ Best regards
156
+ BFH Student Administration"""
157
+
158
  return EmailDraft(
159
+ subject=subject,
160
+ body=body,
161
  tone="professional",
162
+ confidence=0.1,
163
  )
164
 
165
  def _build_context(self, documents: List[Document]) -> str:
 
173
  Formatted context text
174
  """
175
  if not documents:
176
+ return """No relevant documents found in the knowledge base.
177
+
178
+ IMPORTANT: Since no specific documentation was found, you should:
179
+ 1. Acknowledge the user's query professionally
180
+ 2. Provide general guidance if you can infer the topic
181
+ 3. Direct them to contact the appropriate administrative office
182
+ 4. Suggest checking the official BFH website for more information
183
+ 5. Do NOT make up specific procedures, deadlines, or requirements"""
184
 
185
  context_parts = []
186
  for i, doc in enumerate(documents, 1):
src/config.py CHANGED
@@ -88,7 +88,7 @@ class RetrievalConfig:
88
  top_k: int = 5 # Number of documents to retrieve
89
  bm25_weight: float = 0.5 # Weight for BM25 score
90
  vector_weight: float = 0.5 # Weight for vector similarity score
91
- min_score: float = 0.3 # Minimum relevance score threshold
92
 
93
  @classmethod
94
  def from_env(cls) -> "RetrievalConfig":
@@ -97,7 +97,7 @@ class RetrievalConfig:
97
  top_k=int(os.getenv("RETRIEVAL_TOP_K", "5")),
98
  bm25_weight=float(os.getenv("BM25_WEIGHT", "0.5")),
99
  vector_weight=float(os.getenv("VECTOR_WEIGHT", "0.5")),
100
- min_score=float(os.getenv("MIN_RELEVANCE_SCORE", "0.3")),
101
  )
102
 
103
 
 
88
  top_k: int = 5 # Number of documents to retrieve
89
  bm25_weight: float = 0.5 # Weight for BM25 score
90
  vector_weight: float = 0.5 # Weight for vector similarity score
91
+ min_score: float = 0.1 # Minimum relevance score threshold (lowered to be more permissive)
92
 
93
  @classmethod
94
  def from_env(cls) -> "RetrievalConfig":
 
97
  top_k=int(os.getenv("RETRIEVAL_TOP_K", "5")),
98
  bm25_weight=float(os.getenv("BM25_WEIGHT", "0.5")),
99
  vector_weight=float(os.getenv("VECTOR_WEIGHT", "0.5")),
100
+ min_score=float(os.getenv("MIN_RELEVANCE_SCORE", "0.1")),
101
  )
102
 
103
 
src/pipeline/orchestrator.py CHANGED
@@ -83,11 +83,13 @@ class RAGOrchestrator:
83
  logger.info("Step 1: Extracting intent...")
84
  intent = await self.intent_agent.extract_intent(query)
85
 
86
- # Step 2: Retrieve relevant documents
87
  logger.info("Step 2: Retrieving relevant documents...")
88
- retrieved_docs = self.retriever.retrieve(query)
89
 
90
  logger.info(f"Retrieved {len(retrieved_docs)} documents")
 
 
91
 
92
  # Step 3: Compose email draft
93
  logger.info("Step 3: Composing email draft...")
 
83
  logger.info("Step 1: Extracting intent...")
84
  intent = await self.intent_agent.extract_intent(query)
85
 
86
+ # Step 2: Retrieve relevant documents (with query rewriting)
87
  logger.info("Step 2: Retrieving relevant documents...")
88
+ retrieved_docs = await self.retriever.retrieve_with_rewriting(query)
89
 
90
  logger.info(f"Retrieved {len(retrieved_docs)} documents")
91
+ if len(retrieved_docs) == 0:
92
+ logger.warning("No documents retrieved - email generation may be limited")
93
 
94
  # Step 3: Compose email draft
95
  logger.info("Step 3: Composing email draft...")
src/retrieval/__init__.py CHANGED
@@ -11,3 +11,10 @@ try:
11
  __all__.append("HybridRetriever")
12
  except ImportError:
13
  pass # OpenSearch dependencies not installed
 
 
 
 
 
 
 
 
11
  __all__.append("HybridRetriever")
12
  except ImportError:
13
  pass # OpenSearch dependencies not installed
14
+
15
+ # Optional query rewriter
16
+ try:
17
+ from .query_rewriter import QueryRewriter, RewrittenQueries
18
+ __all__.extend(["QueryRewriter", "RewrittenQueries"])
19
+ except ImportError:
20
+ pass # PydanticAI not installed
src/retrieval/hybrid_retriever.py CHANGED
@@ -1,6 +1,6 @@
1
  """Hybrid retriever combining BM25 and vector search."""
2
 
3
- from typing import List, Dict, Any
4
  from haystack import Document
5
  from haystack.components.embedders import OpenAITextEmbedder
6
  from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore
@@ -12,6 +12,18 @@ import logging
12
 
13
  from ..config import RetrievalConfig, LLMConfig
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  logger = logging.getLogger(__name__)
16
 
17
 
@@ -23,6 +35,7 @@ class HybridRetriever:
23
  document_store: OpenSearchDocumentStore,
24
  llm_config: LLMConfig,
25
  retrieval_config: RetrievalConfig,
 
26
  ):
27
  """
28
  Initialize the hybrid retriever.
@@ -31,10 +44,12 @@ class HybridRetriever:
31
  document_store: OpenSearch document store
32
  llm_config: LLM configuration for embeddings
33
  retrieval_config: Retrieval configuration
 
34
  """
35
  self.document_store = document_store
36
  self.llm_config = llm_config
37
  self.retrieval_config = retrieval_config
 
38
 
39
  # Initialize BM25 retriever
40
  self.bm25_retriever = OpenSearchBM25Retriever(
@@ -52,12 +67,21 @@ class HybridRetriever:
52
  model=llm_config.embedding_model,
53
  )
54
 
55
- def retrieve(self, query: str) -> List[Document]:
 
 
 
 
 
 
 
 
56
  """
57
- Retrieve documents using hybrid search.
58
 
59
  Args:
60
- query: Search query
 
61
 
62
  Returns:
63
  List of relevant documents with scores
@@ -65,16 +89,23 @@ class HybridRetriever:
65
  logger.info(f"Retrieving documents for query: {query[:100]}...")
66
 
67
  try:
 
 
 
68
  # Get BM25 results
69
- logger.debug("Running BM25 retrieval...")
 
70
  bm25_results = self.bm25_retriever.run(
71
- query=query,
72
  top_k=self.retrieval_config.top_k * 2, # Get more to merge
73
  )
74
  bm25_docs = bm25_results.get("documents", [])
 
 
 
75
  logger.debug(f"BM25 retrieved {len(bm25_docs)} documents")
76
 
77
- # Generate query embedding
78
  logger.debug("Generating query embedding...")
79
  embedding_result = self.text_embedder.run(text=query)
80
  query_embedding = embedding_result.get("embedding")
@@ -101,12 +132,73 @@ class HybridRetriever:
101
 
102
  logger.info(f"Retrieved {len(final_docs)} documents after hybrid ranking")
103
 
 
 
 
 
 
 
 
 
 
104
  return final_docs
105
 
106
  except Exception as e:
107
  logger.error(f"Error during retrieval: {e}")
108
  return []
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def _merge_results(
111
  self, bm25_docs: List[Document], vector_docs: List[Document]
112
  ) -> List[Document]:
 
1
  """Hybrid retriever combining BM25 and vector search."""
2
 
3
+ from typing import List, Dict, Any, Optional, TYPE_CHECKING
4
  from haystack import Document
5
  from haystack.components.embedders import OpenAITextEmbedder
6
  from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore
 
12
 
13
  from ..config import RetrievalConfig, LLMConfig
14
 
15
+ if TYPE_CHECKING:
16
+ from .query_rewriter import QueryRewriter, RewrittenQueries
17
+ else:
18
+ # Optional import for query rewriting at runtime
19
+ try:
20
+ from .query_rewriter import QueryRewriter, RewrittenQueries
21
+ QUERY_REWRITER_AVAILABLE = True
22
+ except ImportError:
23
+ QUERY_REWRITER_AVAILABLE = False
24
+ QueryRewriter = None # type: ignore
25
+ RewrittenQueries = None # type: ignore
26
+
27
  logger = logging.getLogger(__name__)
28
 
29
 
 
35
  document_store: OpenSearchDocumentStore,
36
  llm_config: LLMConfig,
37
  retrieval_config: RetrievalConfig,
38
+ use_query_rewriting: bool = True,
39
  ):
40
  """
41
  Initialize the hybrid retriever.
 
44
  document_store: OpenSearch document store
45
  llm_config: LLM configuration for embeddings
46
  retrieval_config: Retrieval configuration
47
+ use_query_rewriting: Enable LLM-based query rewriting (default: True)
48
  """
49
  self.document_store = document_store
50
  self.llm_config = llm_config
51
  self.retrieval_config = retrieval_config
52
+ self.use_query_rewriting = use_query_rewriting
53
 
54
  # Initialize BM25 retriever
55
  self.bm25_retriever = OpenSearchBM25Retriever(
 
67
  model=llm_config.embedding_model,
68
  )
69
 
70
+ # Initialize query rewriter (uses faster model for speed)
71
+ self.query_rewriter: Optional["QueryRewriter"] = None
72
+ if use_query_rewriting and QUERY_REWRITER_AVAILABLE and QueryRewriter:
73
+ self.query_rewriter = QueryRewriter(
74
+ api_key=llm_config.api_key,
75
+ model="openai:gpt-4o-mini", # Faster model for query rewriting
76
+ )
77
+
78
+ def retrieve(self, query: str, rewritten_query: Optional["RewrittenQueries"] = None) -> List[Document]:
79
  """
80
+ Retrieve documents using hybrid search with optional query rewriting.
81
 
82
  Args:
83
+ query: Search query (original or already rewritten)
84
+ rewritten_query: Pre-computed rewritten queries (optional)
85
 
86
  Returns:
87
  List of relevant documents with scores
 
89
  logger.info(f"Retrieving documents for query: {query[:100]}...")
90
 
91
  try:
92
+ # Use provided query for BM25 (might be rewritten)
93
+ bm25_query = query
94
+
95
  # Get BM25 results
96
+ logger.info(f"[DEBUG] BM25 query string: '{bm25_query}'")
97
+ logger.debug(f"Running BM25 retrieval with query: {bm25_query[:100]}...")
98
  bm25_results = self.bm25_retriever.run(
99
+ query=bm25_query,
100
  top_k=self.retrieval_config.top_k * 2, # Get more to merge
101
  )
102
  bm25_docs = bm25_results.get("documents", [])
103
+ logger.info(f"[DEBUG] BM25 retrieved {len(bm25_docs)} documents")
104
+ if bm25_docs:
105
+ logger.debug(f"[DEBUG] Top BM25 result score: {bm25_docs[0].score if bm25_docs[0].score else 'None'}")
106
  logger.debug(f"BM25 retrieved {len(bm25_docs)} documents")
107
 
108
+ # Generate query embedding (use original query for semantic similarity)
109
  logger.debug("Generating query embedding...")
110
  embedding_result = self.text_embedder.run(text=query)
111
  query_embedding = embedding_result.get("embedding")
 
132
 
133
  logger.info(f"Retrieved {len(final_docs)} documents after hybrid ranking")
134
 
135
+ # If no docs and we have rewritten queries, try fallback with synonyms
136
+ if len(final_docs) == 0 and rewritten_query and rewritten_query.synonyms:
137
+ logger.info("No documents found, trying with synonym queries...")
138
+ for synonym in rewritten_query.synonyms[:2]: # Try top 2 synonyms
139
+ fallback_docs = self._retrieve_with_query(synonym)
140
+ if fallback_docs:
141
+ logger.info(f"Found {len(fallback_docs)} documents with synonym: {synonym}")
142
+ return fallback_docs
143
+
144
  return final_docs
145
 
146
  except Exception as e:
147
  logger.error(f"Error during retrieval: {e}")
148
  return []
149
 
150
+ async def retrieve_with_rewriting(self, query: str) -> List[Document]:
151
+ """
152
+ Retrieve documents with query rewriting.
153
+
154
+ Args:
155
+ query: Original user query
156
+
157
+ Returns:
158
+ List of relevant documents with scores
159
+ """
160
+ if not self.use_query_rewriting or not self.query_rewriter:
161
+ logger.info("[DEBUG] Query rewriting disabled, using original query")
162
+ return self.retrieve(query)
163
+
164
+ # Rewrite query
165
+ logger.info("Rewriting query for better retrieval...")
166
+ rewritten = await self.query_rewriter.rewrite_query(query)
167
+
168
+ # Build optimized query for BM25
169
+ optimized_query = self.query_rewriter.build_expanded_query(rewritten)
170
+
171
+ logger.info(f"[DEBUG] Original query: '{query}'")
172
+ logger.info(f"[DEBUG] Primary rewritten: '{rewritten.primary_query}'")
173
+ logger.info(f"[DEBUG] Key terms: {rewritten.key_terms}")
174
+ logger.info(f"[DEBUG] Optimized for BM25: '{optimized_query}'")
175
+ logger.info(f"Original: {query[:80]}")
176
+ logger.info(f"Optimized: {optimized_query[:80]}")
177
+
178
+ # Retrieve with optimized query
179
+ return self.retrieve(optimized_query, rewritten_query=rewritten)
180
+
181
+ def _retrieve_with_query(self, query: str) -> List[Document]:
182
+ """
183
+ Helper method to retrieve with a specific query string.
184
+
185
+ Args:
186
+ query: Query string
187
+
188
+ Returns:
189
+ List of documents
190
+ """
191
+ try:
192
+ bm25_results = self.bm25_retriever.run(
193
+ query=query,
194
+ top_k=self.retrieval_config.top_k,
195
+ )
196
+ docs = bm25_results.get("documents", [])
197
+ return self._apply_score_threshold(docs)
198
+ except Exception as e:
199
+ logger.error(f"Error in fallback retrieval: {e}")
200
+ return []
201
+
202
  def _merge_results(
203
  self, bm25_docs: List[Document], vector_docs: List[Document]
204
  ) -> List[Document]:
src/retrieval/query_rewriter.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Query rewriting for improved retrieval using LLM."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import List, Set
6
+ from pydantic import BaseModel, Field
7
+ from pydantic_ai import Agent
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class RewrittenQueries(BaseModel):
13
+ """Structured output for rewritten queries."""
14
+
15
+ primary_query: str = Field(
16
+ description="Main rewritten query optimized for keyword search"
17
+ )
18
+ synonyms: List[str] = Field(
19
+ default_factory=list,
20
+ description="Alternative phrasings and synonyms"
21
+ )
22
+ key_terms: List[str] = Field(
23
+ default_factory=list,
24
+ description="Important domain-specific terms to boost"
25
+ )
26
+ explanation: str = Field(
27
+ default="",
28
+ description="Brief explanation of the rewriting strategy"
29
+ )
30
+
31
+
32
+ class QueryRewriter:
33
+ """Rewrites queries for better retrieval using LLM."""
34
+
35
+ # Stopwords for basic filtering (used as fallback)
36
+ STOPWORDS: Set[str] = {
37
+ # English
38
+ "i", "me", "my", "we", "you", "he", "she", "it", "they", "am", "is", "are",
39
+ "was", "were", "be", "been", "have", "has", "had", "do", "does", "did",
40
+ "a", "an", "the", "and", "but", "if", "or", "as", "of", "at", "by", "for",
41
+ "with", "about", "to", "from", "in", "on", "can", "will", "would", "should",
42
+ # German
43
+ "ich", "mich", "mir", "du", "er", "sie", "es", "wir", "ihr", "der", "die",
44
+ "das", "den", "dem", "des", "ein", "eine", "und", "oder", "aber", "wenn",
45
+ "als", "von", "zu", "mit", "bei", "für", "auf", "an", "in", "ist", "sind",
46
+ "war", "haben", "hat", "kann", "wie", "was", "wo",
47
+ # French
48
+ "je", "tu", "il", "elle", "nous", "vous", "ils", "elles", "le", "la", "les",
49
+ "un", "une", "des", "et", "ou", "mais", "si", "de", "à", "pour", "avec",
50
+ "dans", "est", "sont", "avoir", "peut", "comment", "que", "qui",
51
+ }
52
+
53
+ def __init__(self, api_key: str, model: str = "openai:gpt-4o-mini"):
54
+ """
55
+ Initialize query rewriter.
56
+
57
+ Args:
58
+ api_key: OpenAI API key
59
+ model: Model to use (default: gpt-4o-mini for speed)
60
+ """
61
+ self.agent = Agent[None, RewrittenQueries](
62
+ model,
63
+ system_prompt="""You are an expert at rewriting user queries for optimal document retrieval in a university administrative system (BFH - Bern University of Applied Sciences).
64
+
65
+ Your task is to transform natural language queries into optimized search queries that will find relevant administrative documents.
66
+
67
+ Key strategies:
68
+ 1. **Translate colloquial to formal**: Convert informal phrasing to official administrative terms
69
+ - Example: "drop out" → "Exmatrikulation" (German) or "withdrawal" (English)
70
+ - Example: "change my major" → "Studiengangwechsel"
71
+
72
+ 2. **Add domain-specific keywords**: Include relevant administrative terms
73
+ - Common terms: Antrag (application), Formular (form), Frist (deadline), Anmeldung (registration)
74
+
75
+ 3. **Language consistency**: Keep search terms in the same language as documents (primarily German)
76
+ - German query → German search terms
77
+ - English query → can include German terms if they're official names
78
+
79
+ 4. **Extract key entities**: Identify important terms like:
80
+ - Administrative processes: Exmatrikulation, Immatrikulation, Beurlaubung
81
+ - Documents: Formular, Bestätigung, Bescheinigung
82
+ - Deadlines: Frist, Semester, Anmeldung
83
+
84
+ 5. **Expand with synonyms**: Provide alternative terms that might appear in documents
85
+ - Example: "Abmeldung" and "Exmatrikulation"
86
+
87
+ 6. **Remove filler words**: Focus on content words only
88
+
89
+ Output:
90
+ - primary_query: The best optimized query for keyword search (BM25)
91
+ - synonyms: 2-3 alternative phrasings
92
+ - key_terms: 3-5 critical domain-specific terms to boost in search
93
+ - explanation: Brief note on your strategy
94
+
95
+ Keep queries concise (5-10 words max for primary_query)."""
96
+ )
97
+
98
+ async def rewrite_query(self, original_query: str) -> RewrittenQueries:
99
+ """
100
+ Rewrite a query for better retrieval.
101
+
102
+ Args:
103
+ original_query: Original user query
104
+
105
+ Returns:
106
+ Rewritten queries with metadata
107
+ """
108
+ logger.info(f"Rewriting query: {original_query[:100]}...")
109
+
110
+ try:
111
+ result = await self.agent.run(f"Rewrite this query for optimal document retrieval:\n\n{original_query}")
112
+ rewritten = result.output
113
+
114
+ # Post-process to ensure stopwords are removed
115
+ rewritten = self._clean_stopwords(rewritten)
116
+
117
+ logger.info(f"Rewritten primary query: {rewritten.primary_query}")
118
+ logger.debug(f"Key terms: {', '.join(rewritten.key_terms)}")
119
+ logger.debug(f"Explanation: {rewritten.explanation}")
120
+
121
+ return rewritten
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error rewriting query: {e}")
125
+ # Fallback to basic stopword removal
126
+ return self._fallback_rewrite(original_query)
127
+
128
+ def _clean_stopwords(self, rewritten: RewrittenQueries) -> RewrittenQueries:
129
+ """
130
+ Clean stopwords from rewritten query (post-processing safety net).
131
+
132
+ Args:
133
+ rewritten: Rewritten query from LLM
134
+
135
+ Returns:
136
+ Cleaned rewritten query
137
+ """
138
+ # Clean primary query
139
+ tokens = re.findall(r'\w+', rewritten.primary_query.lower())
140
+ removed_stopwords = [t for t in tokens if t in self.STOPWORDS]
141
+ cleaned_tokens = [t for t in tokens if t not in self.STOPWORDS]
142
+
143
+ logger.info(f"[DEBUG] Stopwords removed from primary query: {removed_stopwords}")
144
+ logger.info(f"[DEBUG] Before stopword cleaning: '{rewritten.primary_query}'")
145
+
146
+ if cleaned_tokens:
147
+ cleaned_primary = " ".join(cleaned_tokens)
148
+ else:
149
+ # If everything was removed, keep original
150
+ cleaned_primary = rewritten.primary_query
151
+
152
+ logger.info(f"[DEBUG] After stopword cleaning: '{cleaned_primary}'")
153
+
154
+ # Clean key terms
155
+ cleaned_key_terms = [term for term in rewritten.key_terms if term.lower() not in self.STOPWORDS]
156
+
157
+ # Clean synonyms
158
+ cleaned_synonyms = []
159
+ for syn in rewritten.synonyms:
160
+ syn_tokens = re.findall(r'\w+', syn.lower())
161
+ syn_cleaned = [t for t in syn_tokens if t not in self.STOPWORDS]
162
+ if syn_cleaned:
163
+ cleaned_synonyms.append(" ".join(syn_cleaned))
164
+
165
+ return RewrittenQueries(
166
+ primary_query=cleaned_primary,
167
+ synonyms=cleaned_synonyms,
168
+ key_terms=cleaned_key_terms,
169
+ explanation=rewritten.explanation + " (stopwords cleaned)",
170
+ )
171
+
172
+ def _fallback_rewrite(self, query: str) -> RewrittenQueries:
173
+ """
174
+ Fallback query rewriting using simple stopword removal.
175
+
176
+ Args:
177
+ query: Original query
178
+
179
+ Returns:
180
+ Basic rewritten query
181
+ """
182
+ logger.info("Using fallback query rewriting (stopword removal)")
183
+
184
+ # Tokenize and filter stopwords
185
+ tokens = re.findall(r'\w+', query.lower())
186
+ filtered = [t for t in tokens if t not in self.STOPWORDS and len(t) >= 3]
187
+
188
+ primary_query = " ".join(filtered) if filtered else query
189
+
190
+ return RewrittenQueries(
191
+ primary_query=primary_query,
192
+ synonyms=[],
193
+ key_terms=filtered[:5],
194
+ explanation="Fallback: removed stopwords only"
195
+ )
196
+
197
+ def build_expanded_query(self, rewritten: RewrittenQueries) -> str:
198
+ """
199
+ Build an expanded query combining primary query and key terms.
200
+
201
+ Args:
202
+ rewritten: Rewritten query data
203
+
204
+ Returns:
205
+ Expanded query string for BM25 search
206
+ """
207
+ # Combine primary query with key terms (weighted more)
208
+ parts = [rewritten.primary_query]
209
+
210
+ # Add key terms (they'll boost relevance if present)
211
+ for term in rewritten.key_terms:
212
+ if term.lower() not in rewritten.primary_query.lower():
213
+ parts.append(term)
214
+
215
+ return " ".join(parts)
216
+
217
+ def get_all_variants(self, rewritten: RewrittenQueries) -> List[str]:
218
+ """
219
+ Get all query variants for multi-query retrieval.
220
+
221
+ Args:
222
+ rewritten: Rewritten query data
223
+
224
+ Returns:
225
+ List of query variants
226
+ """
227
+ variants = [rewritten.primary_query]
228
+
229
+ # Add synonym variants
230
+ variants.extend(rewritten.synonyms[:2]) # Limit to top 2
231
+
232
+ # Add expanded query
233
+ expanded = self.build_expanded_query(rewritten)
234
+ if expanded not in variants:
235
+ variants.append(expanded)
236
+
237
+ return variants
test_query_debug.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test script to debug query processing for 'Was kostet eine Namensänderung?'"""
2
+
3
+ import asyncio
4
+ import logging
5
+ import sys
6
+ from src.config import get_config
7
+ from src.indexing.indexer import DocumentIndexer
8
+ from src.pipeline.orchestrator import RAGOrchestrator
9
+
10
+ # Configure logging to see all DEBUG messages
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
14
+ handlers=[logging.StreamHandler(sys.stdout)]
15
+ )
16
+
17
+ # Set specific loggers to INFO to see debug messages
18
+ logging.getLogger('src.retrieval.hybrid_retriever').setLevel(logging.INFO)
19
+ logging.getLogger('src.retrieval.query_rewriter').setLevel(logging.INFO)
20
+
21
+
22
+ async def test_query():
23
+ """Test the problematic query."""
24
+ print("\n" + "="*80)
25
+ print("Testing query: 'Was kostet eine Namensänderung?'")
26
+ print("="*80 + "\n")
27
+
28
+ # Load config
29
+ config = get_config()
30
+
31
+ # Initialize indexer (this connects to document store)
32
+ indexer = DocumentIndexer(config)
33
+
34
+ # Initialize orchestrator
35
+ orchestrator = RAGOrchestrator(config, indexer)
36
+
37
+ # Test query
38
+ query = "Was kostet eine Namensänderung?"
39
+
40
+ print(f"\n>>> Running query: '{query}'\n")
41
+
42
+ try:
43
+ result = await orchestrator.process_query(query)
44
+
45
+ print("\n" + "="*80)
46
+ print("RESULTS SUMMARY")
47
+ print("="*80)
48
+ print(f"Documents retrieved: {len(result.retrieved_docs)}")
49
+ print(f"Processing time: {result.processing_time:.2f}s")
50
+ print(f"\nIntent detected:")
51
+ print(f" - Action: {result.intent.action_type}")
52
+ print(f" - Topic: {result.intent.topic}")
53
+ print(f" - Language: {result.intent.language}")
54
+
55
+ if result.retrieved_docs:
56
+ print(f"\nTop 3 retrieved documents:")
57
+ for i, doc in enumerate(result.retrieved_docs[:3], 1):
58
+ print(f"\n [{i}] Score: {doc.get('score', 'N/A'):.4f}")
59
+ print(f" Source: {doc.get('meta', {}).get('source_file', 'Unknown')}")
60
+ print(f" Preview: {doc.get('content', '')[:150]}...")
61
+
62
+ print(f"\nEmail draft preview:")
63
+ print(f" Subject: {result.email_draft.subject}")
64
+ print(f" Body (first 200 chars): {result.email_draft.body[:200]}...")
65
+
66
+ except Exception as e:
67
+ print(f"\n❌ Error: {e}")
68
+ import traceback
69
+ traceback.print_exc()
70
+
71
+
72
+ if __name__ == "__main__":
73
+ asyncio.run(test_query())
test_retrieval_simple.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple test script to check retrieval for the problematic query."""
2
+
3
+ import logging
4
+ import sys
5
+
6
+ # Set up logging
7
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
8
+
9
+ # Import after logging is configured
10
+ from src.config import get_config
11
+ from src.indexing.memory_indexer import MemoryDocumentIndexer
12
+ from src.retrieval.memory_retriever import MemoryRetriever
13
+
14
+ def test_retrieval():
15
+ """Test retrieval for the problematic query."""
16
+
17
+ query = "Wie kann ich einen Rückzug machen? Wie kann ich mich vom Studium abmelden?"
18
+
19
+ print("="*80)
20
+ print(f"Testing retrieval for query:")
21
+ print(f" '{query}'")
22
+ print("="*80)
23
+
24
+ # Load config
25
+ config = get_config()
26
+
27
+ # Initialize indexer and retriever
28
+ print("\nInitializing document store...")
29
+ indexer = MemoryDocumentIndexer(llm_config=config.llm)
30
+
31
+ # Load documents from JSON
32
+ import json
33
+ from haystack import Document as HaystackDoc
34
+ from pathlib import Path
35
+
36
+ json_path = Path("documents_indexed.json")
37
+ if json_path.exists():
38
+ print(f"Loading documents from {json_path}...")
39
+ with open(json_path) as f:
40
+ docs_data = json.load(f)
41
+
42
+ documents = []
43
+ for doc_data in docs_data:
44
+ doc = HaystackDoc(
45
+ id=doc_data["id"],
46
+ content=doc_data["content"],
47
+ embedding=doc_data.get("embedding"),
48
+ meta=doc_data.get("meta", {})
49
+ )
50
+ documents.append(doc)
51
+
52
+ indexer.document_store.write_documents(documents)
53
+ print(f" Loaded {len(documents)} documents")
54
+ else:
55
+ print(f"ERROR: {json_path} not found!")
56
+ return
57
+
58
+ print(f" Total documents in store: {indexer.document_store.count_documents()}")
59
+
60
+ # Initialize retriever
61
+ print("\nInitializing retriever...")
62
+ retriever = MemoryRetriever(
63
+ document_store=indexer.document_store,
64
+ llm_config=config.llm,
65
+ retrieval_config=config.retrieval,
66
+ )
67
+
68
+ # Perform retrieval
69
+ print(f"\nPerforming retrieval...")
70
+ docs = retriever.retrieve(query)
71
+
72
+ print(f"\nResults: {len(docs)} documents retrieved")
73
+ print("="*80)
74
+
75
+ if len(docs) == 0:
76
+ print("❌ NO DOCUMENTS RETRIEVED!")
77
+ print("\nThis is the problem - retrieval is returning 0 results.")
78
+ else:
79
+ print("✅ Documents were retrieved successfully\n")
80
+ for i, doc in enumerate(docs, 1):
81
+ score = doc.score if doc.score else 0.0
82
+ source = doc.meta.get('source_file', 'Unknown')
83
+ section = doc.meta.get('section', 'N/A')
84
+
85
+ print(f"\n[{i}] Score: {score:.4f}")
86
+ print(f" Source: {source}")
87
+ print(f" Section: {section}")
88
+ print(f" Content preview: {doc.content[:150]}...")
89
+
90
+ print("\n" + "="*80)
91
+
92
+ if __name__ == "__main__":
93
+ test_retrieval()