Spaces:
Sleeping
Sleeping
Update README and enhance query processing with logging and error handling
Browse files- Updated README to reflect changes in model names and configuration options.
- Added detailed logging for query processing, document retrieval, and email drafting in `app_simple.py`.
- Improved error handling in email composition and retrieval processes.
- Introduced a new `QueryRewriter` class for enhanced query rewriting capabilities.
- Added test scripts for debugging query processing and retrieval.
- README.md +2 -2
- app_simple.py +85 -14
- src/agents/composer_agent.py +45 -5
- src/config.py +2 -2
- src/pipeline/orchestrator.py +4 -2
- src/retrieval/__init__.py +7 -0
- src/retrieval/hybrid_retriever.py +99 -7
- src/retrieval/query_rewriter.py +237 -0
- test_query_debug.py +73 -0
- test_retrieval_simple.py +93 -0
README.md
CHANGED
|
@@ -23,7 +23,7 @@ This system helps administrative staff compose accurate, professional email resp
|
|
| 23 |
- **In-Memory Document Store**: No Docker required
|
| 24 |
- **PydanticAI**: Multi-agent orchestration with structured outputs (optional)
|
| 25 |
- **Gradio**: Clean, light web interface
|
| 26 |
-
- **OpenAI GPT-5-
|
| 27 |
|
| 28 |
## Two Versions Available
|
| 29 |
|
|
@@ -176,7 +176,7 @@ The app is configured for automatic deployment to Hugging Face Spaces via `app.p
|
|
| 176 |
Key configuration options in `.env`:
|
| 177 |
|
| 178 |
### LLM Configuration
|
| 179 |
-
- `LLM_MODEL`: OpenAI model (default: gpt-4o)
|
| 180 |
- `EMBEDDING_MODEL`: Embedding model (default: text-embedding-3-small)
|
| 181 |
- `LLM_TEMPERATURE`: Temperature for generation (0-1)
|
| 182 |
- `LLM_MAX_TOKENS`: Maximum tokens per response
|
|
|
|
| 23 |
- **In-Memory Document Store**: No Docker required
|
| 24 |
- **PydanticAI**: Multi-agent orchestration with structured outputs (optional)
|
| 25 |
- **Gradio**: Clean, light web interface
|
| 26 |
+
- **OpenAI GPT-5-mini**: Reliable language model (upgraded from gpt-5-nano)
|
| 27 |
|
| 28 |
## Two Versions Available
|
| 29 |
|
|
|
|
| 176 |
Key configuration options in `.env`:
|
| 177 |
|
| 178 |
### LLM Configuration
|
| 179 |
+
- `LLM_MODEL`: OpenAI model (default: gpt-4o, recommended: gpt-5-mini)
|
| 180 |
- `EMBEDDING_MODEL`: Embedding model (default: text-embedding-3-small)
|
| 181 |
- `LLM_TEMPERATURE`: Temperature for generation (0-1)
|
| 182 |
- `LLM_MAX_TOKENS`: Maximum tokens per response
|
app_simple.py
CHANGED
|
@@ -143,14 +143,32 @@ class SimpleFastAssistant:
|
|
| 143 |
import time
|
| 144 |
start = time.time()
|
| 145 |
|
|
|
|
|
|
|
| 146 |
# Retrieve documents
|
| 147 |
docs = self.retriever.retrieve(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
for i, doc in enumerate(docs[:3])
|
| 153 |
-
])
|
| 154 |
|
| 155 |
# Single LLM call
|
| 156 |
system_prompt = """You are an email assistant for BFH (Bern University of Applied Sciences) administrative staff.
|
|
@@ -176,6 +194,16 @@ Context from knowledge base:
|
|
| 176 |
|
| 177 |
Compose a professional email response."""
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
# GPT-5 uses max_completion_tokens instead of max_tokens
|
| 180 |
completion_params = {
|
| 181 |
"model": self.config.llm.model_name,
|
|
@@ -189,19 +217,45 @@ Compose a professional email response."""
|
|
| 189 |
if "gpt-5" in self.config.llm.model_name:
|
| 190 |
completion_params["max_completion_tokens"] = self.config.llm.max_tokens
|
| 191 |
# GPT-5-nano only supports temperature=1 (default), so don't set it
|
|
|
|
| 192 |
else:
|
| 193 |
completion_params["max_tokens"] = self.config.llm.max_tokens
|
| 194 |
completion_params["temperature"] = self.config.llm.temperature
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
email = response.choices[0].message.content
|
| 199 |
elapsed = time.time() - start
|
| 200 |
|
| 201 |
# Parse subject and body
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
stats = f"**Time:** {elapsed:.1f}s | **Docs:** {len(docs)} | **Model:** {self.config.llm.model_name}"
|
| 207 |
|
|
@@ -218,6 +272,17 @@ Compose a professional email response."""
|
|
| 218 |
'explanation': self._explain_relevance(query, doc, i + 1)
|
| 219 |
})
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
return subject, body, stats, chunks_info
|
| 222 |
|
| 223 |
|
|
@@ -237,10 +302,16 @@ class QueryProcessor:
|
|
| 237 |
if not query or not query.strip():
|
| 238 |
return "", "", "", ""
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
|
| 246 |
# ============================================================================
|
|
|
|
| 143 |
import time
|
| 144 |
start = time.time()
|
| 145 |
|
| 146 |
+
logger.info(f"[DEBUG] Processing query: '{query}'")
|
| 147 |
+
|
| 148 |
# Retrieve documents
|
| 149 |
docs = self.retriever.retrieve(query)
|
| 150 |
+
logger.info(f"[DEBUG] Retrieved {len(docs)} documents")
|
| 151 |
+
|
| 152 |
+
if len(docs) == 0:
|
| 153 |
+
logger.warning(f"[DEBUG] No documents retrieved for query: '{query}'")
|
| 154 |
+
|
| 155 |
+
# Build context - limit for GPT-5-nano's smaller context window
|
| 156 |
+
# GPT-5-nano has limited capacity, so we need to be more conservative
|
| 157 |
+
max_docs = 2 if "gpt-5" in self.config.llm.model_name else 3
|
| 158 |
+
max_chars_per_doc = 800 if "gpt-5" in self.config.llm.model_name else 1500
|
| 159 |
+
|
| 160 |
+
context_parts = []
|
| 161 |
+
for i, doc in enumerate(docs[:max_docs]):
|
| 162 |
+
doc_content = doc.content
|
| 163 |
+
if len(doc_content) > max_chars_per_doc:
|
| 164 |
+
doc_content = doc_content[:max_chars_per_doc] + "..."
|
| 165 |
+
logger.debug(f"[DEBUG] Truncated document {i+1} from {len(doc.content)} to {max_chars_per_doc} chars")
|
| 166 |
+
|
| 167 |
+
context_parts.append(f"Document {i+1} (from {doc.meta.get('source_file', 'unknown')}):\n{doc_content}")
|
| 168 |
|
| 169 |
+
context = "\n\n".join(context_parts)
|
| 170 |
+
|
| 171 |
+
logger.info(f"[DEBUG] Context length: {len(context)} chars (using {len(context_parts)} of {len(docs)} docs)")
|
|
|
|
|
|
|
| 172 |
|
| 173 |
# Single LLM call
|
| 174 |
system_prompt = """You are an email assistant for BFH (Bern University of Applied Sciences) administrative staff.
|
|
|
|
| 194 |
|
| 195 |
Compose a professional email response."""
|
| 196 |
|
| 197 |
+
logger.info(f"[DEBUG] User prompt length: {len(user_prompt)} chars")
|
| 198 |
+
logger.info(f"[DEBUG] System prompt length: {len(system_prompt)} chars")
|
| 199 |
+
logger.info(f"[DEBUG] Total prompt length: {len(system_prompt) + len(user_prompt)} chars")
|
| 200 |
+
logger.debug(f"[DEBUG] User prompt preview: {user_prompt[:500]}...")
|
| 201 |
+
|
| 202 |
+
# Check if prompt might be too long for gpt-5-nano
|
| 203 |
+
total_chars = len(system_prompt) + len(user_prompt)
|
| 204 |
+
if "gpt-5" in self.config.llm.model_name and total_chars > 8000:
|
| 205 |
+
logger.warning(f"[DEBUG] Prompt may be too long for gpt-5-nano ({total_chars} chars). Consider using fewer documents or a different model.")
|
| 206 |
+
|
| 207 |
# GPT-5 uses max_completion_tokens instead of max_tokens
|
| 208 |
completion_params = {
|
| 209 |
"model": self.config.llm.model_name,
|
|
|
|
| 217 |
if "gpt-5" in self.config.llm.model_name:
|
| 218 |
completion_params["max_completion_tokens"] = self.config.llm.max_tokens
|
| 219 |
# GPT-5-nano only supports temperature=1 (default), so don't set it
|
| 220 |
+
logger.info(f"[DEBUG] Using GPT-5 parameters: max_completion_tokens={self.config.llm.max_tokens}")
|
| 221 |
else:
|
| 222 |
completion_params["max_tokens"] = self.config.llm.max_tokens
|
| 223 |
completion_params["temperature"] = self.config.llm.temperature
|
| 224 |
+
logger.info(f"[DEBUG] Using standard parameters: max_tokens={self.config.llm.max_tokens}, temp={self.config.llm.temperature}")
|
| 225 |
+
|
| 226 |
+
logger.info(f"[DEBUG] Calling LLM with model: {self.config.llm.model_name}")
|
| 227 |
|
| 228 |
+
try:
|
| 229 |
+
response = self.client.chat.completions.create(**completion_params)
|
| 230 |
+
email = response.choices[0].message.content
|
| 231 |
+
|
| 232 |
+
# Check for null/empty response
|
| 233 |
+
if email is None or email.strip() == "":
|
| 234 |
+
logger.error(f"[DEBUG] LLM returned null or empty response!")
|
| 235 |
+
logger.error(f"[DEBUG] Full response: {response}")
|
| 236 |
+
email = "Error: The model returned an empty response. Please try again."
|
| 237 |
+
else:
|
| 238 |
+
logger.info(f"[DEBUG] LLM response received: {len(email)} chars")
|
| 239 |
+
logger.debug(f"[DEBUG] LLM response preview: {email[:300]}...")
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"[DEBUG] LLM call failed: {e}")
|
| 243 |
+
import traceback
|
| 244 |
+
traceback.print_exc()
|
| 245 |
+
raise
|
| 246 |
|
|
|
|
| 247 |
elapsed = time.time() - start
|
| 248 |
|
| 249 |
# Parse subject and body
|
| 250 |
+
if email and email.strip():
|
| 251 |
+
lines = email.split('\n')
|
| 252 |
+
subject = lines[0].replace('Subject:', '').strip() if lines else "Response"
|
| 253 |
+
body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else email
|
| 254 |
+
else:
|
| 255 |
+
subject = "Error"
|
| 256 |
+
body = "No response generated from the model."
|
| 257 |
+
|
| 258 |
+
logger.info(f"[DEBUG] Parsed email - Subject: '{subject[:50] if len(subject) > 50 else subject}', Body: {len(body)} chars")
|
| 259 |
|
| 260 |
stats = f"**Time:** {elapsed:.1f}s | **Docs:** {len(docs)} | **Model:** {self.config.llm.model_name}"
|
| 261 |
|
|
|
|
| 272 |
'explanation': self._explain_relevance(query, doc, i + 1)
|
| 273 |
})
|
| 274 |
|
| 275 |
+
# Ensure we return non-empty strings
|
| 276 |
+
if not subject or subject.strip() == "":
|
| 277 |
+
subject = "(No subject generated)"
|
| 278 |
+
logger.warning("[DEBUG] Subject was empty, using fallback")
|
| 279 |
+
|
| 280 |
+
if not body or body.strip() == "":
|
| 281 |
+
body = "(No email body generated - please check logs for errors)"
|
| 282 |
+
logger.warning("[DEBUG] Body was empty, using fallback")
|
| 283 |
+
|
| 284 |
+
logger.info(f"[DEBUG] Returning - Subject: '{subject[:30]}...', Body length: {len(body)}, Chunks: {len(chunks_info)}")
|
| 285 |
+
|
| 286 |
return subject, body, stats, chunks_info
|
| 287 |
|
| 288 |
|
|
|
|
| 302 |
if not query or not query.strip():
|
| 303 |
return "", "", "", ""
|
| 304 |
|
| 305 |
+
try:
|
| 306 |
+
subject, body, stats, chunks_info = self.assistant.process_query(query)
|
| 307 |
+
chunks_html = self.formatter.format_chunks_html(chunks_info)
|
| 308 |
+
return subject, body, stats, chunks_html
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.error(f"[ERROR] Failed to process query: {e}")
|
| 311 |
+
import traceback
|
| 312 |
+
traceback.print_exc()
|
| 313 |
+
error_msg = f"Error processing query: {str(e)}"
|
| 314 |
+
return "Error", error_msg, f"**Status:** Failed", f"<p style='color:red'>{error_msg}</p>"
|
| 315 |
|
| 316 |
|
| 317 |
# ============================================================================
|
src/agents/composer_agent.py
CHANGED
|
@@ -121,12 +121,45 @@ Based on this information, compose a complete email response that addresses the
|
|
| 121 |
|
| 122 |
except Exception as e:
|
| 123 |
logger.error(f"Error composing email: {e}")
|
| 124 |
-
# Return minimal draft on error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
return EmailDraft(
|
| 126 |
-
subject=
|
| 127 |
-
body=
|
| 128 |
tone="professional",
|
| 129 |
-
confidence=0.
|
| 130 |
)
|
| 131 |
|
| 132 |
def _build_context(self, documents: List[Document]) -> str:
|
|
@@ -140,7 +173,14 @@ Based on this information, compose a complete email response that addresses the
|
|
| 140 |
Formatted context text
|
| 141 |
"""
|
| 142 |
if not documents:
|
| 143 |
-
return "No relevant documents found in the knowledge base.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
context_parts = []
|
| 146 |
for i, doc in enumerate(documents, 1):
|
|
|
|
| 121 |
|
| 122 |
except Exception as e:
|
| 123 |
logger.error(f"Error composing email: {e}")
|
| 124 |
+
# Return minimal draft on error based on detected language
|
| 125 |
+
language = intent.language if intent and hasattr(intent, 'language') else "de"
|
| 126 |
+
|
| 127 |
+
if language == "de":
|
| 128 |
+
subject = "Ihre Anfrage zur Studienadministration"
|
| 129 |
+
body = """Guten Tag,
|
| 130 |
+
|
| 131 |
+
Vielen Dank für Ihre Anfrage.
|
| 132 |
+
|
| 133 |
+
Leider konnte ich zu Ihrer spezifischen Frage keine detaillierten Informationen in unserer Wissensdatenbank finden. Ich empfehle Ihnen, sich direkt an die Studienadministration der BFH zu wenden:
|
| 134 |
+
|
| 135 |
+
- Website: https://www.bfh.ch
|
| 136 |
+
- Studienadministration: Kontaktinformationen finden Sie auf der BFH-Website
|
| 137 |
+
|
| 138 |
+
Die Mitarbeitenden können Ihnen bei Ihrer Anfrage persönlich weiterhelfen.
|
| 139 |
+
|
| 140 |
+
Freundliche Grüsse
|
| 141 |
+
BFH Studienadministration"""
|
| 142 |
+
else:
|
| 143 |
+
subject = "Your Inquiry to Student Administration"
|
| 144 |
+
body = """Hello,
|
| 145 |
+
|
| 146 |
+
Thank you for your inquiry.
|
| 147 |
+
|
| 148 |
+
Unfortunately, I could not find detailed information regarding your specific question in our knowledge base. I recommend contacting BFH Student Administration directly:
|
| 149 |
+
|
| 150 |
+
- Website: https://www.bfh.ch
|
| 151 |
+
- Student Administration: Contact information available on the BFH website
|
| 152 |
+
|
| 153 |
+
The staff will be able to assist you personally with your inquiry.
|
| 154 |
+
|
| 155 |
+
Best regards
|
| 156 |
+
BFH Student Administration"""
|
| 157 |
+
|
| 158 |
return EmailDraft(
|
| 159 |
+
subject=subject,
|
| 160 |
+
body=body,
|
| 161 |
tone="professional",
|
| 162 |
+
confidence=0.1,
|
| 163 |
)
|
| 164 |
|
| 165 |
def _build_context(self, documents: List[Document]) -> str:
|
|
|
|
| 173 |
Formatted context text
|
| 174 |
"""
|
| 175 |
if not documents:
|
| 176 |
+
return """No relevant documents found in the knowledge base.
|
| 177 |
+
|
| 178 |
+
IMPORTANT: Since no specific documentation was found, you should:
|
| 179 |
+
1. Acknowledge the user's query professionally
|
| 180 |
+
2. Provide general guidance if you can infer the topic
|
| 181 |
+
3. Direct them to contact the appropriate administrative office
|
| 182 |
+
4. Suggest checking the official BFH website for more information
|
| 183 |
+
5. Do NOT make up specific procedures, deadlines, or requirements"""
|
| 184 |
|
| 185 |
context_parts = []
|
| 186 |
for i, doc in enumerate(documents, 1):
|
src/config.py
CHANGED
|
@@ -88,7 +88,7 @@ class RetrievalConfig:
|
|
| 88 |
top_k: int = 5 # Number of documents to retrieve
|
| 89 |
bm25_weight: float = 0.5 # Weight for BM25 score
|
| 90 |
vector_weight: float = 0.5 # Weight for vector similarity score
|
| 91 |
-
min_score: float = 0.
|
| 92 |
|
| 93 |
@classmethod
|
| 94 |
def from_env(cls) -> "RetrievalConfig":
|
|
@@ -97,7 +97,7 @@ class RetrievalConfig:
|
|
| 97 |
top_k=int(os.getenv("RETRIEVAL_TOP_K", "5")),
|
| 98 |
bm25_weight=float(os.getenv("BM25_WEIGHT", "0.5")),
|
| 99 |
vector_weight=float(os.getenv("VECTOR_WEIGHT", "0.5")),
|
| 100 |
-
min_score=float(os.getenv("MIN_RELEVANCE_SCORE", "0.
|
| 101 |
)
|
| 102 |
|
| 103 |
|
|
|
|
| 88 |
top_k: int = 5 # Number of documents to retrieve
|
| 89 |
bm25_weight: float = 0.5 # Weight for BM25 score
|
| 90 |
vector_weight: float = 0.5 # Weight for vector similarity score
|
| 91 |
+
min_score: float = 0.1 # Minimum relevance score threshold (lowered to be more permissive)
|
| 92 |
|
| 93 |
@classmethod
|
| 94 |
def from_env(cls) -> "RetrievalConfig":
|
|
|
|
| 97 |
top_k=int(os.getenv("RETRIEVAL_TOP_K", "5")),
|
| 98 |
bm25_weight=float(os.getenv("BM25_WEIGHT", "0.5")),
|
| 99 |
vector_weight=float(os.getenv("VECTOR_WEIGHT", "0.5")),
|
| 100 |
+
min_score=float(os.getenv("MIN_RELEVANCE_SCORE", "0.1")),
|
| 101 |
)
|
| 102 |
|
| 103 |
|
src/pipeline/orchestrator.py
CHANGED
|
@@ -83,11 +83,13 @@ class RAGOrchestrator:
|
|
| 83 |
logger.info("Step 1: Extracting intent...")
|
| 84 |
intent = await self.intent_agent.extract_intent(query)
|
| 85 |
|
| 86 |
-
# Step 2: Retrieve relevant documents
|
| 87 |
logger.info("Step 2: Retrieving relevant documents...")
|
| 88 |
-
retrieved_docs = self.retriever.
|
| 89 |
|
| 90 |
logger.info(f"Retrieved {len(retrieved_docs)} documents")
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Step 3: Compose email draft
|
| 93 |
logger.info("Step 3: Composing email draft...")
|
|
|
|
| 83 |
logger.info("Step 1: Extracting intent...")
|
| 84 |
intent = await self.intent_agent.extract_intent(query)
|
| 85 |
|
| 86 |
+
# Step 2: Retrieve relevant documents (with query rewriting)
|
| 87 |
logger.info("Step 2: Retrieving relevant documents...")
|
| 88 |
+
retrieved_docs = await self.retriever.retrieve_with_rewriting(query)
|
| 89 |
|
| 90 |
logger.info(f"Retrieved {len(retrieved_docs)} documents")
|
| 91 |
+
if len(retrieved_docs) == 0:
|
| 92 |
+
logger.warning("No documents retrieved - email generation may be limited")
|
| 93 |
|
| 94 |
# Step 3: Compose email draft
|
| 95 |
logger.info("Step 3: Composing email draft...")
|
src/retrieval/__init__.py
CHANGED
|
@@ -11,3 +11,10 @@ try:
|
|
| 11 |
__all__.append("HybridRetriever")
|
| 12 |
except ImportError:
|
| 13 |
pass # OpenSearch dependencies not installed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
__all__.append("HybridRetriever")
|
| 12 |
except ImportError:
|
| 13 |
pass # OpenSearch dependencies not installed
|
| 14 |
+
|
| 15 |
+
# Optional query rewriter
|
| 16 |
+
try:
|
| 17 |
+
from .query_rewriter import QueryRewriter, RewrittenQueries
|
| 18 |
+
__all__.extend(["QueryRewriter", "RewrittenQueries"])
|
| 19 |
+
except ImportError:
|
| 20 |
+
pass # PydanticAI not installed
|
src/retrieval/hybrid_retriever.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""Hybrid retriever combining BM25 and vector search."""
|
| 2 |
|
| 3 |
-
from typing import List, Dict, Any
|
| 4 |
from haystack import Document
|
| 5 |
from haystack.components.embedders import OpenAITextEmbedder
|
| 6 |
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore
|
|
@@ -12,6 +12,18 @@ import logging
|
|
| 12 |
|
| 13 |
from ..config import RetrievalConfig, LLMConfig
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
|
|
@@ -23,6 +35,7 @@ class HybridRetriever:
|
|
| 23 |
document_store: OpenSearchDocumentStore,
|
| 24 |
llm_config: LLMConfig,
|
| 25 |
retrieval_config: RetrievalConfig,
|
|
|
|
| 26 |
):
|
| 27 |
"""
|
| 28 |
Initialize the hybrid retriever.
|
|
@@ -31,10 +44,12 @@ class HybridRetriever:
|
|
| 31 |
document_store: OpenSearch document store
|
| 32 |
llm_config: LLM configuration for embeddings
|
| 33 |
retrieval_config: Retrieval configuration
|
|
|
|
| 34 |
"""
|
| 35 |
self.document_store = document_store
|
| 36 |
self.llm_config = llm_config
|
| 37 |
self.retrieval_config = retrieval_config
|
|
|
|
| 38 |
|
| 39 |
# Initialize BM25 retriever
|
| 40 |
self.bm25_retriever = OpenSearchBM25Retriever(
|
|
@@ -52,12 +67,21 @@ class HybridRetriever:
|
|
| 52 |
model=llm_config.embedding_model,
|
| 53 |
)
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"""
|
| 57 |
-
Retrieve documents using hybrid search.
|
| 58 |
|
| 59 |
Args:
|
| 60 |
-
query: Search query
|
|
|
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
List of relevant documents with scores
|
|
@@ -65,16 +89,23 @@ class HybridRetriever:
|
|
| 65 |
logger.info(f"Retrieving documents for query: {query[:100]}...")
|
| 66 |
|
| 67 |
try:
|
|
|
|
|
|
|
|
|
|
| 68 |
# Get BM25 results
|
| 69 |
-
logger.
|
|
|
|
| 70 |
bm25_results = self.bm25_retriever.run(
|
| 71 |
-
query=
|
| 72 |
top_k=self.retrieval_config.top_k * 2, # Get more to merge
|
| 73 |
)
|
| 74 |
bm25_docs = bm25_results.get("documents", [])
|
|
|
|
|
|
|
|
|
|
| 75 |
logger.debug(f"BM25 retrieved {len(bm25_docs)} documents")
|
| 76 |
|
| 77 |
-
# Generate query embedding
|
| 78 |
logger.debug("Generating query embedding...")
|
| 79 |
embedding_result = self.text_embedder.run(text=query)
|
| 80 |
query_embedding = embedding_result.get("embedding")
|
|
@@ -101,12 +132,73 @@ class HybridRetriever:
|
|
| 101 |
|
| 102 |
logger.info(f"Retrieved {len(final_docs)} documents after hybrid ranking")
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return final_docs
|
| 105 |
|
| 106 |
except Exception as e:
|
| 107 |
logger.error(f"Error during retrieval: {e}")
|
| 108 |
return []
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def _merge_results(
|
| 111 |
self, bm25_docs: List[Document], vector_docs: List[Document]
|
| 112 |
) -> List[Document]:
|
|
|
|
| 1 |
"""Hybrid retriever combining BM25 and vector search."""
|
| 2 |
|
| 3 |
+
from typing import List, Dict, Any, Optional, TYPE_CHECKING
|
| 4 |
from haystack import Document
|
| 5 |
from haystack.components.embedders import OpenAITextEmbedder
|
| 6 |
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore
|
|
|
|
| 12 |
|
| 13 |
from ..config import RetrievalConfig, LLMConfig
|
| 14 |
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from .query_rewriter import QueryRewriter, RewrittenQueries
|
| 17 |
+
else:
|
| 18 |
+
# Optional import for query rewriting at runtime
|
| 19 |
+
try:
|
| 20 |
+
from .query_rewriter import QueryRewriter, RewrittenQueries
|
| 21 |
+
QUERY_REWRITER_AVAILABLE = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
QUERY_REWRITER_AVAILABLE = False
|
| 24 |
+
QueryRewriter = None # type: ignore
|
| 25 |
+
RewrittenQueries = None # type: ignore
|
| 26 |
+
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
|
|
|
|
| 35 |
document_store: OpenSearchDocumentStore,
|
| 36 |
llm_config: LLMConfig,
|
| 37 |
retrieval_config: RetrievalConfig,
|
| 38 |
+
use_query_rewriting: bool = True,
|
| 39 |
):
|
| 40 |
"""
|
| 41 |
Initialize the hybrid retriever.
|
|
|
|
| 44 |
document_store: OpenSearch document store
|
| 45 |
llm_config: LLM configuration for embeddings
|
| 46 |
retrieval_config: Retrieval configuration
|
| 47 |
+
use_query_rewriting: Enable LLM-based query rewriting (default: True)
|
| 48 |
"""
|
| 49 |
self.document_store = document_store
|
| 50 |
self.llm_config = llm_config
|
| 51 |
self.retrieval_config = retrieval_config
|
| 52 |
+
self.use_query_rewriting = use_query_rewriting
|
| 53 |
|
| 54 |
# Initialize BM25 retriever
|
| 55 |
self.bm25_retriever = OpenSearchBM25Retriever(
|
|
|
|
| 67 |
model=llm_config.embedding_model,
|
| 68 |
)
|
| 69 |
|
| 70 |
+
# Initialize query rewriter (uses faster model for speed)
|
| 71 |
+
self.query_rewriter: Optional["QueryRewriter"] = None
|
| 72 |
+
if use_query_rewriting and QUERY_REWRITER_AVAILABLE and QueryRewriter:
|
| 73 |
+
self.query_rewriter = QueryRewriter(
|
| 74 |
+
api_key=llm_config.api_key,
|
| 75 |
+
model="openai:gpt-4o-mini", # Faster model for query rewriting
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def retrieve(self, query: str, rewritten_query: Optional["RewrittenQueries"] = None) -> List[Document]:
|
| 79 |
"""
|
| 80 |
+
Retrieve documents using hybrid search with optional query rewriting.
|
| 81 |
|
| 82 |
Args:
|
| 83 |
+
query: Search query (original or already rewritten)
|
| 84 |
+
rewritten_query: Pre-computed rewritten queries (optional)
|
| 85 |
|
| 86 |
Returns:
|
| 87 |
List of relevant documents with scores
|
|
|
|
| 89 |
logger.info(f"Retrieving documents for query: {query[:100]}...")
|
| 90 |
|
| 91 |
try:
|
| 92 |
+
# Use provided query for BM25 (might be rewritten)
|
| 93 |
+
bm25_query = query
|
| 94 |
+
|
| 95 |
# Get BM25 results
|
| 96 |
+
logger.info(f"[DEBUG] BM25 query string: '{bm25_query}'")
|
| 97 |
+
logger.debug(f"Running BM25 retrieval with query: {bm25_query[:100]}...")
|
| 98 |
bm25_results = self.bm25_retriever.run(
|
| 99 |
+
query=bm25_query,
|
| 100 |
top_k=self.retrieval_config.top_k * 2, # Get more to merge
|
| 101 |
)
|
| 102 |
bm25_docs = bm25_results.get("documents", [])
|
| 103 |
+
logger.info(f"[DEBUG] BM25 retrieved {len(bm25_docs)} documents")
|
| 104 |
+
if bm25_docs:
|
| 105 |
+
logger.debug(f"[DEBUG] Top BM25 result score: {bm25_docs[0].score if bm25_docs[0].score else 'None'}")
|
| 106 |
logger.debug(f"BM25 retrieved {len(bm25_docs)} documents")
|
| 107 |
|
| 108 |
+
# Generate query embedding (use original query for semantic similarity)
|
| 109 |
logger.debug("Generating query embedding...")
|
| 110 |
embedding_result = self.text_embedder.run(text=query)
|
| 111 |
query_embedding = embedding_result.get("embedding")
|
|
|
|
| 132 |
|
| 133 |
logger.info(f"Retrieved {len(final_docs)} documents after hybrid ranking")
|
| 134 |
|
| 135 |
+
# If no docs and we have rewritten queries, try fallback with synonyms
|
| 136 |
+
if len(final_docs) == 0 and rewritten_query and rewritten_query.synonyms:
|
| 137 |
+
logger.info("No documents found, trying with synonym queries...")
|
| 138 |
+
for synonym in rewritten_query.synonyms[:2]: # Try top 2 synonyms
|
| 139 |
+
fallback_docs = self._retrieve_with_query(synonym)
|
| 140 |
+
if fallback_docs:
|
| 141 |
+
logger.info(f"Found {len(fallback_docs)} documents with synonym: {synonym}")
|
| 142 |
+
return fallback_docs
|
| 143 |
+
|
| 144 |
return final_docs
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
logger.error(f"Error during retrieval: {e}")
|
| 148 |
return []
|
| 149 |
|
| 150 |
+
async def retrieve_with_rewriting(self, query: str) -> List[Document]:
|
| 151 |
+
"""
|
| 152 |
+
Retrieve documents with query rewriting.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
query: Original user query
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
List of relevant documents with scores
|
| 159 |
+
"""
|
| 160 |
+
if not self.use_query_rewriting or not self.query_rewriter:
|
| 161 |
+
logger.info("[DEBUG] Query rewriting disabled, using original query")
|
| 162 |
+
return self.retrieve(query)
|
| 163 |
+
|
| 164 |
+
# Rewrite query
|
| 165 |
+
logger.info("Rewriting query for better retrieval...")
|
| 166 |
+
rewritten = await self.query_rewriter.rewrite_query(query)
|
| 167 |
+
|
| 168 |
+
# Build optimized query for BM25
|
| 169 |
+
optimized_query = self.query_rewriter.build_expanded_query(rewritten)
|
| 170 |
+
|
| 171 |
+
logger.info(f"[DEBUG] Original query: '{query}'")
|
| 172 |
+
logger.info(f"[DEBUG] Primary rewritten: '{rewritten.primary_query}'")
|
| 173 |
+
logger.info(f"[DEBUG] Key terms: {rewritten.key_terms}")
|
| 174 |
+
logger.info(f"[DEBUG] Optimized for BM25: '{optimized_query}'")
|
| 175 |
+
logger.info(f"Original: {query[:80]}")
|
| 176 |
+
logger.info(f"Optimized: {optimized_query[:80]}")
|
| 177 |
+
|
| 178 |
+
# Retrieve with optimized query
|
| 179 |
+
return self.retrieve(optimized_query, rewritten_query=rewritten)
|
| 180 |
+
|
| 181 |
+
def _retrieve_with_query(self, query: str) -> List[Document]:
|
| 182 |
+
"""
|
| 183 |
+
Helper method to retrieve with a specific query string.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
query: Query string
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
List of documents
|
| 190 |
+
"""
|
| 191 |
+
try:
|
| 192 |
+
bm25_results = self.bm25_retriever.run(
|
| 193 |
+
query=query,
|
| 194 |
+
top_k=self.retrieval_config.top_k,
|
| 195 |
+
)
|
| 196 |
+
docs = bm25_results.get("documents", [])
|
| 197 |
+
return self._apply_score_threshold(docs)
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Error in fallback retrieval: {e}")
|
| 200 |
+
return []
|
| 201 |
+
|
| 202 |
def _merge_results(
|
| 203 |
self, bm25_docs: List[Document], vector_docs: List[Document]
|
| 204 |
) -> List[Document]:
|
src/retrieval/query_rewriter.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Query rewriting for improved retrieval using LLM."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Set
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
from pydantic_ai import Agent
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class RewrittenQueries(BaseModel):
|
| 13 |
+
"""Structured output for rewritten queries."""
|
| 14 |
+
|
| 15 |
+
primary_query: str = Field(
|
| 16 |
+
description="Main rewritten query optimized for keyword search"
|
| 17 |
+
)
|
| 18 |
+
synonyms: List[str] = Field(
|
| 19 |
+
default_factory=list,
|
| 20 |
+
description="Alternative phrasings and synonyms"
|
| 21 |
+
)
|
| 22 |
+
key_terms: List[str] = Field(
|
| 23 |
+
default_factory=list,
|
| 24 |
+
description="Important domain-specific terms to boost"
|
| 25 |
+
)
|
| 26 |
+
explanation: str = Field(
|
| 27 |
+
default="",
|
| 28 |
+
description="Brief explanation of the rewriting strategy"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class QueryRewriter:
|
| 33 |
+
"""Rewrites queries for better retrieval using LLM."""
|
| 34 |
+
|
| 35 |
+
# Stopwords for basic filtering (used as fallback)
|
| 36 |
+
STOPWORDS: Set[str] = {
|
| 37 |
+
# English
|
| 38 |
+
"i", "me", "my", "we", "you", "he", "she", "it", "they", "am", "is", "are",
|
| 39 |
+
"was", "were", "be", "been", "have", "has", "had", "do", "does", "did",
|
| 40 |
+
"a", "an", "the", "and", "but", "if", "or", "as", "of", "at", "by", "for",
|
| 41 |
+
"with", "about", "to", "from", "in", "on", "can", "will", "would", "should",
|
| 42 |
+
# German
|
| 43 |
+
"ich", "mich", "mir", "du", "er", "sie", "es", "wir", "ihr", "der", "die",
|
| 44 |
+
"das", "den", "dem", "des", "ein", "eine", "und", "oder", "aber", "wenn",
|
| 45 |
+
"als", "von", "zu", "mit", "bei", "für", "auf", "an", "in", "ist", "sind",
|
| 46 |
+
"war", "haben", "hat", "kann", "wie", "was", "wo",
|
| 47 |
+
# French
|
| 48 |
+
"je", "tu", "il", "elle", "nous", "vous", "ils", "elles", "le", "la", "les",
|
| 49 |
+
"un", "une", "des", "et", "ou", "mais", "si", "de", "à", "pour", "avec",
|
| 50 |
+
"dans", "est", "sont", "avoir", "peut", "comment", "que", "qui",
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
def __init__(self, api_key: str, model: str = "openai:gpt-4o-mini"):
|
| 54 |
+
"""
|
| 55 |
+
Initialize query rewriter.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
api_key: OpenAI API key
|
| 59 |
+
model: Model to use (default: gpt-4o-mini for speed)
|
| 60 |
+
"""
|
| 61 |
+
self.agent = Agent[None, RewrittenQueries](
|
| 62 |
+
model,
|
| 63 |
+
system_prompt="""You are an expert at rewriting user queries for optimal document retrieval in a university administrative system (BFH - Bern University of Applied Sciences).
|
| 64 |
+
|
| 65 |
+
Your task is to transform natural language queries into optimized search queries that will find relevant administrative documents.
|
| 66 |
+
|
| 67 |
+
Key strategies:
|
| 68 |
+
1. **Translate colloquial to formal**: Convert informal phrasing to official administrative terms
|
| 69 |
+
- Example: "drop out" → "Exmatrikulation" (German) or "withdrawal" (English)
|
| 70 |
+
- Example: "change my major" → "Studiengangwechsel"
|
| 71 |
+
|
| 72 |
+
2. **Add domain-specific keywords**: Include relevant administrative terms
|
| 73 |
+
- Common terms: Antrag (application), Formular (form), Frist (deadline), Anmeldung (registration)
|
| 74 |
+
|
| 75 |
+
3. **Language consistency**: Keep search terms in the same language as documents (primarily German)
|
| 76 |
+
- German query → German search terms
|
| 77 |
+
- English query → can include German terms if they're official names
|
| 78 |
+
|
| 79 |
+
4. **Extract key entities**: Identify important terms like:
|
| 80 |
+
- Administrative processes: Exmatrikulation, Immatrikulation, Beurlaubung
|
| 81 |
+
- Documents: Formular, Bestätigung, Bescheinigung
|
| 82 |
+
- Deadlines: Frist, Semester, Anmeldung
|
| 83 |
+
|
| 84 |
+
5. **Expand with synonyms**: Provide alternative terms that might appear in documents
|
| 85 |
+
- Example: "Abmeldung" and "Exmatrikulation"
|
| 86 |
+
|
| 87 |
+
6. **Remove filler words**: Focus on content words only
|
| 88 |
+
|
| 89 |
+
Output:
|
| 90 |
+
- primary_query: The best optimized query for keyword search (BM25)
|
| 91 |
+
- synonyms: 2-3 alternative phrasings
|
| 92 |
+
- key_terms: 3-5 critical domain-specific terms to boost in search
|
| 93 |
+
- explanation: Brief note on your strategy
|
| 94 |
+
|
| 95 |
+
Keep queries concise (5-10 words max for primary_query)."""
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
async def rewrite_query(self, original_query: str) -> RewrittenQueries:
|
| 99 |
+
"""
|
| 100 |
+
Rewrite a query for better retrieval.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
original_query: Original user query
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
Rewritten queries with metadata
|
| 107 |
+
"""
|
| 108 |
+
logger.info(f"Rewriting query: {original_query[:100]}...")
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
result = await self.agent.run(f"Rewrite this query for optimal document retrieval:\n\n{original_query}")
|
| 112 |
+
rewritten = result.output
|
| 113 |
+
|
| 114 |
+
# Post-process to ensure stopwords are removed
|
| 115 |
+
rewritten = self._clean_stopwords(rewritten)
|
| 116 |
+
|
| 117 |
+
logger.info(f"Rewritten primary query: {rewritten.primary_query}")
|
| 118 |
+
logger.debug(f"Key terms: {', '.join(rewritten.key_terms)}")
|
| 119 |
+
logger.debug(f"Explanation: {rewritten.explanation}")
|
| 120 |
+
|
| 121 |
+
return rewritten
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Error rewriting query: {e}")
|
| 125 |
+
# Fallback to basic stopword removal
|
| 126 |
+
return self._fallback_rewrite(original_query)
|
| 127 |
+
|
| 128 |
+
def _clean_stopwords(self, rewritten: RewrittenQueries) -> RewrittenQueries:
|
| 129 |
+
"""
|
| 130 |
+
Clean stopwords from rewritten query (post-processing safety net).
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
rewritten: Rewritten query from LLM
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Cleaned rewritten query
|
| 137 |
+
"""
|
| 138 |
+
# Clean primary query
|
| 139 |
+
tokens = re.findall(r'\w+', rewritten.primary_query.lower())
|
| 140 |
+
removed_stopwords = [t for t in tokens if t in self.STOPWORDS]
|
| 141 |
+
cleaned_tokens = [t for t in tokens if t not in self.STOPWORDS]
|
| 142 |
+
|
| 143 |
+
logger.info(f"[DEBUG] Stopwords removed from primary query: {removed_stopwords}")
|
| 144 |
+
logger.info(f"[DEBUG] Before stopword cleaning: '{rewritten.primary_query}'")
|
| 145 |
+
|
| 146 |
+
if cleaned_tokens:
|
| 147 |
+
cleaned_primary = " ".join(cleaned_tokens)
|
| 148 |
+
else:
|
| 149 |
+
# If everything was removed, keep original
|
| 150 |
+
cleaned_primary = rewritten.primary_query
|
| 151 |
+
|
| 152 |
+
logger.info(f"[DEBUG] After stopword cleaning: '{cleaned_primary}'")
|
| 153 |
+
|
| 154 |
+
# Clean key terms
|
| 155 |
+
cleaned_key_terms = [term for term in rewritten.key_terms if term.lower() not in self.STOPWORDS]
|
| 156 |
+
|
| 157 |
+
# Clean synonyms
|
| 158 |
+
cleaned_synonyms = []
|
| 159 |
+
for syn in rewritten.synonyms:
|
| 160 |
+
syn_tokens = re.findall(r'\w+', syn.lower())
|
| 161 |
+
syn_cleaned = [t for t in syn_tokens if t not in self.STOPWORDS]
|
| 162 |
+
if syn_cleaned:
|
| 163 |
+
cleaned_synonyms.append(" ".join(syn_cleaned))
|
| 164 |
+
|
| 165 |
+
return RewrittenQueries(
|
| 166 |
+
primary_query=cleaned_primary,
|
| 167 |
+
synonyms=cleaned_synonyms,
|
| 168 |
+
key_terms=cleaned_key_terms,
|
| 169 |
+
explanation=rewritten.explanation + " (stopwords cleaned)",
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def _fallback_rewrite(self, query: str) -> RewrittenQueries:
|
| 173 |
+
"""
|
| 174 |
+
Fallback query rewriting using simple stopword removal.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
query: Original query
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Basic rewritten query
|
| 181 |
+
"""
|
| 182 |
+
logger.info("Using fallback query rewriting (stopword removal)")
|
| 183 |
+
|
| 184 |
+
# Tokenize and filter stopwords
|
| 185 |
+
tokens = re.findall(r'\w+', query.lower())
|
| 186 |
+
filtered = [t for t in tokens if t not in self.STOPWORDS and len(t) >= 3]
|
| 187 |
+
|
| 188 |
+
primary_query = " ".join(filtered) if filtered else query
|
| 189 |
+
|
| 190 |
+
return RewrittenQueries(
|
| 191 |
+
primary_query=primary_query,
|
| 192 |
+
synonyms=[],
|
| 193 |
+
key_terms=filtered[:5],
|
| 194 |
+
explanation="Fallback: removed stopwords only"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
def build_expanded_query(self, rewritten: RewrittenQueries) -> str:
|
| 198 |
+
"""
|
| 199 |
+
Build an expanded query combining primary query and key terms.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
rewritten: Rewritten query data
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Expanded query string for BM25 search
|
| 206 |
+
"""
|
| 207 |
+
# Combine primary query with key terms (weighted more)
|
| 208 |
+
parts = [rewritten.primary_query]
|
| 209 |
+
|
| 210 |
+
# Add key terms (they'll boost relevance if present)
|
| 211 |
+
for term in rewritten.key_terms:
|
| 212 |
+
if term.lower() not in rewritten.primary_query.lower():
|
| 213 |
+
parts.append(term)
|
| 214 |
+
|
| 215 |
+
return " ".join(parts)
|
| 216 |
+
|
| 217 |
+
def get_all_variants(self, rewritten: RewrittenQueries) -> List[str]:
|
| 218 |
+
"""
|
| 219 |
+
Get all query variants for multi-query retrieval.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
rewritten: Rewritten query data
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
List of query variants
|
| 226 |
+
"""
|
| 227 |
+
variants = [rewritten.primary_query]
|
| 228 |
+
|
| 229 |
+
# Add synonym variants
|
| 230 |
+
variants.extend(rewritten.synonyms[:2]) # Limit to top 2
|
| 231 |
+
|
| 232 |
+
# Add expanded query
|
| 233 |
+
expanded = self.build_expanded_query(rewritten)
|
| 234 |
+
if expanded not in variants:
|
| 235 |
+
variants.append(expanded)
|
| 236 |
+
|
| 237 |
+
return variants
|
test_query_debug.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test script to debug query processing for 'Was kostet eine Namensänderung?'"""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import logging
|
| 5 |
+
import sys
|
| 6 |
+
from src.config import get_config
|
| 7 |
+
from src.indexing.indexer import DocumentIndexer
|
| 8 |
+
from src.pipeline.orchestrator import RAGOrchestrator
|
| 9 |
+
|
| 10 |
+
# Configure logging to see all DEBUG messages
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 14 |
+
handlers=[logging.StreamHandler(sys.stdout)]
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Set specific loggers to INFO to see debug messages
|
| 18 |
+
logging.getLogger('src.retrieval.hybrid_retriever').setLevel(logging.INFO)
|
| 19 |
+
logging.getLogger('src.retrieval.query_rewriter').setLevel(logging.INFO)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def test_query():
|
| 23 |
+
"""Test the problematic query."""
|
| 24 |
+
print("\n" + "="*80)
|
| 25 |
+
print("Testing query: 'Was kostet eine Namensänderung?'")
|
| 26 |
+
print("="*80 + "\n")
|
| 27 |
+
|
| 28 |
+
# Load config
|
| 29 |
+
config = get_config()
|
| 30 |
+
|
| 31 |
+
# Initialize indexer (this connects to document store)
|
| 32 |
+
indexer = DocumentIndexer(config)
|
| 33 |
+
|
| 34 |
+
# Initialize orchestrator
|
| 35 |
+
orchestrator = RAGOrchestrator(config, indexer)
|
| 36 |
+
|
| 37 |
+
# Test query
|
| 38 |
+
query = "Was kostet eine Namensänderung?"
|
| 39 |
+
|
| 40 |
+
print(f"\n>>> Running query: '{query}'\n")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
result = await orchestrator.process_query(query)
|
| 44 |
+
|
| 45 |
+
print("\n" + "="*80)
|
| 46 |
+
print("RESULTS SUMMARY")
|
| 47 |
+
print("="*80)
|
| 48 |
+
print(f"Documents retrieved: {len(result.retrieved_docs)}")
|
| 49 |
+
print(f"Processing time: {result.processing_time:.2f}s")
|
| 50 |
+
print(f"\nIntent detected:")
|
| 51 |
+
print(f" - Action: {result.intent.action_type}")
|
| 52 |
+
print(f" - Topic: {result.intent.topic}")
|
| 53 |
+
print(f" - Language: {result.intent.language}")
|
| 54 |
+
|
| 55 |
+
if result.retrieved_docs:
|
| 56 |
+
print(f"\nTop 3 retrieved documents:")
|
| 57 |
+
for i, doc in enumerate(result.retrieved_docs[:3], 1):
|
| 58 |
+
print(f"\n [{i}] Score: {doc.get('score', 'N/A'):.4f}")
|
| 59 |
+
print(f" Source: {doc.get('meta', {}).get('source_file', 'Unknown')}")
|
| 60 |
+
print(f" Preview: {doc.get('content', '')[:150]}...")
|
| 61 |
+
|
| 62 |
+
print(f"\nEmail draft preview:")
|
| 63 |
+
print(f" Subject: {result.email_draft.subject}")
|
| 64 |
+
print(f" Body (first 200 chars): {result.email_draft.body[:200]}...")
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"\n❌ Error: {e}")
|
| 68 |
+
import traceback
|
| 69 |
+
traceback.print_exc()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
asyncio.run(test_query())
|
test_retrieval_simple.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simple test script to check retrieval for the problematic query."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
# Set up logging
|
| 7 |
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
|
| 8 |
+
|
| 9 |
+
# Import after logging is configured
|
| 10 |
+
from src.config import get_config
|
| 11 |
+
from src.indexing.memory_indexer import MemoryDocumentIndexer
|
| 12 |
+
from src.retrieval.memory_retriever import MemoryRetriever
|
| 13 |
+
|
| 14 |
+
def test_retrieval():
|
| 15 |
+
"""Test retrieval for the problematic query."""
|
| 16 |
+
|
| 17 |
+
query = "Wie kann ich einen Rückzug machen? Wie kann ich mich vom Studium abmelden?"
|
| 18 |
+
|
| 19 |
+
print("="*80)
|
| 20 |
+
print(f"Testing retrieval for query:")
|
| 21 |
+
print(f" '{query}'")
|
| 22 |
+
print("="*80)
|
| 23 |
+
|
| 24 |
+
# Load config
|
| 25 |
+
config = get_config()
|
| 26 |
+
|
| 27 |
+
# Initialize indexer and retriever
|
| 28 |
+
print("\nInitializing document store...")
|
| 29 |
+
indexer = MemoryDocumentIndexer(llm_config=config.llm)
|
| 30 |
+
|
| 31 |
+
# Load documents from JSON
|
| 32 |
+
import json
|
| 33 |
+
from haystack import Document as HaystackDoc
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
|
| 36 |
+
json_path = Path("documents_indexed.json")
|
| 37 |
+
if json_path.exists():
|
| 38 |
+
print(f"Loading documents from {json_path}...")
|
| 39 |
+
with open(json_path) as f:
|
| 40 |
+
docs_data = json.load(f)
|
| 41 |
+
|
| 42 |
+
documents = []
|
| 43 |
+
for doc_data in docs_data:
|
| 44 |
+
doc = HaystackDoc(
|
| 45 |
+
id=doc_data["id"],
|
| 46 |
+
content=doc_data["content"],
|
| 47 |
+
embedding=doc_data.get("embedding"),
|
| 48 |
+
meta=doc_data.get("meta", {})
|
| 49 |
+
)
|
| 50 |
+
documents.append(doc)
|
| 51 |
+
|
| 52 |
+
indexer.document_store.write_documents(documents)
|
| 53 |
+
print(f" Loaded {len(documents)} documents")
|
| 54 |
+
else:
|
| 55 |
+
print(f"ERROR: {json_path} not found!")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
print(f" Total documents in store: {indexer.document_store.count_documents()}")
|
| 59 |
+
|
| 60 |
+
# Initialize retriever
|
| 61 |
+
print("\nInitializing retriever...")
|
| 62 |
+
retriever = MemoryRetriever(
|
| 63 |
+
document_store=indexer.document_store,
|
| 64 |
+
llm_config=config.llm,
|
| 65 |
+
retrieval_config=config.retrieval,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Perform retrieval
|
| 69 |
+
print(f"\nPerforming retrieval...")
|
| 70 |
+
docs = retriever.retrieve(query)
|
| 71 |
+
|
| 72 |
+
print(f"\nResults: {len(docs)} documents retrieved")
|
| 73 |
+
print("="*80)
|
| 74 |
+
|
| 75 |
+
if len(docs) == 0:
|
| 76 |
+
print("❌ NO DOCUMENTS RETRIEVED!")
|
| 77 |
+
print("\nThis is the problem - retrieval is returning 0 results.")
|
| 78 |
+
else:
|
| 79 |
+
print("✅ Documents were retrieved successfully\n")
|
| 80 |
+
for i, doc in enumerate(docs, 1):
|
| 81 |
+
score = doc.score if doc.score else 0.0
|
| 82 |
+
source = doc.meta.get('source_file', 'Unknown')
|
| 83 |
+
section = doc.meta.get('section', 'N/A')
|
| 84 |
+
|
| 85 |
+
print(f"\n[{i}] Score: {score:.4f}")
|
| 86 |
+
print(f" Source: {source}")
|
| 87 |
+
print(f" Section: {section}")
|
| 88 |
+
print(f" Content preview: {doc.content[:150]}...")
|
| 89 |
+
|
| 90 |
+
print("\n" + "="*80)
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
test_retrieval()
|