Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 23 days ago

Commit

d0fb28f

1 Parent(s): 583c3c6

feat(query-enhancement): upgrade LLM prompt + replace print with logger

Query Enhancement Prompt Upgrades:
- Added 'location' field: extracts specific region/city from query
- Added 'query_type' field: conflict|humanitarian|political|economic|social|general
- Added 'search_keywords' field: 3-5 key terms for keyword overlap filter
- Removed NLI DeBERTa fallback (replaced with simple regex fallback)
- Cleaner regex fallback: temporal + source detection without DeBERTa dependency
- Structured log output: [QE] topic=... type=... days_back=... source=... location=...

Logging Improvements:
- Replaced ALL print(f'DEBUG: ...') with logger.info(f'[RAG] ...')
- Fixed broken line: except block return [] was on same line as logger call
- Consistent [RAG] prefix for all pipeline log messages
- Errors use logger.warning() instead of logger.info()
- Server logs now show clean structured output instead of print spam

Files changed (1) hide show

src/core/use_cases/rag_chat_use_case.py +124 -117

src/core/use_cases/rag_chat_use_case.py CHANGED Viewed

@@ -135,38 +135,52 @@ Document:
     def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
         """
-        Single LLM call that does BOTH intent extraction AND multilingual translation.
-        Replaces the previous two-call approach (_extract_intents + _translate_query_to_all_languages).
         Returns:
         {
-            "expanded_query": str,          # cleaned English query
-            "days_back": int | None,        # temporal filter
-            "source": str | None,           # source filter
-            "translations": {               # per-language queries for Qdrant
-                "en": str, "ar": str, "am": str, "so": str, "sw": str, "fr": str
             }
         }
         Falls back gracefully on any LLM/parse failure.
         """
         import re, json
-        prompt = f"""You are a search query processor for a multilingual news system. Analyze the query and output ONLY valid JSON.
-Required fields:
-- "days_back": integer or null
-  * 1=today/now/tonight, 2=yesterday, 7=this week/recently, 30=this month, 365=this year, null=no time
-- "source": string or null — news outlet name if mentioned (e.g. "BBC", "Reuters"), else null
-- "topic": string — the SPECIFIC search topic with typos fixed, time/source words removed.
-  IMPORTANT: Be precise. "Ethiopia peace talks" must stay "Ethiopia peace talks", NOT expand to "Sudan peace talks" or "Horn of Africa conflict".
-  Keep named entities exact. Do NOT broaden or generalize the topic.
-- "translations": object with these 5 keys (translate "topic" into each language):
-  * "ar": Arabic
-  * "am": Amharic (Ethiopian script)
-  * "so": Somali
-  * "sw": Swahili
-  * "fr": French
-  Rules for translations: keep concise and specific, preserve named entities in local spelling, use English term if uncertain.
 Query: "{query}"
@@ -176,6 +190,9 @@ JSON:"""
             "expanded_query": query,
             "days_back": None,
             "source": None,
             "translations": {lang: query for lang in ["ar", "am", "so", "sw", "fr"]},
         }
@@ -187,68 +204,55 @@ JSON:"""
                 if isinstance(parsed.get("days_back"), int):
                     result["days_back"] = parsed["days_back"]
-                    print(f"DEBUG: days_back={parsed['days_back']}")
                 if isinstance(parsed.get("source"), str) and parsed["source"]:
                     result["source"] = parsed["source"]
-                    print(f"DEBUG: source={parsed['source']}")
                 if isinstance(parsed.get("topic"), str) and parsed["topic"].strip():
                     result["expanded_query"] = parsed["topic"].strip()
-                    print(f"DEBUG: expanded_query='{result['expanded_query']}'")
                 translations = parsed.get("translations", {})
                 if isinstance(translations, dict):
                     for lang in ["ar", "am", "so", "sw", "fr"]:
                         val = translations.get(lang, "").strip()
                         result["translations"][lang] = val if val else result["expanded_query"]
-                        print(f"DEBUG: translation [{lang}]: {result['translations'][lang]}")
                 return result
         except Exception as e:
-            print(f"DEBUG: Combined LLM call failed: {e} — using NLI fallback for temporal, English for translations")
-        # ── NLI fallback for temporal only (translations stay as English) ────
-        try:
-            from src.infrastructure.adapters.intent_classifier import intent_classifier
-            pipe = intent_classifier._pipe
-            if pipe is None:
-                intent_classifier._load()
-                pipe = intent_classifier._pipe
-            if pipe is not None:
-                temporal_result = pipe(
-                    query,
-                    candidate_labels=[
-                        "asking about news from today or right now",
-                        "asking about news from yesterday",
-                        "asking about news from this week or past few days",
-                        "asking about news from this month",
-                        "asking about news from this year",
-                        "no specific time period mentioned",
-                    ],
-                    hypothesis_template="The user is {}.",
-                    multi_label=False,
-                )
-                top_label = temporal_result["labels"][0]
-                top_score = temporal_result["scores"][0]
-                if top_score > 0.4:
-                    if "today" in top_label or "right now" in top_label:
-                        result["days_back"] = 1
-                    elif "yesterday" in top_label:
-                        result["days_back"] = 2
-                    elif "week" in top_label or "past few days" in top_label:
-                        result["days_back"] = 7
-                    elif "month" in top_label:
-                        result["days_back"] = 30
-                    elif "year" in top_label:
-                        result["days_back"] = 365
-                    print(f"DEBUG: NLI temporal fallback → days_back={result['days_back']}")
-        except Exception as e:
-            print(f"DEBUG: NLI fallback also failed: {e}")
-        # ── Source regex fallback ─────────────────────────────────────────────
         source_match = re.search(
             r'\b(?:from|on|by|via|source[:\s]+)\s*([A-Z][A-Za-z]+(?:\s[A-Z][A-Za-z]+)?)\b',
             query
@@ -256,6 +260,7 @@ JSON:"""
         if source_match:
             result["source"] = source_match.group(1)
         return result
     def _search_single_language(
@@ -309,11 +314,11 @@ JSON:"""
                     "doc_id": hit.doc_id,
                 })
-            print(f"DEBUG: [{lang_code}] search returned {len(docs)} results")
             return docs
         except Exception as e:
-            print(f"DEBUG: [{lang_code}] search failed: {e}")
             return []
     async def _build_context(self, query: str, top_k: int, source_filter=None, language_filter=None, days_back=None) -> Tuple[str, List[Dict[str, Any]]]:
@@ -327,13 +332,13 @@ JSON:"""
             if language_detector:
                 lang_detection = language_detector.detect(query)
                 query_language = lang_detection.language
-                print(f"DEBUG: Detected language: {query_language} (confidence={lang_detection.confidence:.2f}, method={lang_detection.method})")
                 # If query is not in English, we'll handle it in translation step
                 if query_language != "en":
-                    print(f"DEBUG: Non-English query detected, will translate to English for processing")
         except Exception as e:
-            print(f"DEBUG: Language detection failed: {e}, assuming English")
         # Expand query if needed (typo fix, short query expansion)
         try:
@@ -341,20 +346,20 @@ JSON:"""
             if query_expander:
                 expansion_result = query_expander.expand(query)
                 if expansion_result.was_expanded:
-                    print(f"DEBUG: Query expanded: '{query}' → '{expansion_result.expanded}'")
-                    print(f"DEBUG: Expansion reason: {expansion_result.expansion_reason}")
                     query = expansion_result.expanded
                 else:
-                    print(f"DEBUG: Query not expanded: {expansion_result.expansion_reason}")
         except Exception as e:
-            print(f"DEBUG: Query expansion failed: {e}, using original query")
         # Extract entities for better filtering
         try:
             from src.infrastructure.adapters.entity_extractor import entity_extractor
             if entity_extractor:
                 entities = entity_extractor.extract(query)
-                print(f"DEBUG: Extracted entities:")
                 print(f"  - Locations: {entities.locations}")
                 print(f"  - Organizations: {entities.organizations}")
                 print(f"  - Temporal keywords: {entities.temporal_keywords}")
@@ -364,9 +369,9 @@ JSON:"""
                     auto_source = entity_extractor.get_source_filter(entities)
                     if auto_source:
                         source_filter = auto_source
-                        print(f"DEBUG: Auto-detected source filter: {source_filter}")
         except Exception as e:
-            print(f"DEBUG: Entity extraction failed: {e}")
         # ── Step 1: Single LLM call — intent extraction + multilingual translation ──
         expanded_query = query
@@ -380,7 +385,7 @@ JSON:"""
         use_hybrid = self.orchestrator is not None and self.hybrid_ranker is not None
         if use_hybrid:
-            print(f"DEBUG: Hybrid search enabled - checking intent and strategy")
             # Classify intent using v2 (production-grade) or v1 (fallback)
             # Check Redis cache first to avoid 8-11s DeBERTa inference on repeat queries
@@ -390,7 +395,7 @@ JSON:"""
             if self.cache:
                 cached_intent = self.cache.get(intent_cache_key)
                 if cached_intent:
-                    print(f"DEBUG: Intent cache HIT — skipping DeBERTa inference")
                     # Reconstruct a minimal intent result from cache
                     class _CachedIntent:
                         def __init__(self, d):
@@ -400,17 +405,19 @@ JSON:"""
                             self.inference_time_ms = 0.0
                     intent_result = _CachedIntent(cached_intent)
                     intent = "NEWS" if intent_result.intent != "OTHER" else "OTHER"
-                    print(f"DEBUG: Intent (cached): {intent_result.intent} (confidence={intent_result.confidence:.2f})")
             if intent_result is None:
                 if self.use_v2_classifier and self.intent_classifier_v2:
                     intent_result = self.intent_classifier_v2.classify(query)
                     intent = "NEWS" if intent_result.intent != "OTHER" else "OTHER"
-                    print(f"DEBUG: Intent classification v2: {intent_result.intent} "
-                          f"(confidence={intent_result.confidence:.2f}, "
-                          f"method={intent_result.method}, "
-                          f"time={intent_result.inference_time_ms:.1f}ms)")
                     # Cache intent result for 1 hour (same query = same intent)
                     if self.cache:
@@ -422,27 +429,27 @@ JSON:"""
                 else:
                     intent = self.intent_classifier.classify(query)
                     intent_result = None
-                    print(f"DEBUG: Intent classification v1: {intent}")
             # Decide search strategy (pass full intent_result for v2)
             strategy = self.orchestrator.decide_search_strategy(query, intent, intent_result)
-            print(f"DEBUG: Search strategy: {strategy}")
             # If intent is OTHER (small talk), skip search entirely
             if intent == "OTHER":
-                print(f"DEBUG: Small talk detected - skipping search")
                 return "", []
         else:
-            print(f"DEBUG: Hybrid search disabled - using traditional pipeline")
             use_hybrid = False
             strategy = None
         if actual_language_filter:
             # Explicit language override — single-language mode, no translation needed
-            print(f"DEBUG: Language filter '{actual_language_filter}' — single-language mode")
             lang_sparse_queries: Dict[str, str] = {actual_language_filter: expanded_query}
         else:
-            print(f"DEBUG: Running combined intent extraction + translation...")
             combined = self._extract_intents_and_translate(query)
             if combined.get("days_back") and isinstance(combined["days_back"], int):
@@ -466,7 +473,7 @@ JSON:"""
         # ── HYBRID SEARCH EXECUTION ────────────────────────────────────────────
         if use_hybrid and strategy and (strategy.use_live or strategy.use_db):
-            print(f"DEBUG: Executing hybrid search...")
             # Execute hybrid search (parallel live + DB)
             try:
@@ -480,7 +487,7 @@ JSON:"""
                     top_k=per_lang_limit
                 )
-                print(f"DEBUG: Hybrid search returned {len(db_results)} DB + {len(live_results)} live results")
                 # Merge and rank results
                 all_docs = self.hybrid_ranker.merge_and_rank(
@@ -491,24 +498,24 @@ JSON:"""
                     final_top_n=top_k * 3  # Get more candidates for quality filtering
                 )
-                print(f"DEBUG: After hybrid ranking: {len(all_docs)} results")
             except Exception as e:
-                print(f"DEBUG: Hybrid search failed: {e} - falling back to traditional pipeline")
                 use_hybrid = False
                 all_docs = []
         # ── TRADITIONAL PIPELINE (fallback or when hybrid disabled) ────────────
         if not use_hybrid or not all_docs:
-            print(f"DEBUG: Using traditional multilingual pipeline")
             # ── Step 2: Compute dense vector ONCE from English query ──────────────
             # BGE-M3 dense space is language-agnostic — one English dense vector
             # semantically matches content in all 6 languages.
-            print(f"DEBUG: Computing dense vector for: '{expanded_query}'")
             english_vectors = self.embedder.encode_query(expanded_query)
             dense_vec: List[float] = english_vectors["dense"]
-            print(f"DEBUG: Dense vector ready ({len(dense_vec)} dims)")
             # ── Step 3: Batch sparse encoding — ONE forward pass for all languages ─
             # BGE-M3 holds the GIL during inference. ThreadPoolExecutor gives zero
@@ -518,7 +525,7 @@ JSON:"""
             lang_codes = list(lang_sparse_queries.keys())
             lang_texts  = [lang_sparse_queries[lc] for lc in lang_codes]
-            print(f"DEBUG: Batch sparse encoding {len(lang_texts)} language queries...")
             sparse_results = self.embedder.encode_sparse_batch(lang_texts)
             lang_sparse_vecs: Dict[str, Optional[Dict]] = {}
@@ -526,7 +533,7 @@ JSON:"""
                 sparse = result.get("sparse")
                 lang_sparse_vecs[lc] = sparse
                 token_count = len(sparse["indices"]) if sparse else 0
-                print(f"DEBUG: [{lc}] sparse ready — {token_count} tokens")
             # ── Step 4: Fan out to Qdrant — 6 parallel searches ──────────────────
             # Each lane: shared dense_vec + language-specific sparse_vec + language filter
@@ -559,14 +566,14 @@ JSON:"""
                                 seen_doc_ids.add(doc_id)
                             all_docs.append(doc)
                     except Exception as e:
-                        print(f"DEBUG: [{lc}] future failed: {e}")
-            print(f"DEBUG: Total pooled candidates after dedup: {len(all_docs)}")
             # ── Step 5: Temporal fallback — retry without date filter if zero results ──
             self._temporal_fallback_used = False
             if len(all_docs) == 0 and days_back is not None:
-                print(f"DEBUG: No results with days_back={days_back} — retrying without temporal filter")
                 with concurrent.futures.ThreadPoolExecutor(max_workers=len(lang_sparse_queries)) as executor:
                     futures = {
                         executor.submit(_run_search, lc, None): lc
@@ -583,9 +590,9 @@ JSON:"""
                                     seen_doc_ids.add(doc_id)
                                 all_docs.append(doc)
                         except Exception as e:
-                            print(f"DEBUG: [{lc}] fallback future failed: {e}")
                 self._temporal_fallback_used = True
-                print(f"DEBUG: Fallback returned {len(all_docs)} total candidates")
             # ── Step 6: Multilingual reranking ────────────────────────────────────
             # bge-reranker-v2-m3 scores (English query, any-language content) natively
@@ -610,11 +617,11 @@ JSON:"""
         above_threshold = [d for d in quality_docs if d.get("rerank_score", 1.0) >= RERANK_THRESHOLD]
         if above_threshold:
             quality_docs = above_threshold
-            print(f"DEBUG: {len(quality_docs)} docs above rerank threshold {RERANK_THRESHOLD}")
         else:
             # All scores low — keep top 3 anyway rather than returning nothing
             quality_docs = quality_docs[:3]
-            print(f"DEBUG: All docs below threshold — keeping top 3 by rerank score")
         # ── Keyword overlap filter — soft filter, keeps docs with ANY query term ─
         # Only drops docs with ZERO overlap AND low rerank score.
@@ -643,7 +650,7 @@ JSON:"""
             overlapping = [d for d in quality_docs if _has_overlap(d)]
             if overlapping:
                 quality_docs = overlapping
-                print(f"DEBUG: {len(quality_docs)} docs after keyword overlap filter")
             else:
                 # No overlap at all — keep top 5 by score rather than dropping everything
                 quality_docs = sorted(
@@ -651,7 +658,7 @@ JSON:"""
                     key=lambda d: d.get("rerank_score") or d.get("score", 0),
                     reverse=True
                 )[:5]
-                print(f"DEBUG: No keyword overlap — keeping top 5 by score ({len(quality_docs)} docs)")
         # Guarantee at least 1 non-English result if available
         non_english = [d for d in quality_docs if d.get("metadata", {}).get("_search_lang", "en") != "en"]
@@ -672,7 +679,7 @@ JSON:"""
             deduped_final.append(d)
         langs_in_result = list({d.get("metadata", {}).get("_search_lang", "en") for d in deduped_final})
-        print(f"DEBUG: Final {len(deduped_final)} docs — languages: {langs_in_result}")
         # ── Step 8: Token limitation ──────────────────────────────────────────
         return self._limit_context(query, deduped_final)
@@ -703,12 +710,12 @@ JSON:"""
         }
     async def execute_chat(self, request: ChatRequest) -> Dict[str, Any]:
-        print(f"DEBUG: execute_chat called with query: {request.query}")
         # Generate a unique session ID if none provided — never use a shared fallback
         if not request.session_id:
             import uuid
             request.session_id = str(uuid.uuid4())
-            print(f"DEBUG: Generated new session_id: {request.session_id}")
         session_id = request.session_id
         # ── Layer 1: Full Response Cache (5 min TTL) ──────────────────────────
@@ -869,7 +876,7 @@ Answer:"""
                 result,
                 expiration=settings.CACHE_RESPONSE_TTL
             )
-            print(f"DEBUG: Cached full response (TTL={settings.CACHE_RESPONSE_TTL}s)")
         return result
@@ -878,7 +885,7 @@ Answer:"""
         if not request.session_id:
             import uuid
             request.session_id = str(uuid.uuid4())
-            print(f"DEBUG: Generated new session_id: {request.session_id}")
         session_id = request.session_id
         history_text = "" if is_guest else self._get_history_text(session_id)
         context_text, final_sources = await self._build_context(

     def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
         """
+        Single LLM call: query understanding + multilingual translation.
         Returns:
         {
+            "expanded_query": str,       # cleaned, specific English topic
+            "days_back": int | None,     # temporal filter (1=today, 7=week, 30=month, 365=year)
+            "source": str | None,        # news outlet filter if mentioned
+            "location": str | None,      # specific location/region if mentioned
+            "query_type": str,           # "conflict" | "humanitarian" | "political" | "economic" | "general"
+            "search_keywords": list,     # 3-5 key terms for keyword overlap filter
+            "translations": {            # per-language queries for Qdrant
+                "ar": str, "am": str, "so": str, "sw": str, "fr": str
             }
         }
         Falls back gracefully on any LLM/parse failure.
         """
         import re, json
+        prompt = f"""You are a search query processor for a multilingual Ethiopia/Africa news system.
+Analyze the query and output ONLY valid JSON with these exact fields:
+{{
+  "days_back": <integer or null>,
+  "source": <string or null>,
+  "location": <string or null>,
+  "query_type": <string>,
+  "topic": <string>,
+  "search_keywords": [<string>, ...],
+  "translations": {{"ar": <string>, "am": <string>, "so": <string>, "sw": <string>, "fr": <string>}}
+}}
+Field rules:
+- "days_back": 1=today/now/tonight/breaking, 2=yesterday, 7=this week/recently/past few days, 30=this month, 365=this year, null=no time reference
+- "source": news outlet name ONLY if explicitly mentioned (e.g. "BBC", "Reuters", "Al Jazeera"), else null
+- "location": specific place if mentioned (e.g. "Amhara", "Tigray", "Addis Ababa", "Somalia"), else null
+- "query_type": ONE of: "conflict" | "humanitarian" | "political" | "economic" | "social" | "general"
+  * conflict = fighting, clashes, attacks, military, armed groups
+  * humanitarian = displaced, refugees, aid, famine, drought, flood
+  * political = elections, government, diplomacy, policy, leaders
+  * economic = economy, trade, investment, inflation, development
+  * social = education, health, culture, religion, sports
+  * general = anything else
+- "topic": the SPECIFIC search topic — fix typos, remove time/source words, keep named entities EXACT.
+  "Ethiopia peace talks" stays "Ethiopia peace talks". Do NOT broaden or generalize.
+- "search_keywords": 3-5 key terms from the topic for keyword matching (lowercase, no stopwords)
+- "translations": translate "topic" into each language. Keep named entities in local spelling. Use English if uncertain.
 Query: "{query}"
             "expanded_query": query,
             "days_back": None,
             "source": None,
+            "location": None,
+            "query_type": "general",
+            "search_keywords": [],
             "translations": {lang: query for lang in ["ar", "am", "so", "sw", "fr"]},
         }
                 if isinstance(parsed.get("days_back"), int):
                     result["days_back"] = parsed["days_back"]
                 if isinstance(parsed.get("source"), str) and parsed["source"]:
                     result["source"] = parsed["source"]
+                if isinstance(parsed.get("location"), str) and parsed["location"]:
+                    result["location"] = parsed["location"]
+                if isinstance(parsed.get("query_type"), str) and parsed["query_type"]:
+                    result["query_type"] = parsed["query_type"]
+                if isinstance(parsed.get("search_keywords"), list):
+                    result["search_keywords"] = [k for k in parsed["search_keywords"] if isinstance(k, str)][:5]
                 if isinstance(parsed.get("topic"), str) and parsed["topic"].strip():
                     result["expanded_query"] = parsed["topic"].strip()
                 translations = parsed.get("translations", {})
                 if isinstance(translations, dict):
                     for lang in ["ar", "am", "so", "sw", "fr"]:
                         val = translations.get(lang, "").strip()
                         result["translations"][lang] = val if val else result["expanded_query"]
+                logger.info(
+                    f"[QE] topic='{result['expanded_query']}' "
+                    f"type={result['query_type']} "
+                    f"days_back={result['days_back']} "
+                    f"source={result['source']} "
+                    f"location={result['location']} "
+                    f"keywords={result['search_keywords']}"
+                )
                 return result
         except Exception as e:
+            logger.warning(f"[QE] LLM call failed: {e} — using regex fallback")
+        # ── Regex fallback for temporal + source ─────────────────────────────
+        query_lower = query.lower()
+        # Temporal fallback
+        if any(w in query_lower for w in ("today", "tonight", "now", "breaking", "just")):
+            result["days_back"] = 1
+        elif "yesterday" in query_lower:
+            result["days_back"] = 2
+        elif any(w in query_lower for w in ("this week", "recently", "past few days")):
+            result["days_back"] = 7
+        elif "this month" in query_lower:
+            result["days_back"] = 30
+        # Source fallback
         source_match = re.search(
             r'\b(?:from|on|by|via|source[:\s]+)\s*([A-Z][A-Za-z]+(?:\s[A-Z][A-Za-z]+)?)\b',
             query
         if source_match:
             result["source"] = source_match.group(1)
+        logger.info(f"[QE] Regex fallback: days_back={result['days_back']} source={result['source']}")
         return result
     def _search_single_language(
                     "doc_id": hit.doc_id,
                 })
+            logger.info(f"[RAG] [{lang_code}] search returned {len(docs)} results")
             return docs
         except Exception as e:
+            logger.warning(f"[RAG] [{lang_code}] search failed: {e}")
             return []
     async def _build_context(self, query: str, top_k: int, source_filter=None, language_filter=None, days_back=None) -> Tuple[str, List[Dict[str, Any]]]:
             if language_detector:
                 lang_detection = language_detector.detect(query)
                 query_language = lang_detection.language
+                logger.info(f"[RAG] Detected language: {query_language} (confidence={lang_detection.confidence:.2f}, method={lang_detection.method})")
                 # If query is not in English, we'll handle it in translation step
                 if query_language != "en":
+                    logger.info(f"[RAG] Non-English query detected, will translate to English for processing")
         except Exception as e:
+            logger.info(f"[RAG] Language detection failed: {e}, assuming English")
         # Expand query if needed (typo fix, short query expansion)
         try:
             if query_expander:
                 expansion_result = query_expander.expand(query)
                 if expansion_result.was_expanded:
+                    logger.info(f"[RAG] Query expanded: '{query}' → '{expansion_result.expanded}'")
+                    logger.info(f"[RAG] Expansion reason: {expansion_result.expansion_reason}")
                     query = expansion_result.expanded
                 else:
+                    logger.info(f"[RAG] Query not expanded: {expansion_result.expansion_reason}")
         except Exception as e:
+            logger.info(f"[RAG] Query expansion failed: {e}, using original query")
         # Extract entities for better filtering
         try:
             from src.infrastructure.adapters.entity_extractor import entity_extractor
             if entity_extractor:
                 entities = entity_extractor.extract(query)
+                logger.info(f"[RAG] Extracted entities:")
                 print(f"  - Locations: {entities.locations}")
                 print(f"  - Organizations: {entities.organizations}")
                 print(f"  - Temporal keywords: {entities.temporal_keywords}")
                     auto_source = entity_extractor.get_source_filter(entities)
                     if auto_source:
                         source_filter = auto_source
+                        logger.info(f"[RAG] Auto-detected source filter: {source_filter}")
         except Exception as e:
+            logger.info(f"[RAG] Entity extraction failed: {e}")
         # ── Step 1: Single LLM call — intent extraction + multilingual translation ──
         expanded_query = query
         use_hybrid = self.orchestrator is not None and self.hybrid_ranker is not None
         if use_hybrid:
+            logger.info(f"[RAG] Hybrid search enabled - checking intent and strategy")
             # Classify intent using v2 (production-grade) or v1 (fallback)
             # Check Redis cache first to avoid 8-11s DeBERTa inference on repeat queries
             if self.cache:
                 cached_intent = self.cache.get(intent_cache_key)
                 if cached_intent:
+                    logger.info(f"[RAG] Intent cache HIT — skipping DeBERTa inference")
                     # Reconstruct a minimal intent result from cache
                     class _CachedIntent:
                         def __init__(self, d):
                             self.inference_time_ms = 0.0
                     intent_result = _CachedIntent(cached_intent)
                     intent = "NEWS" if intent_result.intent != "OTHER" else "OTHER"
+                    logger.info(f"[RAG] Intent (cached): {intent_result.intent} (confidence={intent_result.confidence:.2f})")
             if intent_result is None:
                 if self.use_v2_classifier and self.intent_classifier_v2:
                     intent_result = self.intent_classifier_v2.classify(query)
                     intent = "NEWS" if intent_result.intent != "OTHER" else "OTHER"
+                    logger.info(
+                        f"[RAG] Intent v2: {intent_result.intent} "
+                        f"conf={intent_result.confidence:.2f} "
+                        f"method={intent_result.method} "
+                        f"time={intent_result.inference_time_ms:.1f}ms"
+                    )
                     # Cache intent result for 1 hour (same query = same intent)
                     if self.cache:
                 else:
                     intent = self.intent_classifier.classify(query)
                     intent_result = None
+                    logger.info(f"[RAG] Intent classification v1: {intent}")
             # Decide search strategy (pass full intent_result for v2)
             strategy = self.orchestrator.decide_search_strategy(query, intent, intent_result)
+            logger.info(f"[RAG] Search strategy: {strategy}")
             # If intent is OTHER (small talk), skip search entirely
             if intent == "OTHER":
+                logger.info(f"[RAG] Small talk detected - skipping search")
                 return "", []
         else:
+            logger.info(f"[RAG] Hybrid search disabled - using traditional pipeline")
             use_hybrid = False
             strategy = None
         if actual_language_filter:
             # Explicit language override — single-language mode, no translation needed
+            logger.info(f"[RAG] Language filter '{actual_language_filter}' — single-language mode")
             lang_sparse_queries: Dict[str, str] = {actual_language_filter: expanded_query}
         else:
+            logger.info(f"[RAG] Running combined intent extraction + translation...")
             combined = self._extract_intents_and_translate(query)
             if combined.get("days_back") and isinstance(combined["days_back"], int):
         # ── HYBRID SEARCH EXECUTION ────────────────────────────────────────────
         if use_hybrid and strategy and (strategy.use_live or strategy.use_db):
+            logger.info(f"[RAG] Executing hybrid search...")
             # Execute hybrid search (parallel live + DB)
             try:
                     top_k=per_lang_limit
                 )
+                logger.info(f"[RAG] Hybrid search returned {len(db_results)} DB + {len(live_results)} live results")
                 # Merge and rank results
                 all_docs = self.hybrid_ranker.merge_and_rank(
                     final_top_n=top_k * 3  # Get more candidates for quality filtering
                 )
+                logger.info(f"[RAG] After hybrid ranking: {len(all_docs)} results")
             except Exception as e:
+                logger.info(f"[RAG] Hybrid search failed: {e} - falling back to traditional pipeline")
                 use_hybrid = False
                 all_docs = []
         # ── TRADITIONAL PIPELINE (fallback or when hybrid disabled) ────────────
         if not use_hybrid or not all_docs:
+            logger.info(f"[RAG] Using traditional multilingual pipeline")
             # ── Step 2: Compute dense vector ONCE from English query ──────────────
             # BGE-M3 dense space is language-agnostic — one English dense vector
             # semantically matches content in all 6 languages.
+            logger.info(f"[RAG] Computing dense vector for: '{expanded_query}'")
             english_vectors = self.embedder.encode_query(expanded_query)
             dense_vec: List[float] = english_vectors["dense"]
+            logger.info(f"[RAG] Dense vector ready ({len(dense_vec)} dims)")
             # ── Step 3: Batch sparse encoding — ONE forward pass for all languages ─
             # BGE-M3 holds the GIL during inference. ThreadPoolExecutor gives zero
             lang_codes = list(lang_sparse_queries.keys())
             lang_texts  = [lang_sparse_queries[lc] for lc in lang_codes]
+            logger.info(f"[RAG] Batch sparse encoding {len(lang_texts)} language queries...")
             sparse_results = self.embedder.encode_sparse_batch(lang_texts)
             lang_sparse_vecs: Dict[str, Optional[Dict]] = {}
                 sparse = result.get("sparse")
                 lang_sparse_vecs[lc] = sparse
                 token_count = len(sparse["indices"]) if sparse else 0
+                logger.info(f"[RAG] [{lc}] sparse ready — {token_count} tokens")
             # ── Step 4: Fan out to Qdrant — 6 parallel searches ──────────────────
             # Each lane: shared dense_vec + language-specific sparse_vec + language filter
                                 seen_doc_ids.add(doc_id)
                             all_docs.append(doc)
                     except Exception as e:
+                        logger.info(f"[RAG] [{lc}] future failed: {e}")
+            logger.info(f"[RAG] Total pooled candidates after dedup: {len(all_docs)}")
             # ── Step 5: Temporal fallback — retry without date filter if zero results ──
             self._temporal_fallback_used = False
             if len(all_docs) == 0 and days_back is not None:
+                logger.info(f"[RAG] No results with days_back={days_back} — retrying without temporal filter")
                 with concurrent.futures.ThreadPoolExecutor(max_workers=len(lang_sparse_queries)) as executor:
                     futures = {
                         executor.submit(_run_search, lc, None): lc
                                     seen_doc_ids.add(doc_id)
                                 all_docs.append(doc)
                         except Exception as e:
+                            logger.info(f"[RAG] [{lc}] fallback future failed: {e}")
                 self._temporal_fallback_used = True
+                logger.info(f"[RAG] Fallback returned {len(all_docs)} total candidates")
             # ── Step 6: Multilingual reranking ────────────────────────────────────
             # bge-reranker-v2-m3 scores (English query, any-language content) natively
         above_threshold = [d for d in quality_docs if d.get("rerank_score", 1.0) >= RERANK_THRESHOLD]
         if above_threshold:
             quality_docs = above_threshold
+            logger.info(f"[RAG] {len(quality_docs)} docs above rerank threshold {RERANK_THRESHOLD}")
         else:
             # All scores low — keep top 3 anyway rather than returning nothing
             quality_docs = quality_docs[:3]
+            logger.info(f"[RAG] All docs below threshold — keeping top 3 by rerank score")
         # ── Keyword overlap filter — soft filter, keeps docs with ANY query term ─
         # Only drops docs with ZERO overlap AND low rerank score.
             overlapping = [d for d in quality_docs if _has_overlap(d)]
             if overlapping:
                 quality_docs = overlapping
+                logger.info(f"[RAG] {len(quality_docs)} docs after keyword overlap filter")
             else:
                 # No overlap at all — keep top 5 by score rather than dropping everything
                 quality_docs = sorted(
                     key=lambda d: d.get("rerank_score") or d.get("score", 0),
                     reverse=True
                 )[:5]
+                logger.info(f"[RAG] No keyword overlap — keeping top 5 by score ({len(quality_docs)} docs)")
         # Guarantee at least 1 non-English result if available
         non_english = [d for d in quality_docs if d.get("metadata", {}).get("_search_lang", "en") != "en"]
             deduped_final.append(d)
         langs_in_result = list({d.get("metadata", {}).get("_search_lang", "en") for d in deduped_final})
+        logger.info(f"[RAG] Final {len(deduped_final)} docs — languages: {langs_in_result}")
         # ── Step 8: Token limitation ──────────────────────────────────────────
         return self._limit_context(query, deduped_final)
         }
     async def execute_chat(self, request: ChatRequest) -> Dict[str, Any]:
+        logger.info(f"[RAG] execute_chat called with query: {request.query}")
         # Generate a unique session ID if none provided — never use a shared fallback
         if not request.session_id:
             import uuid
             request.session_id = str(uuid.uuid4())
+            logger.info(f"[RAG] Generated new session_id: {request.session_id}")
         session_id = request.session_id
         # ── Layer 1: Full Response Cache (5 min TTL) ──────────────────────────
                 result,
                 expiration=settings.CACHE_RESPONSE_TTL
             )
+            logger.info(f"[RAG] Cached full response (TTL={settings.CACHE_RESPONSE_TTL}s)")
         return result
         if not request.session_id:
             import uuid
             request.session_id = str(uuid.uuid4())
+            logger.info(f"[RAG] Generated new session_id: {request.session_id}")
         session_id = request.session_id
         history_text = "" if is_guest else self._get_history_text(session_id)
         context_text, final_sources = await self._build_context(