""" Query rewriting module for improved retrieval. Transforms user queries before retrieval to improve recall by: - Expanding queries with synonyms and related terms - Reformulating queries to match document terminology - Generating multiple query variants for broader coverage - Decomposing complex queries into sub-queries """ from dataclasses import dataclass from typing import List, Optional, Literal import re # Try to import LLM provider for advanced rewriting try: from src.llm_providers import call_llm LLM_AVAILABLE = True except ImportError: LLM_AVAILABLE = False @dataclass class QueryRewriteResult: """Result of query rewriting operation.""" original_query: str rewritten_queries: List[str] strategy_used: str # Common synonym mappings for rule-based expansion SYNONYMS = { "fix": ["resolve", "troubleshoot", "repair", "solve"], "error": ["issue", "problem", "failure", "bug"], "login": ["sign-in", "authentication", "log in"], "cost": ["price", "pricing", "fee", "rate"], "fast": ["quick", "performance", "speed", "efficient"], "slow": ["performance", "latency", "delay"], "setup": ["install", "configure", "initialization"], "delete": ["remove", "uninstall", "clear"], "create": ["add", "new", "generate", "make"], "update": ["modify", "change", "edit", "upgrade"], "get": ["retrieve", "fetch", "obtain", "access"], "show": ["display", "view", "list"], } # Prompt for LLM-based query rewriting MULTI_QUERY_PROMPT = """You are a query rewriting assistant for a document search system. Given a user query, generate {num_variants} alternative search queries that would help find relevant documents. Rules: - Each variant should use different terminology while preserving the intent - Include both formal/technical and casual phrasings - If the query contains multiple questions, create separate queries for each - Output ONLY the queries, one per line, no numbering or explanations User query: {query} Alternative queries:""" DECOMPOSE_PROMPT = """You are a query analysis assistant. Given a complex user query, break it down into simple, atomic sub-queries that can be searched independently. Rules: - Each sub-query should focus on one specific piece of information - Preserve the key terms from the original query - Output ONLY the sub-queries, one per line, no numbering or explanations - Generate between 2-4 sub-queries User query: {query} Sub-queries:""" def _expand_with_synonyms(query: str) -> List[str]: """ Expand query with synonyms using rule-based matching. Args: query: Original user query Returns: List containing original query plus expanded version """ words = query.lower().split() expansions = [] for word in words: clean_word = re.sub(r'[^\w]', '', word) if clean_word in SYNONYMS: expansions.extend(SYNONYMS[clean_word]) if expansions: expanded = f"{query} {' '.join(expansions)}" return [query, expanded] return [query] def _is_complex_query(query: str) -> bool: """ Determine if a query is complex enough to warrant decomposition. Complex queries typically: - Contain multiple questions (and, also, as well as) - Have comparison words (vs, compare, difference, between) - Are longer than 15 words """ query_lower = query.lower() # Check for conjunctions suggesting multiple intents multi_intent_markers = [" and ", " also ", " as well as ", " plus "] if any(marker in query_lower for marker in multi_intent_markers): return True # Check for comparison queries comparison_markers = [" vs ", " versus ", "compare", "difference", "between"] if any(marker in query_lower for marker in comparison_markers): return True # Long queries are often complex if len(query.split()) > 15: return True return False def _rewrite_with_llm( query: str, num_variants: int = 3, strategy: Literal["multi", "decompose"] = "multi" ) -> List[str]: """ Use LLM to generate query variants. Args: query: Original user query num_variants: Number of variants to generate strategy: "multi" for multi-query, "decompose" for decomposition Returns: List of rewritten queries """ if not LLM_AVAILABLE: return [query] if strategy == "decompose": prompt = DECOMPOSE_PROMPT.format(query=query) else: prompt = MULTI_QUERY_PROMPT.format(query=query, num_variants=num_variants) try: response = call_llm(prompt=prompt, temperature=0.3, max_tokens=256) text = response.get("text", "") # Parse response into individual queries lines = [line.strip() for line in text.strip().split("\n") if line.strip()] # Filter out empty lines and numbering artifacts queries = [] for line in lines: # Remove common numbering patterns cleaned = re.sub(r'^[\d\-\.\)\*]+\s*', '', line).strip() if cleaned and len(cleaned) > 3: queries.append(cleaned) # Always include original query if query not in queries: queries.insert(0, query) return queries[:num_variants + 1] except Exception: # Fallback to original query on any error return [query] def rewrite_query( query: str, num_variants: int = 3, strategy: Optional[Literal["expand", "multi", "decompose", "auto"]] = "auto", use_llm: bool = True ) -> QueryRewriteResult: """ Rewrite a user query to improve retrieval recall. Args: query: Original user query num_variants: Number of query variants to generate strategy: Rewriting strategy - "expand": Rule-based synonym expansion (fast, no LLM) - "multi": LLM generates multiple query variants - "decompose": LLM breaks complex query into sub-queries - "auto": Automatically choose based on query complexity use_llm: Whether to allow LLM-based rewriting Returns: QueryRewriteResult with original query, rewritten queries, and strategy used """ query = query.strip() if not query: return QueryRewriteResult( original_query=query, rewritten_queries=[query], strategy_used="none" ) # Auto-select strategy based on query characteristics if strategy == "auto": if _is_complex_query(query) and use_llm and LLM_AVAILABLE: strategy = "decompose" elif use_llm and LLM_AVAILABLE: strategy = "multi" else: strategy = "expand" # Execute the selected strategy if strategy == "expand": rewritten = _expand_with_synonyms(query) elif strategy == "multi" and use_llm and LLM_AVAILABLE: rewritten = _rewrite_with_llm(query, num_variants, strategy="multi") elif strategy == "decompose" and use_llm and LLM_AVAILABLE: rewritten = _rewrite_with_llm(query, num_variants, strategy="decompose") else: # Fallback to expansion if LLM not available rewritten = _expand_with_synonyms(query) strategy = "expand" return QueryRewriteResult( original_query=query, rewritten_queries=rewritten, strategy_used=strategy )