Spaces:
Sleeping
Sleeping
| """ | |
| Query Expansion System for CogniChat RAG Application | |
| This module implements advanced query expansion techniques to improve retrieval quality: | |
| - QueryAnalyzer: Extracts intent, entities, and keywords | |
| - QueryRephraser: Generates natural language variations | |
| - MultiQueryExpander: Creates diverse query formulations | |
| - MultiHopReasoner: Connects concepts across documents | |
| - FallbackStrategies: Handles edge cases gracefully | |
| Author: CogniChat Team | |
| Date: October 19, 2025 | |
| """ | |
| import re | |
| from typing import List, Dict, Any, Optional | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| class QueryStrategy(Enum): | |
| """Query expansion strategies with different complexity levels.""" | |
| QUICK = "quick" # 2 queries - fast, minimal expansion | |
| BALANCED = "balanced" # 3-4 queries - good balance | |
| COMPREHENSIVE = "comprehensive" # 5-6 queries - maximum coverage | |
| class QueryAnalysis: | |
| """Results from query analysis.""" | |
| intent: str # question, definition, comparison, explanation, etc. | |
| entities: List[str] # Named entities extracted | |
| keywords: List[str] # Important keywords | |
| complexity: str # simple, medium, complex | |
| domain: Optional[str] = None # Technical domain if detected | |
| class ExpandedQuery: | |
| """Container for expanded query variations.""" | |
| original: str | |
| variations: List[str] | |
| strategy_used: QueryStrategy | |
| analysis: QueryAnalysis | |
| class QueryAnalyzer: | |
| """ | |
| Analyzes queries to extract intent, entities, and key information. | |
| Uses LLM-based analysis for intelligent query understanding. | |
| """ | |
| def __init__(self, llm=None): | |
| """ | |
| Initialize QueryAnalyzer. | |
| Args: | |
| llm: Optional LangChain LLM for advanced analysis | |
| """ | |
| self.llm = llm | |
| self.intent_patterns = { | |
| 'definition': r'\b(what is|define|meaning of|definition)\b', | |
| 'how_to': r'\b(how to|how do|how can|steps to)\b', | |
| 'comparison': r'\b(compare|difference|versus|vs|better than)\b', | |
| 'explanation': r'\b(why|explain|reason|cause)\b', | |
| 'listing': r'\b(list|enumerate|what are|types of)\b', | |
| 'example': r'\b(example|instance|sample|case)\b', | |
| } | |
| def analyze(self, query: str) -> QueryAnalysis: | |
| """ | |
| Analyze query to extract intent, entities, and keywords. | |
| Args: | |
| query: User's original query | |
| Returns: | |
| QueryAnalysis object with extracted information | |
| """ | |
| query_lower = query.lower() | |
| # Detect intent | |
| intent = self._detect_intent(query_lower) | |
| # Extract entities (simplified - can be enhanced with NER) | |
| entities = self._extract_entities(query) | |
| # Extract keywords | |
| keywords = self._extract_keywords(query) | |
| # Assess complexity | |
| complexity = self._assess_complexity(query, entities, keywords) | |
| # Detect domain | |
| domain = self._detect_domain(query_lower) | |
| return QueryAnalysis( | |
| intent=intent, | |
| entities=entities, | |
| keywords=keywords, | |
| complexity=complexity, | |
| domain=domain | |
| ) | |
| def _detect_intent(self, query_lower: str) -> str: | |
| """Detect query intent using pattern matching.""" | |
| for intent, pattern in self.intent_patterns.items(): | |
| if re.search(pattern, query_lower): | |
| return intent | |
| return 'general' | |
| def _extract_entities(self, query: str) -> List[str]: | |
| """Extract named entities (simplified version).""" | |
| # Look for capitalized words (potential entities) | |
| words = query.split() | |
| entities = [] | |
| for word in words: | |
| # Skip common words at sentence start | |
| if word[0].isupper() and word.lower() not in ['what', 'how', 'why', 'when', 'where', 'which']: | |
| entities.append(word) | |
| # Look for quoted terms | |
| quoted = re.findall(r'"([^"]+)"', query) | |
| entities.extend(quoted) | |
| return list(set(entities)) | |
| def _extract_keywords(self, query: str) -> List[str]: | |
| """Extract important keywords from query.""" | |
| # Remove stop words (simplified list) | |
| stop_words = { | |
| 'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', | |
| 'what', 'how', 'why', 'when', 'where', 'which', 'who', | |
| 'do', 'does', 'did', 'can', 'could', 'should', 'would', | |
| 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by' | |
| } | |
| # Split and filter | |
| words = re.findall(r'\b\w+\b', query.lower()) | |
| keywords = [w for w in words if w not in stop_words and len(w) > 2] | |
| return keywords[:10] # Limit to top 10 | |
| def _assess_complexity(self, query: str, entities: List[str], keywords: List[str]) -> str: | |
| """Assess query complexity.""" | |
| word_count = len(query.split()) | |
| entity_count = len(entities) | |
| keyword_count = len(keywords) | |
| # Simple scoring | |
| score = word_count + (entity_count * 2) + (keyword_count * 1.5) | |
| if score < 15: | |
| return 'simple' | |
| elif score < 30: | |
| return 'medium' | |
| else: | |
| return 'complex' | |
| def _detect_domain(self, query_lower: str) -> Optional[str]: | |
| """Detect technical domain if present.""" | |
| domains = { | |
| 'programming': ['code', 'function', 'class', 'variable', 'algorithm', 'debug'], | |
| 'data_science': ['model', 'dataset', 'training', 'prediction', 'accuracy'], | |
| 'machine_learning': ['neural', 'network', 'learning', 'ai', 'deep learning'], | |
| 'web': ['html', 'css', 'javascript', 'api', 'frontend', 'backend'], | |
| 'database': ['sql', 'query', 'database', 'table', 'index'], | |
| 'security': ['encryption', 'authentication', 'vulnerability', 'attack'], | |
| } | |
| for domain, keywords in domains.items(): | |
| if any(kw in query_lower for kw in keywords): | |
| return domain | |
| return None | |
| class QueryRephraser: | |
| """ | |
| Generates natural language variations of queries using multiple strategies. | |
| """ | |
| def __init__(self, llm=None): | |
| """ | |
| Initialize QueryRephraser. | |
| Args: | |
| llm: LangChain LLM for generating variations | |
| """ | |
| self.llm = llm | |
| def generate_variations( | |
| self, | |
| query: str, | |
| analysis: QueryAnalysis, | |
| strategy: QueryStrategy = QueryStrategy.BALANCED | |
| ) -> List[str]: | |
| """ | |
| Generate query variations based on strategy. | |
| Args: | |
| query: Original query | |
| analysis: Query analysis results | |
| strategy: Expansion strategy to use | |
| Returns: | |
| List of query variations | |
| """ | |
| variations = [query] # Always include original | |
| if strategy == QueryStrategy.QUICK: | |
| # Just add synonym variation | |
| variations.append(self._synonym_variation(query, analysis)) | |
| elif strategy == QueryStrategy.BALANCED: | |
| # Add synonym, expanded, and simplified versions | |
| variations.append(self._synonym_variation(query, analysis)) | |
| variations.append(self._expanded_variation(query, analysis)) | |
| variations.append(self._simplified_variation(query, analysis)) | |
| elif strategy == QueryStrategy.COMPREHENSIVE: | |
| # Add all variations | |
| variations.append(self._synonym_variation(query, analysis)) | |
| variations.append(self._expanded_variation(query, analysis)) | |
| variations.append(self._simplified_variation(query, analysis)) | |
| variations.append(self._keyword_focused(query, analysis)) | |
| variations.append(self._context_variation(query, analysis)) | |
| # Add one more: alternate phrasing | |
| if analysis.intent in ['how_to', 'explanation']: | |
| variations.append(f"Guide to {' '.join(analysis.keywords[:3])}") | |
| # Remove duplicates and None values | |
| variations = [v for v in variations if v] | |
| return list(dict.fromkeys(variations)) # Preserve order, remove dupes | |
| def _synonym_variation(self, query: str, analysis: QueryAnalysis) -> str: | |
| """Generate variation using synonyms.""" | |
| # Common synonym replacements | |
| synonyms = { | |
| 'error': 'issue', | |
| 'problem': 'issue', | |
| 'fix': 'resolve', | |
| 'use': 'utilize', | |
| 'create': 'generate', | |
| 'make': 'create', | |
| 'get': 'retrieve', | |
| 'show': 'display', | |
| 'find': 'locate', | |
| 'explain': 'describe', | |
| } | |
| words = query.lower().split() | |
| for i, word in enumerate(words): | |
| if word in synonyms: | |
| words[i] = synonyms[word] | |
| break # Only replace one word to keep natural | |
| return ' '.join(words).capitalize() | |
| def _expanded_variation(self, query: str, analysis: QueryAnalysis) -> str: | |
| """Generate expanded version with more detail.""" | |
| if analysis.intent == 'definition': | |
| return f"Detailed explanation and definition of {' '.join(analysis.keywords)}" | |
| elif analysis.intent == 'how_to': | |
| return f"Step-by-step guide on {query.lower()}" | |
| elif analysis.intent == 'comparison': | |
| return f"Comprehensive comparison: {query}" | |
| else: | |
| # Add qualifying words | |
| return f"Detailed information about {query.lower()}" | |
| def _simplified_variation(self, query: str, analysis: QueryAnalysis) -> str: | |
| """Generate simplified version focusing on core concepts.""" | |
| # Use just the keywords | |
| if len(analysis.keywords) >= 2: | |
| return ' '.join(analysis.keywords[:3]) | |
| return query | |
| def _keyword_focused(self, query: str, analysis: QueryAnalysis) -> str: | |
| """Create keyword-focused variation for BM25.""" | |
| keywords = analysis.keywords + analysis.entities | |
| return ' '.join(keywords[:5]) | |
| def _context_variation(self, query: str, analysis: QueryAnalysis) -> str: | |
| """Add contextual information if domain detected.""" | |
| if analysis.domain: | |
| return f"{query} in {analysis.domain} context" | |
| return query | |
| class MultiQueryExpander: | |
| """ | |
| Main query expansion orchestrator that combines analysis and rephrasing. | |
| """ | |
| def __init__(self, llm=None): | |
| """ | |
| Initialize MultiQueryExpander. | |
| Args: | |
| llm: LangChain LLM for advanced expansions | |
| """ | |
| self.analyzer = QueryAnalyzer(llm) | |
| self.rephraser = QueryRephraser(llm) | |
| def expand( | |
| self, | |
| query: str, | |
| strategy: QueryStrategy = QueryStrategy.BALANCED, | |
| max_queries: int = 6 | |
| ) -> ExpandedQuery: | |
| """ | |
| Expand query into multiple variations. | |
| Args: | |
| query: Original user query | |
| strategy: Expansion strategy | |
| max_queries: Maximum number of queries to generate | |
| Returns: | |
| ExpandedQuery object with all variations | |
| """ | |
| # Analyze query | |
| analysis = self.analyzer.analyze(query) | |
| # Generate variations | |
| variations = self.rephraser.generate_variations(query, analysis, strategy) | |
| # Limit to max_queries | |
| variations = variations[:max_queries] | |
| return ExpandedQuery( | |
| original=query, | |
| variations=variations, | |
| strategy_used=strategy, | |
| analysis=analysis | |
| ) | |
| class MultiHopReasoner: | |
| """ | |
| Implements multi-hop reasoning to connect concepts across documents. | |
| Useful for complex queries that require information from multiple sources. | |
| """ | |
| def __init__(self, llm=None): | |
| """ | |
| Initialize MultiHopReasoner. | |
| Args: | |
| llm: LangChain LLM for reasoning | |
| """ | |
| self.llm = llm | |
| def generate_sub_queries(self, query: str, analysis: QueryAnalysis) -> List[str]: | |
| """ | |
| Break complex query into sub-queries for multi-hop reasoning. | |
| Args: | |
| query: Original complex query | |
| analysis: Query analysis | |
| Returns: | |
| List of sub-queries | |
| """ | |
| sub_queries = [query] | |
| # For comparison queries, create separate queries for each entity | |
| if analysis.intent == 'comparison' and len(analysis.entities) >= 2: | |
| for entity in analysis.entities[:2]: | |
| sub_queries.append(f"Information about {entity}") | |
| elif analysis.intent == 'comparison' and len(analysis.keywords) >= 2: | |
| # Fallback: use keywords if no entities found | |
| for keyword in analysis.keywords[:2]: | |
| sub_queries.append(f"Information about {keyword}") | |
| # For how-to queries, break into steps | |
| if analysis.intent == 'how_to' and len(analysis.keywords) >= 2: | |
| main_topic = ' '.join(analysis.keywords[:2]) | |
| sub_queries.append(f"Prerequisites for {main_topic}") | |
| sub_queries.append(f"Steps to {main_topic}") | |
| # For complex questions, create focused sub-queries | |
| if analysis.complexity == 'complex' and len(analysis.keywords) > 3: | |
| # Create queries focusing on different keyword groups | |
| mid = len(analysis.keywords) // 2 | |
| sub_queries.append(' '.join(analysis.keywords[:mid])) | |
| sub_queries.append(' '.join(analysis.keywords[mid:])) | |
| return sub_queries[:5] # Limit to 5 sub-queries | |
| class FallbackStrategies: | |
| """ | |
| Implements fallback strategies for queries that don't retrieve good results. | |
| """ | |
| def simplify_query(query: str) -> str: | |
| """Simplify query by removing modifiers and focusing on core terms.""" | |
| # Remove question words | |
| query = re.sub(r'\b(what|how|why|when|where|which|who|can|could|should|would)\b', '', query, flags=re.IGNORECASE) | |
| # Remove common phrases | |
| query = re.sub(r'\b(is|are|was|were|be|been|the|a|an)\b', '', query, flags=re.IGNORECASE) | |
| # Clean up extra spaces | |
| query = re.sub(r'\s+', ' ', query).strip() | |
| return query | |
| def broaden_query(query: str, analysis: QueryAnalysis) -> str: | |
| """Broaden query to increase recall.""" | |
| # Remove specific constraints | |
| query = re.sub(r'\b(specific|exactly|precisely|only|just)\b', '', query, flags=re.IGNORECASE) | |
| # Add general terms | |
| if analysis.keywords: | |
| return f"{analysis.keywords[0]} overview" | |
| return query | |
| def focus_entities(analysis: QueryAnalysis) -> str: | |
| """Create entity-focused query as fallback.""" | |
| if analysis.entities: | |
| return ' '.join(analysis.entities) | |
| elif analysis.keywords: | |
| return ' '.join(analysis.keywords[:3]) | |
| return "" | |
| # Convenience function for easy integration | |
| def expand_query_simple( | |
| query: str, | |
| strategy: str = "balanced", | |
| llm=None | |
| ) -> List[str]: | |
| """ | |
| Simple function to expand a query without dealing with classes. | |
| Args: | |
| query: User's query to expand | |
| strategy: "quick", "balanced", or "comprehensive" | |
| llm: Optional LangChain LLM | |
| Returns: | |
| List of expanded query variations | |
| Example: | |
| >>> queries = expand_query_simple("How do I debug Python code?", strategy="balanced") | |
| >>> print(queries) | |
| ['How do I debug Python code?', 'How do I resolve Python code?', ...] | |
| """ | |
| expander = MultiQueryExpander(llm=llm) | |
| strategy_enum = QueryStrategy(strategy) | |
| expanded = expander.expand(query, strategy=strategy_enum) | |
| return expanded.variations | |
| # Example usage and testing | |
| if __name__ == "__main__": | |
| # Example 1: Simple query expansion | |
| print("=" * 60) | |
| print("Example 1: Simple Query Expansion") | |
| print("=" * 60) | |
| query = "What is machine learning?" | |
| queries = expand_query_simple(query, strategy="balanced") | |
| print(f"\nOriginal: {query}") | |
| print(f"\nExpanded queries ({len(queries)}):") | |
| for i, q in enumerate(queries, 1): | |
| print(f" {i}. {q}") | |
| # Example 2: Complex query with full analysis | |
| print("\n" + "=" * 60) | |
| print("Example 2: Complex Query with Analysis") | |
| print("=" * 60) | |
| expander = MultiQueryExpander() | |
| query = "How do I compare the performance of different neural network architectures?" | |
| result = expander.expand(query, strategy=QueryStrategy.COMPREHENSIVE) | |
| print(f"\nOriginal: {result.original}") | |
| print(f"\nAnalysis:") | |
| print(f" Intent: {result.analysis.intent}") | |
| print(f" Entities: {result.analysis.entities}") | |
| print(f" Keywords: {result.analysis.keywords}") | |
| print(f" Complexity: {result.analysis.complexity}") | |
| print(f" Domain: {result.analysis.domain}") | |
| print(f"\nExpanded queries ({len(result.variations)}):") | |
| for i, q in enumerate(result.variations, 1): | |
| print(f" {i}. {q}") | |
| # Example 3: Multi-hop reasoning | |
| print("\n" + "=" * 60) | |
| print("Example 3: Multi-Hop Reasoning") | |
| print("=" * 60) | |
| reasoner = MultiHopReasoner() | |
| analyzer = QueryAnalyzer() | |
| query = "Compare Python and Java for web development" | |
| analysis = analyzer.analyze(query) | |
| sub_queries = reasoner.generate_sub_queries(query, analysis) | |
| print(f"\nOriginal: {query}") | |
| print(f"\nSub-queries for multi-hop reasoning:") | |
| for i, sq in enumerate(sub_queries, 1): | |
| print(f" {i}. {sq}") | |
| # Example 4: Fallback strategies | |
| print("\n" + "=" * 60) | |
| print("Example 4: Fallback Strategies") | |
| print("=" * 60) | |
| query = "What is the specific difference between supervised and unsupervised learning?" | |
| analysis = analyzer.analyze(query) | |