""" Query handler that routes different types of queries appropriately. """ from typing import List, Tuple, Dict, Any import re class QueryType: GREETING = "greeting" SEMANTIC_ANALYSIS = "semantic_analysis" KEYWORD_SEARCH = "keyword_search" DOCUMENT_QUESTION = "document_question" def detect_result_count(query: str) -> int: """ Detect how many results the user wants based on their query. Returns the number of documents to return (default: 100). """ query_lower = query.lower() # Look for explicit numbers import re # Pattern: "find 5 documents", "show me 10", "top 3", etc. number_patterns = [ r'(\d+)\s*(?:documents?|results?|matches?|items?)', # "5 documents" r'(?:top|first|show|find|give me)\s*(\d+)', # "top 5" r'(\d+)', # standalone number ] for pattern in number_patterns: match = re.search(pattern, query_lower) if match: try: count = int(match.group(1)) return min(count, 100) # Cap at 100 except (ValueError, IndexError): pass # Singular indicators = 1 result singular_words = ["a document", "one document", "single document", "the document", "find me a"] if any(word in query_lower for word in singular_words): return 1 # "All" or "every" = 100 (max) if any(word in query_lower for word in ["all", "every", "everything"]): return 100 # "Few" = 3-5 if "few" in query_lower: return 3 # "Several" or "some" = 10 if any(word in query_lower for word in ["several", "some"]): return 10 # Default: 5 for conversational, 100 for search modes return 100 def classify_query(query: str) -> str: """Classify the type of query based on keywords and patterns.""" query_lower = query.lower().strip() # Greetings - only exact matches of simple greetings simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"] if query_lower in simple_greetings: return QueryType.GREETING # Keyword search - trigger if "keyword" appears anywhere if "keyword" in query_lower: return QueryType.KEYWORD_SEARCH # Semantic analysis - trigger if "semantic" appears anywhere if "semantic" in query_lower: return QueryType.SEMANTIC_ANALYSIS # Default to conversational document question (RAG mode) return QueryType.DOCUMENT_QUESTION def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]: """ Simple keyword/phrase search in documents. Returns documents that contain the search terms with a relevance score. """ # Extract the actual search terms (remove meta phrases like "keyword search for") query_clean = query.lower() for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase", "search for phrase", "contains the word"]: query_clean = query_clean.replace(phrase, "") query_clean = query_clean.strip() # Remove leading "for" or "the" if present if query_clean.startswith("for "): query_clean = query_clean[4:] if query_clean.startswith("the "): query_clean = query_clean[4:] query_clean = query_clean.strip() results = [] for doc in docs: doc_lower = doc.lower() # Count occurrences of search term (exact phrase match) count = doc_lower.count(query_clean) # Only include documents that actually contain the search term if count > 0: # Score heavily weighted by frequency (exact matches matter most) first_position = doc_lower.find(query_clean) # Normalize position score (0-1, earlier is better) position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0 # Frequency score (more occurrences = higher score) frequency_score = min(count / 3.0, 1.0) # Cap at 1.0, scale faster # Combined score - prioritize frequency over position for exact matches score = (frequency_score * 0.8) + (position_score * 0.2) results.append((doc, score)) # Sort by score descending results.sort(key=lambda x: x[1], reverse=True) return results def generate_greeting_response() -> str: """Generate a friendly greeting response.""" return ( "Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n" "• **Search documents** - Ask me any question about the declassified KGB documents\n" "• **Keyword search** - Say 'keyword search for [term]' to find exact phrases\n" "• **Semantic analysis** - Ask for 'semantic analysis of [text]' to understand meaning\n\n" "How can I assist you today?" ) def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str: """Generate semantic analysis of the query and retrieved documents.""" # Extract what user wants analyzed query_lower = query.lower() for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]: if phrase in query_lower: query = query_lower.split(phrase, 1)[1].strip() break analysis = "**Semantic Analysis:**\n\n" if hits: analysis += f"**Query Terms:** {query}\n\n" analysis += "**Semantic Interpretation:**\n" analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n" analysis += "**Most Semantically Similar Documents:**\n\n" for i, (doc, score) in enumerate(hits[:3], 1): source = "unknown" body = doc if "[Source:" in doc: parts = doc.rsplit("[Source:", 1) body = parts[0].strip() source = parts[1].strip("] ") analysis += f"**{i}.** Similarity: {score:.3f}\n" analysis += f" {body[:200]}{'...' if len(body) > 200 else ''}\n" analysis += f" *[Source: {source}]*\n\n" analysis += "\n**Semantic Context:**\n" analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. " analysis += "The scores represent how conceptually related each document is to your query." else: analysis += "No semantically similar documents found for this query." return analysis