Spaces:
Sleeping
Sleeping
| """ | |
| Query handler that routes different types of queries appropriately. | |
| """ | |
| from typing import List, Tuple, Dict, Any | |
| import re | |
| class QueryType: | |
| GREETING = "greeting" | |
| SEMANTIC_ANALYSIS = "semantic_analysis" | |
| KEYWORD_SEARCH = "keyword_search" | |
| DOCUMENT_QUESTION = "document_question" | |
| def detect_result_count(query: str) -> int: | |
| """ | |
| Detect how many results the user wants based on their query. | |
| Returns the number of documents to return (default: 100). | |
| """ | |
| query_lower = query.lower() | |
| # Look for explicit numbers | |
| import re | |
| # Pattern: "find 5 documents", "show me 10", "top 3", etc. | |
| number_patterns = [ | |
| r'(\d+)\s*(?:documents?|results?|matches?|items?)', # "5 documents" | |
| r'(?:top|first|show|find|give me)\s*(\d+)', # "top 5" | |
| r'(\d+)', # standalone number | |
| ] | |
| for pattern in number_patterns: | |
| match = re.search(pattern, query_lower) | |
| if match: | |
| try: | |
| count = int(match.group(1)) | |
| return min(count, 100) # Cap at 100 | |
| except (ValueError, IndexError): | |
| pass | |
| # Singular indicators = 1 result | |
| singular_words = ["a document", "one document", "single document", "the document", "find me a"] | |
| if any(word in query_lower for word in singular_words): | |
| return 1 | |
| # "All" or "every" = 100 (max) | |
| if any(word in query_lower for word in ["all", "every", "everything"]): | |
| return 100 | |
| # "Few" = 3-5 | |
| if "few" in query_lower: | |
| return 3 | |
| # "Several" or "some" = 10 | |
| if any(word in query_lower for word in ["several", "some"]): | |
| return 10 | |
| # Default: 5 for conversational, 100 for search modes | |
| return 100 | |
| def classify_query(query: str) -> str: | |
| """Classify the type of query based on keywords and patterns.""" | |
| query_lower = query.lower().strip() | |
| # Greetings - only exact matches of simple greetings | |
| simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"] | |
| if query_lower in simple_greetings: | |
| return QueryType.GREETING | |
| # Keyword search - trigger if "keyword" appears anywhere | |
| if "keyword" in query_lower: | |
| return QueryType.KEYWORD_SEARCH | |
| # Semantic analysis - trigger if "semantic" appears anywhere | |
| if "semantic" in query_lower: | |
| return QueryType.SEMANTIC_ANALYSIS | |
| # Default to conversational document question (RAG mode) | |
| return QueryType.DOCUMENT_QUESTION | |
| def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]: | |
| """ | |
| Simple keyword/phrase search in documents. | |
| Returns documents that contain the search terms with a relevance score. | |
| """ | |
| # Extract the actual search terms (remove meta phrases like "keyword search for") | |
| query_clean = query.lower() | |
| for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase", | |
| "search for phrase", "contains the word"]: | |
| query_clean = query_clean.replace(phrase, "") | |
| query_clean = query_clean.strip() | |
| # Remove leading "for" or "the" if present | |
| if query_clean.startswith("for "): | |
| query_clean = query_clean[4:] | |
| if query_clean.startswith("the "): | |
| query_clean = query_clean[4:] | |
| query_clean = query_clean.strip() | |
| results = [] | |
| for doc in docs: | |
| doc_lower = doc.lower() | |
| # Count occurrences of search term (exact phrase match) | |
| count = doc_lower.count(query_clean) | |
| # Only include documents that actually contain the search term | |
| if count > 0: | |
| # Score heavily weighted by frequency (exact matches matter most) | |
| first_position = doc_lower.find(query_clean) | |
| # Normalize position score (0-1, earlier is better) | |
| position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0 | |
| # Frequency score (more occurrences = higher score) | |
| frequency_score = min(count / 3.0, 1.0) # Cap at 1.0, scale faster | |
| # Combined score - prioritize frequency over position for exact matches | |
| score = (frequency_score * 0.8) + (position_score * 0.2) | |
| results.append((doc, score)) | |
| # Sort by score descending | |
| results.sort(key=lambda x: x[1], reverse=True) | |
| return results | |
| def generate_greeting_response() -> str: | |
| """Generate a friendly greeting response.""" | |
| return ( | |
| "Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n" | |
| "• **Search documents** - Ask me any question about the declassified KGB documents\n" | |
| "• **Keyword search** - Say 'keyword search for [term]' to find exact phrases\n" | |
| "• **Semantic analysis** - Ask for 'semantic analysis of [text]' to understand meaning\n\n" | |
| "How can I assist you today?" | |
| ) | |
| def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str: | |
| """Generate semantic analysis of the query and retrieved documents.""" | |
| # Extract what user wants analyzed | |
| query_lower = query.lower() | |
| for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]: | |
| if phrase in query_lower: | |
| query = query_lower.split(phrase, 1)[1].strip() | |
| break | |
| analysis = "**Semantic Analysis:**\n\n" | |
| if hits: | |
| analysis += f"**Query Terms:** {query}\n\n" | |
| analysis += "**Semantic Interpretation:**\n" | |
| analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n" | |
| analysis += "**Most Semantically Similar Documents:**\n\n" | |
| for i, (doc, score) in enumerate(hits[:3], 1): | |
| source = "unknown" | |
| body = doc | |
| if "[Source:" in doc: | |
| parts = doc.rsplit("[Source:", 1) | |
| body = parts[0].strip() | |
| source = parts[1].strip("] ") | |
| analysis += f"**{i}.** Similarity: {score:.3f}\n" | |
| analysis += f" {body[:200]}{'...' if len(body) > 200 else ''}\n" | |
| analysis += f" *[Source: {source}]*\n\n" | |
| analysis += "\n**Semantic Context:**\n" | |
| analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. " | |
| analysis += "The scores represent how conceptually related each document is to your query." | |
| else: | |
| analysis += "No semantically similar documents found for this query." | |
| return analysis | |