Spaces:
Sleeping
Sleeping
File size: 6,634 Bytes
35913c9 5bc045f 35913c9 fd06dc1 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 5bc045f 35913c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""
Query handler that routes different types of queries appropriately.
"""
from typing import List, Tuple, Dict, Any
import re
class QueryType:
GREETING = "greeting"
SEMANTIC_ANALYSIS = "semantic_analysis"
KEYWORD_SEARCH = "keyword_search"
DOCUMENT_QUESTION = "document_question"
def detect_result_count(query: str) -> int:
"""
Detect how many results the user wants based on their query.
Returns the number of documents to return (default: 100).
"""
query_lower = query.lower()
# Look for explicit numbers
import re
# Pattern: "find 5 documents", "show me 10", "top 3", etc.
number_patterns = [
r'(\d+)\s*(?:documents?|results?|matches?|items?)', # "5 documents"
r'(?:top|first|show|find|give me)\s*(\d+)', # "top 5"
r'(\d+)', # standalone number
]
for pattern in number_patterns:
match = re.search(pattern, query_lower)
if match:
try:
count = int(match.group(1))
return min(count, 100) # Cap at 100
except (ValueError, IndexError):
pass
# Singular indicators = 1 result
singular_words = ["a document", "one document", "single document", "the document", "find me a"]
if any(word in query_lower for word in singular_words):
return 1
# "All" or "every" = 100 (max)
if any(word in query_lower for word in ["all", "every", "everything"]):
return 100
# "Few" = 3-5
if "few" in query_lower:
return 3
# "Several" or "some" = 10
if any(word in query_lower for word in ["several", "some"]):
return 10
# Default: 5 for conversational, 100 for search modes
return 100
def classify_query(query: str) -> str:
"""Classify the type of query based on keywords and patterns."""
query_lower = query.lower().strip()
# Greetings - only exact matches of simple greetings
simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"]
if query_lower in simple_greetings:
return QueryType.GREETING
# Keyword search - trigger if "keyword" appears anywhere
if "keyword" in query_lower:
return QueryType.KEYWORD_SEARCH
# Semantic analysis - trigger if "semantic" appears anywhere
if "semantic" in query_lower:
return QueryType.SEMANTIC_ANALYSIS
# Default to conversational document question (RAG mode)
return QueryType.DOCUMENT_QUESTION
def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]:
"""
Simple keyword/phrase search in documents.
Returns documents that contain the search terms with a relevance score.
"""
# Extract the actual search terms (remove meta phrases like "keyword search for")
query_clean = query.lower()
for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase",
"search for phrase", "contains the word"]:
query_clean = query_clean.replace(phrase, "")
query_clean = query_clean.strip()
# Remove leading "for" or "the" if present
if query_clean.startswith("for "):
query_clean = query_clean[4:]
if query_clean.startswith("the "):
query_clean = query_clean[4:]
query_clean = query_clean.strip()
results = []
for doc in docs:
doc_lower = doc.lower()
# Count occurrences of search term (exact phrase match)
count = doc_lower.count(query_clean)
# Only include documents that actually contain the search term
if count > 0:
# Score heavily weighted by frequency (exact matches matter most)
first_position = doc_lower.find(query_clean)
# Normalize position score (0-1, earlier is better)
position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0
# Frequency score (more occurrences = higher score)
frequency_score = min(count / 3.0, 1.0) # Cap at 1.0, scale faster
# Combined score - prioritize frequency over position for exact matches
score = (frequency_score * 0.8) + (position_score * 0.2)
results.append((doc, score))
# Sort by score descending
results.sort(key=lambda x: x[1], reverse=True)
return results
def generate_greeting_response() -> str:
"""Generate a friendly greeting response."""
return (
"Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n"
"• **Search documents** - Ask me any question about the declassified KGB documents\n"
"• **Keyword search** - Say 'keyword search for [term]' to find exact phrases\n"
"• **Semantic analysis** - Ask for 'semantic analysis of [text]' to understand meaning\n\n"
"How can I assist you today?"
)
def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str:
"""Generate semantic analysis of the query and retrieved documents."""
# Extract what user wants analyzed
query_lower = query.lower()
for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]:
if phrase in query_lower:
query = query_lower.split(phrase, 1)[1].strip()
break
analysis = "**Semantic Analysis:**\n\n"
if hits:
analysis += f"**Query Terms:** {query}\n\n"
analysis += "**Semantic Interpretation:**\n"
analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n"
analysis += "**Most Semantically Similar Documents:**\n\n"
for i, (doc, score) in enumerate(hits[:3], 1):
source = "unknown"
body = doc
if "[Source:" in doc:
parts = doc.rsplit("[Source:", 1)
body = parts[0].strip()
source = parts[1].strip("] ")
analysis += f"**{i}.** Similarity: {score:.3f}\n"
analysis += f" {body[:200]}{'...' if len(body) > 200 else ''}\n"
analysis += f" *[Source: {source}]*\n\n"
analysis += "\n**Semantic Context:**\n"
analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. "
analysis += "The scores represent how conceptually related each document is to your query."
else:
analysis += "No semantically similar documents found for this query."
return analysis
|