Spaces:

thomascerniglia
/

kgbchatbot

Sleeping

App Files Files Community

kgbchatbot / src /retrieval /query_handler.py

thomascerniglia

Fix greeting detection to only trigger on exact simple greetings

fd06dc1 2 months ago

raw

history blame contribute delete

6.63 kB

	"""
	Query handler that routes different types of queries appropriately.
	"""
	from typing import List, Tuple, Dict, Any
	import re

	class QueryType:
	GREETING = "greeting"
	SEMANTIC_ANALYSIS = "semantic_analysis"
	KEYWORD_SEARCH = "keyword_search"
	DOCUMENT_QUESTION = "document_question"

	def detect_result_count(query: str) -> int:
	"""
	Detect how many results the user wants based on their query.
	Returns the number of documents to return (default: 100).
	"""
	query_lower = query.lower()

	# Look for explicit numbers
	import re
	# Pattern: "find 5 documents", "show me 10", "top 3", etc.
	number_patterns = [
	r'(\d+)\s*(?:documents?\|results?\|matches?\|items?)', # "5 documents"
	r'(?:top\|first\|show\|find\|give me)\s*(\d+)', # "top 5"
	r'(\d+)', # standalone number
	]

	for pattern in number_patterns:
	match = re.search(pattern, query_lower)
	if match:
	try:
	count = int(match.group(1))
	return min(count, 100) # Cap at 100
	except (ValueError, IndexError):
	pass

	# Singular indicators = 1 result
	singular_words = ["a document", "one document", "single document", "the document", "find me a"]
	if any(word in query_lower for word in singular_words):
	return 1

	# "All" or "every" = 100 (max)
	if any(word in query_lower for word in ["all", "every", "everything"]):
	return 100

	# "Few" = 3-5
	if "few" in query_lower:
	return 3

	# "Several" or "some" = 10
	if any(word in query_lower for word in ["several", "some"]):
	return 10

	# Default: 5 for conversational, 100 for search modes
	return 100

	def classify_query(query: str) -> str:
	"""Classify the type of query based on keywords and patterns."""
	query_lower = query.lower().strip()

	# Greetings - only exact matches of simple greetings
	simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"]
	if query_lower in simple_greetings:
	return QueryType.GREETING

	# Keyword search - trigger if "keyword" appears anywhere
	if "keyword" in query_lower:
	return QueryType.KEYWORD_SEARCH

	# Semantic analysis - trigger if "semantic" appears anywhere
	if "semantic" in query_lower:
	return QueryType.SEMANTIC_ANALYSIS

	# Default to conversational document question (RAG mode)
	return QueryType.DOCUMENT_QUESTION

	def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]:
	"""
	Simple keyword/phrase search in documents.
	Returns documents that contain the search terms with a relevance score.
	"""
	# Extract the actual search terms (remove meta phrases like "keyword search for")
	query_clean = query.lower()
	for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase",
	"search for phrase", "contains the word"]:
	query_clean = query_clean.replace(phrase, "")
	query_clean = query_clean.strip()

	# Remove leading "for" or "the" if present
	if query_clean.startswith("for "):
	query_clean = query_clean[4:]
	if query_clean.startswith("the "):
	query_clean = query_clean[4:]
	query_clean = query_clean.strip()

	results = []
	for doc in docs:
	doc_lower = doc.lower()

	# Count occurrences of search term (exact phrase match)
	count = doc_lower.count(query_clean)

	# Only include documents that actually contain the search term
	if count > 0:
	# Score heavily weighted by frequency (exact matches matter most)
	first_position = doc_lower.find(query_clean)
	# Normalize position score (0-1, earlier is better)
	position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0
	# Frequency score (more occurrences = higher score)
	frequency_score = min(count / 3.0, 1.0) # Cap at 1.0, scale faster
	# Combined score - prioritize frequency over position for exact matches
	score = (frequency_score * 0.8) + (position_score * 0.2)
	results.append((doc, score))

	# Sort by score descending
	results.sort(key=lambda x: x[1], reverse=True)
	return results

	def generate_greeting_response() -> str:
	"""Generate a friendly greeting response."""
	return (
	"Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n"
	"• Search documents - Ask me any question about the declassified KGB documents\n"
	"• Keyword search - Say 'keyword search for [term]' to find exact phrases\n"
	"• Semantic analysis - Ask for 'semantic analysis of [text]' to understand meaning\n\n"
	"How can I assist you today?"
	)

	def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str:
	"""Generate semantic analysis of the query and retrieved documents."""
	# Extract what user wants analyzed
	query_lower = query.lower()
	for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]:
	if phrase in query_lower:
	query = query_lower.split(phrase, 1)[1].strip()
	break

	analysis = "Semantic Analysis:\n\n"

	if hits:
	analysis += f"Query Terms: {query}\n\n"
	analysis += "Semantic Interpretation:\n"
	analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n"

	analysis += "Most Semantically Similar Documents:\n\n"
	for i, (doc, score) in enumerate(hits[:3], 1):
	source = "unknown"
	body = doc
	if "[Source:" in doc:
	parts = doc.rsplit("[Source:", 1)
	body = parts[0].strip()
	source = parts[1].strip("] ")

	analysis += f"{i}. Similarity: {score:.3f}\n"
	analysis += f" {body[:200]}{'...' if len(body) > 200 else ''}\n"
	analysis += f" [Source: {source}]\n\n"

	analysis += "\nSemantic Context:\n"
	analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. "
	analysis += "The scores represent how conceptually related each document is to your query."
	else:
	analysis += "No semantically similar documents found for this query."

	return analysis