Spaces:

thomascerniglia
/

kgbchatbot

Sleeping

File size: 6,634 Bytes

"""
Query handler that routes different types of queries appropriately.
"""
from typing import List, Tuple, Dict, Any
import re

class QueryType:
    GREETING = "greeting"
    SEMANTIC_ANALYSIS = "semantic_analysis"
    KEYWORD_SEARCH = "keyword_search"
    DOCUMENT_QUESTION = "document_question"

def detect_result_count(query: str) -> int:
    """
    Detect how many results the user wants based on their query.
    Returns the number of documents to return (default: 100).
    """
    query_lower = query.lower()
    
    # Look for explicit numbers
    import re
    # Pattern: "find 5 documents", "show me 10", "top 3", etc.
    number_patterns = [
        r'(\d+)\s*(?:documents?|results?|matches?|items?)',  # "5 documents"
        r'(?:top|first|show|find|give me)\s*(\d+)',          # "top 5"
        r'(\d+)',                                              # standalone number
    ]
    
    for pattern in number_patterns:
        match = re.search(pattern, query_lower)
        if match:
            try:
                count = int(match.group(1))
                return min(count, 100)  # Cap at 100
            except (ValueError, IndexError):
                pass
    
    # Singular indicators = 1 result
    singular_words = ["a document", "one document", "single document", "the document", "find me a"]
    if any(word in query_lower for word in singular_words):
        return 1
    
    # "All" or "every" = 100 (max)
    if any(word in query_lower for word in ["all", "every", "everything"]):
        return 100
    
    # "Few" = 3-5
    if "few" in query_lower:
        return 3
    
    # "Several" or "some" = 10
    if any(word in query_lower for word in ["several", "some"]):
        return 10
    
    # Default: 5 for conversational, 100 for search modes
    return 100

def classify_query(query: str) -> str:
    """Classify the type of query based on keywords and patterns."""
    query_lower = query.lower().strip()
    
    # Greetings - only exact matches of simple greetings
    simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"]
    if query_lower in simple_greetings:
        return QueryType.GREETING
    
    # Keyword search - trigger if "keyword" appears anywhere
    if "keyword" in query_lower:
        return QueryType.KEYWORD_SEARCH
    
    # Semantic analysis - trigger if "semantic" appears anywhere
    if "semantic" in query_lower:
        return QueryType.SEMANTIC_ANALYSIS
    
    # Default to conversational document question (RAG mode)
    return QueryType.DOCUMENT_QUESTION

def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]:
    """
    Simple keyword/phrase search in documents.
    Returns documents that contain the search terms with a relevance score.
    """
    # Extract the actual search terms (remove meta phrases like "keyword search for")
    query_clean = query.lower()
    for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase", 
                   "search for phrase", "contains the word"]:
        query_clean = query_clean.replace(phrase, "")
    query_clean = query_clean.strip()
    
    # Remove leading "for" or "the" if present
    if query_clean.startswith("for "):
        query_clean = query_clean[4:]
    if query_clean.startswith("the "):
        query_clean = query_clean[4:]
    query_clean = query_clean.strip()
    
    results = []
    for doc in docs:
        doc_lower = doc.lower()
        
        # Count occurrences of search term (exact phrase match)
        count = doc_lower.count(query_clean)
        
        # Only include documents that actually contain the search term
        if count > 0:
            # Score heavily weighted by frequency (exact matches matter most)
            first_position = doc_lower.find(query_clean)
            # Normalize position score (0-1, earlier is better)
            position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0
            # Frequency score (more occurrences = higher score)
            frequency_score = min(count / 3.0, 1.0)  # Cap at 1.0, scale faster
            # Combined score - prioritize frequency over position for exact matches
            score = (frequency_score * 0.8) + (position_score * 0.2)
            results.append((doc, score))
    
    # Sort by score descending
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def generate_greeting_response() -> str:
    """Generate a friendly greeting response."""
    return (
        "Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n"
        "• **Search documents** - Ask me any question about the declassified KGB documents\n"
        "• **Keyword search** - Say 'keyword search for [term]' to find exact phrases\n"
        "• **Semantic analysis** - Ask for 'semantic analysis of [text]' to understand meaning\n\n"
        "How can I assist you today?"
    )

def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str:
    """Generate semantic analysis of the query and retrieved documents."""
    # Extract what user wants analyzed
    query_lower = query.lower()
    for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]:
        if phrase in query_lower:
            query = query_lower.split(phrase, 1)[1].strip()
            break
    
    analysis = "**Semantic Analysis:**\n\n"
    
    if hits:
        analysis += f"**Query Terms:** {query}\n\n"
        analysis += "**Semantic Interpretation:**\n"
        analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n"
        
        analysis += "**Most Semantically Similar Documents:**\n\n"
        for i, (doc, score) in enumerate(hits[:3], 1):
            source = "unknown"
            body = doc
            if "[Source:" in doc:
                parts = doc.rsplit("[Source:", 1)
                body = parts[0].strip()
                source = parts[1].strip("] ")
            
            analysis += f"**{i}.** Similarity: {score:.3f}\n"
            analysis += f"   {body[:200]}{'...' if len(body) > 200 else ''}\n"
            analysis += f"   *[Source: {source}]*\n\n"
        
        analysis += "\n**Semantic Context:**\n"
        analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. "
        analysis += "The scores represent how conceptually related each document is to your query."
    else:
        analysis += "No semantically similar documents found for this query."
    
    return analysis