File size: 6,634 Bytes
35913c9
 
 
 
 
 
 
 
 
 
 
 
5bc045f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35913c9
 
 
 
fd06dc1
 
 
35913c9
 
5bc045f
 
35913c9
 
5bc045f
 
 
 
 
35913c9
 
 
 
 
 
 
 
 
 
5bc045f
35913c9
 
 
5bc045f
 
 
 
 
 
 
35913c9
 
 
 
5bc045f
35913c9
 
5bc045f
35913c9
5bc045f
35913c9
 
 
5bc045f
 
 
 
35913c9
 
 
 
 
 
 
 
 
5bc045f
 
35913c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Query handler that routes different types of queries appropriately.
"""
from typing import List, Tuple, Dict, Any
import re

class QueryType:
    GREETING = "greeting"
    SEMANTIC_ANALYSIS = "semantic_analysis"
    KEYWORD_SEARCH = "keyword_search"
    DOCUMENT_QUESTION = "document_question"

def detect_result_count(query: str) -> int:
    """
    Detect how many results the user wants based on their query.
    Returns the number of documents to return (default: 100).
    """
    query_lower = query.lower()
    
    # Look for explicit numbers
    import re
    # Pattern: "find 5 documents", "show me 10", "top 3", etc.
    number_patterns = [
        r'(\d+)\s*(?:documents?|results?|matches?|items?)',  # "5 documents"
        r'(?:top|first|show|find|give me)\s*(\d+)',          # "top 5"
        r'(\d+)',                                              # standalone number
    ]
    
    for pattern in number_patterns:
        match = re.search(pattern, query_lower)
        if match:
            try:
                count = int(match.group(1))
                return min(count, 100)  # Cap at 100
            except (ValueError, IndexError):
                pass
    
    # Singular indicators = 1 result
    singular_words = ["a document", "one document", "single document", "the document", "find me a"]
    if any(word in query_lower for word in singular_words):
        return 1
    
    # "All" or "every" = 100 (max)
    if any(word in query_lower for word in ["all", "every", "everything"]):
        return 100
    
    # "Few" = 3-5
    if "few" in query_lower:
        return 3
    
    # "Several" or "some" = 10
    if any(word in query_lower for word in ["several", "some"]):
        return 10
    
    # Default: 5 for conversational, 100 for search modes
    return 100

def classify_query(query: str) -> str:
    """Classify the type of query based on keywords and patterns."""
    query_lower = query.lower().strip()
    
    # Greetings - only exact matches of simple greetings
    simple_greetings = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"]
    if query_lower in simple_greetings:
        return QueryType.GREETING
    
    # Keyword search - trigger if "keyword" appears anywhere
    if "keyword" in query_lower:
        return QueryType.KEYWORD_SEARCH
    
    # Semantic analysis - trigger if "semantic" appears anywhere
    if "semantic" in query_lower:
        return QueryType.SEMANTIC_ANALYSIS
    
    # Default to conversational document question (RAG mode)
    return QueryType.DOCUMENT_QUESTION

def keyword_search_documents(query: str, docs: List[str]) -> List[Tuple[str, float]]:
    """
    Simple keyword/phrase search in documents.
    Returns documents that contain the search terms with a relevance score.
    """
    # Extract the actual search terms (remove meta phrases like "keyword search for")
    query_clean = query.lower()
    for phrase in ["keyword search", "search for keyword", "exact match", "find the phrase", 
                   "search for phrase", "contains the word"]:
        query_clean = query_clean.replace(phrase, "")
    query_clean = query_clean.strip()
    
    # Remove leading "for" or "the" if present
    if query_clean.startswith("for "):
        query_clean = query_clean[4:]
    if query_clean.startswith("the "):
        query_clean = query_clean[4:]
    query_clean = query_clean.strip()
    
    results = []
    for doc in docs:
        doc_lower = doc.lower()
        
        # Count occurrences of search term (exact phrase match)
        count = doc_lower.count(query_clean)
        
        # Only include documents that actually contain the search term
        if count > 0:
            # Score heavily weighted by frequency (exact matches matter most)
            first_position = doc_lower.find(query_clean)
            # Normalize position score (0-1, earlier is better)
            position_score = 1.0 - (first_position / len(doc_lower)) if len(doc_lower) > 0 else 0
            # Frequency score (more occurrences = higher score)
            frequency_score = min(count / 3.0, 1.0)  # Cap at 1.0, scale faster
            # Combined score - prioritize frequency over position for exact matches
            score = (frequency_score * 0.8) + (position_score * 0.2)
            results.append((doc, score))
    
    # Sort by score descending
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def generate_greeting_response() -> str:
    """Generate a friendly greeting response."""
    return (
        "Hello! I'm the KGB Lab Document Chatbot. I can help you:\n\n"
        "• **Search documents** - Ask me any question about the declassified KGB documents\n"
        "• **Keyword search** - Say 'keyword search for [term]' to find exact phrases\n"
        "• **Semantic analysis** - Ask for 'semantic analysis of [text]' to understand meaning\n\n"
        "How can I assist you today?"
    )

def generate_semantic_analysis(query: str, hits: List[Tuple[str, float]]) -> str:
    """Generate semantic analysis of the query and retrieved documents."""
    # Extract what user wants analyzed
    query_lower = query.lower()
    for phrase in ["semantic analysis of", "analyze semantically", "semantic meaning of"]:
        if phrase in query_lower:
            query = query_lower.split(phrase, 1)[1].strip()
            break
    
    analysis = "**Semantic Analysis:**\n\n"
    
    if hits:
        analysis += f"**Query Terms:** {query}\n\n"
        analysis += "**Semantic Interpretation:**\n"
        analysis += f"The query relates to concepts around: {', '.join(set(query.split()))}\n\n"
        
        analysis += "**Most Semantically Similar Documents:**\n\n"
        for i, (doc, score) in enumerate(hits[:3], 1):
            source = "unknown"
            body = doc
            if "[Source:" in doc:
                parts = doc.rsplit("[Source:", 1)
                body = parts[0].strip()
                source = parts[1].strip("] ")
            
            analysis += f"**{i}.** Similarity: {score:.3f}\n"
            analysis += f"   {body[:200]}{'...' if len(body) > 200 else ''}\n"
            analysis += f"   *[Source: {source}]*\n\n"
        
        analysis += "\n**Semantic Context:**\n"
        analysis += "These documents were selected based on semantic similarity (meaning-based) rather than exact keyword matching. "
        analysis += "The scores represent how conceptually related each document is to your query."
    else:
        analysis += "No semantically similar documents found for this query."
    
    return analysis