File size: 4,854 Bytes
e820a8a
 
 
8931211
e820a8a
 
 
 
 
 
 
 
 
 
 
 
38a0876
 
 
e820a8a
6971bf3
38a0876
a85e7e2
 
 
 
38a0876
 
 
 
 
 
 
 
 
 
 
 
e820a8a
 
 
38a0876
a85e7e2
 
 
38a0876
e820a8a
 
 
38a0876
e820a8a
 
 
38a0876
 
 
 
e820a8a
 
 
 
38a0876
 
 
e820a8a
38a0876
 
 
 
 
 
 
 
 
 
 
e820a8a
38a0876
 
 
 
e820a8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38a0876
e820a8a
 
 
 
 
 
 
 
 
 
 
 
 
38a0876
8f0c11b
38a0876
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# main.py
import os
import sys
from document_processor import load_single_document
from embedding_generator import generate_embeddings
from search_engine import SearchEngine
from response_generator import ResponseGenerator
from config import Config
from utils import setup_logger

logger = setup_logger('main')

def initialize_system():
    """Initialize the search and response system."""
    logger.info("Initializing the system...")
    
    # NEW: Log configuration parameters
    logger.info(f"Configuration: TOP_K={Config.TOP_K}, MIN_SIMILARITY={Config.MIN_SIMILARITY_SCORE}")
    
    # Load and process documents
    documents = load_single_document(Config.DOCUMENT_FILE)
    
    # ❗ Prevent crash if no document loaded
    if documents.empty:
        raise FileNotFoundError(f"❌ No document found at: {Config.DOCUMENT_FILE}. "
                                f"Make sure the .txt file is uploaded in Hugging Face.")
    
    # NEW: Log document statistics with chunks info
    num_chunks = len(documents)
    if 'chunk_id' in documents.columns:
        total_chunks = documents['total_chunks'].iloc[0] if 'total_chunks' in documents.columns else num_chunks
        logger.info(f"Loaded document with {num_chunks} chunks")
        if 'content_length' in documents.columns:
            avg_length = documents['content_length'].mean()
            logger.info(f"Average chunk size: {avg_length:.0f} characters")
    else:
        logger.info(f"Loaded {len(documents)} documents")
    
    # Generate embeddings
    embeddings = generate_embeddings(documents)
    logger.info(f"Generated embeddings of shape {embeddings.shape}")
    
    # ❗ Prevent FAISS error if embeddings are empty
    if embeddings.shape[0] == 0:
        raise ValueError("❌ Embeddings are empty! Document might be empty or path is wrong.")
    
    # Initialize search engine
    search_engine = SearchEngine(documents, embeddings)
    logger.info("Search engine initialized")
    
    # Initialize response generator
    response_generator = ResponseGenerator()
    logger.info("Response generator initialized")
    
    # NEW: Log system ready message
    logger.info("✅ System ready for queries!")
    
    return search_engine, response_generator

def process_query(query, search_engine, response_generator):
    """Process a single query and return the response."""
    # NEW: Log the query being processed
    logger.info(f"Processing query: '{query}'")
    
    relevant_docs = search_engine.search(query)
    
    # NEW: Enhanced logging with similarity info
    if len(relevant_docs) == 0:
        logger.warning("⚠️ No relevant documents found above similarity threshold")
    else:
        if 'similarity_score' in relevant_docs.columns:
            scores = relevant_docs['similarity_score'].values
            logger.info(f"Found {len(relevant_docs)} relevant chunks (similarity: {scores.min():.2f} - {scores.max():.2f})")
        else:
            logger.info(f"Found {len(relevant_docs)} relevant documents")
    
    response = response_generator.generate_response(query, relevant_docs)
    
    # NEW: Log response length
    logger.info(f"Generated response ({len(response)} characters)")
    
    return response

def interactive_mode(search_engine, response_generator):
    """Run the system in interactive mode, processing queries from user input."""
    print("Enter your queries. Type 'quit' to exit.")
    while True:
        query = input("Query: ").strip()
        if query.lower() == 'quit':
            break
        
        response = process_query(query, search_engine, response_generator)
        print(f"Response: {response}\n")

def batch_mode(input_file, output_file, search_engine, response_generator):
    """Process queries from an input file and write responses to an output file."""
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            query = line.strip()
            response = process_query(query, search_engine, response_generator)
            outfile.write(f"Query: {query}\nResponse: {response}\n\n")
    logger.info(f"Batch processing completed. Results written to {output_file}")

def main():
    search_engine, response_generator = initialize_system()
    
    if len(sys.argv) > 1:
        if sys.argv[1] == '--batch':
            if len(sys.argv) != 4:
                print("Usage for batch mode: python main.py --batch input_file output_file")
                sys.exit(1)
            batch_mode(sys.argv[2], sys.argv[3], search_engine, response_generator)
        else:
            print("Unknown argument. Use --batch for batch mode or no arguments for interactive mode.")
            sys.exit(1)
    else:
        interactive_mode(search_engine, response_generator)

if __name__ == "__main__":
    main()