File size: 4,854 Bytes
e820a8a 8931211 e820a8a 38a0876 e820a8a 6971bf3 38a0876 a85e7e2 38a0876 e820a8a 38a0876 a85e7e2 38a0876 e820a8a 38a0876 e820a8a 38a0876 e820a8a 38a0876 e820a8a 38a0876 e820a8a 38a0876 e820a8a 38a0876 e820a8a 38a0876 8f0c11b 38a0876 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | # main.py
import os
import sys
from document_processor import load_single_document
from embedding_generator import generate_embeddings
from search_engine import SearchEngine
from response_generator import ResponseGenerator
from config import Config
from utils import setup_logger
logger = setup_logger('main')
def initialize_system():
"""Initialize the search and response system."""
logger.info("Initializing the system...")
# NEW: Log configuration parameters
logger.info(f"Configuration: TOP_K={Config.TOP_K}, MIN_SIMILARITY={Config.MIN_SIMILARITY_SCORE}")
# Load and process documents
documents = load_single_document(Config.DOCUMENT_FILE)
# ❗ Prevent crash if no document loaded
if documents.empty:
raise FileNotFoundError(f"❌ No document found at: {Config.DOCUMENT_FILE}. "
f"Make sure the .txt file is uploaded in Hugging Face.")
# NEW: Log document statistics with chunks info
num_chunks = len(documents)
if 'chunk_id' in documents.columns:
total_chunks = documents['total_chunks'].iloc[0] if 'total_chunks' in documents.columns else num_chunks
logger.info(f"Loaded document with {num_chunks} chunks")
if 'content_length' in documents.columns:
avg_length = documents['content_length'].mean()
logger.info(f"Average chunk size: {avg_length:.0f} characters")
else:
logger.info(f"Loaded {len(documents)} documents")
# Generate embeddings
embeddings = generate_embeddings(documents)
logger.info(f"Generated embeddings of shape {embeddings.shape}")
# ❗ Prevent FAISS error if embeddings are empty
if embeddings.shape[0] == 0:
raise ValueError("❌ Embeddings are empty! Document might be empty or path is wrong.")
# Initialize search engine
search_engine = SearchEngine(documents, embeddings)
logger.info("Search engine initialized")
# Initialize response generator
response_generator = ResponseGenerator()
logger.info("Response generator initialized")
# NEW: Log system ready message
logger.info("✅ System ready for queries!")
return search_engine, response_generator
def process_query(query, search_engine, response_generator):
"""Process a single query and return the response."""
# NEW: Log the query being processed
logger.info(f"Processing query: '{query}'")
relevant_docs = search_engine.search(query)
# NEW: Enhanced logging with similarity info
if len(relevant_docs) == 0:
logger.warning("⚠️ No relevant documents found above similarity threshold")
else:
if 'similarity_score' in relevant_docs.columns:
scores = relevant_docs['similarity_score'].values
logger.info(f"Found {len(relevant_docs)} relevant chunks (similarity: {scores.min():.2f} - {scores.max():.2f})")
else:
logger.info(f"Found {len(relevant_docs)} relevant documents")
response = response_generator.generate_response(query, relevant_docs)
# NEW: Log response length
logger.info(f"Generated response ({len(response)} characters)")
return response
def interactive_mode(search_engine, response_generator):
"""Run the system in interactive mode, processing queries from user input."""
print("Enter your queries. Type 'quit' to exit.")
while True:
query = input("Query: ").strip()
if query.lower() == 'quit':
break
response = process_query(query, search_engine, response_generator)
print(f"Response: {response}\n")
def batch_mode(input_file, output_file, search_engine, response_generator):
"""Process queries from an input file and write responses to an output file."""
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
query = line.strip()
response = process_query(query, search_engine, response_generator)
outfile.write(f"Query: {query}\nResponse: {response}\n\n")
logger.info(f"Batch processing completed. Results written to {output_file}")
def main():
search_engine, response_generator = initialize_system()
if len(sys.argv) > 1:
if sys.argv[1] == '--batch':
if len(sys.argv) != 4:
print("Usage for batch mode: python main.py --batch input_file output_file")
sys.exit(1)
batch_mode(sys.argv[2], sys.argv[3], search_engine, response_generator)
else:
print("Unknown argument. Use --batch for batch mode or no arguments for interactive mode.")
sys.exit(1)
else:
interactive_mode(search_engine, response_generator)
if __name__ == "__main__":
main()
|