| |
| import os |
| import sys |
| from document_processor import load_single_document |
| from embedding_generator import generate_embeddings |
| from search_engine import SearchEngine |
| from response_generator import ResponseGenerator |
| from config import Config |
| from utils import setup_logger |
|
|
| logger = setup_logger('main') |
|
|
| def initialize_system(): |
| """Initialize the search and response system.""" |
| logger.info("Initializing the system...") |
| |
| |
| logger.info(f"Configuration: TOP_K={Config.TOP_K}, MIN_SIMILARITY={Config.MIN_SIMILARITY_SCORE}") |
| |
| |
| documents = load_single_document(Config.DOCUMENT_FILE) |
| |
| |
| if documents.empty: |
| raise FileNotFoundError(f"β No document found at: {Config.DOCUMENT_FILE}. " |
| f"Make sure the .txt file is uploaded in Hugging Face.") |
| |
| |
| num_chunks = len(documents) |
| if 'chunk_id' in documents.columns: |
| total_chunks = documents['total_chunks'].iloc[0] if 'total_chunks' in documents.columns else num_chunks |
| logger.info(f"Loaded document with {num_chunks} chunks") |
| if 'content_length' in documents.columns: |
| avg_length = documents['content_length'].mean() |
| logger.info(f"Average chunk size: {avg_length:.0f} characters") |
| else: |
| logger.info(f"Loaded {len(documents)} documents") |
| |
| |
| embeddings = generate_embeddings(documents) |
| logger.info(f"Generated embeddings of shape {embeddings.shape}") |
| |
| |
| if embeddings.shape[0] == 0: |
| raise ValueError("β Embeddings are empty! Document might be empty or path is wrong.") |
| |
| |
| search_engine = SearchEngine(documents, embeddings) |
| logger.info("Search engine initialized") |
| |
| |
| response_generator = ResponseGenerator() |
| logger.info("Response generator initialized") |
| |
| |
| logger.info("β
System ready for queries!") |
| |
| return search_engine, response_generator |
|
|
| def process_query(query, search_engine, response_generator): |
| """Process a single query and return the response.""" |
| |
| logger.info(f"Processing query: '{query}'") |
| |
| relevant_docs = search_engine.search(query) |
| |
| |
| if len(relevant_docs) == 0: |
| logger.warning("β οΈ No relevant documents found above similarity threshold") |
| else: |
| if 'similarity_score' in relevant_docs.columns: |
| scores = relevant_docs['similarity_score'].values |
| logger.info(f"Found {len(relevant_docs)} relevant chunks (similarity: {scores.min():.2f} - {scores.max():.2f})") |
| else: |
| logger.info(f"Found {len(relevant_docs)} relevant documents") |
| |
| response = response_generator.generate_response(query, relevant_docs) |
| |
| |
| logger.info(f"Generated response ({len(response)} characters)") |
| |
| return response |
|
|
| def interactive_mode(search_engine, response_generator): |
| """Run the system in interactive mode, processing queries from user input.""" |
| print("Enter your queries. Type 'quit' to exit.") |
| while True: |
| query = input("Query: ").strip() |
| if query.lower() == 'quit': |
| break |
| |
| response = process_query(query, search_engine, response_generator) |
| print(f"Response: {response}\n") |
|
|
| def batch_mode(input_file, output_file, search_engine, response_generator): |
| """Process queries from an input file and write responses to an output file.""" |
| with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: |
| for line in infile: |
| query = line.strip() |
| response = process_query(query, search_engine, response_generator) |
| outfile.write(f"Query: {query}\nResponse: {response}\n\n") |
| logger.info(f"Batch processing completed. Results written to {output_file}") |
|
|
| def main(): |
| search_engine, response_generator = initialize_system() |
| |
| if len(sys.argv) > 1: |
| if sys.argv[1] == '--batch': |
| if len(sys.argv) != 4: |
| print("Usage for batch mode: python main.py --batch input_file output_file") |
| sys.exit(1) |
| batch_mode(sys.argv[2], sys.argv[3], search_engine, response_generator) |
| else: |
| print("Unknown argument. Use --batch for batch mode or no arguments for interactive mode.") |
| sys.exit(1) |
| else: |
| interactive_mode(search_engine, response_generator) |
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|
|
|