from src.docs_embd.preprocessing import preprocess_pdfs_from_csv from src.docs_embd.embed import generate_embeddings from src.docs_embd.index import create_faiss_index from src.utils.helpers import save_chunks_to_disk from src.configs.config import CHUNKS_FILE, METADATA_FILE, MAX_DOCS from src.utils.helpers import save_chunks_to_disk import logging #logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') def run(max_docs=None): logging.info("Step 1: Starting PDF preprocessing") print("Step 1: Starting PDF preprocessing") csv_path=METADATA_FILE chunks_data =preprocess_pdfs_from_csv(csv_path, max_docs=max_docs) save_chunks_to_disk(chunks_data, CHUNKS_FILE) logging.info("Step 1: PDF preprocessing complete") print("Step 1: PDF preprocessing complete") logging.info("Step 2: Generating embeddings") print("Step 2: Generating embeddings") embeddings, chunks = generate_embeddings() logging.info("Step 2: Embeddings generated") print("Step 2: Embeddings generated") logging.info("Step 3: Creating FAISS index") print("Step 3: Creating FAISS index") index = create_faiss_index() logging.info("Step 3: FAISS index created") print("Step 3: FAISS index created") if __name__ == "__main__": run(max_docs=MAX_DOCS)