Spaces:
Sleeping
Sleeping
| from src.docs_embd.preprocessing import preprocess_pdfs_from_csv | |
| from src.docs_embd.embed import generate_embeddings | |
| from src.docs_embd.index import create_faiss_index | |
| from src.utils.helpers import save_chunks_to_disk | |
| from src.configs.config import CHUNKS_FILE, METADATA_FILE, MAX_DOCS | |
| from src.utils.helpers import save_chunks_to_disk | |
| import logging | |
| #logging.basicConfig(level=logging.INFO) | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') | |
| def run(max_docs=None): | |
| logging.info("Step 1: Starting PDF preprocessing") | |
| print("Step 1: Starting PDF preprocessing") | |
| csv_path=METADATA_FILE | |
| chunks_data =preprocess_pdfs_from_csv(csv_path, max_docs=max_docs) | |
| save_chunks_to_disk(chunks_data, CHUNKS_FILE) | |
| logging.info("Step 1: PDF preprocessing complete") | |
| print("Step 1: PDF preprocessing complete") | |
| logging.info("Step 2: Generating embeddings") | |
| print("Step 2: Generating embeddings") | |
| embeddings, chunks = generate_embeddings() | |
| logging.info("Step 2: Embeddings generated") | |
| print("Step 2: Embeddings generated") | |
| logging.info("Step 3: Creating FAISS index") | |
| print("Step 3: Creating FAISS index") | |
| index = create_faiss_index() | |
| logging.info("Step 3: FAISS index created") | |
| print("Step 3: FAISS index created") | |
| if __name__ == "__main__": | |
| run(max_docs=MAX_DOCS) | |