RAG_APP / src /scripts /1_docs_index.py
sxid003's picture
Upload 83 files
3107242 verified
from src.docs_embd.preprocessing import preprocess_pdfs_from_csv
from src.docs_embd.embed import generate_embeddings
from src.docs_embd.index import create_faiss_index
from src.utils.helpers import save_chunks_to_disk
from src.configs.config import CHUNKS_FILE, METADATA_FILE, MAX_DOCS
from src.utils.helpers import save_chunks_to_disk
import logging
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
def run(max_docs=None):
logging.info("Step 1: Starting PDF preprocessing")
print("Step 1: Starting PDF preprocessing")
csv_path=METADATA_FILE
chunks_data =preprocess_pdfs_from_csv(csv_path, max_docs=max_docs)
save_chunks_to_disk(chunks_data, CHUNKS_FILE)
logging.info("Step 1: PDF preprocessing complete")
print("Step 1: PDF preprocessing complete")
logging.info("Step 2: Generating embeddings")
print("Step 2: Generating embeddings")
embeddings, chunks = generate_embeddings()
logging.info("Step 2: Embeddings generated")
print("Step 2: Embeddings generated")
logging.info("Step 3: Creating FAISS index")
print("Step 3: Creating FAISS index")
index = create_faiss_index()
logging.info("Step 3: FAISS index created")
print("Step 3: FAISS index created")
if __name__ == "__main__":
run(max_docs=MAX_DOCS)