Spaces:
Sleeping
Sleeping
File size: 1,371 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from src.docs_embd.preprocessing import preprocess_pdfs_from_csv
from src.docs_embd.embed import generate_embeddings
from src.docs_embd.index import create_faiss_index
from src.utils.helpers import save_chunks_to_disk
from src.configs.config import CHUNKS_FILE, METADATA_FILE, MAX_DOCS
from src.utils.helpers import save_chunks_to_disk
import logging
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
def run(max_docs=None):
logging.info("Step 1: Starting PDF preprocessing")
print("Step 1: Starting PDF preprocessing")
csv_path=METADATA_FILE
chunks_data =preprocess_pdfs_from_csv(csv_path, max_docs=max_docs)
save_chunks_to_disk(chunks_data, CHUNKS_FILE)
logging.info("Step 1: PDF preprocessing complete")
print("Step 1: PDF preprocessing complete")
logging.info("Step 2: Generating embeddings")
print("Step 2: Generating embeddings")
embeddings, chunks = generate_embeddings()
logging.info("Step 2: Embeddings generated")
print("Step 2: Embeddings generated")
logging.info("Step 3: Creating FAISS index")
print("Step 3: Creating FAISS index")
index = create_faiss_index()
logging.info("Step 3: FAISS index created")
print("Step 3: FAISS index created")
if __name__ == "__main__":
run(max_docs=MAX_DOCS)
|