File size: 1,371 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from src.docs_embd.preprocessing import preprocess_pdfs_from_csv
from src.docs_embd.embed import generate_embeddings
from src.docs_embd.index import create_faiss_index
from src.utils.helpers import save_chunks_to_disk
from src.configs.config import CHUNKS_FILE, METADATA_FILE, MAX_DOCS
from src.utils.helpers import save_chunks_to_disk
import logging

#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

def run(max_docs=None):
    logging.info("Step 1: Starting PDF preprocessing")
    print("Step 1: Starting PDF preprocessing")
    csv_path=METADATA_FILE
    chunks_data =preprocess_pdfs_from_csv(csv_path, max_docs=max_docs)
    save_chunks_to_disk(chunks_data, CHUNKS_FILE)
    logging.info("Step 1: PDF preprocessing complete")
    print("Step 1: PDF preprocessing complete")
    logging.info("Step 2: Generating embeddings")
    print("Step 2: Generating embeddings")
    embeddings, chunks = generate_embeddings()
    logging.info("Step 2: Embeddings generated")
    print("Step 2: Embeddings generated")
    logging.info("Step 3: Creating FAISS index")
    print("Step 3: Creating FAISS index")
    index = create_faiss_index()
    logging.info("Step 3: FAISS index created")
    print("Step 3: FAISS index created")
    


if __name__ == "__main__":
    run(max_docs=MAX_DOCS)