import os from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS # Define the data path DATA_PATH = "data/" FAISS_PATH = "vectorstore/db_faiss" # Step 1: Load raw PDFs def load_pdf_files(data): loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader) documents = loader.load() return documents documents = load_pdf_files(DATA_PATH) # Step 2: Create Chunks def create_chunks(extracted_data): text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50 ) return text_splitter.split_documents(extracted_data) text_chunks = create_chunks(documents) # Step 3: Embeddings def get_embedding_model(): return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") embedding_model = get_embedding_model() # Step 4: Store or Load FAISS if not os.path.exists(FAISS_PATH): db = FAISS.from_documents(text_chunks, embedding_model) db.save_local(FAISS_PATH) else: db = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)