import os from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import SentenceTransformerEmbeddings from langchain.vectorstores import Chroma # or FAISS def ingest_data(data_path="/", persist_directory="db/"): loader = PyPDFDirectoryLoader(data_path) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_documents(documents) embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2") vectordb = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=persist_directory) print("Data ingestion complete!") if __name__ == "__main__": ingest_data()