import os import sys from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma DATA_DIR = "data" CHROMA_DIR = "chroma_db" EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" def main() -> None: print("Starting ingestion pipeline...") if not os.path.isdir(DATA_DIR): print(f"Data directory '{DATA_DIR}' does not exist. Please create it and add PDFs.") sys.exit(1) print(f"Loading PDFs from '{DATA_DIR}'...") loader = PyPDFDirectoryLoader(DATA_DIR) documents = loader.load() if not documents: print(f"No PDF documents found in '{DATA_DIR}'. Add PDFs and run again.") sys.exit(0) print(f"Loaded {len(documents)} documents. Splitting into chunks...") text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, ) splits = text_splitter.split_documents(documents) print(f"Created {len(splits)} text chunks.") print(f"Initializing embeddings model '{EMBEDDING_MODEL_NAME}'...") embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) print(f"Creating Chroma database in '{CHROMA_DIR}'...") vectorstore = Chroma.from_documents( documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR, ) print("Persisting Chroma database to disk...") vectorstore.persist() print(f"Database successfully created and stored in '{CHROMA_DIR}'.") if __name__ == "__main__": main()