File size: 1,673 Bytes
f018f6e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os
import sys
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
DATA_DIR = "data"
CHROMA_DIR = "chroma_db"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def main() -> None:
print("Starting ingestion pipeline...")
if not os.path.isdir(DATA_DIR):
print(f"Data directory '{DATA_DIR}' does not exist. Please create it and add PDFs.")
sys.exit(1)
print(f"Loading PDFs from '{DATA_DIR}'...")
loader = PyPDFDirectoryLoader(DATA_DIR)
documents = loader.load()
if not documents:
print(f"No PDF documents found in '{DATA_DIR}'. Add PDFs and run again.")
sys.exit(0)
print(f"Loaded {len(documents)} documents. Splitting into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
)
splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} text chunks.")
print(f"Initializing embeddings model '{EMBEDDING_MODEL_NAME}'...")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print(f"Creating Chroma database in '{CHROMA_DIR}'...")
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory=CHROMA_DIR,
)
print("Persisting Chroma database to disk...")
vectorstore.persist()
print(f"Database successfully created and stored in '{CHROMA_DIR}'.")
if __name__ == "__main__":
main()
|