RAG / ingest.py
Adeen
Initial deployment of Source.AI premium platform
f018f6e
import os
import sys
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
DATA_DIR = "data"
CHROMA_DIR = "chroma_db"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def main() -> None:
print("Starting ingestion pipeline...")
if not os.path.isdir(DATA_DIR):
print(f"Data directory '{DATA_DIR}' does not exist. Please create it and add PDFs.")
sys.exit(1)
print(f"Loading PDFs from '{DATA_DIR}'...")
loader = PyPDFDirectoryLoader(DATA_DIR)
documents = loader.load()
if not documents:
print(f"No PDF documents found in '{DATA_DIR}'. Add PDFs and run again.")
sys.exit(0)
print(f"Loaded {len(documents)} documents. Splitting into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
)
splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} text chunks.")
print(f"Initializing embeddings model '{EMBEDDING_MODEL_NAME}'...")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print(f"Creating Chroma database in '{CHROMA_DIR}'...")
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory=CHROMA_DIR,
)
print("Persisting Chroma database to disk...")
vectorstore.persist()
print(f"Database successfully created and stored in '{CHROMA_DIR}'.")
if __name__ == "__main__":
main()