from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import TextLoader import os DATA_DIR = "data" CHROMA_DIR = "chroma_db" embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) all_docs = [] for filename in os.listdir(DATA_DIR): if filename.endswith(".txt"): loader = TextLoader(os.path.join(DATA_DIR, filename)) docs = loader.load() chunks = text_splitter.split_documents(docs) all_docs.extend(chunks) db = Chroma.from_documents(all_docs, embedding, persist_directory=CHROMA_DIR) db.persist() print("✅ Ingestion complete")