kizfestchat / scripts /ingest_docs.py
Bur3hani's picture
added files
eb888e3
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import os
DATA_DIR = "data"
CHROMA_DIR = "chroma_db"
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_docs = []
for filename in os.listdir(DATA_DIR):
if filename.endswith(".txt"):
loader = TextLoader(os.path.join(DATA_DIR, filename))
docs = loader.load()
chunks = text_splitter.split_documents(docs)
all_docs.extend(chunks)
db = Chroma.from_documents(all_docs, embedding, persist_directory=CHROMA_DIR)
db.persist()
print("✅ Ingestion complete")