File size: 825 Bytes
eb888e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import os

DATA_DIR = "data"
CHROMA_DIR = "chroma_db"

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

all_docs = []

for filename in os.listdir(DATA_DIR):
    if filename.endswith(".txt"):
        loader = TextLoader(os.path.join(DATA_DIR, filename))
        docs = loader.load()
        chunks = text_splitter.split_documents(docs)
        all_docs.extend(chunks)

db = Chroma.from_documents(all_docs, embedding, persist_directory=CHROMA_DIR)
db.persist()
print("✅ Ingestion complete")