File size: 812 Bytes
0e5d52e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
def update_vector_db(data_folder="new_data", db_path="chroma_store"):
embed = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory=db_path, embedding_function=embed)
docs = []
for file in os.listdir(data_folder):
if file.endswith(".txt"):
loader = TextLoader(os.path.join(data_folder, file), encoding="utf-8")
docs.extend(loader.load())
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)
db.add_documents(chunks)
db.persist() |