| import os | |
| from langchain.document_loaders import TextLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| def update_vector_db(data_folder="new_data", db_path="chroma_store"): | |
| embed = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| db = Chroma(persist_directory=db_path, embedding_function=embed) | |
| docs = [] | |
| for file in os.listdir(data_folder): | |
| if file.endswith(".txt"): | |
| loader = TextLoader(os.path.join(data_folder, file), encoding="utf-8") | |
| docs.extend(loader.load()) | |
| splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = splitter.split_documents(docs) | |
| db.add_documents(chunks) | |
| db.persist() |