File size: 1,901 Bytes
067cdc9 04d4d26 067cdc9 04d4d26 067cdc9 04d4d26 067cdc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import os
# Core LangChain components
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import configs
if __name__ == "__main__":
print("Loading documents from directory...")
loader = DirectoryLoader(
path=configs["DATA_PATH"],
glob="*.md",
loader_cls=TextLoader,
silent_errors=True
)
raw_documents = loader.load()
if not raw_documents:
print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.")
exit()
# Split Documents into Chunks
print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""]
)
documents_to_embed = text_splitter.split_documents(raw_documents)
print(f"Split into {len(documents_to_embed)} chunks.")
print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...")
dense_embeddings = HuggingFaceEmbeddings(
model_name=configs["EMBEDDING_MODEL_NAME"]
)
print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...")
vectorstore = Chroma.from_documents(
documents=documents_to_embed, # The prepared Document chunks
embedding=dense_embeddings,
collection_name=configs["COLLECTION_NAME"],
persist_directory=configs["PERSIST_PATH"]
)
print("β
Success: Chroma vector store created and data persisted.")
print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'") |