|
|
import os |
|
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from langchain_community.document_loaders.text import TextLoader |
|
|
from langchain_community.document_loaders.directory import DirectoryLoader |
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from langchain_chroma import Chroma |
|
|
|
|
|
from config import configs |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Loading documents from directory...") |
|
|
loader = DirectoryLoader( |
|
|
path=configs["DATA_PATH"], |
|
|
glob="*.md", |
|
|
loader_cls=TextLoader, |
|
|
silent_errors=True |
|
|
) |
|
|
|
|
|
raw_documents = loader.load() |
|
|
if not raw_documents: |
|
|
print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.") |
|
|
exit() |
|
|
|
|
|
|
|
|
print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...") |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200, |
|
|
separators=["\n\n", "\n", " ", ""] |
|
|
) |
|
|
|
|
|
documents_to_embed = text_splitter.split_documents(raw_documents) |
|
|
print(f"Split into {len(documents_to_embed)} chunks.") |
|
|
|
|
|
print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...") |
|
|
dense_embeddings = HuggingFaceEmbeddings( |
|
|
model_name=configs["EMBEDDING_MODEL_NAME"] |
|
|
) |
|
|
|
|
|
print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...") |
|
|
vectorstore = Chroma.from_documents( |
|
|
documents=documents_to_embed, |
|
|
embedding=dense_embeddings, |
|
|
collection_name=configs["COLLECTION_NAME"], |
|
|
persist_directory=configs["PERSIST_PATH"] |
|
|
) |
|
|
|
|
|
print("β
Success: Chroma vector store created and data persisted.") |
|
|
print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'") |