File size: 1,901 Bytes
067cdc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04d4d26
067cdc9
 
 
 
 
 
 
04d4d26
067cdc9
 
 
 
04d4d26
067cdc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
# Core LangChain components
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from config import configs

if __name__ == "__main__":
    print("Loading documents from directory...")
    loader = DirectoryLoader(
        path=configs["DATA_PATH"],
        glob="*.md",
        loader_cls=TextLoader,
        silent_errors=True 
    )

    raw_documents = loader.load()
    if not raw_documents:
        print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.")
        exit()

    # Split Documents into Chunks
    print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""] 
    )

    documents_to_embed = text_splitter.split_documents(raw_documents)
    print(f"Split into {len(documents_to_embed)} chunks.")

    print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...")
    dense_embeddings = HuggingFaceEmbeddings(
        model_name=configs["EMBEDDING_MODEL_NAME"]
    )

    print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...")
    vectorstore = Chroma.from_documents(
        documents=documents_to_embed,  # The prepared Document chunks
        embedding=dense_embeddings,
        collection_name=configs["COLLECTION_NAME"],
        persist_directory=configs["PERSIST_PATH"]
    )

    print("βœ… Success: Chroma vector store created and data persisted.")
    print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'")