import os # Core LangChain components from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders.text import TextLoader from langchain_community.document_loaders.directory import DirectoryLoader from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma from config import configs if __name__ == "__main__": # --- 1. Load Documents --- print("Loading documents from directory...") loader = DirectoryLoader( path=configs["DATA_PATH"], glob="*.md", loader_cls=TextLoader, silent_errors=True # Set to False if you want to see loader errors ) raw_documents = loader.load() if not raw_documents: print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.") exit() # --- 2. Split Documents into Chunks --- print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...") # Recursive splitting is better than simple splitting, preserving context. text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""] # Optimal separators for markdown/text ) documents_to_embed = text_splitter.split_documents(raw_documents) print(f"Split into {len(documents_to_embed)} chunks.") # --- 3. Define Custom Embedding Model --- print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...") dense_embeddings = HuggingFaceEmbeddings( model_name=configs["EMBEDDING_MODEL_NAME"] ) # --- 4. Create and Persist the Vector Store --- print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...") vectorstore = Chroma.from_documents( documents=documents_to_embed, # The prepared Document chunks embedding=dense_embeddings, collection_name=configs["COLLECTION_NAME"], persist_directory=configs["PERSIST_PATH"] ) # Explicitly persist the data for immediate use vectorstore.persist() print("✅ Success: Chroma vector store created and data persisted.") print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'")