Spaces:

kith777
/

rag_agent

Paused

App Files Files Community

rag_agent / knowledge_base /chroma.py

kith777

first commit

067cdc9 17 days ago

raw

history blame

2.29 kB

	import os
	# Core LangChain components
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders.text import TextLoader
	from langchain_community.document_loaders.directory import DirectoryLoader
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma

	from config import configs

	if __name__ == "__main__":
	# --- 1. Load Documents ---
	print("Loading documents from directory...")
	loader = DirectoryLoader(
	path=configs["DATA_PATH"],
	glob="*.md",
	loader_cls=TextLoader,
	silent_errors=True # Set to False if you want to see loader errors
	)

	raw_documents = loader.load()
	if not raw_documents:
	print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.")
	exit()

	# --- 2. Split Documents into Chunks ---
	print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...")
	# Recursive splitting is better than simple splitting, preserving context.
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	separators=["\n\n", "\n", " ", ""] # Optimal separators for markdown/text
	)

	documents_to_embed = text_splitter.split_documents(raw_documents)
	print(f"Split into {len(documents_to_embed)} chunks.")

	# --- 3. Define Custom Embedding Model ---
	print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...")
	dense_embeddings = HuggingFaceEmbeddings(
	model_name=configs["EMBEDDING_MODEL_NAME"]
	)

	# --- 4. Create and Persist the Vector Store ---
	print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...")
	vectorstore = Chroma.from_documents(
	documents=documents_to_embed, # The prepared Document chunks
	embedding=dense_embeddings,
	collection_name=configs["COLLECTION_NAME"],
	persist_directory=configs["PERSIST_PATH"]
	)

	# Explicitly persist the data for immediate use
	vectorstore.persist()

	print("✅ Success: Chroma vector store created and data persisted.")
	print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'")