MANIT_Chat / server /classes /VectorStore.py
WizardCoder2007's picture
first commit
2e9afea
Raw
History Blame Contribute Delete
3.22 kB
import chromadb
import os
from typing import List,Dict,Any,Tuple
import numpy as np
from pathlib import Path
CURRENT_FILE_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = CURRENT_FILE_DIR.parent
PERSIST_DIRECTORY = str(PROJECT_ROOT / "data" / "vector_store")
class VectorStore:
def __init__(self,collection_name:str= "pdf_directory",persist_directory: str= PERSIST_DIRECTORY):
self.collection_name= collection_name
self.persist_directory= persist_directory
self.client= None
self.collection= None
self._initialize_store()
def _initialize_store(self):
try:
os.makedirs(self.persist_directory,exist_ok= True)
self.client= chromadb.PersistentClient(path= self.persist_directory)
self.collection= self.client.get_or_create_collection(
name= self.collection_name,
metadata= {"description":"PDF Document embeddings for RAG","hnsw:space": "cosine"}
)
print(f"Vector embeddings initialized collection: {self.collection_name}")
print(f"Exisiting documents in collection: {self.collection.count()}")
except Exception as e:
print("erorr in initializing vector store")
raise
def add_documents(self,documents: List[Any], embeddings: np.ndarray):
if len(embeddings)!=len(documents):
raise ValueError("Number of documents must match number of embeddings")
print(f"Adding {len(embeddings)} documents to vector store...")
# prepare data for ChromaDB
ids= []
metadatas= []
documents_text= []
embeddings_list= []
for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
# generate unique id
# doc_id= f"doc_{uuid.uuid4().hex[:8]}_{i}"
doc_id= doc.metadata['chunk_id']
ids.append(doc_id)
# prepare metadata
cleaned_metadata= {}
for key,value in doc.metadata.items():
if value is None:
continue
# ChromaDB only accepts str, int, float, bool. Drop or stringify arrays/dicts.
if(isinstance(value,(str,int,bool,float))):
cleaned_metadata[key]= value
else:
cleaned_metadata[key]= str(value)
cleaned_metadata['doc_id']= doc_id
cleaned_metadata['doc_index']= i
cleaned_metadata['content_length']= int(len(doc.page_content))
metadatas.append(cleaned_metadata)
documents_text.append(doc.page_content)
embeddings_list.append(embedding.tolist())
# add to collection
try:
self.collection.add(
ids= ids,
embeddings= embeddings_list,
metadatas= metadatas,
documents= documents_text
)
print(f"Success in adding {len(documents)} documents")
print(f"No. of documents in vector store: {self.collection.count()}")
except Exception as e:
print("error in adding document to vector store")
raise