Spaces:
Running
Running
| import chromadb | |
| import os | |
| from typing import List,Dict,Any,Tuple | |
| import numpy as np | |
| from pathlib import Path | |
| CURRENT_FILE_DIR = Path(__file__).resolve().parent | |
| PROJECT_ROOT = CURRENT_FILE_DIR.parent | |
| PERSIST_DIRECTORY = str(PROJECT_ROOT / "data" / "vector_store") | |
| class VectorStore: | |
| def __init__(self,collection_name:str= "pdf_directory",persist_directory: str= PERSIST_DIRECTORY): | |
| self.collection_name= collection_name | |
| self.persist_directory= persist_directory | |
| self.client= None | |
| self.collection= None | |
| self._initialize_store() | |
| def _initialize_store(self): | |
| try: | |
| os.makedirs(self.persist_directory,exist_ok= True) | |
| self.client= chromadb.PersistentClient(path= self.persist_directory) | |
| self.collection= self.client.get_or_create_collection( | |
| name= self.collection_name, | |
| metadata= {"description":"PDF Document embeddings for RAG","hnsw:space": "cosine"} | |
| ) | |
| print(f"Vector embeddings initialized collection: {self.collection_name}") | |
| print(f"Exisiting documents in collection: {self.collection.count()}") | |
| except Exception as e: | |
| print("erorr in initializing vector store") | |
| raise | |
| def add_documents(self,documents: List[Any], embeddings: np.ndarray): | |
| if len(embeddings)!=len(documents): | |
| raise ValueError("Number of documents must match number of embeddings") | |
| print(f"Adding {len(embeddings)} documents to vector store...") | |
| # prepare data for ChromaDB | |
| ids= [] | |
| metadatas= [] | |
| documents_text= [] | |
| embeddings_list= [] | |
| for i,(doc,embedding) in enumerate(zip(documents,embeddings)): | |
| # generate unique id | |
| # doc_id= f"doc_{uuid.uuid4().hex[:8]}_{i}" | |
| doc_id= doc.metadata['chunk_id'] | |
| ids.append(doc_id) | |
| # prepare metadata | |
| cleaned_metadata= {} | |
| for key,value in doc.metadata.items(): | |
| if value is None: | |
| continue | |
| # ChromaDB only accepts str, int, float, bool. Drop or stringify arrays/dicts. | |
| if(isinstance(value,(str,int,bool,float))): | |
| cleaned_metadata[key]= value | |
| else: | |
| cleaned_metadata[key]= str(value) | |
| cleaned_metadata['doc_id']= doc_id | |
| cleaned_metadata['doc_index']= i | |
| cleaned_metadata['content_length']= int(len(doc.page_content)) | |
| metadatas.append(cleaned_metadata) | |
| documents_text.append(doc.page_content) | |
| embeddings_list.append(embedding.tolist()) | |
| # add to collection | |
| try: | |
| self.collection.add( | |
| ids= ids, | |
| embeddings= embeddings_list, | |
| metadatas= metadatas, | |
| documents= documents_text | |
| ) | |
| print(f"Success in adding {len(documents)} documents") | |
| print(f"No. of documents in vector store: {self.collection.count()}") | |
| except Exception as e: | |
| print("error in adding document to vector store") | |
| raise |