import os import chromadb from typing import List, Dict, Any import hashlib class ChromaVectorDB: def __init__(self, db_path: str = "./data/chroma_db"): """Initialize ChromaDB for vector storage""" os.makedirs(db_path, exist_ok=True) self.client = chromadb.PersistentClient(path=db_path) self.collection = self.client.get_or_create_collection("documents") def add_document(self, file_path: str, text_chunks: List[str], metadata: Dict[str, Any] = None): """Add document chunks to the vector database""" # Generate unique IDs for each chunk ids = [hashlib.md5(f"{file_path}_{i}".encode()).hexdigest() for i in range(len(text_chunks))] # Create metadata for each chunk metadatas = [] for i in range(len(text_chunks)): chunk_metadata = {"source": file_path, "chunk_id": i} if metadata: chunk_metadata.update(metadata) metadatas.append(chunk_metadata) # Add to collection self.collection.add( documents=text_chunks, metadatas=metadatas, ids=ids ) return ids def search(self, query: str, n_results: int = 5): """Search for relevant document chunks""" results = self.collection.query( query_texts=[query], n_results=n_results ) return results def delete_document(self, file_path: str): """Deleting all chunks from a specific document""" # Getting all IDs related to this document results = self.collection.get( where={"source": file_path} ) if results and results['ids']: self.collection.delete(ids=results['ids']) def reset_collection(self): """Reset the collection by clearing all documents""" try: # Getting all document IDs try: all_ids = self.collection.get()["ids"] if all_ids: # Deleting all documents self.collection.delete(ids=all_ids) print(f"Deleted {len(all_ids)} documents from collection") else: print("Collection is already empty") return True except Exception as e: print(f"Error getting or deleting documents: {str(e)}") # Trying to recreate the collection as a fallback try: self.client.delete_collection("documents") self.collection = self.client.get_or_create_collection("documents") print("Collection recreated successfully") return True except Exception as e2: print(f"Error recreating collection: {str(e2)}") return False except Exception as e: print(f"Error resetting collection: {str(e)}") return False