import chromadb from chromadb.utils import embedding_functions import os import hashlib class VectorService: def __init__(self, db_path="./chroma_db"): self.client = chromadb.PersistentClient(path=db_path) # Use a simple default embedding function or LLM Services if needed self.ef = embedding_functions.DefaultEmbeddingFunction() self.collection = self.client.get_or_create_collection( name="document_fingerprints", embedding_function=self.ef ) def get_file_hash(self, file_path): """Generate a hash for exact match detection.""" hasher = hashlib.sha256() with open(file_path, 'rb') as f: buf = f.read() hasher.update(buf) return hasher.hexdigest() def add_document(self, file_path, doc_id, metadata=None): """Add a document's representation to the vector store.""" # For documents, we might want to extract text or just use metadata/hashes # Here we use the filename and some metadata as a 'content' proxy for now, # but ideally we'd use extracted text or visual embeddings. content = f"Document: {os.path.basename(file_path)}" file_hash = self.get_file_hash(file_path) meta = metadata or {} meta["file_hash"] = file_hash meta["file_path"] = file_path self.collection.add( documents=[content], metadatas=[meta], ids=[doc_id] ) def find_duplicates(self, file_path): """Find if a document or a very similar one exists.""" file_hash = self.get_file_hash(file_path) # 1. Exact match by hash results = self.collection.get(where={"file_hash": file_hash}) if results and results['ids']: return {"type": "exact", "match": results['metadatas'][0]} # 2. Semantic match (very simple proxy for now) content = f"Document: {os.path.basename(file_path)}" results = self.collection.query( query_texts=[content], n_results=1 ) if results and results['distances'] and results['distances'][0]: distance = results['distances'][0][0] if distance < 0.1: # Threshold for 'too similar' return {"type": "semantic", "match": results['metadatas'][0][0], "distance": distance} return None def get_document(self, doc_id): """Retrieve a document and its metadata by ID.""" results = self.collection.get(ids=[doc_id]) if results and results['ids']: return { "id": results['ids'][0], "document": results['documents'][0], "metadata": results['metadatas'][0] } return None def delete_document(self, doc_id): """Delete a document from the collection by ID.""" # Optional: Delete the actual file from storage if you want doc = self.get_document(doc_id) if doc and 'metadata' in doc: file_path = doc['metadata'].get('file_path') if file_path and os.path.exists(file_path): try: os.remove(file_path) except Exception as e: print(f"Error deleting file {file_path}: {e}") self.collection.delete(ids=[doc_id]) return True