| import chromadb |
| from chromadb.utils import embedding_functions |
| import os |
| import hashlib |
|
|
| class VectorService: |
| def __init__(self, db_path="./chroma_db"): |
| self.client = chromadb.PersistentClient(path=db_path) |
| |
| self.ef = embedding_functions.DefaultEmbeddingFunction() |
| self.collection = self.client.get_or_create_collection( |
| name="document_fingerprints", |
| embedding_function=self.ef |
| ) |
|
|
| def get_file_hash(self, file_path): |
| """Generate a hash for exact match detection.""" |
| hasher = hashlib.sha256() |
| with open(file_path, 'rb') as f: |
| buf = f.read() |
| hasher.update(buf) |
| return hasher.hexdigest() |
|
|
| def add_document(self, file_path, doc_id, metadata=None): |
| """Add a document's representation to the vector store.""" |
| |
| |
| |
| content = f"Document: {os.path.basename(file_path)}" |
| file_hash = self.get_file_hash(file_path) |
| |
| meta = metadata or {} |
| meta["file_hash"] = file_hash |
| meta["file_path"] = file_path |
|
|
| self.collection.add( |
| documents=[content], |
| metadatas=[meta], |
| ids=[doc_id] |
| ) |
|
|
| def find_duplicates(self, file_path): |
| """Find if a document or a very similar one exists.""" |
| file_hash = self.get_file_hash(file_path) |
| |
| |
| results = self.collection.get(where={"file_hash": file_hash}) |
| if results and results['ids']: |
| return {"type": "exact", "match": results['metadatas'][0]} |
|
|
| |
| content = f"Document: {os.path.basename(file_path)}" |
| results = self.collection.query( |
| query_texts=[content], |
| n_results=1 |
| ) |
| |
| if results and results['distances'] and results['distances'][0]: |
| distance = results['distances'][0][0] |
| if distance < 0.1: |
| return {"type": "semantic", "match": results['metadatas'][0][0], "distance": distance} |
| |
| return None |
|
|
| def get_document(self, doc_id): |
| """Retrieve a document and its metadata by ID.""" |
| results = self.collection.get(ids=[doc_id]) |
| if results and results['ids']: |
| return { |
| "id": results['ids'][0], |
| "document": results['documents'][0], |
| "metadata": results['metadatas'][0] |
| } |
| return None |
|
|
| def delete_document(self, doc_id): |
| """Delete a document from the collection by ID.""" |
| |
| doc = self.get_document(doc_id) |
| if doc and 'metadata' in doc: |
| file_path = doc['metadata'].get('file_path') |
| if file_path and os.path.exists(file_path): |
| try: |
| os.remove(file_path) |
| except Exception as e: |
| print(f"Error deleting file {file_path}: {e}") |
| |
| self.collection.delete(ids=[doc_id]) |
| return True |
|
|