fraudoo / vector_service.py
obaes's picture
Upload 10 files
71680bc verified
import chromadb
from chromadb.utils import embedding_functions
import os
import hashlib
class VectorService:
def __init__(self, db_path="./chroma_db"):
self.client = chromadb.PersistentClient(path=db_path)
# Use a simple default embedding function or LLM Services if needed
self.ef = embedding_functions.DefaultEmbeddingFunction()
self.collection = self.client.get_or_create_collection(
name="document_fingerprints",
embedding_function=self.ef
)
def get_file_hash(self, file_path):
"""Generate a hash for exact match detection."""
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
buf = f.read()
hasher.update(buf)
return hasher.hexdigest()
def add_document(self, file_path, doc_id, metadata=None):
"""Add a document's representation to the vector store."""
# For documents, we might want to extract text or just use metadata/hashes
# Here we use the filename and some metadata as a 'content' proxy for now,
# but ideally we'd use extracted text or visual embeddings.
content = f"Document: {os.path.basename(file_path)}"
file_hash = self.get_file_hash(file_path)
meta = metadata or {}
meta["file_hash"] = file_hash
meta["file_path"] = file_path
self.collection.add(
documents=[content],
metadatas=[meta],
ids=[doc_id]
)
def find_duplicates(self, file_path):
"""Find if a document or a very similar one exists."""
file_hash = self.get_file_hash(file_path)
# 1. Exact match by hash
results = self.collection.get(where={"file_hash": file_hash})
if results and results['ids']:
return {"type": "exact", "match": results['metadatas'][0]}
# 2. Semantic match (very simple proxy for now)
content = f"Document: {os.path.basename(file_path)}"
results = self.collection.query(
query_texts=[content],
n_results=1
)
if results and results['distances'] and results['distances'][0]:
distance = results['distances'][0][0]
if distance < 0.1: # Threshold for 'too similar'
return {"type": "semantic", "match": results['metadatas'][0][0], "distance": distance}
return None
def get_document(self, doc_id):
"""Retrieve a document and its metadata by ID."""
results = self.collection.get(ids=[doc_id])
if results and results['ids']:
return {
"id": results['ids'][0],
"document": results['documents'][0],
"metadata": results['metadatas'][0]
}
return None
def delete_document(self, doc_id):
"""Delete a document from the collection by ID."""
# Optional: Delete the actual file from storage if you want
doc = self.get_document(doc_id)
if doc and 'metadata' in doc:
file_path = doc['metadata'].get('file_path')
if file_path and os.path.exists(file_path):
try:
os.remove(file_path)
except Exception as e:
print(f"Error deleting file {file_path}: {e}")
self.collection.delete(ids=[doc_id])
return True