Spaces:

obaes
/

fraudoo

Sleeping

App Files Files Community

fraudoo / vector_service.py

obaes

Upload 10 files

71680bc verified about 2 months ago

raw

history blame contribute delete

3.42 kB

	import chromadb
	from chromadb.utils import embedding_functions
	import os
	import hashlib

	class VectorService:
	def __init__(self, db_path="./chroma_db"):
	self.client = chromadb.PersistentClient(path=db_path)
	# Use a simple default embedding function or LLM Services if needed
	self.ef = embedding_functions.DefaultEmbeddingFunction()
	self.collection = self.client.get_or_create_collection(
	name="document_fingerprints",
	embedding_function=self.ef
	)

	def get_file_hash(self, file_path):
	"""Generate a hash for exact match detection."""
	hasher = hashlib.sha256()
	with open(file_path, 'rb') as f:
	buf = f.read()
	hasher.update(buf)
	return hasher.hexdigest()

	def add_document(self, file_path, doc_id, metadata=None):
	"""Add a document's representation to the vector store."""
	# For documents, we might want to extract text or just use metadata/hashes
	# Here we use the filename and some metadata as a 'content' proxy for now,
	# but ideally we'd use extracted text or visual embeddings.
	content = f"Document: {os.path.basename(file_path)}"
	file_hash = self.get_file_hash(file_path)

	meta = metadata or {}
	meta["file_hash"] = file_hash
	meta["file_path"] = file_path

	self.collection.add(
	documents=[content],
	metadatas=[meta],
	ids=[doc_id]
	)

	def find_duplicates(self, file_path):
	"""Find if a document or a very similar one exists."""
	file_hash = self.get_file_hash(file_path)

	# 1. Exact match by hash
	results = self.collection.get(where={"file_hash": file_hash})
	if results and results['ids']:
	return {"type": "exact", "match": results['metadatas'][0]}

	# 2. Semantic match (very simple proxy for now)
	content = f"Document: {os.path.basename(file_path)}"
	results = self.collection.query(
	query_texts=[content],
	n_results=1
	)

	if results and results['distances'] and results['distances'][0]:
	distance = results['distances'][0][0]
	if distance < 0.1: # Threshold for 'too similar'
	return {"type": "semantic", "match": results['metadatas'][0][0], "distance": distance}

	return None

	def get_document(self, doc_id):
	"""Retrieve a document and its metadata by ID."""
	results = self.collection.get(ids=[doc_id])
	if results and results['ids']:
	return {
	"id": results['ids'][0],
	"document": results['documents'][0],
	"metadata": results['metadatas'][0]
	}
	return None

	def delete_document(self, doc_id):
	"""Delete a document from the collection by ID."""
	# Optional: Delete the actual file from storage if you want
	doc = self.get_document(doc_id)
	if doc and 'metadata' in doc:
	file_path = doc['metadata'].get('file_path')
	if file_path and os.path.exists(file_path):
	try:
	os.remove(file_path)
	except Exception as e:
	print(f"Error deleting file {file_path}: {e}")

	self.collection.delete(ids=[doc_id])
	return True