import chromadb
from chromadb.utils import embedding_functions
import os
import hashlib

class VectorService:
    def __init__(self, db_path="./chroma_db"):
        self.client = chromadb.PersistentClient(path=db_path)
        # Use a simple default embedding function or LLM Services if needed
        self.ef = embedding_functions.DefaultEmbeddingFunction()
        self.collection = self.client.get_or_create_collection(
            name="document_fingerprints",
            embedding_function=self.ef
        )

    def get_file_hash(self, file_path):
        """Generate a hash for exact match detection."""
        hasher = hashlib.sha256()
        with open(file_path, 'rb') as f:
            buf = f.read()
            hasher.update(buf)
        return hasher.hexdigest()

    def add_document(self, file_path, doc_id, metadata=None):
        """Add a document's representation to the vector store."""
        # For documents, we might want to extract text or just use metadata/hashes
        # Here we use the filename and some metadata as a 'content' proxy for now, 
        # but ideally we'd use extracted text or visual embeddings.
        content = f"Document: {os.path.basename(file_path)}"
        file_hash = self.get_file_hash(file_path)
        
        meta = metadata or {}
        meta["file_hash"] = file_hash
        meta["file_path"] = file_path

        self.collection.add(
            documents=[content],
            metadatas=[meta],
            ids=[doc_id]
        )

    def find_duplicates(self, file_path):
        """Find if a document or a very similar one exists."""
        file_hash = self.get_file_hash(file_path)
        
        # 1. Exact match by hash
        results = self.collection.get(where={"file_hash": file_hash})
        if results and results['ids']:
            return {"type": "exact", "match": results['metadatas'][0]}

        # 2. Semantic match (very simple proxy for now)
        content = f"Document: {os.path.basename(file_path)}"
        results = self.collection.query(
            query_texts=[content],
            n_results=1
        )
        
        if results and results['distances'] and results['distances'][0]:
            distance = results['distances'][0][0]
            if distance < 0.1: # Threshold for 'too similar'
                return {"type": "semantic", "match": results['metadatas'][0][0], "distance": distance}
        
        return None

    def get_document(self, doc_id):
        """Retrieve a document and its metadata by ID."""
        results = self.collection.get(ids=[doc_id])
        if results and results['ids']:
            return {
                "id": results['ids'][0],
                "document": results['documents'][0],
                "metadata": results['metadatas'][0]
            }
        return None

    def delete_document(self, doc_id):
        """Delete a document from the collection by ID."""
        # Optional: Delete the actual file from storage if you want
        doc = self.get_document(doc_id)
        if doc and 'metadata' in doc:
            file_path = doc['metadata'].get('file_path')
            if file_path and os.path.exists(file_path):
                try:
                    os.remove(file_path)
                except Exception as e:
                    print(f"Error deleting file {file_path}: {e}")
        
        self.collection.delete(ids=[doc_id])
        return True