Spaces:

VibecoderMcSwaggins
/

DeepBoner

Paused

File size: 4,322 Bytes

"""Protocol definition for embedding services.

This module defines the common interface that all embedding services must implement.
Using Protocol (PEP 544) for structural subtyping - no inheritance required.

Design Pattern: Strategy Pattern (Gang of Four)
- Each implementation (EmbeddingService, LlamaIndexRAGService) is a concrete strategy
- Protocol defines the strategy interface
- service_loader selects the appropriate strategy at runtime

SOLID Principles:
- Interface Segregation: Protocol includes only methods needed by consumers
- Dependency Inversion: Consumers depend on Protocol (abstraction), not concrete classes
- Liskov Substitution: All implementations are interchangeable
"""

from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable

if TYPE_CHECKING:
    from src.utils.models import Evidence


@runtime_checkable
class EmbeddingServiceProtocol(Protocol):
    """Common interface for embedding services.

    Both EmbeddingService (local/free) and LlamaIndexRAGService (OpenAI/premium)
    implement this interface, allowing seamless swapping via get_embedding_service().

    All methods are async to avoid blocking the event loop during:
    - Embedding computation (CPU-bound with local models)
    - Vector store operations (I/O-bound with persistent storage)
    - API calls (network I/O with OpenAI embeddings)

    Example:
        ```python
        from src.utils.service_loader import get_embedding_service

        # Get best available service (LlamaIndex if OpenAI key, else local)
        service = get_embedding_service()

        # Use via protocol interface
        await service.add_evidence("id", "content", {"source": "pubmed"})
        results = await service.search_similar("query", n_results=5)
        unique = await service.deduplicate(evidence_list)

        # Direct embedding (for MMR/diversity selection)
        embedding = await service.embed("text")
        embeddings = await service.embed_batch(["text1", "text2"])
        ```
    """

    async def embed(self, text: str) -> list[float]:
        """Embed a single text into a vector.

        Args:
            text: Text to embed

        Returns:
            Embedding vector as list of floats
        """
        ...

    async def embed_batch(self, texts: list[str]) -> list[list[float]]:
        """Embed multiple texts efficiently.

        More efficient than calling embed() multiple times due to batching.

        Args:
            texts: List of texts to embed

        Returns:
            List of embedding vectors
        """
        ...

    async def add_evidence(self, evidence_id: str, content: str, metadata: dict[str, Any]) -> None:
        """Store evidence with embeddings.

        Args:
            evidence_id: Unique identifier (typically URL)
            content: Text content to embed and store
            metadata: Additional metadata for retrieval filtering
                Expected keys: source, title, date, authors, url
        """
        ...

    async def search_similar(self, query: str, n_results: int = 5) -> list[dict[str, Any]]:
        """Search for semantically similar content.

        Args:
            query: Search query text
            n_results: Maximum number of results to return

        Returns:
            List of dicts with keys:
            - id: Evidence identifier
            - content: Original text content
            - metadata: Stored metadata
            - distance: Semantic distance (0 = identical, higher = less similar)
        """
        ...

    async def deduplicate(
        self, evidence: list["Evidence"], threshold: float = 0.9
    ) -> list["Evidence"]:
        """Remove duplicate evidence based on semantic similarity.

        Uses the embedding service to check if new evidence is similar to
        existing stored evidence. Unique evidence is stored automatically.

        Args:
            evidence: List of evidence items to deduplicate
            threshold: Similarity threshold (0.9 = 90% similar is duplicate)
                ChromaDB cosine distance interpretation:
                - 0 = identical vectors
                - 2 = opposite vectors
                Duplicate if: distance < (1 - threshold)

        Returns:
            List of unique evidence items (duplicates removed)
        """
        ...