DeepBoner / src /services /embedding_protocol.py
VibecoderMcSwaggins's picture
ci: upgrade to ironclad CI/CD configuration
e70a3b7
raw
history blame
4.32 kB
"""Protocol definition for embedding services.
This module defines the common interface that all embedding services must implement.
Using Protocol (PEP 544) for structural subtyping - no inheritance required.
Design Pattern: Strategy Pattern (Gang of Four)
- Each implementation (EmbeddingService, LlamaIndexRAGService) is a concrete strategy
- Protocol defines the strategy interface
- service_loader selects the appropriate strategy at runtime
SOLID Principles:
- Interface Segregation: Protocol includes only methods needed by consumers
- Dependency Inversion: Consumers depend on Protocol (abstraction), not concrete classes
- Liskov Substitution: All implementations are interchangeable
"""
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
if TYPE_CHECKING:
from src.utils.models import Evidence
@runtime_checkable
class EmbeddingServiceProtocol(Protocol):
"""Common interface for embedding services.
Both EmbeddingService (local/free) and LlamaIndexRAGService (OpenAI/premium)
implement this interface, allowing seamless swapping via get_embedding_service().
All methods are async to avoid blocking the event loop during:
- Embedding computation (CPU-bound with local models)
- Vector store operations (I/O-bound with persistent storage)
- API calls (network I/O with OpenAI embeddings)
Example:
```python
from src.utils.service_loader import get_embedding_service
# Get best available service (LlamaIndex if OpenAI key, else local)
service = get_embedding_service()
# Use via protocol interface
await service.add_evidence("id", "content", {"source": "pubmed"})
results = await service.search_similar("query", n_results=5)
unique = await service.deduplicate(evidence_list)
# Direct embedding (for MMR/diversity selection)
embedding = await service.embed("text")
embeddings = await service.embed_batch(["text1", "text2"])
```
"""
async def embed(self, text: str) -> list[float]:
"""Embed a single text into a vector.
Args:
text: Text to embed
Returns:
Embedding vector as list of floats
"""
...
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
"""Embed multiple texts efficiently.
More efficient than calling embed() multiple times due to batching.
Args:
texts: List of texts to embed
Returns:
List of embedding vectors
"""
...
async def add_evidence(self, evidence_id: str, content: str, metadata: dict[str, Any]) -> None:
"""Store evidence with embeddings.
Args:
evidence_id: Unique identifier (typically URL)
content: Text content to embed and store
metadata: Additional metadata for retrieval filtering
Expected keys: source, title, date, authors, url
"""
...
async def search_similar(self, query: str, n_results: int = 5) -> list[dict[str, Any]]:
"""Search for semantically similar content.
Args:
query: Search query text
n_results: Maximum number of results to return
Returns:
List of dicts with keys:
- id: Evidence identifier
- content: Original text content
- metadata: Stored metadata
- distance: Semantic distance (0 = identical, higher = less similar)
"""
...
async def deduplicate(
self, evidence: list["Evidence"], threshold: float = 0.9
) -> list["Evidence"]:
"""Remove duplicate evidence based on semantic similarity.
Uses the embedding service to check if new evidence is similar to
existing stored evidence. Unique evidence is stored automatically.
Args:
evidence: List of evidence items to deduplicate
threshold: Similarity threshold (0.9 = 90% similar is duplicate)
ChromaDB cosine distance interpretation:
- 0 = identical vectors
- 2 = opposite vectors
Duplicate if: distance < (1 - threshold)
Returns:
List of unique evidence items (duplicates removed)
"""
...