|
|
"""Protocol definition for embedding services. |
|
|
|
|
|
This module defines the common interface that all embedding services must implement. |
|
|
Using Protocol (PEP 544) for structural subtyping - no inheritance required. |
|
|
|
|
|
Design Pattern: Strategy Pattern (Gang of Four) |
|
|
- Each implementation (EmbeddingService, LlamaIndexRAGService) is a concrete strategy |
|
|
- Protocol defines the strategy interface |
|
|
- service_loader selects the appropriate strategy at runtime |
|
|
|
|
|
SOLID Principles: |
|
|
- Interface Segregation: Protocol includes only methods needed by consumers |
|
|
- Dependency Inversion: Consumers depend on Protocol (abstraction), not concrete classes |
|
|
- Liskov Substitution: All implementations are interchangeable |
|
|
""" |
|
|
|
|
|
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable |
|
|
|
|
|
if TYPE_CHECKING: |
|
|
from src.utils.models import Evidence |
|
|
|
|
|
|
|
|
@runtime_checkable |
|
|
class EmbeddingServiceProtocol(Protocol): |
|
|
"""Common interface for embedding services. |
|
|
|
|
|
Both EmbeddingService (local/free) and LlamaIndexRAGService (OpenAI/premium) |
|
|
implement this interface, allowing seamless swapping via get_embedding_service(). |
|
|
|
|
|
All methods are async to avoid blocking the event loop during: |
|
|
- Embedding computation (CPU-bound with local models) |
|
|
- Vector store operations (I/O-bound with persistent storage) |
|
|
- API calls (network I/O with OpenAI embeddings) |
|
|
|
|
|
Example: |
|
|
```python |
|
|
from src.utils.service_loader import get_embedding_service |
|
|
|
|
|
# Get best available service (LlamaIndex if OpenAI key, else local) |
|
|
service = get_embedding_service() |
|
|
|
|
|
# Use via protocol interface |
|
|
await service.add_evidence("id", "content", {"source": "pubmed"}) |
|
|
results = await service.search_similar("query", n_results=5) |
|
|
unique = await service.deduplicate(evidence_list) |
|
|
|
|
|
# Direct embedding (for MMR/diversity selection) |
|
|
embedding = await service.embed("text") |
|
|
embeddings = await service.embed_batch(["text1", "text2"]) |
|
|
``` |
|
|
""" |
|
|
|
|
|
async def embed(self, text: str) -> list[float]: |
|
|
"""Embed a single text into a vector. |
|
|
|
|
|
Args: |
|
|
text: Text to embed |
|
|
|
|
|
Returns: |
|
|
Embedding vector as list of floats |
|
|
""" |
|
|
... |
|
|
|
|
|
async def embed_batch(self, texts: list[str]) -> list[list[float]]: |
|
|
"""Embed multiple texts efficiently. |
|
|
|
|
|
More efficient than calling embed() multiple times due to batching. |
|
|
|
|
|
Args: |
|
|
texts: List of texts to embed |
|
|
|
|
|
Returns: |
|
|
List of embedding vectors |
|
|
""" |
|
|
... |
|
|
|
|
|
async def add_evidence(self, evidence_id: str, content: str, metadata: dict[str, Any]) -> None: |
|
|
"""Store evidence with embeddings. |
|
|
|
|
|
Args: |
|
|
evidence_id: Unique identifier (typically URL) |
|
|
content: Text content to embed and store |
|
|
metadata: Additional metadata for retrieval filtering |
|
|
Expected keys: source, title, date, authors, url |
|
|
""" |
|
|
... |
|
|
|
|
|
async def search_similar(self, query: str, n_results: int = 5) -> list[dict[str, Any]]: |
|
|
"""Search for semantically similar content. |
|
|
|
|
|
Args: |
|
|
query: Search query text |
|
|
n_results: Maximum number of results to return |
|
|
|
|
|
Returns: |
|
|
List of dicts with keys: |
|
|
- id: Evidence identifier |
|
|
- content: Original text content |
|
|
- metadata: Stored metadata |
|
|
- distance: Semantic distance (0 = identical, higher = less similar) |
|
|
""" |
|
|
... |
|
|
|
|
|
async def deduplicate( |
|
|
self, evidence: list["Evidence"], threshold: float = 0.9 |
|
|
) -> list["Evidence"]: |
|
|
"""Remove duplicate evidence based on semantic similarity. |
|
|
|
|
|
Uses the embedding service to check if new evidence is similar to |
|
|
existing stored evidence. Unique evidence is stored automatically. |
|
|
|
|
|
Args: |
|
|
evidence: List of evidence items to deduplicate |
|
|
threshold: Similarity threshold (0.9 = 90% similar is duplicate) |
|
|
ChromaDB cosine distance interpretation: |
|
|
- 0 = identical vectors |
|
|
- 2 = opposite vectors |
|
|
Duplicate if: distance < (1 - threshold) |
|
|
|
|
|
Returns: |
|
|
List of unique evidence items (duplicates removed) |
|
|
""" |
|
|
... |
|
|
|