Spaces:

VibecoderMcSwaggins
/

DeepBoner

Paused

App Files Files Community

DeepBoner / src /services /embedding_protocol.py

VibecoderMcSwaggins

ci: upgrade to ironclad CI/CD configuration

e70a3b7 15 days ago

raw

history blame

4.32 kB

	"""Protocol definition for embedding services.

	This module defines the common interface that all embedding services must implement.
	Using Protocol (PEP 544) for structural subtyping - no inheritance required.

	Design Pattern: Strategy Pattern (Gang of Four)
	- Each implementation (EmbeddingService, LlamaIndexRAGService) is a concrete strategy
	- Protocol defines the strategy interface
	- service_loader selects the appropriate strategy at runtime

	SOLID Principles:
	- Interface Segregation: Protocol includes only methods needed by consumers
	- Dependency Inversion: Consumers depend on Protocol (abstraction), not concrete classes
	- Liskov Substitution: All implementations are interchangeable
	"""

	from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable

	if TYPE_CHECKING:
	from src.utils.models import Evidence


	@runtime_checkable
	class EmbeddingServiceProtocol(Protocol):
	"""Common interface for embedding services.

	Both EmbeddingService (local/free) and LlamaIndexRAGService (OpenAI/premium)
	implement this interface, allowing seamless swapping via get_embedding_service().

	All methods are async to avoid blocking the event loop during:
	- Embedding computation (CPU-bound with local models)
	- Vector store operations (I/O-bound with persistent storage)
	- API calls (network I/O with OpenAI embeddings)

	Example:
	```python
	from src.utils.service_loader import get_embedding_service

	# Get best available service (LlamaIndex if OpenAI key, else local)
	service = get_embedding_service()

	# Use via protocol interface
	await service.add_evidence("id", "content", {"source": "pubmed"})
	results = await service.search_similar("query", n_results=5)
	unique = await service.deduplicate(evidence_list)

	# Direct embedding (for MMR/diversity selection)
	embedding = await service.embed("text")
	embeddings = await service.embed_batch(["text1", "text2"])
	```
	"""

	async def embed(self, text: str) -> list[float]:
	"""Embed a single text into a vector.

	Args:
	text: Text to embed

	Returns:
	Embedding vector as list of floats
	"""
	...

	async def embed_batch(self, texts: list[str]) -> list[list[float]]:
	"""Embed multiple texts efficiently.

	More efficient than calling embed() multiple times due to batching.

	Args:
	texts: List of texts to embed

	Returns:
	List of embedding vectors
	"""
	...

	async def add_evidence(self, evidence_id: str, content: str, metadata: dict[str, Any]) -> None:
	"""Store evidence with embeddings.

	Args:
	evidence_id: Unique identifier (typically URL)
	content: Text content to embed and store
	metadata: Additional metadata for retrieval filtering
	Expected keys: source, title, date, authors, url
	"""
	...

	async def search_similar(self, query: str, n_results: int = 5) -> list[dict[str, Any]]:
	"""Search for semantically similar content.

	Args:
	query: Search query text
	n_results: Maximum number of results to return

	Returns:
	List of dicts with keys:
	- id: Evidence identifier
	- content: Original text content
	- metadata: Stored metadata
	- distance: Semantic distance (0 = identical, higher = less similar)
	"""
	...

	async def deduplicate(
	self, evidence: list["Evidence"], threshold: float = 0.9
	) -> list["Evidence"]:
	"""Remove duplicate evidence based on semantic similarity.

	Uses the embedding service to check if new evidence is similar to
	existing stored evidence. Unique evidence is stored automatically.

	Args:
	evidence: List of evidence items to deduplicate
	threshold: Similarity threshold (0.9 = 90% similar is duplicate)
	ChromaDB cosine distance interpretation:
	- 0 = identical vectors
	- 2 = opposite vectors
	Duplicate if: distance < (1 - threshold)

	Returns:
	List of unique evidence items (duplicates removed)
	"""
	...