Spaces:
Running
Running
| """ | |
| MediGuard AI — Retriever Interface | |
| Abstract base class defining the common interface for all retriever backends: | |
| - FAISS (local dev and HuggingFace Spaces) | |
| - OpenSearch (production with BM25 + KNN hybrid) | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| logger = logging.getLogger(__name__) | |
| class RetrievalResult: | |
| """Unified result format for retrieval operations.""" | |
| doc_id: str | |
| """Unique identifier for the document chunk.""" | |
| content: str | |
| """The actual text content of the chunk.""" | |
| score: float | |
| """Relevance score (higher is better, normalized 0-1 where possible).""" | |
| metadata: dict[str, Any] = field(default_factory=dict) | |
| """Arbitrary metadata (source_file, page, section, etc.).""" | |
| def __repr__(self) -> str: | |
| preview = self.content[:80].replace("\n", " ") + "..." if len(self.content) > 80 else self.content | |
| return f"RetrievalResult(score={self.score:.3f}, content='{preview}')" | |
| class BaseRetriever(ABC): | |
| """ | |
| Abstract base class for retrieval backends. | |
| Implementations must provide: | |
| - retrieve(): Semantic/hybrid search | |
| - health(): Health check | |
| - doc_count(): Number of indexed documents | |
| Optionally: | |
| - retrieve_bm25(): Keyword-only search | |
| - retrieve_hybrid(): Combined BM25 + vector search | |
| """ | |
| def retrieve( | |
| self, | |
| query: str, | |
| *, | |
| top_k: int = 5, | |
| filters: dict[str, Any] | None = None, | |
| ) -> list[RetrievalResult]: | |
| """ | |
| Retrieve relevant documents for a query. | |
| Args: | |
| query: Natural language query | |
| top_k: Maximum number of results | |
| filters: Optional metadata filters (e.g., {"source_file": "guidelines.pdf"}) | |
| Returns: | |
| List of RetrievalResult objects, ordered by relevance (highest first) | |
| """ | |
| ... | |
| def health(self) -> bool: | |
| """ | |
| Check if the retriever is healthy and ready. | |
| Returns: | |
| True if operational, False otherwise | |
| """ | |
| ... | |
| def doc_count(self) -> int: | |
| """ | |
| Return the number of indexed document chunks. | |
| Returns: | |
| Total document count, or 0 if unavailable | |
| """ | |
| ... | |
| def retrieve_bm25( | |
| self, | |
| query: str, | |
| *, | |
| top_k: int = 5, | |
| filters: dict[str, Any] | None = None, | |
| ) -> list[RetrievalResult]: | |
| """ | |
| BM25 keyword search (optional, falls back to retrieve()). | |
| Args: | |
| query: Natural language query | |
| top_k: Maximum results | |
| filters: Optional filters | |
| Returns: | |
| List of RetrievalResult objects | |
| """ | |
| logger.warning("%s does not support BM25, falling back to retrieve()", type(self).__name__) | |
| return self.retrieve(query, top_k=top_k, filters=filters) | |
| def retrieve_hybrid( | |
| self, | |
| query: str, | |
| embedding: list[float] | None = None, | |
| *, | |
| top_k: int = 5, | |
| filters: dict[str, Any] | None = None, | |
| bm25_weight: float = 0.4, | |
| vector_weight: float = 0.6, | |
| ) -> list[RetrievalResult]: | |
| """ | |
| Hybrid search combining BM25 and vector search (optional). | |
| Args: | |
| query: Natural language query | |
| embedding: Pre-computed embedding (optional) | |
| top_k: Maximum results | |
| filters: Optional filters | |
| bm25_weight: Weight for BM25 component | |
| vector_weight: Weight for vector component | |
| Returns: | |
| List of RetrievalResult objects | |
| """ | |
| logger.warning("%s does not support hybrid search, falling back to retrieve()", type(self).__name__) | |
| return self.retrieve(query, top_k=top_k, filters=filters) | |
| def backend_name(self) -> str: | |
| """Human-readable backend name for logging.""" | |
| return type(self).__name__ | |