"""Local text embedding helpers for retrieval. Spec references: - `specs/10_test_plan.md`: deterministic, unit-testable retrieval primitives. Notes: - Embeddings are computed locally with `sentence-transformers`. - This module does not persist embeddings. """ from __future__ import annotations from functools import lru_cache import os from typing import Protocol, cast class EmbedderError(Exception): """Base exception for embedding failures.""" class EmbedderDependencyError(EmbedderError): """Raised when `sentence-transformers` is unavailable.""" class EmbedderModelError(EmbedderError): """Raised when the configured embedding model cannot be loaded.""" class _SentenceTransformerLike(Protocol): """Protocol for the subset of the sentence-transformers API used here.""" def encode( self, sentences: list[str], *, convert_to_numpy: bool, normalize_embeddings: bool, show_progress_bar: bool, ) -> object: """Encode input texts into vector embeddings.""" def _model_name() -> str: """Return the configured local embedding model identifier. Raises: EmbedderModelError: If the configured model identifier is blank. """ model_name: str = os.getenv( "NOTEBOOKLM_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2" ).strip() if not model_name: raise EmbedderModelError("Embedding model name must be a non-empty string.") return model_name @lru_cache(maxsize=1) def _load_model() -> _SentenceTransformerLike: """Load and cache the local embedding model once per process. Raises: EmbedderDependencyError: If `sentence-transformers` is not installed. EmbedderModelError: If the model cannot be initialized locally. """ try: from sentence_transformers import SentenceTransformer except ImportError as exc: raise EmbedderDependencyError( "Embedding requires the 'sentence-transformers' package to be installed." ) from exc model_name: str = _model_name() try: model = SentenceTransformer(model_name) except Exception as exc: raise EmbedderModelError(f"Failed to load embedding model: {model_name}") from exc return cast(_SentenceTransformerLike, model) def embed_texts(texts: list[str]) -> list[list[float]]: """Embed texts locally and return vectors aligned to input order. Spec references: - User requirement: return embeddings aligned to the original input order. - `specs/10_test_plan.md`: implementation should be explicit and testable. Args: texts: Input strings to embed. Returns: A list of float vectors aligned one-to-one with `texts`. Raises: TypeError: If `texts` is not a list of strings. EmbedderDependencyError: If `sentence-transformers` is unavailable. EmbedderModelError: If the model cannot be loaded. EmbedderError: If encoding fails or the output shape is invalid. """ if not isinstance(texts, list): raise TypeError("texts must be a list of strings.") if any(not isinstance(text, str) for text in texts): raise TypeError("texts must contain only strings.") if not texts: return [] model: _SentenceTransformerLike = _load_model() try: raw_embeddings: object = model.encode( texts, convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=False, ) except Exception as exc: raise EmbedderError("Failed to encode input texts.") from exc if not hasattr(raw_embeddings, "tolist"): raise EmbedderError("Embedding model returned a non-convertible result.") embeddings_object: object = raw_embeddings.tolist() if not isinstance(embeddings_object, list): raise EmbedderError("Embedding model returned an invalid top-level result.") embeddings: list[list[float]] = [] for vector in embeddings_object: if not isinstance(vector, list): raise EmbedderError("Embedding model returned an invalid vector result.") float_vector: list[float] = [] for value in vector: if not isinstance(value, (int, float)): raise EmbedderError("Embedding model returned a non-numeric value.") float_vector.append(float(value)) embeddings.append(float_vector) if len(embeddings) != len(texts): raise EmbedderError("Embedding count does not match input text count.") return embeddings