Spaces:
Running
Running
| """Local text embedding helpers for retrieval. | |
| Spec references: | |
| - `specs/10_test_plan.md`: deterministic, unit-testable retrieval primitives. | |
| Notes: | |
| - Embeddings are computed locally with `sentence-transformers`. | |
| - This module does not persist embeddings. | |
| """ | |
| from __future__ import annotations | |
| from functools import lru_cache | |
| import os | |
| from typing import Protocol, cast | |
| class EmbedderError(Exception): | |
| """Base exception for embedding failures.""" | |
| class EmbedderDependencyError(EmbedderError): | |
| """Raised when `sentence-transformers` is unavailable.""" | |
| class EmbedderModelError(EmbedderError): | |
| """Raised when the configured embedding model cannot be loaded.""" | |
| class _SentenceTransformerLike(Protocol): | |
| """Protocol for the subset of the sentence-transformers API used here.""" | |
| def encode( | |
| self, | |
| sentences: list[str], | |
| *, | |
| convert_to_numpy: bool, | |
| normalize_embeddings: bool, | |
| show_progress_bar: bool, | |
| ) -> object: | |
| """Encode input texts into vector embeddings.""" | |
| def _model_name() -> str: | |
| """Return the configured local embedding model identifier. | |
| Raises: | |
| EmbedderModelError: If the configured model identifier is blank. | |
| """ | |
| model_name: str = os.getenv( | |
| "NOTEBOOKLM_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2" | |
| ).strip() | |
| if not model_name: | |
| raise EmbedderModelError("Embedding model name must be a non-empty string.") | |
| return model_name | |
| def _load_model() -> _SentenceTransformerLike: | |
| """Load and cache the local embedding model once per process. | |
| Raises: | |
| EmbedderDependencyError: If `sentence-transformers` is not installed. | |
| EmbedderModelError: If the model cannot be initialized locally. | |
| """ | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| except ImportError as exc: | |
| raise EmbedderDependencyError( | |
| "Embedding requires the 'sentence-transformers' package to be installed." | |
| ) from exc | |
| model_name: str = _model_name() | |
| try: | |
| model = SentenceTransformer(model_name) | |
| except Exception as exc: | |
| raise EmbedderModelError(f"Failed to load embedding model: {model_name}") from exc | |
| return cast(_SentenceTransformerLike, model) | |
| def embed_texts(texts: list[str]) -> list[list[float]]: | |
| """Embed texts locally and return vectors aligned to input order. | |
| Spec references: | |
| - User requirement: return embeddings aligned to the original input order. | |
| - `specs/10_test_plan.md`: implementation should be explicit and testable. | |
| Args: | |
| texts: Input strings to embed. | |
| Returns: | |
| A list of float vectors aligned one-to-one with `texts`. | |
| Raises: | |
| TypeError: If `texts` is not a list of strings. | |
| EmbedderDependencyError: If `sentence-transformers` is unavailable. | |
| EmbedderModelError: If the model cannot be loaded. | |
| EmbedderError: If encoding fails or the output shape is invalid. | |
| """ | |
| if not isinstance(texts, list): | |
| raise TypeError("texts must be a list of strings.") | |
| if any(not isinstance(text, str) for text in texts): | |
| raise TypeError("texts must contain only strings.") | |
| if not texts: | |
| return [] | |
| model: _SentenceTransformerLike = _load_model() | |
| try: | |
| raw_embeddings: object = model.encode( | |
| texts, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| show_progress_bar=False, | |
| ) | |
| except Exception as exc: | |
| raise EmbedderError("Failed to encode input texts.") from exc | |
| if not hasattr(raw_embeddings, "tolist"): | |
| raise EmbedderError("Embedding model returned a non-convertible result.") | |
| embeddings_object: object = raw_embeddings.tolist() | |
| if not isinstance(embeddings_object, list): | |
| raise EmbedderError("Embedding model returned an invalid top-level result.") | |
| embeddings: list[list[float]] = [] | |
| for vector in embeddings_object: | |
| if not isinstance(vector, list): | |
| raise EmbedderError("Embedding model returned an invalid vector result.") | |
| float_vector: list[float] = [] | |
| for value in vector: | |
| if not isinstance(value, (int, float)): | |
| raise EmbedderError("Embedding model returned a non-numeric value.") | |
| float_vector.append(float(value)) | |
| embeddings.append(float_vector) | |
| if len(embeddings) != len(texts): | |
| raise EmbedderError("Embedding count does not match input text count.") | |
| return embeddings | |