aidenv03's picture
Initial deploy
d3a26e1
"""Local text embedding helpers for retrieval.
Spec references:
- `specs/10_test_plan.md`: deterministic, unit-testable retrieval primitives.
Notes:
- Embeddings are computed locally with `sentence-transformers`.
- This module does not persist embeddings.
"""
from __future__ import annotations
from functools import lru_cache
import os
from typing import Protocol, cast
class EmbedderError(Exception):
"""Base exception for embedding failures."""
class EmbedderDependencyError(EmbedderError):
"""Raised when `sentence-transformers` is unavailable."""
class EmbedderModelError(EmbedderError):
"""Raised when the configured embedding model cannot be loaded."""
class _SentenceTransformerLike(Protocol):
"""Protocol for the subset of the sentence-transformers API used here."""
def encode(
self,
sentences: list[str],
*,
convert_to_numpy: bool,
normalize_embeddings: bool,
show_progress_bar: bool,
) -> object:
"""Encode input texts into vector embeddings."""
def _model_name() -> str:
"""Return the configured local embedding model identifier.
Raises:
EmbedderModelError: If the configured model identifier is blank.
"""
model_name: str = os.getenv(
"NOTEBOOKLM_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
).strip()
if not model_name:
raise EmbedderModelError("Embedding model name must be a non-empty string.")
return model_name
@lru_cache(maxsize=1)
def _load_model() -> _SentenceTransformerLike:
"""Load and cache the local embedding model once per process.
Raises:
EmbedderDependencyError: If `sentence-transformers` is not installed.
EmbedderModelError: If the model cannot be initialized locally.
"""
try:
from sentence_transformers import SentenceTransformer
except ImportError as exc:
raise EmbedderDependencyError(
"Embedding requires the 'sentence-transformers' package to be installed."
) from exc
model_name: str = _model_name()
try:
model = SentenceTransformer(model_name)
except Exception as exc:
raise EmbedderModelError(f"Failed to load embedding model: {model_name}") from exc
return cast(_SentenceTransformerLike, model)
def embed_texts(texts: list[str]) -> list[list[float]]:
"""Embed texts locally and return vectors aligned to input order.
Spec references:
- User requirement: return embeddings aligned to the original input order.
- `specs/10_test_plan.md`: implementation should be explicit and testable.
Args:
texts: Input strings to embed.
Returns:
A list of float vectors aligned one-to-one with `texts`.
Raises:
TypeError: If `texts` is not a list of strings.
EmbedderDependencyError: If `sentence-transformers` is unavailable.
EmbedderModelError: If the model cannot be loaded.
EmbedderError: If encoding fails or the output shape is invalid.
"""
if not isinstance(texts, list):
raise TypeError("texts must be a list of strings.")
if any(not isinstance(text, str) for text in texts):
raise TypeError("texts must contain only strings.")
if not texts:
return []
model: _SentenceTransformerLike = _load_model()
try:
raw_embeddings: object = model.encode(
texts,
convert_to_numpy=True,
normalize_embeddings=False,
show_progress_bar=False,
)
except Exception as exc:
raise EmbedderError("Failed to encode input texts.") from exc
if not hasattr(raw_embeddings, "tolist"):
raise EmbedderError("Embedding model returned a non-convertible result.")
embeddings_object: object = raw_embeddings.tolist()
if not isinstance(embeddings_object, list):
raise EmbedderError("Embedding model returned an invalid top-level result.")
embeddings: list[list[float]] = []
for vector in embeddings_object:
if not isinstance(vector, list):
raise EmbedderError("Embedding model returned an invalid vector result.")
float_vector: list[float] = []
for value in vector:
if not isinstance(value, (int, float)):
raise EmbedderError("Embedding model returned a non-numeric value.")
float_vector.append(float(value))
embeddings.append(float_vector)
if len(embeddings) != len(texts):
raise EmbedderError("Embedding count does not match input text count.")
return embeddings