""" Shared semantic encoding utilities for backend services. """ from __future__ import annotations from functools import lru_cache from typing import Iterable, List, Optional import hashlib import numpy as np try: from sentence_transformers import SentenceTransformer except ImportError: # pragma: no cover - optional dependency SentenceTransformer = None @lru_cache(maxsize=1) def _get_model() -> Optional[SentenceTransformer]: """ Lazily load the MiniLM encoder once per process. """ if SentenceTransformer is None: return None return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") def embed_text(text: str) -> List[float]: """ Generate an embedding for the provided text. """ if not text: text = "" model = _get_model() if model is None: return _fallback_embed(text) vector = model.encode(text) return vector.tolist() def cosine_similarity(vec_a: Iterable[float], vec_b: Iterable[float]) -> float: a = np.array(list(vec_a), dtype=float) b = np.array(list(vec_b), dtype=float) denom = (np.linalg.norm(a) * np.linalg.norm(b)) if denom == 0: return 0.0 return float(np.dot(a, b) / denom) def _fallback_embed(text: str, dim: int = 64) -> List[float]: """ Deterministic hashing-based embedding used when sentence-transformers is not available (e.g., during slim CI environments). """ vector = [0.0] * dim for token in text.lower().split(): digest = hashlib.sha256(token.encode("utf-8")).hexdigest() idx = int(digest, 16) % dim vector[idx] += 1.0 return vector