Spaces:
Sleeping
Sleeping
| """ | |
| Shared semantic encoding utilities for backend services. | |
| """ | |
| from __future__ import annotations | |
| from functools import lru_cache | |
| from typing import Iterable, List, Optional | |
| import hashlib | |
| import numpy as np | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| except ImportError: # pragma: no cover - optional dependency | |
| SentenceTransformer = None | |
| def _get_model() -> Optional[SentenceTransformer]: | |
| """ | |
| Lazily load the MiniLM encoder once per process. | |
| """ | |
| if SentenceTransformer is None: | |
| return None | |
| return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def embed_text(text: str) -> List[float]: | |
| """ | |
| Generate an embedding for the provided text. | |
| """ | |
| if not text: | |
| text = "" | |
| model = _get_model() | |
| if model is None: | |
| return _fallback_embed(text) | |
| vector = model.encode(text) | |
| return vector.tolist() | |
| def cosine_similarity(vec_a: Iterable[float], vec_b: Iterable[float]) -> float: | |
| a = np.array(list(vec_a), dtype=float) | |
| b = np.array(list(vec_b), dtype=float) | |
| denom = (np.linalg.norm(a) * np.linalg.norm(b)) | |
| if denom == 0: | |
| return 0.0 | |
| return float(np.dot(a, b) / denom) | |
| def _fallback_embed(text: str, dim: int = 64) -> List[float]: | |
| """ | |
| Deterministic hashing-based embedding used when sentence-transformers | |
| is not available (e.g., during slim CI environments). | |
| """ | |
| vector = [0.0] * dim | |
| for token in text.lower().split(): | |
| digest = hashlib.sha256(token.encode("utf-8")).hexdigest() | |
| idx = int(digest, 16) % dim | |
| vector[idx] += 1.0 | |
| return vector | |