Spaces:
Sleeping
Sleeping
File size: 1,652 Bytes
ef83e66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
"""
Shared semantic encoding utilities for backend services.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Iterable, List, Optional
import hashlib
import numpy as np
try:
from sentence_transformers import SentenceTransformer
except ImportError: # pragma: no cover - optional dependency
SentenceTransformer = None
@lru_cache(maxsize=1)
def _get_model() -> Optional[SentenceTransformer]:
"""
Lazily load the MiniLM encoder once per process.
"""
if SentenceTransformer is None:
return None
return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def embed_text(text: str) -> List[float]:
"""
Generate an embedding for the provided text.
"""
if not text:
text = ""
model = _get_model()
if model is None:
return _fallback_embed(text)
vector = model.encode(text)
return vector.tolist()
def cosine_similarity(vec_a: Iterable[float], vec_b: Iterable[float]) -> float:
a = np.array(list(vec_a), dtype=float)
b = np.array(list(vec_b), dtype=float)
denom = (np.linalg.norm(a) * np.linalg.norm(b))
if denom == 0:
return 0.0
return float(np.dot(a, b) / denom)
def _fallback_embed(text: str, dim: int = 64) -> List[float]:
"""
Deterministic hashing-based embedding used when sentence-transformers
is not available (e.g., during slim CI environments).
"""
vector = [0.0] * dim
for token in text.lower().split():
digest = hashlib.sha256(token.encode("utf-8")).hexdigest()
idx = int(digest, 16) % dim
vector[idx] += 1.0
return vector
|