File size: 1,652 Bytes
ef83e66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Shared semantic encoding utilities for backend services.
"""

from __future__ import annotations

from functools import lru_cache
from typing import Iterable, List, Optional
import hashlib

import numpy as np

try:
    from sentence_transformers import SentenceTransformer
except ImportError:  # pragma: no cover - optional dependency
    SentenceTransformer = None


@lru_cache(maxsize=1)
def _get_model() -> Optional[SentenceTransformer]:
    """
    Lazily load the MiniLM encoder once per process.
    """
    if SentenceTransformer is None:
        return None
    return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def embed_text(text: str) -> List[float]:
    """
    Generate an embedding for the provided text.
    """
    if not text:
        text = ""
    model = _get_model()
    if model is None:
        return _fallback_embed(text)
    vector = model.encode(text)
    return vector.tolist()


def cosine_similarity(vec_a: Iterable[float], vec_b: Iterable[float]) -> float:
    a = np.array(list(vec_a), dtype=float)
    b = np.array(list(vec_b), dtype=float)
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0:
        return 0.0
    return float(np.dot(a, b) / denom)


def _fallback_embed(text: str, dim: int = 64) -> List[float]:
    """
    Deterministic hashing-based embedding used when sentence-transformers
    is not available (e.g., during slim CI environments).
    """
    vector = [0.0] * dim
    for token in text.lower().split():
        digest = hashlib.sha256(token.encode("utf-8")).hexdigest()
        idx = int(digest, 16) % dim
        vector[idx] += 1.0
    return vector