File size: 1,585 Bytes
b657fcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""Pluggable embedding interface. Provides simple char-histogram fallback and

an optional sentence-transformers adapter if available.

"""
from typing import List
import math

try:
    from sentence_transformers import SentenceTransformer
    SBER_AVAILABLE = True
except Exception:
    SBER_AVAILABLE = False


class EmbeddingBackend:
    def embed(self, texts: List[str]) -> List[List[float]]:
        raise NotImplementedError()


class CharHistogramEmbedding(EmbeddingBackend):
    def __init__(self, dim: int = 32):
        self.dim = dim

    def embed(self, texts: List[str]) -> List[List[float]]:
        def _embed(text: str):
            vec = [0.0] * self.dim
            for ch in text[:4096]:
                vec[ord(ch) % self.dim] += 1.0
            norm = math.sqrt(sum(v * v for v in vec)) or 1.0
            return [v / norm for v in vec]

        return [_embed(t) for t in texts]


class SBERTEmbedding(EmbeddingBackend):
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        if not SBER_AVAILABLE:
            raise RuntimeError("sentence-transformers not installed")
        self.model = SentenceTransformer(model_name)

    def embed(self, texts: List[str]) -> List[List[float]]:
        arr = self.model.encode(texts)
        return [list(map(float, vec)) for vec in arr]


def make_default_backend() -> EmbeddingBackend:
    if SBER_AVAILABLE:
        try:
            return SBERTEmbedding()
        except Exception:
            pass
    return CharHistogramEmbedding()