File size: 1,585 Bytes
b657fcc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | """Pluggable embedding interface. Provides simple char-histogram fallback and
an optional sentence-transformers adapter if available.
"""
from typing import List
import math
try:
from sentence_transformers import SentenceTransformer
SBER_AVAILABLE = True
except Exception:
SBER_AVAILABLE = False
class EmbeddingBackend:
def embed(self, texts: List[str]) -> List[List[float]]:
raise NotImplementedError()
class CharHistogramEmbedding(EmbeddingBackend):
def __init__(self, dim: int = 32):
self.dim = dim
def embed(self, texts: List[str]) -> List[List[float]]:
def _embed(text: str):
vec = [0.0] * self.dim
for ch in text[:4096]:
vec[ord(ch) % self.dim] += 1.0
norm = math.sqrt(sum(v * v for v in vec)) or 1.0
return [v / norm for v in vec]
return [_embed(t) for t in texts]
class SBERTEmbedding(EmbeddingBackend):
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
if not SBER_AVAILABLE:
raise RuntimeError("sentence-transformers not installed")
self.model = SentenceTransformer(model_name)
def embed(self, texts: List[str]) -> List[List[float]]:
arr = self.model.encode(texts)
return [list(map(float, vec)) for vec in arr]
def make_default_backend() -> EmbeddingBackend:
if SBER_AVAILABLE:
try:
return SBERTEmbedding()
except Exception:
pass
return CharHistogramEmbedding()
|