| """Pluggable embedding interface. Provides simple char-histogram fallback and | |
| an optional sentence-transformers adapter if available. | |
| """ | |
| from typing import List | |
| import math | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| SBER_AVAILABLE = True | |
| except Exception: | |
| SBER_AVAILABLE = False | |
| class EmbeddingBackend: | |
| def embed(self, texts: List[str]) -> List[List[float]]: | |
| raise NotImplementedError() | |
| class CharHistogramEmbedding(EmbeddingBackend): | |
| def __init__(self, dim: int = 32): | |
| self.dim = dim | |
| def embed(self, texts: List[str]) -> List[List[float]]: | |
| def _embed(text: str): | |
| vec = [0.0] * self.dim | |
| for ch in text[:4096]: | |
| vec[ord(ch) % self.dim] += 1.0 | |
| norm = math.sqrt(sum(v * v for v in vec)) or 1.0 | |
| return [v / norm for v in vec] | |
| return [_embed(t) for t in texts] | |
| class SBERTEmbedding(EmbeddingBackend): | |
| def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): | |
| if not SBER_AVAILABLE: | |
| raise RuntimeError("sentence-transformers not installed") | |
| self.model = SentenceTransformer(model_name) | |
| def embed(self, texts: List[str]) -> List[List[float]]: | |
| arr = self.model.encode(texts) | |
| return [list(map(float, vec)) for vec in arr] | |
| def make_default_backend() -> EmbeddingBackend: | |
| if SBER_AVAILABLE: | |
| try: | |
| return SBERTEmbedding() | |
| except Exception: | |
| pass | |
| return CharHistogramEmbedding() | |