Spaces:
Sleeping
Sleeping
File size: 1,926 Bytes
6c54d57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | """Embedding module using sentence-transformers for semantic search."""
import numpy as np
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_CACHE_DIR = "/tmp/models"
BATCH_SIZE = 64
class Embedder:
"""Lazy-loading sentence-transformer embedder for compliance documents."""
def __init__(self):
self._model = None
def _load_model(self):
if self._model is not None:
return
print(f"[Embedder] Loading model {MODEL_NAME} ...")
from sentence_transformers import SentenceTransformer
self._model = SentenceTransformer(
MODEL_NAME,
cache_folder=MODEL_CACHE_DIR,
device="cpu",
)
print(f"[Embedder] Model loaded. Dimension: {self._model.get_sentence_embedding_dimension()}")
def embed(self, texts: list[str]) -> np.ndarray:
"""Embed a list of texts, returning normalized vectors.
Processes in batches of BATCH_SIZE to avoid OOM on free-tier.
Returns shape (len(texts), dim).
"""
self._load_model()
all_embeddings = []
for start in range(0, len(texts), BATCH_SIZE):
batch = texts[start : start + BATCH_SIZE]
emb = self._model.encode(
batch,
normalize_embeddings=True,
show_progress_bar=False,
)
all_embeddings.append(emb)
return np.vstack(all_embeddings).astype(np.float32)
def embed_single(self, text: str) -> np.ndarray:
"""Embed a single text string. Returns shape (dim,)."""
self._load_model()
emb = self._model.encode(
[text],
normalize_embeddings=True,
show_progress_bar=False,
)
return emb[0].astype(np.float32)
@property
def dimension(self) -> int:
self._load_model()
return self._model.get_sentence_embedding_dimension()
|