File size: 1,926 Bytes
6c54d57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""Embedding module using sentence-transformers for semantic search."""

import numpy as np

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_CACHE_DIR = "/tmp/models"
BATCH_SIZE = 64


class Embedder:
    """Lazy-loading sentence-transformer embedder for compliance documents."""

    def __init__(self):
        self._model = None

    def _load_model(self):
        if self._model is not None:
            return
        print(f"[Embedder] Loading model {MODEL_NAME} ...")
        from sentence_transformers import SentenceTransformer

        self._model = SentenceTransformer(
            MODEL_NAME,
            cache_folder=MODEL_CACHE_DIR,
            device="cpu",
        )
        print(f"[Embedder] Model loaded. Dimension: {self._model.get_sentence_embedding_dimension()}")

    def embed(self, texts: list[str]) -> np.ndarray:
        """Embed a list of texts, returning normalized vectors.

        Processes in batches of BATCH_SIZE to avoid OOM on free-tier.
        Returns shape (len(texts), dim).
        """
        self._load_model()

        all_embeddings = []
        for start in range(0, len(texts), BATCH_SIZE):
            batch = texts[start : start + BATCH_SIZE]
            emb = self._model.encode(
                batch,
                normalize_embeddings=True,
                show_progress_bar=False,
            )
            all_embeddings.append(emb)

        return np.vstack(all_embeddings).astype(np.float32)

    def embed_single(self, text: str) -> np.ndarray:
        """Embed a single text string. Returns shape (dim,)."""
        self._load_model()
        emb = self._model.encode(
            [text],
            normalize_embeddings=True,
            show_progress_bar=False,
        )
        return emb[0].astype(np.float32)

    @property
    def dimension(self) -> int:
        self._load_model()
        return self._model.get_sentence_embedding_dimension()