File size: 3,852 Bytes
f5eb34f
1b447de
f5eb34f
 
 
 
 
1b447de
f5eb34f
 
 
1b447de
f5eb34f
1b447de
f5eb34f
 
1b447de
 
f5eb34f
 
 
 
 
 
 
 
 
 
 
 
 
 
1b447de
f5eb34f
 
 
 
 
 
 
 
 
1b447de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5eb34f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b447de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Gerenciamento de modelos de embeddings com cache
"""
from typing import List, Optional
import numpy as np
from sentence_transformers import SentenceTransformer
from .config import EMBEDDING_MODEL_ID
from .cache import EmbeddingCache


class EmbeddingManager:
    """Gerenciador de embeddings com cache"""

    def __init__(self, model_id: str = EMBEDDING_MODEL_ID, use_cache: bool = True):
        self.model_id = model_id
        self.model: Optional[SentenceTransformer] = None
        self.use_cache = use_cache
        self.cache = EmbeddingCache(max_size=1000, ttl_seconds=3600) if use_cache else None

    def load_model(self) -> SentenceTransformer:
        """Carrega modelo de embeddings (lazy loading)"""
        if self.model is None:
            self.model = SentenceTransformer(self.model_id)
        return self.model

    def encode(
        self,
        texts: List[str],
        normalize: bool = True,
        show_progress: bool = False
    ) -> np.ndarray:
        """
        Gera embeddings para lista de textos com cache

        Args:
            texts: Lista de textos para embedar
            normalize: Se True, normaliza embeddings (recomendado para cosine)
            show_progress: Se True, mostra barra de progresso

        Returns:
            Array numpy com embeddings
        """
        if not self.use_cache or self.cache is None:
            # Sem cache, processa direto
            model = self.load_model()
            embeddings = model.encode(
                texts,
                normalize_embeddings=normalize,
                show_progress_bar=show_progress
            )
            return embeddings

        # Com cache, verifica cada texto
        embeddings_list = []
        texts_to_encode = []
        indices_to_encode = []

        for i, text in enumerate(texts):
            cached_embedding = self.cache.get(text, self.model_id)
            if cached_embedding is not None:
                embeddings_list.append(cached_embedding)
            else:
                embeddings_list.append(None)
                texts_to_encode.append(text)
                indices_to_encode.append(i)

        # Processa textos não cacheados
        if texts_to_encode:
            model = self.load_model()
            new_embeddings = model.encode(
                texts_to_encode,
                normalize_embeddings=normalize,
                show_progress_bar=show_progress
            )

            # Armazena no cache e insere na lista
            for idx, embedding in zip(indices_to_encode, new_embeddings):
                self.cache.set(texts[idx], self.model_id, embedding)
                embeddings_list[idx] = embedding

        return np.array(embeddings_list)

    def encode_single(self, text: str, normalize: bool = True) -> List[float]:
        """
        Gera embedding para um único texto

        Args:
            text: Texto para embedar
            normalize: Se True, normaliza embedding

        Returns:
            Lista de floats representando o embedding
        """
        embeddings = self.encode([text], normalize=normalize)
        return embeddings[0].astype(np.float32).tolist()

    def get_dimension(self) -> int:
        """Retorna dimensão do embedding"""
        model = self.load_model()
        return model.get_sentence_embedding_dimension()

    def get_cache_stats(self) -> dict:
        """
        Retorna estatísticas do cache

        Returns:
            Dicionário com métricas do cache
        """
        if not self.use_cache or self.cache is None:
            return {"cache_enabled": False}

        stats = self.cache.get_stats()
        stats["cache_enabled"] = True
        return stats

    def clear_cache(self) -> None:
        """Limpa o cache de embeddings"""
        if self.cache is not None:
            self.cache.clear()