File size: 3,097 Bytes
8a1c0d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Embedding model wrapper for document vectorization."""

from pathlib import Path
from typing import Optional

import numpy as np
from sentence_transformers import SentenceTransformer

from src.config import settings
from src.document_processor.chunker import DocumentChunk


class EmbeddingModel:
    """Wrapper for sentence-transformers embedding models.

    Provides efficient batch embedding with caching support.
    """

    def __init__(self, model_name: Optional[str] = None):
        """Initialize the embedding model.

        Args:
            model_name: HuggingFace model name. Defaults to settings.embedding_model.
        """
        self.model_name = model_name or settings.embedding_model
        self._model: Optional[SentenceTransformer] = None

    @property
    def model(self) -> SentenceTransformer:
        """Lazy load the embedding model."""
        if self._model is None:
            self._model = SentenceTransformer(self.model_name)
        return self._model

    @property
    def embedding_dimension(self) -> int:
        """Get the dimension of embeddings produced by this model."""
        return self.model.get_sentence_embedding_dimension()

    def embed_text(self, text: str) -> np.ndarray:
        """Embed a single text string.

        Args:
            text: Text to embed.

        Returns:
            Embedding vector as numpy array.
        """
        return self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)

    def embed_texts(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
        """Embed multiple texts efficiently.

        Args:
            texts: List of texts to embed.
            batch_size: Batch size for processing.

        Returns:
            Array of embedding vectors (num_texts x embedding_dim).
        """
        return self.model.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=len(texts) > 100,
        )

    def embed_chunks(
        self, chunks: list[DocumentChunk], batch_size: int = 32
    ) -> list[tuple[DocumentChunk, np.ndarray]]:
        """Embed document chunks with their metadata.

        Args:
            chunks: List of DocumentChunks to embed.
            batch_size: Batch size for processing.

        Returns:
            List of (chunk, embedding) tuples.
        """
        texts = [chunk.content for chunk in chunks]
        embeddings = self.embed_texts(texts, batch_size=batch_size)

        return list(zip(chunks, embeddings))

    def embed_query(self, query: str) -> np.ndarray:
        """Embed a query for retrieval.

        Some models use different prompting for queries vs documents.

        Args:
            query: Query text to embed.

        Returns:
            Query embedding vector.
        """
        # BGE models benefit from query prefixes
        if "bge" in self.model_name.lower():
            query = f"Represent this sentence for searching relevant passages: {query}"

        return self.embed_text(query)