File size: 3,958 Bytes
bb04c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# indexer/embedder.py

import yaml
from sentence_transformers import SentenceTransformer


class Embedder:
    """

    Loads a sentence-transformer model and converts text chunks

    into dense vector embeddings.



    Model upgrade: all-MiniLM-L6-v2  β†’  BAAI/bge-small-en-v1.5



    Why BGE over MiniLM:

        - MiniLM   : general purpose, fast, 384-dim, NDCG ~0.65 on SciFact

        - BGE-small: retrieval-specific training, 384-dim, NDCG ~0.72 on SciFact

        - Same dimension (384), same API β€” only the model name changes

        - BGE uses a special instruction prefix for queries (not for documents)

          "Represent this sentence for searching relevant passages: {query}"

          This is handled automatically in embed_single()

    """

    # BGE query instruction prefix β€” improves retrieval accuracy
    # Applied to queries only, NOT to document chunks during indexing
    BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "

    def __init__(self, config_path="config.yaml"):
        """

        Load the config and initialize the embedding model.



        Args:

            config_path (str) β€” path to config.yaml

        """
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)

        model_name     = config["embedding_model"]
        self.model_name = model_name

        # detect if we are using a BGE model
        # BGE models need a special prefix on queries (not on documents)
        self.is_bge = "bge" in model_name.lower()

        print(f"Loading embedding model '{model_name}'...")
        self.model = SentenceTransformer(model_name)
        print(f"Model loaded β€” BGE mode: {self.is_bge}")

    def embed_chunks(self, chunks):
        """

        Convert a list of text chunks into dense vector embeddings.

        Used during INDEXING β€” no query prefix applied here.



        Args:

            chunks (list[str]) β€” list of text strings to embed



        Returns:

            numpy.ndarray β€” shape (num_chunks, embedding_dim)

                            384 dimensions for both MiniLM and BGE-small

        """
        embeddings = self.model.encode(
            chunks,
            batch_size=64,
            show_progress_bar=False,
            normalize_embeddings=self.is_bge,  # BGE needs L2 normalization
        )
        return embeddings

    def embed_single(self, text):
        """

        Embed a single query string.

        Used during SEARCH β€” BGE prefix is applied here if using BGE model.



        Why prefix only on queries:

            BGE was trained with this asymmetric setup.

            Documents are indexed as-is.

            Queries get the instruction prefix so the model knows

            it is searching for relevant passages, not matching exact text.



        Args:

            text (str) β€” a single query string



        Returns:

            numpy.ndarray β€” one embedding vector (384 dimensions)

        """
        if self.is_bge:
            text = self.BGE_QUERY_PREFIX + text

        return self.model.encode(
            text,
            normalize_embeddings=True,  # always normalize for BGE
        )


if __name__ == "__main__":
    embedder = Embedder()

    test_chunks = [
        "The quarterly budget report shows increased spending",
        "Machine learning models can understand text semantics",
        "The cat sat on the mat and looked out the window"
    ]

    print("Embedding 3 test chunks...")
    vectors = embedder.embed_chunks(test_chunks)
    print(f"Got {len(vectors)} vectors")
    print(f"Each vector has {len(vectors[0])} dimensions")
    print(f"First vector (first 5 values): {vectors[0][:5]}")

    print("\n--- Single query embedding ---")
    query_vec = embedder.embed_single("budget spending report")
    print(f"Query vector: {len(query_vec)} dimensions")