Spaces:
Sleeping
Sleeping
File size: 3,958 Bytes
bb04c5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | # indexer/embedder.py
import yaml
from sentence_transformers import SentenceTransformer
class Embedder:
"""
Loads a sentence-transformer model and converts text chunks
into dense vector embeddings.
Model upgrade: all-MiniLM-L6-v2 β BAAI/bge-small-en-v1.5
Why BGE over MiniLM:
- MiniLM : general purpose, fast, 384-dim, NDCG ~0.65 on SciFact
- BGE-small: retrieval-specific training, 384-dim, NDCG ~0.72 on SciFact
- Same dimension (384), same API β only the model name changes
- BGE uses a special instruction prefix for queries (not for documents)
"Represent this sentence for searching relevant passages: {query}"
This is handled automatically in embed_single()
"""
# BGE query instruction prefix β improves retrieval accuracy
# Applied to queries only, NOT to document chunks during indexing
BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
def __init__(self, config_path="config.yaml"):
"""
Load the config and initialize the embedding model.
Args:
config_path (str) β path to config.yaml
"""
with open(config_path, "r") as f:
config = yaml.safe_load(f)
model_name = config["embedding_model"]
self.model_name = model_name
# detect if we are using a BGE model
# BGE models need a special prefix on queries (not on documents)
self.is_bge = "bge" in model_name.lower()
print(f"Loading embedding model '{model_name}'...")
self.model = SentenceTransformer(model_name)
print(f"Model loaded β BGE mode: {self.is_bge}")
def embed_chunks(self, chunks):
"""
Convert a list of text chunks into dense vector embeddings.
Used during INDEXING β no query prefix applied here.
Args:
chunks (list[str]) β list of text strings to embed
Returns:
numpy.ndarray β shape (num_chunks, embedding_dim)
384 dimensions for both MiniLM and BGE-small
"""
embeddings = self.model.encode(
chunks,
batch_size=64,
show_progress_bar=False,
normalize_embeddings=self.is_bge, # BGE needs L2 normalization
)
return embeddings
def embed_single(self, text):
"""
Embed a single query string.
Used during SEARCH β BGE prefix is applied here if using BGE model.
Why prefix only on queries:
BGE was trained with this asymmetric setup.
Documents are indexed as-is.
Queries get the instruction prefix so the model knows
it is searching for relevant passages, not matching exact text.
Args:
text (str) β a single query string
Returns:
numpy.ndarray β one embedding vector (384 dimensions)
"""
if self.is_bge:
text = self.BGE_QUERY_PREFIX + text
return self.model.encode(
text,
normalize_embeddings=True, # always normalize for BGE
)
if __name__ == "__main__":
embedder = Embedder()
test_chunks = [
"The quarterly budget report shows increased spending",
"Machine learning models can understand text semantics",
"The cat sat on the mat and looked out the window"
]
print("Embedding 3 test chunks...")
vectors = embedder.embed_chunks(test_chunks)
print(f"Got {len(vectors)} vectors")
print(f"Each vector has {len(vectors[0])} dimensions")
print(f"First vector (first 5 values): {vectors[0][:5]}")
print("\n--- Single query embedding ---")
query_vec = embedder.embed_single("budget spending report")
print(f"Query vector: {len(query_vec)} dimensions") |