# ============================================================ # FILE: src/embeddings.py # ============================================================ # PURPOSE: # Convert text into embedding vectors. # # WHAT IS AN EMBEDDING? # An embedding is a list of numbers that represents the meaning of text. # # Example: # "refund policy" -> [0.12, -0.04, 0.88, ...] # # Similar text should have similar vectors. # # The same embedding model must be used for: # - indexing documents # - querying documents # # If you change the embedding model, rebuild the vector database. # ============================================================ from typing import List from sentence_transformers import SentenceTransformer class EmbeddingModel: """ Wrapper around SentenceTransformer. This keeps embedding logic separate from the vector database logic. """ def __init__(self, model_name: str, device: str = "cpu") -> None: """ Load the embedding model. device='cpu': - best for your current local machine - no GPU required """ self.model_name = model_name self.device = device self.model = SentenceTransformer(model_name, device=device) def embed_texts(self, texts: List[str]) -> List[List[float]]: """ Convert a list of texts into embedding vectors. normalize_embeddings=True: - useful for cosine similarity style retrieval - makes vector comparison more stable """ if not texts: return [] embeddings = self.model.encode( texts, normalize_embeddings=True, show_progress_bar=True, ) return embeddings.tolist() def embed_query(self, query: str) -> List[float]: """ Convert one user question into one embedding vector. """ return self.embed_texts([query])[0]