""" Embedding generation module Converts text chunks into vector embeddings """ import logging from typing import List import numpy as np try: from sentence_transformers import SentenceTransformer SENTENCE_TRANSFORMERS_AVAILABLE = True except ImportError: SENTENCE_TRANSFORMERS_AVAILABLE = False from .config import EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE logger = logging.getLogger(__name__) class EmbeddingGenerator: """Generates embeddings for text chunks using sentence-transformers""" def __init__(self, model_name: str = EMBEDDING_MODEL): """ Initialize embedding generator Args: model_name: Name of the sentence-transformers model to use """ if not SENTENCE_TRANSFORMERS_AVAILABLE: raise ImportError( "sentence-transformers not installed. " "Install with: pip install sentence-transformers" ) logger.info(f"Loading embedding model: {model_name}") self.model_name = model_name self.model = SentenceTransformer(model_name) self.embedding_dim = self.model.get_sentence_embedding_dimension() logger.info(f"Model loaded successfully. Embedding dimension: {self.embedding_dim}") def generate_embedding(self, text: str) -> np.ndarray: """ Generate embedding for a single text Args: text: Input text Returns: Numpy array of embedding vector """ return self.model.encode(text, convert_to_numpy=True) def generate_embeddings_batch( self, texts: List[str], batch_size: int = EMBEDDING_BATCH_SIZE, show_progress: bool = True ) -> np.ndarray: """ Generate embeddings for multiple texts efficiently Args: texts: List of input texts batch_size: Batch size for processing show_progress: Whether to show progress bar Returns: Numpy array of shape (len(texts), embedding_dim) """ logger.info(f"Generating embeddings for {len(texts)} texts in batches of {batch_size}") embeddings = self.model.encode( texts, batch_size=batch_size, show_progress_bar=show_progress, convert_to_numpy=True ) logger.info(f"Generated {len(embeddings)} embeddings of dimension {self.embedding_dim}") return embeddings def get_embedding_dimension(self) -> int: """Get the dimension of embeddings produced by this model""" return self.embedding_dim