Spaces:
Sleeping
Sleeping
| """ | |
| Embedding generation for RAG system. | |
| Handles text-to-vector conversion using sentence-transformers. | |
| """ | |
| from typing import List, Union | |
| import logging | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| class EmbeddingGenerator: | |
| """ | |
| Generates embeddings for text using sentence-transformers. | |
| Features: | |
| - Batch processing for efficiency | |
| - Caching of model | |
| - Normalized embeddings for cosine similarity | |
| """ | |
| def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): | |
| """ | |
| Initialize embedding generator. | |
| Args: | |
| model_name: HuggingFace model identifier | |
| """ | |
| self.model_name = model_name | |
| logger.info(f"Loading embedding model: {model_name}") | |
| try: | |
| self.model = SentenceTransformer(model_name) | |
| self.embedding_dim = self.model.get_sentence_embedding_dimension() | |
| logger.info(f"Model loaded. Embedding dimension: {self.embedding_dim}") | |
| except Exception as e: | |
| logger.error(f"Failed to load embedding model: {e}") | |
| raise | |
| def embed_text(self, text: Union[str, List[str]]) -> np.ndarray: | |
| """ | |
| Generate embeddings for text. | |
| Args: | |
| text: Single text string or list of strings | |
| Returns: | |
| Numpy array of embeddings (shape: [n_texts, embedding_dim]) | |
| """ | |
| if isinstance(text, str): | |
| text = [text] | |
| if not text: | |
| raise ValueError("No text provided for embedding") | |
| try: | |
| # Generate embeddings | |
| embeddings = self.model.encode( | |
| text, | |
| normalize_embeddings=True, # For cosine similarity | |
| show_progress_bar=len(text) > 10, | |
| batch_size=32 | |
| ) | |
| logger.debug(f"Generated embeddings for {len(text)} texts") | |
| return embeddings | |
| except Exception as e: | |
| logger.error(f"Embedding generation failed: {e}") | |
| raise | |
| def embed_query(self, query: str) -> np.ndarray: | |
| """ | |
| Generate embedding for a single query. | |
| Args: | |
| query: Query text | |
| Returns: | |
| 1D numpy array of embedding | |
| """ | |
| embedding = self.embed_text(query) | |
| return embedding[0] # Return single embedding | |
| def embed_documents(self, documents: List[str]) -> np.ndarray: | |
| """ | |
| Generate embeddings for a batch of documents. | |
| Args: | |
| documents: List of document texts | |
| Returns: | |
| 2D numpy array of embeddings | |
| """ | |
| return self.embed_text(documents) | |
| def create_embedding_generator(model_name: str = None) -> EmbeddingGenerator: | |
| """ | |
| Factory function to create embedding generator. | |
| Args: | |
| model_name: Optional model name override | |
| Returns: | |
| EmbeddingGenerator instance | |
| """ | |
| from src.config import settings | |
| model = model_name or settings.embedding_model | |
| return EmbeddingGenerator(model_name=model) | |