Spaces:
Runtime error
Runtime error
| """ | |
| Embedding model abstraction for converting text chunks into vector representations. | |
| Provides a pluggable interface for different embedding models, with a default | |
| implementation using sentence-transformers. | |
| """ | |
| from abc import ABC, abstractmethod | |
| from typing import List | |
| import numpy as np | |
| class EmbeddingModel(ABC): | |
| """ | |
| Abstract base class for embedding models. | |
| This abstraction allows for easy swapping of different embedding models | |
| without changing the retrieval system. | |
| """ | |
| def embed(self, texts: List[str]) -> np.ndarray: | |
| """ | |
| Embed a list of text strings into vector representations. | |
| Args: | |
| texts: List of text strings to embed | |
| Returns: | |
| numpy array of shape (len(texts), embedding_dim) | |
| """ | |
| pass | |
| def embed_single(self, text: str) -> np.ndarray: | |
| """ | |
| Embed a single text string. | |
| Args: | |
| text: Text string to embed | |
| Returns: | |
| numpy array of shape (embedding_dim,) | |
| """ | |
| pass | |
| def embedding_dim(self) -> int: | |
| """Return the dimensionality of the embeddings.""" | |
| pass | |
| class SentenceTransformerEmbedding(EmbeddingModel): | |
| """ | |
| Embedding model using sentence-transformers library. | |
| This is a popular choice for semantic similarity tasks and works well | |
| for code and documentation embedding. | |
| """ | |
| def __init__(self, model_name: str = "all-MiniLM-L6-v2"): | |
| """ | |
| Initialize the sentence transformer model. | |
| Args: | |
| model_name: Name of the pre-trained model to use. | |
| Default is 'all-MiniLM-L6-v2' which is lightweight | |
| and performs well for general-purpose embeddings. | |
| """ | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| self.model = SentenceTransformer(model_name) | |
| self._embedding_dim = self.model.get_sentence_embedding_dimension() | |
| except ImportError: | |
| raise ImportError( | |
| "sentence-transformers is required for SentenceTransformerEmbedding. " | |
| "Install it with: pip install sentence-transformers" | |
| ) | |
| def embed(self, texts: List[str]) -> np.ndarray: | |
| """Embed multiple texts.""" | |
| return self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False) | |
| def embed_single(self, text: str) -> np.ndarray: | |
| """Embed a single text.""" | |
| return self.model.encode([text], convert_to_numpy=True, show_progress_bar=False)[0] | |
| def embedding_dim(self) -> int: | |
| """Return embedding dimensionality.""" | |
| return self._embedding_dim | |
| class SimpleEmbedding(EmbeddingModel): | |
| """ | |
| Simple TF-IDF based embedding for testing or lightweight use. | |
| This implementation doesn't require additional dependencies and can be | |
| used as a fallback when more sophisticated models are not available. | |
| """ | |
| def __init__(self, max_features: int = 384): | |
| """ | |
| Initialize TF-IDF based embedding. | |
| Args: | |
| max_features: Maximum number of features (embedding dimension) | |
| """ | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| self.vectorizer = TfidfVectorizer( | |
| max_features=max_features, | |
| stop_words='english', | |
| ngram_range=(1, 2) | |
| ) | |
| self._embedding_dim = max_features | |
| self._is_fitted = False | |
| def fit(self, texts: List[str]): | |
| """ | |
| Fit the TF-IDF vectorizer on a corpus. | |
| Must be called before embed() or embed_single(). | |
| Args: | |
| texts: Corpus of texts to fit the vectorizer | |
| """ | |
| self.vectorizer.fit(texts) | |
| self._is_fitted = True | |
| def embed(self, texts: List[str]) -> np.ndarray: | |
| """Embed multiple texts using TF-IDF.""" | |
| if not self._is_fitted: | |
| # Auto-fit on the provided texts | |
| self.fit(texts) | |
| return self.vectorizer.transform(texts).toarray() | |
| def embed_single(self, text: str) -> np.ndarray: | |
| """Embed a single text using TF-IDF.""" | |
| if not self._is_fitted: | |
| raise RuntimeError("SimpleEmbedding must be fitted before use. Call fit() first.") | |
| return self.vectorizer.transform([text]).toarray()[0] | |
| def embedding_dim(self) -> int: | |
| """Return embedding dimensionality.""" | |
| return self._embedding_dim | |