Spaces:
Sleeping
Sleeping
| import time | |
| import numpy as np | |
| from tqdm import tqdm | |
| from openai import OpenAI | |
| from typing import List, Dict, Any, Optional | |
| from src.utils.config import EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE, OPENAI_API_KEY | |
| class TextEmbedder: | |
| """Class for generating embeddings for document chunks using OpenAI's embeddings API.""" | |
| def __init__(self, model: str = EMBEDDING_MODEL, batch_size: int = EMBEDDING_BATCH_SIZE): | |
| """ | |
| Initialize the TextEmbedder with the specified embedding model and batch size. | |
| Args: | |
| model: The OpenAI embedding model to use | |
| batch_size: Number of chunks to embed per API call | |
| """ | |
| self.model = model | |
| self.batch_size = batch_size | |
| self.client = OpenAI(api_key=OPENAI_API_KEY) | |
| # Default dimension for different models | |
| self.embedding_dim = self._get_model_dimension(model) | |
| print(f"Initialized TextEmbedder with model {model}, dimension {self.embedding_dim}") | |
| def _get_model_dimension(self, model_name: str) -> int: | |
| """Get the embedding dimension for a given model.""" | |
| # Mapping of model names to dimensions | |
| dimensions = { | |
| "text-embedding-3-small": 1536, | |
| "text-embedding-3-large": 3072, | |
| "text-embedding-ada-002": 1536, | |
| # Add other models if needed | |
| } | |
| # Return the dimension for the model or default to 1536 (most common) | |
| return dimensions.get(model_name, 1536) | |
| def set_dimension(self, dimension: int) -> None: | |
| """ | |
| Set the embedding dimension explicitly. | |
| Use this to ensure compatibility with existing FAISS indices. | |
| """ | |
| self.embedding_dim = dimension | |
| print(f"Explicitly set embedding dimension to {dimension}") | |
| def get_embedding_for_text(self, text: str) -> List[float]: | |
| """Generate embedding for a single text.""" | |
| try: | |
| response = self.client.embeddings.create( | |
| input=[text], | |
| model=self.model | |
| ) | |
| return response.data[0].embedding | |
| except Exception as e: | |
| print(f"Error generating embedding: {e}") | |
| return [0.0] * self.embedding_dim | |
| def get_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]: | |
| """ | |
| Compute embeddings for a list of texts using batched API calls. | |
| Args: | |
| texts: List of text chunks to embed | |
| Returns: | |
| List of embedding vectors | |
| """ | |
| embeddings = [] | |
| for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding chunks"): | |
| batch = texts[i:i + self.batch_size] | |
| try: | |
| response = self.client.embeddings.create( | |
| input=batch, | |
| model=self.model | |
| ) | |
| # Extract embeddings from the response | |
| for item in response.data: | |
| embeddings.append(item.embedding) | |
| except Exception as e: | |
| print(f"Error embedding batch starting at index {i}: {e}") | |
| # Append placeholder zero vectors for failed texts | |
| for _ in batch: | |
| embeddings.append([0.0] * self.embedding_dim) | |
| # Brief pause to avoid rate limits | |
| time.sleep(0.2) | |
| return embeddings | |
| def get_query_embedding(self, query: str) -> np.ndarray: | |
| """ | |
| Generate embedding for a query string and return as numpy array. | |
| Args: | |
| query: The query text to embed | |
| Returns: | |
| Numpy array of the embedding | |
| """ | |
| try: | |
| q_response = self.client.embeddings.create( | |
| input=[query], | |
| model=self.model | |
| ) | |
| embedding = np.array(q_response.data[0].embedding, dtype='float32') | |
| # Check and log the actual dimension | |
| actual_dim = embedding.shape[0] | |
| if actual_dim != self.embedding_dim: | |
| print(f"Warning: OpenAI returned embedding of dimension {actual_dim}, expected {self.embedding_dim}") | |
| # Handle dimension mismatch | |
| if actual_dim > self.embedding_dim: | |
| # Truncate the embedding to match expected dimension | |
| print(f"Truncating embedding from {actual_dim} to {self.embedding_dim}") | |
| embedding = embedding[:self.embedding_dim] | |
| elif actual_dim < self.embedding_dim: | |
| # Pad the embedding to match expected dimension | |
| print(f"Padding embedding from {actual_dim} to {self.embedding_dim}") | |
| padding = np.zeros(self.embedding_dim - actual_dim, dtype='float32') | |
| embedding = np.concatenate([embedding, padding]) | |
| # Return the embedding as a 2D array | |
| return embedding.reshape(1, -1) | |
| except Exception as e: | |
| print(f"Error creating embedding for query: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return np.zeros((1, self.embedding_dim), dtype='float32') |