import time import numpy as np from tqdm import tqdm from openai import OpenAI from typing import List, Dict, Any, Optional import os # Get API key from environment variable OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") EMBEDDING_MODEL = "text-embedding-3-small" EMBEDDING_BATCH_SIZE = 10 class TextEmbedder: """Class for generating embeddings for document chunks using OpenAI's embeddings API.""" def __init__(self, model: str = EMBEDDING_MODEL, batch_size: int = EMBEDDING_BATCH_SIZE): """ Initialize the TextEmbedder with the specified embedding model and batch size. Args: model: The OpenAI embedding model to use batch_size: Number of chunks to embed per API call """ self.model = model self.batch_size = batch_size self.client = OpenAI(api_key=OPENAI_API_KEY) self.embedding_dim = 1536 # Default dimension for text-embedding-3-small def get_embedding_for_text(self, text: str) -> List[float]: """Generate embedding for a single text.""" try: response = self.client.embeddings.create( input=[text], model=self.model ) return response.data[0].embedding except Exception as e: print(f"Error generating embedding: {e}") return [0.0] * self.embedding_dim def get_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]: """ Compute embeddings for a list of texts using batched API calls. Args: texts: List of text chunks to embed Returns: List of embedding vectors """ embeddings = [] for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding chunks"): batch = texts[i:i + self.batch_size] try: response = self.client.embeddings.create( input=batch, model=self.model ) # Extract embeddings from the response for item in response.data: embeddings.append(item.embedding) except Exception as e: print(f"Error embedding batch starting at index {i}: {e}") # Append placeholder zero vectors for failed texts for _ in batch: embeddings.append([0.0] * self.embedding_dim) # Brief pause to avoid rate limits time.sleep(0.2) return embeddings def get_query_embedding(self, query: str) -> np.ndarray: """ Generate embedding for a query string and return as numpy array. Args: query: The query text to embed Returns: Numpy array of the embedding """ try: q_response = self.client.embeddings.create( input=[query], model=self.model ) return np.array(q_response.data[0].embedding, dtype='float32').reshape(1, -1) except Exception as e: print(f"Error creating embedding for query: {e}") return np.zeros((1, self.embedding_dim), dtype='float32')