| | import time |
| | import openai |
| | import numpy as np |
| | from tqdm import tqdm |
| | from typing import List, Mapping, Any |
| | from bertopic.backend import BaseEmbedder |
| |
|
| |
|
| | class OpenAIBackend(BaseEmbedder): |
| | """ OpenAI Embedding Model |
| | |
| | Arguments: |
| | client: A `openai.OpenAI` client. |
| | embedding_model: An OpenAI model. Default is |
| | For an overview of models see: |
| | https://platform.openai.com/docs/models/embeddings |
| | delay_in_seconds: If a `batch_size` is given, use this set |
| | the delay in seconds between batches. |
| | batch_size: The size of each batch. |
| | generator_kwargs: Kwargs passed to `openai.Embedding.create`. |
| | Can be used to define custom engines or |
| | deployment_ids. |
| | |
| | Examples: |
| | |
| | ```python |
| | import openai |
| | from bertopic.backend import OpenAIBackend |
| | |
| | client = openai.OpenAI(api_key="sk-...") |
| | openai_embedder = OpenAIBackend(client, "text-embedding-ada-002") |
| | ``` |
| | """ |
| | def __init__(self, |
| | client: openai.OpenAI, |
| | embedding_model: str = "text-embedding-ada-002", |
| | delay_in_seconds: float = None, |
| | batch_size: int = None, |
| | generator_kwargs: Mapping[str, Any] = {}): |
| | super().__init__() |
| | self.client = client |
| | self.embedding_model = embedding_model |
| | self.delay_in_seconds = delay_in_seconds |
| | self.batch_size = batch_size |
| | self.generator_kwargs = generator_kwargs |
| |
|
| | if self.generator_kwargs.get("model"): |
| | self.embedding_model = generator_kwargs.get("model") |
| | elif not self.generator_kwargs.get("engine"): |
| | self.generator_kwargs["model"] = self.embedding_model |
| |
|
| | def embed(self, |
| | documents: List[str], |
| | verbose: bool = False) -> np.ndarray: |
| | """ Embed a list of n documents/words into an n-dimensional |
| | matrix of embeddings |
| | |
| | Arguments: |
| | documents: A list of documents or words to be embedded |
| | verbose: Controls the verbosity of the process |
| | |
| | Returns: |
| | Document/words embeddings with shape (n, m) with `n` documents/words |
| | that each have an embeddings size of `m` |
| | """ |
| | |
| | prepared_documents = [" " if doc == "" else doc for doc in documents] |
| |
|
| | |
| | if self.batch_size is not None: |
| | embeddings = [] |
| | for batch in tqdm(self._chunks(prepared_documents), disable=not verbose): |
| | response = self.client.embeddings.create(input=batch, **self.generator_kwargs) |
| | embeddings.extend([r.embedding for r in response.data]) |
| |
|
| | |
| | if self.delay_in_seconds: |
| | time.sleep(self.delay_in_seconds) |
| |
|
| | |
| | else: |
| | response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs) |
| | embeddings = [r.embedding for r in response.data] |
| | return np.array(embeddings) |
| |
|
| | def _chunks(self, documents): |
| | for i in range(0, len(documents), self.batch_size): |
| | yield documents[i:i + self.batch_size] |
| |
|