| | import time |
| | import numpy as np |
| | from tqdm import tqdm |
| | from typing import Any, List, Mapping |
| | from bertopic.backend import BaseEmbedder |
| |
|
| |
|
| | class CohereBackend(BaseEmbedder): |
| | """ Cohere Embedding Model |
| | |
| | Arguments: |
| | client: A `cohere` client. |
| | embedding_model: A Cohere model. Default is "large". |
| | For an overview of models see: |
| | https://docs.cohere.ai/docs/generation-card |
| | delay_in_seconds: If a `batch_size` is given, use this set |
| | the delay in seconds between batches. |
| | batch_size: The size of each batch. |
| | embed_kwargs: Kwargs passed to `cohere.Client.embed`. |
| | Can be used to define additional parameters |
| | such as `input_type` |
| | |
| | Examples: |
| | |
| | ```python |
| | import cohere |
| | from bertopic.backend import CohereBackend |
| | |
| | client = cohere.Client("APIKEY") |
| | cohere_model = CohereBackend(client) |
| | ``` |
| | |
| | If you want to specify `input_type`: |
| | |
| | ```python |
| | cohere_model = CohereBackend( |
| | client, |
| | embedding_model="embed-english-v3.0", |
| | embed_kwargs={"input_type": "clustering"} |
| | ) |
| | ``` |
| | """ |
| | def __init__(self, |
| | client, |
| | embedding_model: str = "large", |
| | delay_in_seconds: float = None, |
| | batch_size: int = None, |
| | embed_kwargs: Mapping[str, Any] = {}): |
| | super().__init__() |
| | self.client = client |
| | self.embedding_model = embedding_model |
| | self.delay_in_seconds = delay_in_seconds |
| | self.batch_size = batch_size |
| | self.embed_kwargs = embed_kwargs |
| |
|
| | if self.embed_kwargs.get("model"): |
| | self.embedding_model = embed_kwargs.get("model") |
| | else: |
| | self.embed_kwargs["model"] = self.embedding_model |
| |
|
| | def embed(self, |
| | documents: List[str], |
| | verbose: bool = False) -> np.ndarray: |
| | """ Embed a list of n documents/words into an n-dimensional |
| | matrix of embeddings |
| | |
| | Arguments: |
| | documents: A list of documents or words to be embedded |
| | verbose: Controls the verbosity of the process |
| | |
| | Returns: |
| | Document/words embeddings with shape (n, m) with `n` documents/words |
| | that each have an embeddings size of `m` |
| | """ |
| | |
| | if self.batch_size is not None: |
| | embeddings = [] |
| | for batch in tqdm(self._chunks(documents), disable=not verbose): |
| | response = self.client.embed(batch, **self.embed_kwargs) |
| | embeddings.extend(response.embeddings) |
| |
|
| | |
| | if self.delay_in_seconds: |
| | time.sleep(self.delay_in_seconds) |
| |
|
| | |
| | else: |
| | response = self.client.embed(documents, **self.embed_kwargs) |
| | embeddings = response.embeddings |
| | return np.array(embeddings) |
| |
|
| | def _chunks(self, documents): |
| | for i in range(0, len(documents), self.batch_size): |
| | yield documents[i:i + self.batch_size] |
| |
|