| import numpy as np | |
| from tqdm import tqdm | |
| from typing import List | |
| from bertopic.backend import BaseEmbedder | |
| class USEBackend(BaseEmbedder): | |
| """ Universal Sentence Encoder | |
| USE encodes text into high-dimensional vectors that | |
| are used for semantic similarity in BERTopic. | |
| Arguments: | |
| embedding_model: An USE embedding model | |
| Examples: | |
| ```python | |
| import tensorflow_hub | |
| from bertopic.backend import USEBackend | |
| embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") | |
| use_embedder = USEBackend(embedding_model) | |
| ``` | |
| """ | |
| def __init__(self, embedding_model): | |
| super().__init__() | |
| try: | |
| embedding_model(["test sentence"]) | |
| self.embedding_model = embedding_model | |
| except TypeError: | |
| raise ValueError("Please select a correct USE model: \n" | |
| "`import tensorflow_hub` \n" | |
| "`embedding_model = tensorflow_hub.load(path_to_model)`") | |
| def embed(self, | |
| documents: List[str], | |
| verbose: bool = False) -> np.ndarray: | |
| """ Embed a list of n documents/words into an n-dimensional | |
| matrix of embeddings | |
| Arguments: | |
| documents: A list of documents or words to be embedded | |
| verbose: Controls the verbosity of the process | |
| Returns: | |
| Document/words embeddings with shape (n, m) with `n` documents/words | |
| that each have an embeddings size of `m` | |
| """ | |
| embeddings = np.array( | |
| [ | |
| self.embedding_model([doc]).cpu().numpy()[0] | |
| for doc in tqdm(documents, disable=not verbose) | |
| ] | |
| ) | |
| return embeddings | |