| import numpy as np |
| from tqdm import tqdm |
| from typing import List |
|
|
| from bertopic.backend import BaseEmbedder |
|
|
|
|
| class USEBackend(BaseEmbedder): |
| """ Universal Sentence Encoder |
| |
| USE encodes text into high-dimensional vectors that |
| are used for semantic similarity in BERTopic. |
| |
| Arguments: |
| embedding_model: An USE embedding model |
| |
| Examples: |
| |
| ```python |
| import tensorflow_hub |
| from bertopic.backend import USEBackend |
| |
| embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") |
| use_embedder = USEBackend(embedding_model) |
| ``` |
| """ |
| def __init__(self, embedding_model): |
| super().__init__() |
|
|
| try: |
| embedding_model(["test sentence"]) |
| self.embedding_model = embedding_model |
| except TypeError: |
| raise ValueError("Please select a correct USE model: \n" |
| "`import tensorflow_hub` \n" |
| "`embedding_model = tensorflow_hub.load(path_to_model)`") |
|
|
| def embed(self, |
| documents: List[str], |
| verbose: bool = False) -> np.ndarray: |
| """ Embed a list of n documents/words into an n-dimensional |
| matrix of embeddings |
| |
| Arguments: |
| documents: A list of documents or words to be embedded |
| verbose: Controls the verbosity of the process |
| |
| Returns: |
| Document/words embeddings with shape (n, m) with `n` documents/words |
| that each have an embeddings size of `m` |
| """ |
| embeddings = np.array( |
| [ |
| self.embedding_model([doc]).cpu().numpy()[0] |
| for doc in tqdm(documents, disable=not verbose) |
| ] |
| ) |
| return embeddings |
|
|