| import numpy as np | |
| from typing import List | |
| from bertopic.backend._base import BaseEmbedder | |
| from bertopic.backend._utils import select_backend | |
| class WordDocEmbedder(BaseEmbedder): | |
| """ Combine a document- and word-level embedder | |
| """ | |
| def __init__(self, | |
| embedding_model, | |
| word_embedding_model): | |
| super().__init__() | |
| self.embedding_model = select_backend(embedding_model) | |
| self.word_embedding_model = select_backend(word_embedding_model) | |
| def embed_words(self, | |
| words: List[str], | |
| verbose: bool = False) -> np.ndarray: | |
| """ Embed a list of n words into an n-dimensional | |
| matrix of embeddings | |
| Arguments: | |
| words: A list of words to be embedded | |
| verbose: Controls the verbosity of the process | |
| Returns: | |
| Word embeddings with shape (n, m) with `n` words | |
| that each have an embeddings size of `m` | |
| """ | |
| return self.word_embedding_model.embed(words, verbose) | |
| def embed_documents(self, | |
| document: List[str], | |
| verbose: bool = False) -> np.ndarray: | |
| """ Embed a list of n words into an n-dimensional | |
| matrix of embeddings | |
| Arguments: | |
| document: A list of documents to be embedded | |
| verbose: Controls the verbosity of the process | |
| Returns: | |
| Document embeddings with shape (n, m) with `n` documents | |
| that each have an embeddings size of `m` | |
| """ | |
| return self.embedding_model.embed(document, verbose) | |