| import numpy as np |
| from tqdm import tqdm |
| from typing import List |
| from bertopic.backend import BaseEmbedder |
|
|
|
|
| class SpacyBackend(BaseEmbedder): |
| """ Spacy embedding model |
| |
| The Spacy embedding model used for generating document and |
| word embeddings. |
| |
| Arguments: |
| embedding_model: A spacy embedding model |
| |
| Examples: |
| |
| To create a Spacy backend, you need to create an nlp object and |
| pass it through this backend: |
| |
| ```python |
| import spacy |
| from bertopic.backend import SpacyBackend |
| |
| nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| spacy_model = SpacyBackend(nlp) |
| ``` |
| |
| To load in a transformer model use the following: |
| |
| ```python |
| import spacy |
| from thinc.api import set_gpu_allocator, require_gpu |
| from bertopic.backend import SpacyBackend |
| |
| nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| set_gpu_allocator("pytorch") |
| require_gpu(0) |
| spacy_model = SpacyBackend(nlp) |
| ``` |
| |
| If you run into gpu/memory-issues, please use: |
| |
| ```python |
| import spacy |
| from bertopic.backend import SpacyBackend |
| |
| spacy.prefer_gpu() |
| nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| spacy_model = SpacyBackend(nlp) |
| ``` |
| """ |
| def __init__(self, embedding_model): |
| super().__init__() |
|
|
| if "spacy" in str(type(embedding_model)): |
| self.embedding_model = embedding_model |
| else: |
| raise ValueError("Please select a correct Spacy model by either using a string such as 'en_core_web_md' " |
| "or create a nlp model using: `nlp = spacy.load('en_core_web_md')") |
|
|
| def embed(self, |
| documents: List[str], |
| verbose: bool = False) -> np.ndarray: |
| """ Embed a list of n documents/words into an n-dimensional |
| matrix of embeddings |
| |
| Arguments: |
| documents: A list of documents or words to be embedded |
| verbose: Controls the verbosity of the process |
| |
| Returns: |
| Document/words embeddings with shape (n, m) with `n` documents/words |
| that each have an embeddings size of `m` |
| """ |
| |
| |
| empty_document = " " |
|
|
| |
| embeddings = [] |
| for doc in tqdm(documents, position=0, leave=True, disable=not verbose): |
| embedding = self.embedding_model(doc or empty_document) |
| if embedding.has_vector: |
| embedding = embedding.vector |
| else: |
| embedding = embedding._.trf_data.tensors[-1][0] |
|
|
| if not isinstance(embedding, np.ndarray) and hasattr(embedding, 'get'): |
| |
| embedding = embedding.get() |
| embeddings.append(embedding) |
|
|
| return np.array(embeddings) |
|
|