| | import numpy as np |
| | from tqdm import tqdm |
| | from typing import List |
| | from bertopic.backend import BaseEmbedder |
| |
|
| |
|
| | class SpacyBackend(BaseEmbedder): |
| | """ Spacy embedding model |
| | |
| | The Spacy embedding model used for generating document and |
| | word embeddings. |
| | |
| | Arguments: |
| | embedding_model: A spacy embedding model |
| | |
| | Examples: |
| | |
| | To create a Spacy backend, you need to create an nlp object and |
| | pass it through this backend: |
| | |
| | ```python |
| | import spacy |
| | from bertopic.backend import SpacyBackend |
| | |
| | nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| | spacy_model = SpacyBackend(nlp) |
| | ``` |
| | |
| | To load in a transformer model use the following: |
| | |
| | ```python |
| | import spacy |
| | from thinc.api import set_gpu_allocator, require_gpu |
| | from bertopic.backend import SpacyBackend |
| | |
| | nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| | set_gpu_allocator("pytorch") |
| | require_gpu(0) |
| | spacy_model = SpacyBackend(nlp) |
| | ``` |
| | |
| | If you run into gpu/memory-issues, please use: |
| | |
| | ```python |
| | import spacy |
| | from bertopic.backend import SpacyBackend |
| | |
| | spacy.prefer_gpu() |
| | nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) |
| | spacy_model = SpacyBackend(nlp) |
| | ``` |
| | """ |
| | def __init__(self, embedding_model): |
| | super().__init__() |
| |
|
| | if "spacy" in str(type(embedding_model)): |
| | self.embedding_model = embedding_model |
| | else: |
| | raise ValueError("Please select a correct Spacy model by either using a string such as 'en_core_web_md' " |
| | "or create a nlp model using: `nlp = spacy.load('en_core_web_md')") |
| |
|
| | def embed(self, |
| | documents: List[str], |
| | verbose: bool = False) -> np.ndarray: |
| | """ Embed a list of n documents/words into an n-dimensional |
| | matrix of embeddings |
| | |
| | Arguments: |
| | documents: A list of documents or words to be embedded |
| | verbose: Controls the verbosity of the process |
| | |
| | Returns: |
| | Document/words embeddings with shape (n, m) with `n` documents/words |
| | that each have an embeddings size of `m` |
| | """ |
| | |
| | |
| | empty_document = " " |
| |
|
| | |
| | embeddings = [] |
| | for doc in tqdm(documents, position=0, leave=True, disable=not verbose): |
| | embedding = self.embedding_model(doc or empty_document) |
| | if embedding.has_vector: |
| | embedding = embedding.vector |
| | else: |
| | embedding = embedding._.trf_data.tensors[-1][0] |
| |
|
| | if not isinstance(embedding, np.ndarray) and hasattr(embedding, 'get'): |
| | |
| | embedding = embedding.get() |
| | embeddings.append(embedding) |
| |
|
| | return np.array(embeddings) |
| |
|