| import numpy as np |
|
|
| from tqdm import tqdm |
| from typing import List |
| from torch.utils.data import Dataset |
| from sklearn.preprocessing import normalize |
| from transformers.pipelines import Pipeline |
|
|
| from bertopic.backend import BaseEmbedder |
|
|
|
|
| class HFTransformerBackend(BaseEmbedder): |
| """ Hugging Face transformers model |
| |
| This uses the `transformers.pipelines.pipeline` to define and create |
| a feature generation pipeline from which embeddings can be extracted. |
| |
| Arguments: |
| embedding_model: A Hugging Face feature extraction pipeline |
| |
| Examples: |
| |
| To use a Hugging Face transformers model, load in a pipeline and point |
| to any model found on their model hub (https://huggingface.co/models): |
| |
| ```python |
| from bertopic.backend import HFTransformerBackend |
| from transformers.pipelines import pipeline |
| |
| hf_model = pipeline("feature-extraction", model="distilbert-base-cased") |
| embedding_model = HFTransformerBackend(hf_model) |
| ``` |
| """ |
| def __init__(self, embedding_model: Pipeline): |
| super().__init__() |
|
|
| if isinstance(embedding_model, Pipeline): |
| self.embedding_model = embedding_model |
| else: |
| raise ValueError("Please select a correct transformers pipeline. For example: " |
| "pipeline('feature-extraction', model='distilbert-base-cased', device=0)") |
|
|
| def embed(self, |
| documents: List[str], |
| verbose: bool = False) -> np.ndarray: |
| """ Embed a list of n documents/words into an n-dimensional |
| matrix of embeddings |
| |
| Arguments: |
| documents: A list of documents or words to be embedded |
| verbose: Controls the verbosity of the process |
| |
| Returns: |
| Document/words embeddings with shape (n, m) with `n` documents/words |
| that each have an embeddings size of `m` |
| """ |
| dataset = MyDataset(documents) |
|
|
| embeddings = [] |
| for document, features in tqdm(zip(documents, self.embedding_model(dataset, truncation=True, padding=True)), |
| total=len(dataset), disable=not verbose): |
| embeddings.append(self._embed(document, features)) |
|
|
| return np.array(embeddings) |
|
|
| def _embed(self, |
| document: str, |
| features: np.ndarray) -> np.ndarray: |
| """ Mean pooling |
| |
| Arguments: |
| document: The document for which to extract the attention mask |
| features: The embeddings for each token |
| |
| Adopted from: |
| https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers |
| """ |
| token_embeddings = np.array(features) |
| attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")["attention_mask"] |
| input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape) |
| sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1) |
| sum_mask = np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=input_mask_expanded.sum(1).max()) |
| embedding = normalize(sum_embeddings / sum_mask)[0] |
| return embedding |
|
|
|
|
| class MyDataset(Dataset): |
| """ Dataset to pass to `transformers.pipelines.pipeline` """ |
| def __init__(self, docs): |
| self.docs = docs |
|
|
| def __len__(self): |
| return len(self.docs) |
|
|
| def __getitem__(self, idx): |
| return self.docs[idx] |
|
|