TopicModelingRepo / BERTopic /bertopic /backend /_hftransformers.py
kisejin's picture
Upload 261 files
19b102a verified
raw
history blame
3.48 kB
import numpy as np
from tqdm import tqdm
from typing import List
from torch.utils.data import Dataset
from sklearn.preprocessing import normalize
from transformers.pipelines import Pipeline
from bertopic.backend import BaseEmbedder
class HFTransformerBackend(BaseEmbedder):
""" Hugging Face transformers model
This uses the `transformers.pipelines.pipeline` to define and create
a feature generation pipeline from which embeddings can be extracted.
Arguments:
embedding_model: A Hugging Face feature extraction pipeline
Examples:
To use a Hugging Face transformers model, load in a pipeline and point
to any model found on their model hub (https://huggingface.co/models):
```python
from bertopic.backend import HFTransformerBackend
from transformers.pipelines import pipeline
hf_model = pipeline("feature-extraction", model="distilbert-base-cased")
embedding_model = HFTransformerBackend(hf_model)
```
"""
def __init__(self, embedding_model: Pipeline):
super().__init__()
if isinstance(embedding_model, Pipeline):
self.embedding_model = embedding_model
else:
raise ValueError("Please select a correct transformers pipeline. For example: "
"pipeline('feature-extraction', model='distilbert-base-cased', device=0)")
def embed(self,
documents: List[str],
verbose: bool = False) -> np.ndarray:
""" Embed a list of n documents/words into an n-dimensional
matrix of embeddings
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
dataset = MyDataset(documents)
embeddings = []
for document, features in tqdm(zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
total=len(dataset), disable=not verbose):
embeddings.append(self._embed(document, features))
return np.array(embeddings)
def _embed(self,
document: str,
features: np.ndarray) -> np.ndarray:
""" Mean pooling
Arguments:
document: The document for which to extract the attention mask
features: The embeddings for each token
Adopted from:
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
"""
token_embeddings = np.array(features)
attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")["attention_mask"]
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=input_mask_expanded.sum(1).max())
embedding = normalize(sum_embeddings / sum_mask)[0]
return embedding
class MyDataset(Dataset):
""" Dataset to pass to `transformers.pipelines.pipeline` """
def __init__(self, docs):
self.docs = docs
def __len__(self):
return len(self.docs)
def __getitem__(self, idx):
return self.docs[idx]