| |
| import torch |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| import numpy |
|
|
| class TransformerVectorizer: |
| def __init__(self): |
| |
| self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") |
|
|
| |
| self.transformer_model = AutoModelForSequenceClassification.from_pretrained( |
| "cardiffnlp/twitter-roberta-base-sentiment-latest" |
| ) |
| self.device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
| def text_to_tensor( |
| self, |
| texts: list, |
| ) -> numpy.ndarray: |
| """Function that transforms a list of texts to their learned representation. |
| |
| Args: |
| list_text_X (list): List of texts to be transformed. |
| |
| Returns: |
| numpy.ndarray: Transformed list of texts. |
| """ |
| |
| tokenized_text_X_train = self.tokenizer.batch_encode_plus( |
| texts, return_tensors="pt" |
| )["input_ids"] |
|
|
| |
| |
| tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50) |
|
|
| |
| transformer_model = self.transformer_model.to(self.device) |
| output_hidden_states_list = [] |
|
|
| for tokenized_x in tokenized_text_X_train_split: |
| |
| |
| output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[ |
| 1 |
| ][-1] |
| |
| output_hidden_states = output_hidden_states.mean(dim=1) |
| output_hidden_states = output_hidden_states.detach().cpu().numpy() |
| output_hidden_states_list.append(output_hidden_states) |
|
|
| self.encodings = numpy.concatenate(output_hidden_states_list, axis=0) |
| return self.encodings |
|
|
| def transform(self, texts: list): |
| return self.text_to_tensor(texts) |
|
|
|
|