| | |
| | import torch |
| | from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| | import numpy |
| |
|
| | class TransformerVectorizer: |
| | def __init__(self): |
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") |
| |
|
| | |
| | self.transformer_model = AutoModelForSequenceClassification.from_pretrained( |
| | "cardiffnlp/twitter-roberta-base-sentiment-latest" |
| | ) |
| | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| |
|
| | def text_to_tensor( |
| | self, |
| | texts: list, |
| | ) -> numpy.ndarray: |
| | """Function that transforms a list of texts to their learned representation. |
| | |
| | Args: |
| | list_text_X (list): List of texts to be transformed. |
| | |
| | Returns: |
| | numpy.ndarray: Transformed list of texts. |
| | """ |
| | |
| | tokenized_text_X_train = self.tokenizer.batch_encode_plus( |
| | texts, return_tensors="pt" |
| | )["input_ids"] |
| |
|
| | |
| | |
| | tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50) |
| |
|
| | |
| | transformer_model = self.transformer_model.to(self.device) |
| | output_hidden_states_list = [] |
| |
|
| | for tokenized_x in tokenized_text_X_train_split: |
| | |
| | |
| | output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[ |
| | 1 |
| | ][-1] |
| | |
| | output_hidden_states = output_hidden_states.mean(dim=1) |
| | output_hidden_states = output_hidden_states.detach().cpu().numpy() |
| | output_hidden_states_list.append(output_hidden_states) |
| |
|
| | self.encodings = numpy.concatenate(output_hidden_states_list, axis=0) |
| | return self.encodings |
| |
|
| | def transform(self, texts: list): |
| | return self.text_to_tensor(texts) |
| |
|
| |
|