baltimore / Toxonomy /modules /preprocessor.py
rajaatif786's picture
Update Toxonomy/modules/preprocessor.py
91b731b
from transformers import BertTokenizer
import torch
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('.'+'/berttok')
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
"""Perform required preprocessing steps for pretrained BERT.
@param data (np.array): Array of texts to be processed.
@return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
@return attention_masks (torch.Tensor): Tensor of indices specifying which
tokens should be attended to by the model.
"""
# Create empty lists to store outputs
input_ids = []
attention_masks = []
#MAX_LEN=100
# For every sentence...
for sent in data:
# `encode_plus` will:
# (1) Tokenize the sentence
# (2) Add the `[CLS]` and `[SEP]` token to the start and end
# (3) Truncate/Pad sentence to max length
# (4) Map tokens to their IDs
# (5) Create attention mask
# (6) Return a dictionary of outputs
encoded_sent = tokenizer.encode_plus(
text=sent, # Preprocess sentence
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=5000, # Max length to truncate/pad
pad_to_max_length=True, # Pad sentence to max length
#return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks