Spaces:
Runtime error
Runtime error
| from transformers import BertTokenizer | |
| import torch | |
| # Load the BERT tokenizer | |
| tokenizer = BertTokenizer.from_pretrained('.'+'/berttok') | |
| # Create a function to tokenize a set of texts | |
| def preprocessing_for_bert(data): | |
| """Perform required preprocessing steps for pretrained BERT. | |
| @param data (np.array): Array of texts to be processed. | |
| @return input_ids (torch.Tensor): Tensor of token ids to be fed to a model. | |
| @return attention_masks (torch.Tensor): Tensor of indices specifying which | |
| tokens should be attended to by the model. | |
| """ | |
| # Create empty lists to store outputs | |
| input_ids = [] | |
| attention_masks = [] | |
| #MAX_LEN=100 | |
| # For every sentence... | |
| for sent in data: | |
| # `encode_plus` will: | |
| # (1) Tokenize the sentence | |
| # (2) Add the `[CLS]` and `[SEP]` token to the start and end | |
| # (3) Truncate/Pad sentence to max length | |
| # (4) Map tokens to their IDs | |
| # (5) Create attention mask | |
| # (6) Return a dictionary of outputs | |
| encoded_sent = tokenizer.encode_plus( | |
| text=sent, # Preprocess sentence | |
| add_special_tokens=True, # Add `[CLS]` and `[SEP]` | |
| max_length=5000, # Max length to truncate/pad | |
| pad_to_max_length=True, # Pad sentence to max length | |
| #return_tensors='pt', # Return PyTorch tensor | |
| return_attention_mask=True # Return attention mask | |
| ) | |
| # Add the outputs to the lists | |
| input_ids.append(encoded_sent.get('input_ids')) | |
| attention_masks.append(encoded_sent.get('attention_mask')) | |
| # Convert lists to tensors | |
| input_ids = torch.tensor(input_ids) | |
| attention_masks = torch.tensor(attention_masks) | |
| return input_ids, attention_masks |