Spaces:

rajaatif786
/

baltimore

Runtime error

Update Toxonomy/modules/preprocessor.py

91b731b over 2 years ago

1.85 kB

	from transformers import BertTokenizer
	import torch
	# Load the BERT tokenizer
	tokenizer = BertTokenizer.from_pretrained('.'+'/berttok')
	# Create a function to tokenize a set of texts
	def preprocessing_for_bert(data):
	"""Perform required preprocessing steps for pretrained BERT.
	@param data (np.array): Array of texts to be processed.
	@return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
	@return attention_masks (torch.Tensor): Tensor of indices specifying which
	tokens should be attended to by the model.
	"""
	# Create empty lists to store outputs
	input_ids = []
	attention_masks = []
	#MAX_LEN=100
	# For every sentence...
	for sent in data:

	# `encode_plus` will:
	# (1) Tokenize the sentence
	# (2) Add the `[CLS]` and `[SEP]` token to the start and end
	# (3) Truncate/Pad sentence to max length
	# (4) Map tokens to their IDs
	# (5) Create attention mask
	# (6) Return a dictionary of outputs
	encoded_sent = tokenizer.encode_plus(
	text=sent, # Preprocess sentence
	add_special_tokens=True, # Add `[CLS]` and `[SEP]`
	max_length=5000, # Max length to truncate/pad
	pad_to_max_length=True, # Pad sentence to max length
	#return_tensors='pt', # Return PyTorch tensor
	return_attention_mask=True # Return attention mask
	)

	# Add the outputs to the lists
	input_ids.append(encoded_sent.get('input_ids'))
	attention_masks.append(encoded_sent.get('attention_mask'))

	# Convert lists to tensors
	input_ids = torch.tensor(input_ids)
	attention_masks = torch.tensor(attention_masks)

	return input_ids, attention_masks