kanav0183
/

toxic_model

Model card Files Files and versions

toxic_model / app.py

kanav0183's picture

Update app.py

5f4c80b over 3 years ago

history blame contribute delete

3.81 kB

	import pandas as pd
	import torch
	from tqdm import tqdm
	from torch.utils.data import Dataset, DataLoader
	from transformers import DistilBertTokenizer, DistilBertModel
	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "1"



	MAX_LEN = 512
	TRAIN_BATCH_SIZE = 16
	VALID_BATCH_SIZE = 16
	EPOCHS = 3
	LEARNING_RATE = 1e-05
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


	tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

	class MultiLabelDataset(Dataset):

	def __init__(self, dataframe, tokenizer, max_len, new_data=False):
	self.tokenizer = tokenizer
	self.data = dataframe
	self.text = dataframe.comment_text
	self.new_data = new_data

	if not new_data:
	self.targets = self.data.labels
	self.max_len = max_len

	def __len__(self):
	return len(self.text)

	def __getitem__(self, index):
	text = str(self.text[index])
	text = " ".join(text.split())

	inputs = self.tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	pad_to_max_length=True,
	return_token_type_ids=True
	)
	ids = inputs['input_ids']
	mask = inputs['attention_mask']
	token_type_ids = inputs["token_type_ids"]

	out = {
	'ids': torch.tensor(ids, dtype=torch.long),
	'mask': torch.tensor(mask, dtype=torch.long),
	'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
	}

	if not self.new_data:
	out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)

	return out

	class DistilBERTClass(torch.nn.Module):
	def __init__(self):
	super(DistilBERTClass, self).__init__()

	self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
	self.classifier = torch.nn.Sequential(
	torch.nn.Linear(768, 768),
	torch.nn.ReLU(),
	torch.nn.Dropout(0.1),
	torch.nn.Linear(768, 6)
	)

	def forward(self, input_ids, attention_mask, token_type_ids):
	output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
	hidden_state = output_1[0]
	out = hidden_state[:, 0]
	out = self.classifier(out)
	return out

	model = DistilBERTClass()
	model.to(DEVICE);

	model_loaded = torch.load('inference_models_output_4fold_distilbert_fold_best_model.pth')

	model.load_state_dict(model_loaded['model'])


	val_params = {'batch_size': VALID_BATCH_SIZE,
	'shuffle': False,

	}
	def give_toxic(text):
	# text = "You fucker "
	test_data = pd.DataFrame([text],columns=['comment_text'])
	test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
	test_loader = DataLoader(test_set, **val_params)

	all_test_pred = []

	def test(epoch):
	model.eval()

	with torch.inference_mode():

	for _, data in tqdm(enumerate(test_loader, 0)):


	ids = data['ids'].to(DEVICE, dtype=torch.long)
	mask = data['mask'].to(DEVICE, dtype=torch.long)
	token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
	outputs = model(ids, mask, token_type_ids)
	probas = torch.sigmoid(outputs)

	all_test_pred.append(probas)

	probas = test(model)

	all_test_pred = torch.cat(all_test_pred)

	label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

	preds = all_test_pred.detach().cpu().numpy()[0]

	final_dict = dict(zip(label_columns , preds))
	return final_dict

	def device():
	return DEVICE

	print(give_toxic("fuck"))