|
|
from preprocess_data import preprocess
|
|
|
from load_data import load_dataset_
|
|
|
from model import BERT_Arch
|
|
|
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import torch
|
|
|
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
|
|
from torch import nn
|
|
|
from torch.optim import AdamW
|
|
|
from sklearn.utils.class_weight import compute_class_weight
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from sklearn.metrics import classification_report
|
|
|
from imblearn.under_sampling import RandomUnderSampler
|
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
|
import transformers
|
|
|
from transformers import AutoModel, BertTokenizerFast
|
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset_()
|
|
|
|
|
|
texts,labels = preprocess(dataset)
|
|
|
|
|
|
df = pd.DataFrame({"texts":texts, "labels":labels})
|
|
|
df = df.iloc[:-40000][["texts","labels"]]
|
|
|
|
|
|
rus = RandomUnderSampler(random_state=42)
|
|
|
X_res, y_res = rus.fit_resample(pd.DataFrame(df['texts']), pd.DataFrame(df['labels']))
|
|
|
|
|
|
train_text, temp_text, train_labels, temp_labels = train_test_split(X_res,y_res,
|
|
|
random_state=2018,
|
|
|
test_size=0.3,
|
|
|
stratify=y_res)
|
|
|
|
|
|
|
|
|
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
|
|
|
random_state=2018,
|
|
|
test_size=0.5,
|
|
|
stratify=temp_labels)
|
|
|
|
|
|
bert = AutoModel.from_pretrained('bert-base-uncased')
|
|
|
|
|
|
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
|
|
|
|
|
tokens_train = tokenizer.batch_encode_plus(
|
|
|
train_text['texts'].tolist(),
|
|
|
max_length = 25,
|
|
|
pad_to_max_length=True,
|
|
|
truncation=True
|
|
|
)
|
|
|
|
|
|
tokens_val = tokenizer.batch_encode_plus(
|
|
|
val_text['texts'].tolist(),
|
|
|
max_length = 25,
|
|
|
pad_to_max_length=True,
|
|
|
truncation=True
|
|
|
)
|
|
|
|
|
|
tokens_test = tokenizer.batch_encode_plus(
|
|
|
test_text['texts'].tolist(),
|
|
|
max_length = 25,
|
|
|
pad_to_max_length=True,
|
|
|
truncation=True
|
|
|
)
|
|
|
|
|
|
train_seq = torch.tensor(tokens_train['input_ids'])
|
|
|
train_mask = torch.tensor(tokens_train['attention_mask'])
|
|
|
train_y = torch.tensor(train_labels['labels'].tolist())
|
|
|
|
|
|
val_seq = torch.tensor(tokens_val['input_ids'])
|
|
|
val_mask = torch.tensor(tokens_val['attention_mask'])
|
|
|
val_y = torch.tensor(val_labels['labels'].tolist())
|
|
|
|
|
|
test_seq = torch.tensor(tokens_test['input_ids'])
|
|
|
test_mask = torch.tensor(tokens_test['attention_mask'])
|
|
|
test_y = torch.tensor(test_labels['labels'].tolist())
|
|
|
|
|
|
batch_size = 32
|
|
|
|
|
|
train_data = TensorDataset(train_seq, train_mask, train_y)
|
|
|
|
|
|
train_sampler = RandomSampler(train_data)
|
|
|
|
|
|
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
|
|
|
|
|
|
val_data = TensorDataset(val_seq, val_mask, val_y)
|
|
|
|
|
|
val_sampler = SequentialSampler(val_data)
|
|
|
|
|
|
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
|
|
|
|
|
|
for param in bert.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
model = BERT_Arch(bert)
|
|
|
|
|
|
model = model.to(device)
|
|
|
|
|
|
|
|
|
optimizer = AdamW(model.parameters(),lr = 1e-5)
|
|
|
|
|
|
class_weights = compute_class_weight("balanced",classes = np.unique(train_labels),y =train_labels['labels'] )
|
|
|
|
|
|
weights= torch.tensor(class_weights,dtype=torch.float)
|
|
|
|
|
|
weights = weights.to(device)
|
|
|
|
|
|
cross_entropy = nn.NLLLoss(weight=weights)
|
|
|
|
|
|
epochs = 10
|
|
|
|
|
|
def train():
|
|
|
|
|
|
model.train()
|
|
|
total_loss, total_accuracy = 0, 0
|
|
|
|
|
|
total_preds=[]
|
|
|
|
|
|
for step,batch in enumerate(train_dataloader):
|
|
|
|
|
|
if step % 50 == 0 and not step == 0:
|
|
|
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
|
|
|
|
|
|
batch = [r.to(device) for r in batch]
|
|
|
|
|
|
sent_id, mask, labels = batch
|
|
|
|
|
|
model.zero_grad()
|
|
|
|
|
|
preds = model(sent_id, mask)
|
|
|
|
|
|
loss = cross_entropy(preds, labels)
|
|
|
|
|
|
total_loss = total_loss + loss.item()
|
|
|
|
|
|
loss.backward()
|
|
|
|
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
preds=preds.detach().cpu().numpy()
|
|
|
|
|
|
total_preds.append(preds)
|
|
|
|
|
|
avg_loss = total_loss / len(train_dataloader)
|
|
|
|
|
|
total_preds = np.concatenate(total_preds, axis=0)
|
|
|
|
|
|
return avg_loss, total_preds
|
|
|
|
|
|
def evaluate():
|
|
|
|
|
|
print("\nEvaluating...")
|
|
|
|
|
|
model.eval()
|
|
|
|
|
|
total_loss, total_accuracy = 0, 0
|
|
|
|
|
|
total_preds = []
|
|
|
|
|
|
for step,batch in enumerate(val_dataloader):
|
|
|
|
|
|
|
|
|
batch = [t.to(device) for t in batch]
|
|
|
|
|
|
sent_id, mask, labels = batch
|
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
|
preds = model(sent_id, mask)
|
|
|
|
|
|
loss = cross_entropy(preds,labels)
|
|
|
|
|
|
total_loss = total_loss + loss.item()
|
|
|
|
|
|
preds = preds.detach().cpu().numpy()
|
|
|
|
|
|
total_preds.append(preds)
|
|
|
|
|
|
avg_loss = total_loss / len(val_dataloader)
|
|
|
|
|
|
total_preds = np.concatenate(total_preds, axis=0)
|
|
|
|
|
|
return avg_loss, total_preds
|
|
|
|
|
|
best_valid_loss = float('inf')
|
|
|
|
|
|
epochs = 50
|
|
|
|
|
|
train_losses=[]
|
|
|
valid_losses=[]
|
|
|
|
|
|
for epoch in range(epochs):
|
|
|
|
|
|
print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
|
|
|
|
|
|
train_loss, _ = train()
|
|
|
|
|
|
valid_loss, _ = evaluate()
|
|
|
|
|
|
if valid_loss < best_valid_loss:
|
|
|
best_valid_loss = valid_loss
|
|
|
torch.save(model.state_dict(), 'saved_weights.pt')
|
|
|
|
|
|
train_losses.append(train_loss)
|
|
|
valid_losses.append(valid_loss)
|
|
|
|
|
|
print(f'\nTraining Loss: {train_loss:.3f}')
|
|
|
print(f'Validation Loss: {valid_loss:.3f}')
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
preds = model(test_seq.to(device), test_mask.to(device))
|
|
|
preds = preds.detach().cpu().numpy()
|
|
|
|
|
|
|
|
|
|
|
|
preds = np.argmax(preds, axis = 1)
|
|
|
print(classification_report(test_y, preds))
|
|
|
|
|
|
torch.save(model.state_dict(),'model.pth') |