nijatmammadov's picture
Upload folder using huggingface_hub
96a8ab7 verified
from preprocess_data import preprocess
from load_data import load_dataset_
from model import BERT_Arch
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch import nn
from torch.optim import AdamW
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import AutoTokenizer
# texts, labels = preprocess()
dataset = load_dataset_()
texts,labels = preprocess(dataset)
df = pd.DataFrame({"texts":texts, "labels":labels})
df = df.iloc[:-40000][["texts","labels"]]
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(pd.DataFrame(df['texts']), pd.DataFrame(df['labels']))
train_text, temp_text, train_labels, temp_labels = train_test_split(X_res,y_res,
random_state=2018,
test_size=0.3,
stratify=y_res)
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
random_state=2018,
test_size=0.5,
stratify=temp_labels)
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokens_train = tokenizer.batch_encode_plus(
train_text['texts'].tolist(),
max_length = 25,
pad_to_max_length=True,
truncation=True
)
tokens_val = tokenizer.batch_encode_plus(
val_text['texts'].tolist(),
max_length = 25,
pad_to_max_length=True,
truncation=True
)
tokens_test = tokenizer.batch_encode_plus(
test_text['texts'].tolist(),
max_length = 25,
pad_to_max_length=True,
truncation=True
)
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels['labels'].tolist())
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels['labels'].tolist())
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels['labels'].tolist())
batch_size = 32
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
for param in bert.parameters():
param.requires_grad = False
model = BERT_Arch(bert)
model = model.to(device)
optimizer = AdamW(model.parameters(),lr = 1e-5)
class_weights = compute_class_weight("balanced",classes = np.unique(train_labels),y =train_labels['labels'] )
weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)
cross_entropy = nn.NLLLoss(weight=weights)
epochs = 10
def train():
model.train()
total_loss, total_accuracy = 0, 0
total_preds=[]
for step,batch in enumerate(train_dataloader):
if step % 50 == 0 and not step == 0:
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
batch = [r.to(device) for r in batch]
sent_id, mask, labels = batch
model.zero_grad()
preds = model(sent_id, mask)
loss = cross_entropy(preds, labels)
total_loss = total_loss + loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
preds=preds.detach().cpu().numpy()
total_preds.append(preds)
avg_loss = total_loss / len(train_dataloader)
total_preds = np.concatenate(total_preds, axis=0)
return avg_loss, total_preds
def evaluate():
print("\nEvaluating...")
model.eval()
total_loss, total_accuracy = 0, 0
total_preds = []
for step,batch in enumerate(val_dataloader):
batch = [t.to(device) for t in batch]
sent_id, mask, labels = batch
with torch.no_grad():
preds = model(sent_id, mask)
loss = cross_entropy(preds,labels)
total_loss = total_loss + loss.item()
preds = preds.detach().cpu().numpy()
total_preds.append(preds)
avg_loss = total_loss / len(val_dataloader)
total_preds = np.concatenate(total_preds, axis=0)
return avg_loss, total_preds
best_valid_loss = float('inf')
epochs = 50
train_losses=[]
valid_losses=[]
for epoch in range(epochs):
print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
train_loss, _ = train()
valid_loss, _ = evaluate()
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'saved_weights.pt')
train_losses.append(train_loss)
valid_losses.append(valid_loss)
print(f'\nTraining Loss: {train_loss:.3f}')
print(f'Validation Loss: {valid_loss:.3f}')
# get predictions for test data
with torch.no_grad():
preds = model(test_seq.to(device), test_mask.to(device))
preds = preds.detach().cpu().numpy()
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))
torch.save(model.state_dict(),'model.pth')