GoodreadsDataGenrePredictor / train_model.py
rishikasrinivas's picture
Create train_model.py
03012a7 verified
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
import torch
from process_data import getDF
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, SequentialSampler
from transformers import DataCollatorForTokenClassification
from transformers import get_linear_schedule_with_warmup
from sampler import BalanceSampler
NUM_CLASSES = 13
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Modify the output layer to match the number of classes
model.classifier = torch.nn.Linear(in_features = 768, out_features= NUM_CLASSES)
print(model)
data_collator = DataCollatorForTokenClassification(tokenizer)
def get_input_id_and_attention_masks():
df = getDF() #from process.py
input_ids = []
attention_masks = []
for summ in df['summary']:
encoded_dict = tokenizer.encode_plus(
summ, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 512, # Pad & truncate all sentences.
truncation=True,
pad_to_max_length = True,
padding='max_length',
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt',# Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.from_numpy(np.array(df['genre_id'].tolist()))
return input_ids, attention_masks, labels
input_ids,attention_masks, labels= get_input_id_and_attention_masks()
def createTensorDS(input_ids,attention_masks, labels):
return TensorDataset(input_ids, attention_masks, labels)
def split(tensorDataset):
train_size = int(0.85 * len(tensorDataset))
val_size = len(tensorDataset) - train_size
train_dataset, val_dataset = random_split(tensorDataset, [train_size, val_size])
return train_dataset, val_dataset
def createDataloaders(train_dataset, val_dataset):
batch_size = 16
train_dataloader = DataLoader(
train_dataset,
sampler = BalanceSampler(train_dataset),
batch_size = batch_size
)
valid_dataloader = DataLoader(
val_dataset,
sampler = SequentialSampler(val_dataset),
batch_size = batch_size
)
return train_dataloader, valid_dataloader
def calc_accuracy(logits,labels):
label=[]
num_ones = 0
acc = 0
for label_set in labels:
labs = []
for ind, res in enumerate(label_set):
if res.item() == 1:
labs.append(ind)
label.append(labs)
num_ones += len(labs)
for i,log in enumerate(logits):
top_out = (-log).argsort()[:5]
for ind in top_out:
if ind in label[i]:
acc = acc+1
return acc/num_ones
def train(model, train, val, epochs):
total_steps = len(train)*epochs
optimizer = torch.optim.Adam(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
loss_fn=torch.nn.BCEWithLogitsLoss()
for epoch in range(3):
total_train_loss = 0
batch_loss = 0
print("")
print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
print('Training...')
model.train()
for step, batch in enumerate(train):
input_ids= batch[0].to(device)
input_mask = batch[1].to(device)
labels = batch[2].to(device)
optimizer.zero_grad()
out = model(input_ids, attention_mask=input_mask)
logits =out['logits']
loss = loss_fn(logits, labels)
acc += calc_accuracy(logits, labels)
total_train_loss += loss.item()
batch_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_train_loss/len(train)
print('train_loss: ', avg_train_loss,)
print('train_acc: ', acc)
print("Running Validation...")
model.eval()
total_eval_accuracy=0
total_eval_loss= 0
num_Eval_steps= 0
for batch in val:
input_ids= batch[0].to(device)
input_mask=batch[1].to(device)
labels = batch[2].to(device)
with torch.no_grad():
out = model(input_ids,attention_mask=input_mask)
logits = out['logits']
loss = loss_fn(logits, labels)
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = labels.cpu().numpy()
avg_loss_Eval = total_eval_loss/len(val)
print(
'epoch: ', epoch,
'train_loss: ', avg_train_loss,
'valid loss ', avg_loss_Eval,
)
input_ids, attention_masks, labels=get_input_id_and_attention_masks()
ds=createTensorDS(input_ids, attention_masks, labels)
train_dataset, val_dataset=split(ds)
train_dataloader, valid_dataloader=createDataloaders(train_dataset, val_dataset)
train(model, train_dataloader, valid_dataloader, 3)