| | from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer |
| | import torch |
| | from process_data import getDF |
| | from torch.utils.data import TensorDataset, random_split |
| | from torch.utils.data import DataLoader, SequentialSampler |
| |
|
| | from transformers import DataCollatorForTokenClassification |
| | from transformers import get_linear_schedule_with_warmup |
| | from sampler import BalanceSampler |
| | NUM_CLASSES = 13 |
| | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
| |
|
| | |
| | model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device) |
| | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") |
| |
|
| | |
| | model.classifier = torch.nn.Linear(in_features = 768, out_features= NUM_CLASSES) |
| | print(model) |
| | data_collator = DataCollatorForTokenClassification(tokenizer) |
| |
|
| | def get_input_id_and_attention_masks(): |
| | df = getDF() |
| | |
| | input_ids = [] |
| | attention_masks = [] |
| | for summ in df['summary']: |
| | encoded_dict = tokenizer.encode_plus( |
| | summ, |
| | add_special_tokens = True, |
| | max_length = 512, |
| | truncation=True, |
| | pad_to_max_length = True, |
| | padding='max_length', |
| | return_attention_mask = True, |
| | return_tensors = 'pt', |
| | ) |
| | input_ids.append(encoded_dict['input_ids']) |
| | |
| | |
| | attention_masks.append(encoded_dict['attention_mask']) |
| | input_ids = torch.cat(input_ids, dim=0) |
| | attention_masks = torch.cat(attention_masks, dim=0) |
| | |
| |
|
| | labels = torch.from_numpy(np.array(df['genre_id'].tolist())) |
| | return input_ids, attention_masks, labels |
| |
|
| | input_ids,attention_masks, labels= get_input_id_and_attention_masks() |
| |
|
| | def createTensorDS(input_ids,attention_masks, labels): |
| | return TensorDataset(input_ids, attention_masks, labels) |
| |
|
| | def split(tensorDataset): |
| | train_size = int(0.85 * len(tensorDataset)) |
| | val_size = len(tensorDataset) - train_size |
| | train_dataset, val_dataset = random_split(tensorDataset, [train_size, val_size]) |
| | return train_dataset, val_dataset |
| |
|
| | def createDataloaders(train_dataset, val_dataset): |
| | |
| | batch_size = 16 |
| |
|
| | train_dataloader = DataLoader( |
| | train_dataset, |
| | sampler = BalanceSampler(train_dataset), |
| | batch_size = batch_size |
| | ) |
| |
|
| | valid_dataloader = DataLoader( |
| | val_dataset, |
| | sampler = SequentialSampler(val_dataset), |
| | batch_size = batch_size |
| | ) |
| | return train_dataloader, valid_dataloader |
| |
|
| |
|
| | def calc_accuracy(logits,labels): |
| | label=[] |
| | num_ones = 0 |
| | acc = 0 |
| | for label_set in labels: |
| | labs = [] |
| | for ind, res in enumerate(label_set): |
| | if res.item() == 1: |
| | labs.append(ind) |
| | label.append(labs) |
| | num_ones += len(labs) |
| |
|
| | for i,log in enumerate(logits): |
| | top_out = (-log).argsort()[:5] |
| | |
| | for ind in top_out: |
| | if ind in label[i]: |
| | acc = acc+1 |
| | return acc/num_ones |
| |
|
| | def train(model, train, val, epochs): |
| | total_steps = len(train)*epochs |
| | optimizer = torch.optim.Adam(model.parameters(), |
| | lr = 2e-5, |
| | eps = 1e-8 |
| | ) |
| | scheduler = get_linear_schedule_with_warmup(optimizer, |
| | num_warmup_steps = 0, |
| | num_training_steps = total_steps) |
| | loss_fn=torch.nn.BCEWithLogitsLoss() |
| | for epoch in range(3): |
| | total_train_loss = 0 |
| | batch_loss = 0 |
| | print("") |
| | print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs)) |
| | print('Training...') |
| | model.train() |
| | for step, batch in enumerate(train): |
| | input_ids= batch[0].to(device) |
| | input_mask = batch[1].to(device) |
| | labels = batch[2].to(device) |
| |
|
| | optimizer.zero_grad() |
| | out = model(input_ids, attention_mask=input_mask) |
| |
|
| | logits =out['logits'] |
| | loss = loss_fn(logits, labels) |
| | |
| | acc += calc_accuracy(logits, labels) |
| | total_train_loss += loss.item() |
| | batch_loss += loss.item() |
| |
|
| | loss.backward() |
| | torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| | optimizer.step() |
| | scheduler.step() |
| | |
| | avg_train_loss = total_train_loss/len(train) |
| | print('train_loss: ', avg_train_loss,) |
| | print('train_acc: ', acc) |
| | print("Running Validation...") |
| | model.eval() |
| | total_eval_accuracy=0 |
| | total_eval_loss= 0 |
| | num_Eval_steps= 0 |
| |
|
| | for batch in val: |
| | input_ids= batch[0].to(device) |
| | input_mask=batch[1].to(device) |
| | labels = batch[2].to(device) |
| | with torch.no_grad(): |
| | out = model(input_ids,attention_mask=input_mask) |
| |
|
| |
|
| |
|
| | logits = out['logits'] |
| | loss = loss_fn(logits, labels) |
| | total_eval_loss += loss.item() |
| |
|
| | logits = logits.detach().cpu().numpy() |
| | label_ids = labels.cpu().numpy() |
| |
|
| | |
| | avg_loss_Eval = total_eval_loss/len(val) |
| | print( |
| | 'epoch: ', epoch, |
| | 'train_loss: ', avg_train_loss, |
| | 'valid loss ', avg_loss_Eval, |
| | ) |
| | input_ids, attention_masks, labels=get_input_id_and_attention_masks() |
| | ds=createTensorDS(input_ids, attention_masks, labels) |
| | train_dataset, val_dataset=split(ds) |
| | train_dataloader, valid_dataloader=createDataloaders(train_dataset, val_dataset) |
| | train(model, train_dataloader, valid_dataloader, 3) |