File size: 6,189 Bytes
03012a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
import torch
from process_data import getDF
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, SequentialSampler
from transformers import DataCollatorForTokenClassification
from transformers import get_linear_schedule_with_warmup
from sampler import BalanceSampler
NUM_CLASSES = 13
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Modify the output layer to match the number of classes
model.classifier = torch.nn.Linear(in_features = 768, out_features= NUM_CLASSES)
print(model)
data_collator = DataCollatorForTokenClassification(tokenizer)
def get_input_id_and_attention_masks():
df = getDF() #from process.py
input_ids = []
attention_masks = []
for summ in df['summary']:
encoded_dict = tokenizer.encode_plus(
summ, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 512, # Pad & truncate all sentences.
truncation=True,
pad_to_max_length = True,
padding='max_length',
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt',# Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.from_numpy(np.array(df['genre_id'].tolist()))
return input_ids, attention_masks, labels
input_ids,attention_masks, labels= get_input_id_and_attention_masks()
def createTensorDS(input_ids,attention_masks, labels):
return TensorDataset(input_ids, attention_masks, labels)
def split(tensorDataset):
train_size = int(0.85 * len(tensorDataset))
val_size = len(tensorDataset) - train_size
train_dataset, val_dataset = random_split(tensorDataset, [train_size, val_size])
return train_dataset, val_dataset
def createDataloaders(train_dataset, val_dataset):
batch_size = 16
train_dataloader = DataLoader(
train_dataset,
sampler = BalanceSampler(train_dataset),
batch_size = batch_size
)
valid_dataloader = DataLoader(
val_dataset,
sampler = SequentialSampler(val_dataset),
batch_size = batch_size
)
return train_dataloader, valid_dataloader
def calc_accuracy(logits,labels):
label=[]
num_ones = 0
acc = 0
for label_set in labels:
labs = []
for ind, res in enumerate(label_set):
if res.item() == 1:
labs.append(ind)
label.append(labs)
num_ones += len(labs)
for i,log in enumerate(logits):
top_out = (-log).argsort()[:5]
for ind in top_out:
if ind in label[i]:
acc = acc+1
return acc/num_ones
def train(model, train, val, epochs):
total_steps = len(train)*epochs
optimizer = torch.optim.Adam(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
loss_fn=torch.nn.BCEWithLogitsLoss()
for epoch in range(3):
total_train_loss = 0
batch_loss = 0
print("")
print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
print('Training...')
model.train()
for step, batch in enumerate(train):
input_ids= batch[0].to(device)
input_mask = batch[1].to(device)
labels = batch[2].to(device)
optimizer.zero_grad()
out = model(input_ids, attention_mask=input_mask)
logits =out['logits']
loss = loss_fn(logits, labels)
acc += calc_accuracy(logits, labels)
total_train_loss += loss.item()
batch_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_train_loss/len(train)
print('train_loss: ', avg_train_loss,)
print('train_acc: ', acc)
print("Running Validation...")
model.eval()
total_eval_accuracy=0
total_eval_loss= 0
num_Eval_steps= 0
for batch in val:
input_ids= batch[0].to(device)
input_mask=batch[1].to(device)
labels = batch[2].to(device)
with torch.no_grad():
out = model(input_ids,attention_mask=input_mask)
logits = out['logits']
loss = loss_fn(logits, labels)
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = labels.cpu().numpy()
avg_loss_Eval = total_eval_loss/len(val)
print(
'epoch: ', epoch,
'train_loss: ', avg_train_loss,
'valid loss ', avg_loss_Eval,
)
input_ids, attention_masks, labels=get_input_id_and_attention_masks()
ds=createTensorDS(input_ids, attention_masks, labels)
train_dataset, val_dataset=split(ds)
train_dataloader, valid_dataloader=createDataloaders(train_dataset, val_dataset)
train(model, train_dataloader, valid_dataloader, 3) |