| | |
| | import numpy as np |
| | import pandas as pd |
| | import csv |
| | import torch.nn as nn |
| | from torch.optim.lr_scheduler import ReduceLROnPlateau |
| | from torch.utils.data import TensorDataset, DataLoader |
| | from transformers import BertTokenizer,BertConfig,AdamW |
| | from sklearn.metrics import accuracy_score |
| | from sklearn.metrics import classification_report |
| | from tqdm import tqdm |
| | import torch |
| | import transformers |
| | from torch.utils.data import Dataset, DataLoader |
| |
|
| | |
| |
|
| | class MyDataSet(Dataset): |
| | def __init__(self, loaded_data): |
| | self.data = loaded_data |
| | |
| | def __len__(self): |
| | return len(self.data) |
| | |
| | def __getitem__(self, idx): |
| | return self.data[idx] |
| | |
| | Data_path = "/kaggle/input/inference/train.csv" |
| | Totle_data = pd.read_csv(Data_path) |
| | Totle_data = Totle_data.sample(frac=0.1) |
| | Totle_data = Totle_data.dropna(axis=0,subset = ["2"]) |
| | custom_dataset = MyDataSet(Totle_data) |
| | |
| | train_size = int(len(custom_dataset) * 0.6) |
| | validate_size = int(len(custom_dataset) * 0.1) |
| | test_size = len(custom_dataset) - validate_size - train_size |
| | train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size]) |
| | |
| | |
| | train_data_path="Bert_Try.csv" |
| | dev_data_path = "Bert_Dev.csv" |
| | test_data_path="Bert_Test.csv" |
| |
|
| | train_dataset = Totle_data.iloc[train_dataset.indices] |
| | validate_dataset = Totle_data.iloc[validate_dataset.indices] |
| | test_dataset = Totle_data.iloc[test_dataset.indices] |
| |
|
| | |
| | train_dataset.to_csv(train_data_path,index=False,header=True) |
| | validate_dataset.to_csv(dev_data_path ,index=False,header=True) |
| | test_dataset.to_csv(test_data_path,index=False,header=True) |
| |
|
| | |
| | data = pd.read_csv(train_data_path) |
| | data.head |
| |
|
| | |
| |
|
| | class BertClassificationModel(nn.Module): |
| | def __init__(self): |
| | super(BertClassificationModel, self).__init__() |
| | |
| | pretrained_weights="bert-base-chinese" |
| | self.bert = transformers.BertModel.from_pretrained(pretrained_weights) |
| | for param in self.bert.parameters(): |
| | param.requires_grad = True |
| | |
| | self.dense = nn.Linear(768, 3) |
| | |
| | def forward(self, input_ids,token_type_ids,attention_mask): |
| | |
| | bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask) |
| | |
| | bert_cls_hidden_state = bert_output[1] |
| | |
| | linear_output = self.dense(bert_cls_hidden_state) |
| | return linear_output |
| |
|
| | |
| |
|
| | def encoder(max_len,vocab_path,text_list): |
| | |
| | |
| | tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") |
| | tokenizer = tokenizer( |
| | text_list, |
| | padding = True, |
| | truncation = True, |
| | max_length = max_len, |
| | return_tensors='pt' |
| | ) |
| | input_ids = tokenizer['input_ids'] |
| | token_type_ids = tokenizer['token_type_ids'] |
| | attention_mask = tokenizer['attention_mask'] |
| | return input_ids,token_type_ids,attention_mask |
| |
|
| | |
| | labels2dict = {"neutral":0,"entailment":1,"contradiction":2} |
| | def load_data(path): |
| | csvFileObj = open(path) |
| | readerObj = csv.reader(csvFileObj) |
| | text_list = [] |
| | labels = [] |
| | for row in readerObj: |
| | |
| | if readerObj.line_num == 1: |
| | continue |
| | |
| | label = int(labels2dict[row[0]]) |
| | text = row[1] |
| | text_list.append(text) |
| | labels.append(label) |
| | |
| | input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list) |
| | labels = torch.tensor(labels) |
| | |
| | data = TensorDataset(input_ids,token_type_ids,attention_mask,labels) |
| | return data |
| |
|
| | |
| | |
| | batch_size = 16 |
| | |
| | train_data_path="Bert_Try.csv" |
| | dev_data_path="Bert_Dev.csv" |
| | test_data_path="Bert_Test.csv" |
| | |
| | train_data = load_data(train_data_path) |
| | dev_data = load_data(dev_data_path) |
| | test_data = load_data(test_data_path) |
| | |
| | train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) |
| | dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True) |
| | test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False) |
| |
|
| | |
| | def dev(model,dev_loader): |
| | model.to(device) |
| |
|
| | model.eval() |
| |
|
| | with torch.no_grad(): |
| | correct = 0 |
| | total = 0 |
| | for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'): |
| | input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
| | out_put = model(input_ids,token_type_ids,attention_mask) |
| | _, predict = torch.max(out_put.data, 1) |
| | correct += (predict==labels).sum().item() |
| | total += labels.size(0) |
| | res = correct / total |
| | return res |
| |
|
| | |
| |
|
| | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
| | def train(model,train_loader,dev_loader) : |
| |
|
| | model.to(device) |
| | model.train() |
| | criterion = nn.CrossEntropyLoss() |
| | param_optimizer = list(model.named_parameters()) |
| | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] |
| |
|
| | optimizer_grouped_parameters = [ |
| | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], |
| | 'weight_decay': 0.01}, |
| | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
| | ] |
| |
|
| | optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False} |
| | optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params) |
| | scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08) |
| | t_total = len(train_loader) |
| |
|
| | total_epochs = 10 |
| | bestAcc = 0 |
| | correct = 0 |
| | total = 0 |
| | print('Training and verification begin!') |
| | for epoch in range(total_epochs): |
| | for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader): |
| |
|
| | optimizer.zero_grad() |
| | input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
| | out_put = model(input_ids,token_type_ids,attention_mask) |
| | loss = criterion(out_put, labels) |
| | _, predict = torch.max(out_put.data, 1) |
| | correct += (predict == labels).sum().item() |
| | total += labels.size(0) |
| | loss.backward() |
| | optimizer.step() |
| | |
| | if (step + 1) % 10 == 0: |
| | train_acc = correct / total |
| | print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item())) |
| | |
| | if (step + 1) % 200 == 0: |
| | train_acc = correct / total |
| | |
| | acc = dev(model, dev_loader) |
| | if bestAcc < acc: |
| | bestAcc = acc |
| | |
| | path = 'bert_model.pkl' |
| | torch.save(model, path) |
| | print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item())) |
| | scheduler.step(bestAcc) |
| |
|
| | |
| |
|
| | path = '/kaggle/input/inference/bert_model.pkl' |
| | |
| | |
| | model = BertClassificationModel() |
| | |
| | train(model,train_loader,dev_loader) |
| |
|