Spaces:
Sleeping
Sleeping
| from preprocess_data import preprocess | |
| from load_data import load_dataset_ | |
| from model import BERT_Arch | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | |
| from torch import nn | |
| from torch.optim import AdamW | |
| from sklearn.utils.class_weight import compute_class_weight | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report | |
| from imblearn.under_sampling import RandomUnderSampler | |
| device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') | |
| import transformers | |
| from transformers import AutoModel, BertTokenizerFast | |
| from transformers import AutoTokenizer | |
| # texts, labels = preprocess() | |
| dataset = load_dataset_() | |
| texts,labels = preprocess(dataset) | |
| df = pd.DataFrame({"texts":texts, "labels":labels}) | |
| df = df.iloc[:-40000][["texts","labels"]] | |
| rus = RandomUnderSampler(random_state=42) | |
| X_res, y_res = rus.fit_resample(pd.DataFrame(df['texts']), pd.DataFrame(df['labels'])) | |
| train_text, temp_text, train_labels, temp_labels = train_test_split(X_res,y_res, | |
| random_state=2018, | |
| test_size=0.3, | |
| stratify=y_res) | |
| val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, | |
| random_state=2018, | |
| test_size=0.5, | |
| stratify=temp_labels) | |
| bert = AutoModel.from_pretrained('bert-base-uncased') | |
| tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
| tokens_train = tokenizer.batch_encode_plus( | |
| train_text['texts'].tolist(), | |
| max_length = 25, | |
| pad_to_max_length=True, | |
| truncation=True | |
| ) | |
| tokens_val = tokenizer.batch_encode_plus( | |
| val_text['texts'].tolist(), | |
| max_length = 25, | |
| pad_to_max_length=True, | |
| truncation=True | |
| ) | |
| tokens_test = tokenizer.batch_encode_plus( | |
| test_text['texts'].tolist(), | |
| max_length = 25, | |
| pad_to_max_length=True, | |
| truncation=True | |
| ) | |
| train_seq = torch.tensor(tokens_train['input_ids']) | |
| train_mask = torch.tensor(tokens_train['attention_mask']) | |
| train_y = torch.tensor(train_labels['labels'].tolist()) | |
| val_seq = torch.tensor(tokens_val['input_ids']) | |
| val_mask = torch.tensor(tokens_val['attention_mask']) | |
| val_y = torch.tensor(val_labels['labels'].tolist()) | |
| test_seq = torch.tensor(tokens_test['input_ids']) | |
| test_mask = torch.tensor(tokens_test['attention_mask']) | |
| test_y = torch.tensor(test_labels['labels'].tolist()) | |
| batch_size = 32 | |
| train_data = TensorDataset(train_seq, train_mask, train_y) | |
| train_sampler = RandomSampler(train_data) | |
| train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) | |
| val_data = TensorDataset(val_seq, val_mask, val_y) | |
| val_sampler = SequentialSampler(val_data) | |
| val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size) | |
| for param in bert.parameters(): | |
| param.requires_grad = False | |
| model = BERT_Arch(bert) | |
| model = model.to(device) | |
| optimizer = AdamW(model.parameters(),lr = 1e-5) | |
| class_weights = compute_class_weight("balanced",classes = np.unique(train_labels),y =train_labels['labels'] ) | |
| weights= torch.tensor(class_weights,dtype=torch.float) | |
| weights = weights.to(device) | |
| cross_entropy = nn.NLLLoss(weight=weights) | |
| epochs = 10 | |
| def train(): | |
| model.train() | |
| total_loss, total_accuracy = 0, 0 | |
| total_preds=[] | |
| for step,batch in enumerate(train_dataloader): | |
| if step % 50 == 0 and not step == 0: | |
| print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) | |
| batch = [r.to(device) for r in batch] | |
| sent_id, mask, labels = batch | |
| model.zero_grad() | |
| preds = model(sent_id, mask) | |
| loss = cross_entropy(preds, labels) | |
| total_loss = total_loss + loss.item() | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| preds=preds.detach().cpu().numpy() | |
| total_preds.append(preds) | |
| avg_loss = total_loss / len(train_dataloader) | |
| total_preds = np.concatenate(total_preds, axis=0) | |
| return avg_loss, total_preds | |
| def evaluate(): | |
| print("\nEvaluating...") | |
| model.eval() | |
| total_loss, total_accuracy = 0, 0 | |
| total_preds = [] | |
| for step,batch in enumerate(val_dataloader): | |
| batch = [t.to(device) for t in batch] | |
| sent_id, mask, labels = batch | |
| with torch.no_grad(): | |
| preds = model(sent_id, mask) | |
| loss = cross_entropy(preds,labels) | |
| total_loss = total_loss + loss.item() | |
| preds = preds.detach().cpu().numpy() | |
| total_preds.append(preds) | |
| avg_loss = total_loss / len(val_dataloader) | |
| total_preds = np.concatenate(total_preds, axis=0) | |
| return avg_loss, total_preds | |
| best_valid_loss = float('inf') | |
| epochs = 50 | |
| train_losses=[] | |
| valid_losses=[] | |
| for epoch in range(epochs): | |
| print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) | |
| train_loss, _ = train() | |
| valid_loss, _ = evaluate() | |
| if valid_loss < best_valid_loss: | |
| best_valid_loss = valid_loss | |
| torch.save(model.state_dict(), 'saved_weights.pt') | |
| train_losses.append(train_loss) | |
| valid_losses.append(valid_loss) | |
| print(f'\nTraining Loss: {train_loss:.3f}') | |
| print(f'Validation Loss: {valid_loss:.3f}') | |
| # get predictions for test data | |
| with torch.no_grad(): | |
| preds = model(test_seq.to(device), test_mask.to(device)) | |
| preds = preds.detach().cpu().numpy() | |
| # model's performance | |
| preds = np.argmax(preds, axis = 1) | |
| print(classification_report(test_y, preds)) | |
| torch.save(model.state_dict(),'model.pth') |