Spaces:
Runtime error
Runtime error
| # John Makely | |
| # Finetune Language Modeling Based on BERTweet | |
| # ./jigsaw-toxic-comment-classification-challenge/train.csv | |
| # "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate" [6 total classifiers] | |
| # 1. Extract text from csv | |
| # 2. Tokenize text (BERTweet, RoBERTa, GPT-2) | |
| # 3. Pass each tokenized text to a model with each classifier | |
| # 4. Train each model | |
| # 5. Save each model | |
| import pandas as pd | |
| import os | |
| from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification | |
| import torch | |
| from torch.utils.data import Dataset | |
| torch.cuda.empty_cache() | |
| # Create Dataset class | |
| class MultiLabelClassifierDataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: torch.tensor(val[idx]) | |
| for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]).float() | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| # Set up directories | |
| work_dir = os.path.dirname(os.path.realpath(__file__)) + '/' | |
| dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/' | |
| # Set up labels | |
| classifiers = ['toxic', 'severe_toxic', 'obscene', | |
| 'threat', 'insult', 'identity_hate'] | |
| # Use train.csv to split into train, val, test | |
| print("Loading data...") | |
| df = pd.read_csv(dataset_dir + 'train.csv') | |
| df = df.sample(frac=1).reset_index(drop=True) # Shuffle | |
| # Split into train, val, test | |
| train_df = df[:int(len(df)*0.1)] | |
| # Extracting the last 6 columns into a numpy array | |
| train_labels = train_df[classifiers].to_numpy() | |
| # Setting device | |
| device = torch.device('cuda') | |
| print("Using device: ", device) | |
| # # # # # # # # # # # ## | |
| # # # # # BERT # # # # # | |
| # # # # # # # # # # # ## | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=2, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=64, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| fp16=True | |
| ) | |
| print("BERT") | |
| bert_dir = work_dir + 'bert/' | |
| print("Tokenizing") | |
| print("Model base: ", "vinai/bertweet-base") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "vinai/bertweet-base", model_max_length=128) | |
| print("Creating train encodings...") | |
| train_encodings = tokenizer( | |
| train_df['comment_text'].tolist(), truncation=True, padding=True) | |
| # def bert_train_model('vinai/bertweet-base', num_labels, training_args, train_encodings, train_dataset, model_dir): | |
| print("Training model to be stored in" + bert_dir) | |
| # # Create dataset | |
| print("Creating dataset") | |
| train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
| # # Load model | |
| print("Loading model for training...") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| 'vinai/bertweet-base', num_labels=6) | |
| # Create Trainer | |
| print("Creating trainer...") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset | |
| ) | |
| # Train | |
| print("Training...") | |
| trainer.train() | |
| # # Save model | |
| print("Saving model to " + bert_dir + '_bert_model') | |
| trainer.save_model(bert_dir + '_bert_model') | |
| # # # # # # # # # # # # | |
| # # # # RoBERTa # # # # | |
| # # # # # # # # # # # # | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=1, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=16, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| fp16=True | |
| ) | |
| # RoBERTa | |
| print("RoBERTa") | |
| roberta_dir = work_dir + 'roberta/' | |
| print("Tokenizing") | |
| print("Model base: ", 'roberta-base') | |
| tokenizer = RobertaTokenizer.from_pretrained( | |
| 'roberta-base', model_max_length=128) | |
| train_encodings = tokenizer( | |
| train_df['comment_text'].tolist(), truncation=True, padding=True) | |
| # Create dataset | |
| print("Creating dataset") | |
| train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
| # Load model | |
| print("Loading model for training...") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| 'roberta-base', num_labels=6) | |
| # Create Trainer | |
| print("Creating trainer...") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset | |
| ) | |
| # Train | |
| print("Training...") | |
| trainer.train() | |
| # Save model | |
| print("Saving model to " + roberta_dir + '_roberta_model') | |
| trainer.save_model(roberta_dir + '_roberta_model') | |
| # # # # # # # # # # # ## | |
| # # # distilbert # # # # | |
| # # # # # # # # # # # ## | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=1, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=64, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| fp16=True | |
| ) | |
| print("DISTILBERT") | |
| distilbert_dir = work_dir + 'distilbert/' | |
| print("Tokenizing") | |
| print("Model base: ", 'distilbert-base-cased') | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| 'distilbert-base-cased', model_max_length=128) | |
| print("Creating train encodings...") | |
| train_encodings = tokenizer( | |
| train_df['comment_text'].tolist(), truncation=True, padding=True) | |
| print("Training model to be stored in" + distilbert_dir) | |
| # Create dataset | |
| print("Creating dataset") | |
| train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
| # Load model | |
| print("Loading model for training...") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| 'distilbert-base-cased', num_labels=6) | |
| # Create Trainer | |
| print("Creating trainer...") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset | |
| ) | |
| # Train | |
| print("Training...") | |
| trainer.train() | |
| # Save model | |
| print("Saving model to " + distilbert_dir + '_distilbert_model') | |
| trainer.save_model(distilbert_dir + '_distilbert_model') | |