Spaces:
Runtime error
Runtime error
| from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments | |
| import torch | |
| from torch.utils.data import Dataset | |
| # from torch.optim import AdamW | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| # assignment 3 | |
| model_name = "bert-base-uncased" | |
| class ToxicDataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
| item["labels"] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| print("Reading data...") | |
| data = pd.read_csv("./data/train.csv") | |
| toxic_data = pd.DataFrame() | |
| toxic_data["text"] = data["comment_text"] | |
| toxic_data["labels"] = data.iloc[:, 2:].values.tolist() | |
| print("Data read. Splitting data...") | |
| train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2) | |
| print("Data split. Tokenizing data...") | |
| tokenizer = BertTokenizerFast.from_pretrained(model_name) | |
| train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt') | |
| val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt') | |
| train_dataset = ToxicDataset(train_encodings, train_labels) | |
| val_dataset = ToxicDataset(val_encodings, val_labels) | |
| print("Data tokenized. Beginning training...") | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| num_train_epochs=2, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=16, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| logging_steps=10, | |
| ) | |
| # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| model = BertModel.from_pretrained(model_name, num_labels=6) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| ) | |
| trainer.train() | |
| # model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6) | |
| # model.to(device) | |
| # model.train() | |
| # train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) | |
| # optim = AdamW(model.parameters(), lr=5e-5) | |
| # num_train_epochs = 2 | |
| # for epoch in range(num_train_epochs): | |
| # for batch in train_loader: | |
| # optim.zero_grad() | |
| # input_ids = batch["input_ids"].to(device) | |
| # attention_mask = batch["attention_mask"].to(device) | |
| # labels = batch["labels"].to(device) | |
| # outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| # loss = outputs[0] | |
| # loss.backward() | |
| # optim.step() | |
| # model.eval() | |
| print("Training complete. Saving model...") | |
| save_directory = "./results/model" | |
| model.save_pretrained(save_directory) | |
| print("Model saved.") |