import os os.environ["HF_HOME"] = "/home/jovyan/work/learn-ml/huggingface" import torch from torch.utils.data import DataLoader from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler from datasets import load_dataset, load_metric from accelerate import Accelerator from tqdm.auto import tqdm checkpoint = 'bert-base-uncased' tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) raw_datasets = load_dataset("glue", "mrpc") def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_dataset = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) samples = tokenized_dataset["train"][:8] samples = {k: v for k,v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} print([len(x) for x in samples["input_ids"]]) tokenized_dataset = tokenized_dataset.remove_columns(["sentence1","sentence2","idx"]) tokenized_dataset = tokenized_dataset.rename_column("label", "labels") tokenized_dataset.set_format("torch") tokenized_dataset.column_names["train"] train_dataloader = DataLoader( tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator, ) eval_dataloader = DataLoader( tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator, ) accelerator = Accelerator() model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) optimizer = AdamW(model.parameters(), lr=3e-5) train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer) num_epochs = 3 num_training_steps = num_epochs * len(train_dataloader) lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) print(num_training_steps) progress_bar = tqdm(range(num_training_steps)) model.train() for epoch in range(num_epochs): for batch in train_dataloader: outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1)