| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import numpy as np |
| | import torch |
| | from torch.utils.data import DataLoader |
| |
|
| | from accelerate.utils.dataclasses import DistributedType |
| |
|
| |
|
| | class RegressionDataset: |
| | def __init__(self, a=2, b=3, length=64, seed=None): |
| | rng = np.random.default_rng(seed) |
| | self.length = length |
| | self.x = rng.normal(size=(length,)).astype(np.float32) |
| | self.y = a * self.x + b + rng.normal(scale=0.1, size=(length,)).astype(np.float32) |
| |
|
| | def __len__(self): |
| | return self.length |
| |
|
| | def __getitem__(self, i): |
| | return {"x": self.x[i], "y": self.y[i]} |
| |
|
| |
|
| | class RegressionModel(torch.nn.Module): |
| | def __init__(self, a=0, b=0, double_output=False): |
| | super().__init__() |
| | self.a = torch.nn.Parameter(torch.tensor(a).float()) |
| | self.b = torch.nn.Parameter(torch.tensor(b).float()) |
| | self.first_batch = True |
| |
|
| | def forward(self, x=None): |
| | if self.first_batch: |
| | print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}") |
| | self.first_batch = False |
| | return x * self.a + self.b |
| |
|
| |
|
| | def mocked_dataloaders(accelerator, batch_size: int = 16): |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") |
| | data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"} |
| | datasets = load_dataset("csv", data_files=data_files) |
| | label_list = datasets["train"].unique("label") |
| |
|
| | label_to_id = {v: i for i, v in enumerate(label_list)} |
| |
|
| | def tokenize_function(examples): |
| | |
| | outputs = tokenizer( |
| | examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length" |
| | ) |
| | if "label" in examples: |
| | outputs["labels"] = [label_to_id[l] for l in examples["label"]] |
| | return outputs |
| |
|
| | |
| | tokenized_datasets = datasets.map( |
| | tokenize_function, |
| | batched=True, |
| | remove_columns=["sentence1", "sentence2", "label"], |
| | ) |
| |
|
| | def collate_fn(examples): |
| | |
| | if accelerator.distributed_type == DistributedType.XLA: |
| | return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt") |
| | return tokenizer.pad(examples, padding="longest", return_tensors="pt") |
| |
|
| | |
| | train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2) |
| | eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1) |
| |
|
| | return train_dataloader, eval_dataloader |
| |
|
| |
|
| | def mocked_dataloaders_for_autoregressive_models(accelerator, batch_size: int = 16): |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M") |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"} |
| | datasets = load_dataset("csv", data_files=data_files) |
| |
|
| | def tokenize_function(examples): |
| | |
| | outputs = tokenizer(examples["sentence1"], truncation=True, max_length=None, return_attention_mask=False) |
| | return outputs |
| |
|
| | |
| | |
| | with accelerator.main_process_first(): |
| | tokenized_datasets = datasets.map( |
| | tokenize_function, |
| | batched=True, |
| | remove_columns=["sentence1", "sentence2", "label"], |
| | ) |
| |
|
| | def collate_fn(examples): |
| | |
| | max_length = ( |
| | 128 |
| | if accelerator.distributed_type == DistributedType.XLA |
| | else max([len(e["input_ids"]) for e in examples]) |
| | ) |
| | |
| | if accelerator.mixed_precision == "fp8": |
| | pad_to_multiple_of = 16 |
| | elif accelerator.mixed_precision != "no": |
| | pad_to_multiple_of = 8 |
| | else: |
| | pad_to_multiple_of = None |
| |
|
| | batch = tokenizer.pad( |
| | examples, |
| | padding="max_length", |
| | max_length=max_length + 1, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_tensors="pt", |
| | ) |
| |
|
| | batch["labels"] = batch["input_ids"][:, 1:] |
| | batch["input_ids"] = batch["input_ids"][:, :-1] |
| |
|
| | batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, -100, batch["labels"]) |
| |
|
| | return batch |
| |
|
| | |
| | train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=False, collate_fn=collate_fn, batch_size=2) |
| | eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1) |
| |
|
| | return train_dataloader, eval_dataloader |
| |
|