| import dataclasses |
| import json |
| import logging |
| import os |
| from dataclasses import dataclass, field |
| from typing import Any, Dict, Optional, Tuple |
|
|
| from transformers.file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required |
|
|
|
|
| if is_torch_available(): |
| import torch |
|
|
| if is_torch_tpu_available(): |
| import torch_xla.core.xla_model as xm |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def default_logdir() -> str: |
| """ |
| Same default as PyTorch |
| """ |
| import socket |
| from datetime import datetime |
|
|
| current_time = datetime.now().strftime("%b%d_%H-%M-%S") |
| return os.path.join("runs", current_time + "_" + socket.gethostname()) |
|
|
|
|
| @dataclass |
| class TrainingArguments: |
| """ |
| TrainingArguments is the subset of the arguments we use in our example scripts |
| **which relate to the training loop itself**. |
| |
| Using `HfArgumentParser` we can turn this class |
| into argparse arguments to be able to specify them on |
| the command line. |
| """ |
|
|
| output_dir: str = field( |
| metadata={"help": "The output directory where the model predictions and checkpoints will be written."} |
| ) |
| overwrite_output_dir: bool = field( |
| default=False, |
| metadata={ |
| "help": ( |
| "Overwrite the content of the output directory." |
| "Use this to continue training if output_dir points to a checkpoint directory." |
| ) |
| }, |
| ) |
|
|
| do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) |
| do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) |
| do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) |
| evaluate_during_training: bool = field( |
| default=False, metadata={"help": "Run evaluation during training at each logging step."}, |
| ) |
|
|
| per_device_train_batch_size: int = field( |
| default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} |
| ) |
| per_device_eval_batch_size: int = field( |
| default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."} |
| ) |
|
|
| per_gpu_train_batch_size: Optional[int] = field( |
| default=None, |
| metadata={ |
| "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. " |
| "Batch size per GPU/TPU core/CPU for training." |
| }, |
| ) |
| per_gpu_eval_batch_size: Optional[int] = field( |
| default=None, |
| metadata={ |
| "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred." |
| "Batch size per GPU/TPU core/CPU for evaluation." |
| }, |
| ) |
|
|
| gradient_accumulation_steps: int = field( |
| default=1, |
| metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, |
| ) |
|
|
| learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) |
| weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) |
| adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) |
| max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) |
|
|
| num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) |
| max_steps: int = field( |
| default=-1, |
| metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, |
| ) |
| warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) |
|
|
| logging_tqdm: bool = field(default=False, metadata={"help": "Show tqdm or not."}) |
| eval_steps: int = field(default=500, metadata={"help": "Run validation every X updates steps."}) |
|
|
| logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."}) |
| logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) |
| logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) |
| save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) |
| save_total_limit: Optional[int] = field( |
| default=None, |
| metadata={ |
| "help": ( |
| "Limit the total amount of checkpoints." |
| "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" |
| ) |
| }, |
| ) |
| no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) |
| seed: int = field(default=42, metadata={"help": "random seed for initialization"}) |
|
|
| fp16: bool = field( |
| default=False, |
| metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"}, |
| ) |
| fp16_opt_level: str = field( |
| default="O1", |
| metadata={ |
| "help": ( |
| "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." |
| "See details at https://nvidia.github.io/apex/amp.html" |
| ) |
| }, |
| ) |
| local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) |
|
|
| tpu_num_cores: Optional[int] = field( |
| default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} |
| ) |
| tpu_metrics_debug: bool = field(default=False, metadata={"help": "TPU: Whether to print debug metrics"}) |
|
|
| dataloader_drop_last: bool = field( |
| default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} |
| ) |
|
|
| @property |
| def train_batch_size(self) -> int: |
| if self.per_gpu_train_batch_size: |
| logger.warning( |
| "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " |
| "version. Using `--per_device_train_batch_size` is preferred." |
| ) |
| per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size |
| return per_device_batch_size * max(1, self.n_gpu) |
|
|
| @property |
| def eval_batch_size(self) -> int: |
| if self.per_gpu_eval_batch_size: |
| logger.warning( |
| "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " |
| "version. Using `--per_device_eval_batch_size` is preferred." |
| ) |
| per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size |
| return per_device_batch_size * max(1, self.n_gpu) |
|
|
| @cached_property |
| @torch_required |
| def _setup_devices(self) -> Tuple["torch.device", int]: |
| logger.info("PyTorch: setting up devices") |
| if self.no_cuda: |
| device = torch.device("cpu") |
| n_gpu = 0 |
| elif is_torch_tpu_available(): |
| device = xm.xla_device() |
| n_gpu = 0 |
| elif self.local_rank == -1: |
| |
| |
| |
| |
| |
| |
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
| n_gpu = torch.cuda.device_count() |
| else: |
| |
| |
| torch.distributed.init_process_group(backend="nccl") |
| device = torch.device("cuda", self.local_rank) |
| n_gpu = 1 |
|
|
| if device.type == "cuda": |
| torch.cuda.set_device(device) |
|
|
| return device, n_gpu |
|
|
| @property |
| @torch_required |
| def device(self) -> "torch.device": |
| return self._setup_devices[0] |
|
|
| @property |
| @torch_required |
| def n_gpu(self): |
| return self._setup_devices[1] |
|
|
| def to_json_string(self): |
| """ |
| Serializes this instance to a JSON string. |
| """ |
| return json.dumps(dataclasses.asdict(self), indent=2) |
|
|
| def to_sanitized_dict(self) -> Dict[str, Any]: |
| """ |
| Sanitized serialization to use with TensorBoard’s hparams |
| """ |
| d = dataclasses.asdict(self) |
| valid_types = [bool, int, float, str] |
| if is_torch_available(): |
| valid_types.append(torch.Tensor) |
| return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} |
|
|