Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Methods and tools for efficient training on a single GPU","local":"methods-and-tools-for-efficient-training-on-a-single-gpu","sections":[{"title":"Batch size choice","local":"batch-size-choice","sections":[],"depth":2},{"title":"Gradient Accumulation","local":"gradient-accumulation","sections":[],"depth":2},{"title":"Gradient Checkpointing","local":"gradient-checkpointing","sections":[],"depth":2},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"BF16","local":"bf16","sections":[],"depth":3},{"title":"TF32","local":"tf32","sections":[],"depth":3}],"depth":2},{"title":"Flash Attention 2","local":"flash-attention-2","sections":[],"depth":2},{"title":"Optimizer choice","local":"optimizer-choice","sections":[{"title":"Adafactor","local":"adafactor","sections":[],"depth":3},{"title":"8-bit Adam","local":"8-bit-adam","sections":[],"depth":3},{"title":"multi_tensor","local":"multitensor","sections":[],"depth":3}],"depth":2},{"title":"Data preloading","local":"data-preloading","sections":[],"depth":2},{"title":"DeepSpeed ZeRO","local":"deepspeed-zero","sections":[],"depth":2},{"title":"Using torch.compile","local":"using-torchcompile","sections":[],"depth":2},{"title":"Using 🤗 PEFT","local":"using--peft","sections":[],"depth":2},{"title":"Using 🤗 Accelerate","local":"using--accelerate","sections":[],"depth":2},{"title":"Efficient Software Prebuilds","local":"efficient-software-prebuilds","sections":[],"depth":2},{"title":"Mixture of Experts","local":"mixture-of-experts","sections":[],"depth":2},{"title":"Using PyTorch native attention and Flash Attention","local":"using-pytorch-native-attention-and-flash-attention","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/386.88c43315.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/Tip.baa67368.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js"> | |
| <link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Methods and tools for efficient training on a single GPU","local":"methods-and-tools-for-efficient-training-on-a-single-gpu","sections":[{"title":"Batch size choice","local":"batch-size-choice","sections":[],"depth":2},{"title":"Gradient Accumulation","local":"gradient-accumulation","sections":[],"depth":2},{"title":"Gradient Checkpointing","local":"gradient-checkpointing","sections":[],"depth":2},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"BF16","local":"bf16","sections":[],"depth":3},{"title":"TF32","local":"tf32","sections":[],"depth":3}],"depth":2},{"title":"Flash Attention 2","local":"flash-attention-2","sections":[],"depth":2},{"title":"Optimizer choice","local":"optimizer-choice","sections":[{"title":"Adafactor","local":"adafactor","sections":[],"depth":3},{"title":"8-bit Adam","local":"8-bit-adam","sections":[],"depth":3},{"title":"multi_tensor","local":"multitensor","sections":[],"depth":3}],"depth":2},{"title":"Data preloading","local":"data-preloading","sections":[],"depth":2},{"title":"DeepSpeed ZeRO","local":"deepspeed-zero","sections":[],"depth":2},{"title":"Using torch.compile","local":"using-torchcompile","sections":[],"depth":2},{"title":"Using 🤗 PEFT","local":"using--peft","sections":[],"depth":2},{"title":"Using 🤗 Accelerate","local":"using--accelerate","sections":[],"depth":2},{"title":"Efficient Software Prebuilds","local":"efficient-software-prebuilds","sections":[],"depth":2},{"title":"Mixture of Experts","local":"mixture-of-experts","sections":[],"depth":2},{"title":"Using PyTorch native attention and Flash Attention","local":"using-pytorch-native-attention-and-flash-attention","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="methods-and-tools-for-efficient-training-on-a-single-gpu" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#methods-and-tools-for-efficient-training-on-a-single-gpu"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Methods and tools for efficient training on a single GPU</span></h1> <p data-svelte-h="svelte-tqdu0u">This guide demonstrates practical techniques that you can use to increase the efficiency of your model’s training by | |
| optimizing memory utilization, speeding up the training, or both. If you’d like to understand how GPU is utilized during | |
| training, please refer to the <a href="model_memory_anatomy">Model training anatomy</a> conceptual guide first. This guide | |
| focuses on practical techniques.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-198lgun">If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the <a href="perf_train_gpu_many">multi-GPU section</a>.</p></div> <p data-svelte-h="svelte-4s75i8">When training large models, there are two aspects that should be considered at the same time:</p> <ul data-svelte-h="svelte-djgcn1"><li>Data throughput/training time</li> <li>Model performance</li></ul> <p data-svelte-h="svelte-uf18r4">Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU | |
| as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, | |
| the memory optimization techniques, such as gradient accumulation, can help.</p> <p data-svelte-h="svelte-b3xmwb">However, if the preferred batch size fits into memory, there’s no reason to apply memory-optimizing techniques because they can | |
| slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of | |
| hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly.</p> <p data-svelte-h="svelte-1qbc4zp">The methods and tools covered in this guide can be classified based on the effect they have on the training process:</p> <table data-svelte-h="svelte-chf3xh"><thead><tr><th align="left">Method/tool</th> <th align="left">Improves training speed</th> <th align="left">Optimizes memory utilization</th></tr></thead> <tbody><tr><td align="left"><a href="#batch-size-choice">Batch size choice</a></td> <td align="left">Yes</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#gradient-accumulation">Gradient accumulation</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#gradient-checkpointing">Gradient checkpointing</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#mixed-precision-training">Mixed precision training</a></td> <td align="left">Yes</td> <td align="left">Maybe*</td></tr> <tr><td align="left"><a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps" rel="nofollow">torch_empty_cache_steps</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#optimizer-choice">Optimizer choice</a></td> <td align="left">Yes</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#data-preloading">Data preloading</a></td> <td align="left">Yes</td> <td align="left">No</td></tr> <tr><td align="left"><a href="#deepspeed-zero">DeepSpeed Zero</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#using-torchcompile">torch.compile</a></td> <td align="left">Yes</td> <td align="left">No</td></tr> <tr><td align="left"><a href="#using--peft">Parameter-Efficient Fine Tuning (PEFT)</a></td> <td align="left">No</td> <td align="left">Yes</td></tr></tbody></table> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-fh3gpe">*Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a | |
| large model and a small batch size, the memory use will be larger.</p></div> <p data-svelte-h="svelte-ilb56e">You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are | |
| training your model with <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> or writing a pure PyTorch loop, in which case you can <a href="#using--accelerate">configure these optimizations | |
| with 🤗 Accelerate</a>.</p> <p data-svelte-h="svelte-1thjxg4">If these methods do not result in sufficient gains, you can explore the following options:</p> <ul data-svelte-h="svelte-lxno45"><li><a href="#efficient-software-prebuilds">Look into building your own custom Docker container with efficient software prebuilds</a></li> <li><a href="#mixture-of-experts">Consider a model that uses Mixture of Experts (MoE)</a></li> <li><a href="#using-pytorch-native-attention-and-flash-attention">Convert your model to BetterTransformer to leverage PyTorch native attention</a></li></ul> <p data-svelte-h="svelte-1uvbmrd">Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving | |
| to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism | |
| techniques outlined in the <a href="perf_train_gpu_many">multi-GPU section</a>.</p> <h2 class="relative group"><a id="batch-size-choice" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-size-choice"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch size choice</span></h2> <p data-svelte-h="svelte-1f9wkzb">To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and | |
| input/output neuron counts that are of size 2^N. Often it’s a multiple of 8, but it can be | |
| higher depending on the hardware being used and the model’s dtype.</p> <p data-svelte-h="svelte-h1ek0g">For reference, check out NVIDIA’s recommendation for <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features" rel="nofollow">input/output neuron counts</a> and | |
| <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size" rel="nofollow">batch size</a> for | |
| fully connected layers (which are involved in GEMMs (General Matrix Multiplications)).</p> <p data-svelte-h="svelte-sfn5k1"><a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc" rel="nofollow">Tensor Core Requirements</a> | |
| define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless | |
| it’s an A100 GPU, in which case use multiples of 64.</p> <p data-svelte-h="svelte-wvp8kx">For parameters that are small, consider also <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization" rel="nofollow">Dimension Quantization Effects</a>. | |
| This is where tiling happens and the right multiplier can have a significant speedup.</p> <h2 class="relative group"><a id="gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient Accumulation</span></h2> <p data-svelte-h="svelte-1l7rfkq">The <strong>gradient accumulation</strong> method aims to calculate gradients in smaller increments instead of computing them for the | |
| entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward | |
| and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of | |
| gradients have been accumulated, the model’s optimization step is executed. By employing gradient accumulation, it | |
| becomes possible to increase the <strong>effective batch size</strong> beyond the limitations imposed by the GPU’s memory capacity. | |
| However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can | |
| slow down the training process.</p> <p data-svelte-h="svelte-ecils4">You can enable gradient accumulation by adding the <code>gradient_accumulation_steps</code> argument to <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">1</span>, gradient_accumulation_steps=<span class="hljs-number">4</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18ky75y">In the above example, your effective batch size becomes 4.</p> <p data-svelte-h="svelte-n8w6ni">Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example | |
| <a href="#using--accelerate">further down in this guide</a>.</p> <p data-svelte-h="svelte-1bitspm">While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can | |
| result in a more pronounced training slowdown. Consider the following example. Let’s say, the <code>per_device_train_batch_size=4</code> | |
| without gradient accumulation hits the GPU’s limit. If you would like to train with batches of size 64, do not set the | |
| <code>per_device_train_batch_size</code> to 1 and <code>gradient_accumulation_steps</code> to 64. Instead, keep <code>per_device_train_batch_size=4</code> | |
| and set <code>gradient_accumulation_steps=16</code>. This results in the same effective batch size while making better use of | |
| the available GPU resources.</p> <p data-svelte-h="svelte-nbfonf">For additional information, please refer to batch size and gradient accumulation benchmarks for <a href="https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537" rel="nofollow">RTX-3090</a> | |
| and <a href="https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957" rel="nofollow">A100</a>.</p> <h2 class="relative group"><a id="gradient-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient Checkpointing</span></h2> <p data-svelte-h="svelte-n1qbw">Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. | |
| This is because there are other components that also require memory storage.</p> <p data-svelte-h="svelte-1419dmv">Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in | |
| significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed | |
| during the backward pass, would introduce a considerable computational overhead and slow down the training process.</p> <p data-svelte-h="svelte-nfo3gu"><strong>Gradient checkpointing</strong> offers a compromise between these two approaches and saves strategically selected activations | |
| throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For | |
| an in-depth explanation of gradient checkpointing, refer to <a href="https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9" rel="nofollow">this great article</a>.</p> <p data-svelte-h="svelte-1o03gkd">To enable gradient checkpointing in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, pass the corresponding a flag to <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments( | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, gradient_accumulation_steps=<span class="hljs-number">4</span>, gradient_checkpointing=<span class="hljs-literal">True</span>, **default_args | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lfeqfy">Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example <a href="#using--accelerate">further in this guide</a>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1rw5pua">While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%.</p></div> <h2 class="relative group"><a id="mixed-precision-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mixed-precision-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mixed precision training</span></h2> <p data-svelte-h="svelte-7pqnv0"><strong>Mixed precision training</strong> is a technique that aims to optimize the computational efficiency of training models by | |
| utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point | |
| precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision | |
| level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit | |
| floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed | |
| in half-precision, while some are still in full precision, the approach is called mixed precision training.</p> <p data-svelte-h="svelte-1ttanpy">Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures | |
| (such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check | |
| out the <a href="https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/" rel="nofollow">NVIDIA Blog</a> to learn more about | |
| the differences between these data types.</p> <h3 class="relative group"><a id="fp16" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp16"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp16</span></h3> <p data-svelte-h="svelte-h2t8go">The main advantage of mixed precision training comes from saving the activations in half precision (fp16). | |
| Although the gradients are also computed in half precision they are converted back to full precision for the optimization | |
| step so no memory is saved here. | |
| While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes. | |
| This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).</p> <p data-svelte-h="svelte-wwg09w">To enable mixed precision training, set the <code>fp16</code> flag to <code>True</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, fp16=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1flavde">If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example <a href="#using--accelerate">further in this guide</a>.</p> <h3 class="relative group"><a id="bf16" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bf16"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BF16</span></h3> <p data-svelte-h="svelte-1itatvv">If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While | |
| bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have | |
| is <code>65504</code> and any number above that will result in an overflow. A bf16 number can be as large as <code>3.39e+38</code> (!) which | |
| is about the same as fp32 - because both have 8-bits used for the numerical range.</p> <p data-svelte-h="svelte-vq639r">You can enable BF16 in the 🤗 Trainer with:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(bf16=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="tf32" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tf32"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>TF32</span></h3> <p data-svelte-h="svelte-7szee7">The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead | |
| of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It’s “magical” in the sense that | |
| you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput | |
| improvement. All you need to do is to add the following to your code:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span> | |
| torch.backends.cudnn.allow_tf32 = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kvnj8w">CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.</p> <p data-svelte-h="svelte-alc3bu">According to <a href="https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/" rel="nofollow">NVIDIA research</a>, the | |
| majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. | |
| If you’re already using fp16 or bf16 mixed precision it may help with the throughput as well.</p> <p data-svelte-h="svelte-d0keh1">You can enable this mode in the 🤗 Trainer:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TrainingArguments(tf32=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-lagf4l">tf32 can’t be accessed directly via <code>tensor.to(dtype=torch.tf32)</code> because it is an internal CUDA data type. You need <code>torch>=1.7</code> to use tf32 data types.</p></div> <p data-svelte-h="svelte-dzrbtx">For additional information on tf32 vs other precisions, please refer to the following benchmarks: | |
| <a href="https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803" rel="nofollow">RTX-3090</a> and | |
| <a href="https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189" rel="nofollow">A100</a>.</p> <h2 class="relative group"><a id="flash-attention-2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#flash-attention-2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Flash Attention 2</span></h2> <p data-svelte-h="svelte-4gzx8m">You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the <a href="./perf_infer_gpu_one#Flash-Attention-2">single GPU section</a> to learn more about how to load a model with Flash Attention 2 modules.</p> <h2 class="relative group"><a id="optimizer-choice" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizer-choice"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizer choice</span></h2> <p data-svelte-h="svelte-15ts99c">The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves | |
| good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory | |
| footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. | |
| For example if you have <a href="https://github.com/NVIDIA/apex" rel="nofollow">NVIDIA/apex</a> installed for NVIDIA GPUs, or <a href="https://github.com/ROCmSoftwarePlatform/apex" rel="nofollow">ROCmSoftwarePlatform/apex</a> for AMD GPUs, <code>adamw_apex_fused</code> will give you the | |
| fastest training experience among all supported AdamW optimizers.</p> <p data-svelte-h="svelte-15c4jeh"><a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> integrates a variety of optimizers that can be used out of box: <code>adamw_hf</code>, <code>adamw_torch</code>, <code>adamw_torch_fused</code>, | |
| <code>adamw_apex_fused</code>, <code>adamw_anyprecision</code>, <code>adafactor</code>, or <code>adamw_bnb_8bit</code>. More optimizers can be plugged in via a third-party implementation.</p> <p data-svelte-h="svelte-1qfw41">Let’s take a closer look at two alternatives to AdamW optimizer:</p> <ol data-svelte-h="svelte-10f9wto"><li><code>adafactor</code> which is available in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a></li> <li><code>adamw_bnb_8bit</code> is also available in Trainer, but a third-party integration is provided below for demonstration.</li></ol> <p data-svelte-h="svelte-mpchgx">For comparison, for a 3B-parameter model, like “google-t5/t5-3b”:</p> <ul data-svelte-h="svelte-ajuctj"><li>A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB)</li> <li>Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra.</li> <li>8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.</li></ul> <h3 class="relative group"><a id="adafactor" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#adafactor"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Adafactor</span></h3> <p data-svelte-h="svelte-19jpav8">Adafactor doesn’t store rolling averages for each element in weight matrices. Instead, it keeps aggregated information | |
| (sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, | |
| Adafactor may have slower convergence in certain cases.</p> <p data-svelte-h="svelte-dh41ec">You can switch to Adafactor by setting <code>optim="adafactor"</code> in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, optim=<span class="hljs-string">"adafactor"</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1850zf2">Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) | |
| you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of | |
| Adafactor can be worse than Adam.</p> <h3 class="relative group"><a id="8-bit-adam" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#8-bit-adam"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>8-bit Adam</span></h3> <p data-svelte-h="svelte-1ld1y8t">Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization | |
| means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the | |
| idea behind mixed precision training.</p> <p data-svelte-h="svelte-9trohq">To use <code>adamw_bnb_8bit</code>, you simply need to set <code>optim="adamw_bnb_8bit"</code> in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, optim=<span class="hljs-string">"adamw_bnb_8bit"</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-czmcu6">However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.</p> <p data-svelte-h="svelte-cn7c3a">First, follow the installation guide in the GitHub <a href="https://github.com/bitsandbytes-foundation/bitsandbytes" rel="nofollow">repo</a> to install the <code>bitsandbytes</code> library | |
| that implements the 8-bit Adam optimizer.</p> <p data-svelte-h="svelte-19nyc4u">Next you need to initialize the optimizer. This involves two steps:</p> <ul data-svelte-h="svelte-8z8ok2"><li>First, group the model’s parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed.</li> <li>Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb | |
| <span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> nn | |
| <span class="hljs-keyword">from</span> transformers.trainer_pt_utils <span class="hljs-keyword">import</span> get_parameter_names | |
| training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, **default_args) | |
| decay_parameters = get_parameter_names(model, [nn.LayerNorm]) | |
| decay_parameters = [name <span class="hljs-keyword">for</span> name <span class="hljs-keyword">in</span> decay_parameters <span class="hljs-keyword">if</span> <span class="hljs-string">"bias"</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> name] | |
| optimizer_grouped_parameters = [ | |
| { | |
| <span class="hljs-string">"params"</span>: [p <span class="hljs-keyword">for</span> n, p <span class="hljs-keyword">in</span> model.named_parameters() <span class="hljs-keyword">if</span> n <span class="hljs-keyword">in</span> decay_parameters], | |
| <span class="hljs-string">"weight_decay"</span>: training_args.weight_decay, | |
| }, | |
| { | |
| <span class="hljs-string">"params"</span>: [p <span class="hljs-keyword">for</span> n, p <span class="hljs-keyword">in</span> model.named_parameters() <span class="hljs-keyword">if</span> n <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> decay_parameters], | |
| <span class="hljs-string">"weight_decay"</span>: <span class="hljs-number">0.0</span>, | |
| }, | |
| ] | |
| optimizer_kwargs = { | |
| <span class="hljs-string">"betas"</span>: (training_args.adam_beta1, training_args.adam_beta2), | |
| <span class="hljs-string">"eps"</span>: training_args.adam_epsilon, | |
| } | |
| optimizer_kwargs[<span class="hljs-string">"lr"</span>] = training_args.learning_rate | |
| adam_bnb_optim = bnb.optim.Adam8bit( | |
| optimizer_grouped_parameters, | |
| betas=(training_args.adam_beta1, training_args.adam_beta2), | |
| eps=training_args.adam_epsilon, | |
| lr=training_args.learning_rate, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n8bkwf">Finally, pass the custom optimizer as an argument to the <code>Trainer</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, <span class="hljs-literal">None</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1egi1g7">Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), | |
| you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor.</p> <h3 class="relative group"><a id="multitensor" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multitensor"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>multi_tensor</span></h3> <p data-svelte-h="svelte-1cvgqaq">pytorch-nightly introduced <code>torch.optim._multi_tensor</code> which should significantly speed up the optimizers for situations | |
| with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub <a href="https://github.com/huggingface/transformers/issues/9965" rel="nofollow">issue</a>.</p> <h2 class="relative group"><a id="data-preloading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-preloading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data preloading</span></h2> <p data-svelte-h="svelte-16tninz">One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it | |
| can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast | |
| enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:</p> <ul data-svelte-h="svelte-1i85v8g"><li><code>DataLoader(pin_memory=True, ...)</code> - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.</li> <li><code>DataLoader(num_workers=4, ...)</code> - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it’s far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won’t necessarily lead to better performance.</li></ul> <p data-svelte-h="svelte-ghk363">When using <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, the corresponding <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> are: <code>dataloader_pin_memory</code> (<code>True</code> by default), and <code>dataloader_num_workers</code> (defaults to <code>0</code>).</p> <h2 class="relative group"><a id="deepspeed-zero" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-zero"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed ZeRO</span></h2> <p data-svelte-h="svelte-1onp6io">DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate. | |
| It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale | |
| deep learning training.</p> <p data-svelte-h="svelte-18hg7g">If your model fits onto a single GPU and you have enough space to fit a small batch size, you don’t need to use DeepSpeed | |
| as it’ll only slow things down. However, if the model doesn’t fit onto a single GPU or you can’t fit a small batch, you can | |
| leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately | |
| <a href="main_classes/deepspeed#installation">install the library</a>, then follow one of the guides to create a configuration file | |
| and launch DeepSpeed:</p> <ul data-svelte-h="svelte-j49w9v"><li>For an in-depth guide on DeepSpeed integration with <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, review <a href="main_classes/deepspeed">the corresponding documentation</a>, specifically the | |
| <a href="main_classes/deepspeed#deployment-with-one-gpu">section for a single GPU</a>. Some adjustments are required to use DeepSpeed in a notebook; please take a look at the <a href="main_classes/deepspeed#deployment-in-notebooks">corresponding guide</a>.</li> <li>If you prefer to use 🤗 Accelerate, refer to <a href="https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed" rel="nofollow">🤗 Accelerate DeepSpeed guide</a>.</li></ul> <h2 class="relative group"><a id="using-torchcompile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-torchcompile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using torch.compile</span></h2> <p data-svelte-h="svelte-5qw0v5">PyTorch 2.0 introduced a new compile function that doesn’t require any modification to existing PyTorch code but can | |
| optimize your code by adding a single line of code: <code>model = torch.compile(model)</code>.</p> <p data-svelte-h="svelte-a2bbh0">If using <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, you only need <code>to</code> pass the <code>torch_compile</code> option in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(torch_compile=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y4s25e"><code>torch.compile</code> uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After | |
| capturing the graph, different backends can be deployed to lower the graph to an optimized engine. | |
| You can find more details and benchmarks in <a href="https://pytorch.org/get-started/pytorch-2.0/" rel="nofollow">PyTorch documentation</a>.</p> <p data-svelte-h="svelte-jluuok"><code>torch.compile</code> has a growing list of backends, which can be found in by calling <code>torchdynamo.list_backends()</code>, each of which with its optional dependencies.</p> <p data-svelte-h="svelte-12ef0m3">Choose which backend to use by specifying it via <code>torch_compile_backend</code> in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>. Some of the most commonly used backends are:</p> <p data-svelte-h="svelte-v82kng"><strong>Debugging backends</strong>:</p> <ul data-svelte-h="svelte-1dbgjs7"><li><code>dynamo.optimize("eager")</code> - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.</li> <li><code>dynamo.optimize("aot_eager")</code> - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd’s extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.</li></ul> <p data-svelte-h="svelte-m2qf5f"><strong>Training & inference backends</strong>:</p> <ul data-svelte-h="svelte-x9b2wm"><li><code>dynamo.optimize("inductor")</code> - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels <a href="https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("nvfuser")</code> - nvFuser with TorchScript. <a href="https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("aot_nvfuser")</code> - nvFuser with AotAutograd. <a href="https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("aot_cudagraphs")</code> - cudagraphs with AotAutograd. <a href="https://github.com/pytorch/torchdynamo/pull/757" rel="nofollow">Read more</a></li></ul> <p data-svelte-h="svelte-78l3h4"><strong>Inference-only backend</strong>s:</p> <ul data-svelte-h="svelte-1aisp13"><li><code>dynamo.optimize("ofi")</code> - Uses TorchScript optimize_for_inference. <a href="https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("fx2trt")</code> - Uses NVIDIA TensorRT for inference optimizations. <a href="https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("onnxrt")</code> - Uses ONNXRT for inference on CPU/GPU. <a href="https://onnxruntime.ai/" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("ipex")</code> - Uses IPEX for inference on CPU. <a href="https://github.com/intel/intel-extension-for-pytorch" rel="nofollow">Read more</a></li></ul> <p data-svelte-h="svelte-1prqm7s">For an example of using <code>torch.compile</code> with 🤗 Transformers, check out this <a href="https://www.philschmid.de/getting-started-pytorch-2-0-transformers" rel="nofollow">blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features</a></p> <h2 class="relative group"><a id="using--peft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--peft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 PEFT</span></h2> <p data-svelte-h="svelte-8pfmhp"><a href="https://huggingface.co/blog/peft" rel="nofollow">Parameter-Efficient Fine Tuning (PEFT)</a> methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it.</p> <p data-svelte-h="svelte-vtpr6t">As a result the <a href="https://huggingface.co/docs/transformers/model_memory_anatomy#anatomy-of-models-memory" rel="nofollow">memory associated to the optimizer states and gradients</a> are greatly reduced.</p> <p data-svelte-h="svelte-10pzmck">For example with a vanilla AdamW, the memory requirement for the optimizer state would be:</p> <ul data-svelte-h="svelte-1c6ggig"><li>fp32 copy of parameters: 4 bytes/param</li> <li>Momentum: 4 bytes/param</li> <li>Variance: 4 bytes/param</li></ul> <p data-svelte-h="svelte-o73t8f">Suppose a model with 7B parameters and 200 million parameters injected with <a href="https://huggingface.co/docs/peft/conceptual_guides/lora" rel="nofollow">Low Rank Adapters</a>.</p> <p data-svelte-h="svelte-13ydqxp">The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters).</p> <p data-svelte-h="svelte-16ojy37">Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB.</p> <p data-svelte-h="svelte-1bqs6c8">Read more about PEFT and its detailed usage in <a href="https://huggingface.co/docs/peft/" rel="nofollow">the PEFT documentation</a> or <a href="https://github.com/huggingface/peft" rel="nofollow">PEFT repository</a>.</p> <h2 class="relative group"><a id="using--accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 Accelerate</span></h2> <p data-svelte-h="svelte-d91a9c">With <a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">🤗 Accelerate</a> you can use the above methods while gaining full | |
| control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications.</p> <p data-svelte-h="svelte-z796ng">Suppose you have combined the methods in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> like so:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments( | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, | |
| gradient_accumulation_steps=<span class="hljs-number">4</span>, | |
| gradient_checkpointing=<span class="hljs-literal">True</span>, | |
| fp16=<span class="hljs-literal">True</span>, | |
| **default_args, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eja69v">The full example training loop with 🤗 Accelerate is only a handful of lines of code long:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator | |
| <span class="hljs-keyword">from</span> torch.utils.data.dataloader <span class="hljs-keyword">import</span> DataLoader | |
| dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) | |
| <span class="hljs-keyword">if</span> training_args.gradient_checkpointing: | |
| model.gradient_checkpointing_enable() | |
| accelerator = Accelerator(fp16=training_args.fp16) | |
| model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) | |
| model.train() | |
| <span class="hljs-keyword">for</span> step, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(dataloader, start=<span class="hljs-number">1</span>): | |
| loss = model(**batch).loss | |
| loss = loss / training_args.gradient_accumulation_steps | |
| accelerator.backward(loss) | |
| <span class="hljs-keyword">if</span> step % training_args.gradient_accumulation_steps == <span class="hljs-number">0</span>: | |
| optimizer.step() | |
| optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wtx2y4">First we wrap the dataset in a <a href="https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader" rel="nofollow"><code>DataLoader</code></a>. | |
| Then we can enable gradient checkpointing by calling the model’s <a href="/docs/transformers/pr_33913/en/main_classes/model#transformers.PreTrainedModel.gradient_checkpointing_enable">gradient_checkpointing_enable()</a> method. | |
| When we initialize the <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator" rel="nofollow"><code>Accelerator</code></a> | |
| we can specify if we want to use mixed precision training and it will take care of it for us in the <code>prepare</code> call. | |
| During the <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare" rel="nofollow"><code>prepare</code></a> | |
| call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same <a href="#8-bit-adam">8-bit optimizer</a> from the earlier example.</p> <p data-svelte-h="svelte-i5733s">Finally, we can add the main training loop. Note that the <code>backward</code> call is handled by 🤗 Accelerate. We can also see | |
| how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have | |
| enough steps we run the optimization.</p> <p data-svelte-h="svelte-1caz98g">Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the | |
| benefit of more flexibility in the training loop. For a full documentation of all features have a look at the | |
| <a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate documentation</a>.</p> <h2 class="relative group"><a id="efficient-software-prebuilds" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#efficient-software-prebuilds"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Efficient Software Prebuilds</span></h2> <p data-svelte-h="svelte-nmhpgl">PyTorch’s <a href="https://pytorch.org/get-started/locally/#start-locally" rel="nofollow">pip and conda builds</a> come prebuilt with the cuda toolkit | |
| which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.</p> <p data-svelte-h="svelte-kxuhyz">At times, additional efforts may be required to pre-build some components. For instance, if you’re using libraries like <code>apex</code> that | |
| don’t come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. | |
| To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with | |
| everything prebuilt. You just need to install your programs on it, and it will run out of the box.</p> <p data-svelte-h="svelte-qkuwkx">This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. | |
| To find the docker image version you want start <a href="https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/" rel="nofollow">with PyTorch release notes</a>, | |
| choose one of the latest monthly releases. Go into the release’s notes for the desired release, check that the environment’s | |
| components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go | |
| to the corresponding NGC page. If for some reason you get lost, here is <a href="https://ngc.nvidia.com/catalog/containers/nvidia:pytorch" rel="nofollow">the index of all PyTorch NGC images</a>.</p> <p data-svelte-h="svelte-16g8p4x">Next follow the instructions to download and deploy the docker image.</p> <h2 class="relative group"><a id="mixture-of-experts" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mixture-of-experts"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mixture of Experts</span></h2> <p data-svelte-h="svelte-1aawf57">Some recent papers reported a 4-5x training speedup and a faster inference by integrating | |
| Mixture of Experts (MoE) into the Transformer models.</p> <p data-svelte-h="svelte-ln9ugc">Since it has been discovered that more parameters lead to better performance, this technique allows to increase the | |
| number of parameters by an order of magnitude without increasing training costs.</p> <p data-svelte-h="svelte-1ett6yb">In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function | |
| that trains each expert in a balanced way depending on the input token’s position in a sequence.</p> <p data-svelte-h="svelte-tdgts9"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png" alt="MoE Transformer 2x block"></p> <p data-svelte-h="svelte-z2dori">(source: <a href="https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html" rel="nofollow">GLAM</a>)</p> <p data-svelte-h="svelte-1gc8znk">You can find exhaustive details and comparison tables in the papers listed at the end of this section.</p> <p data-svelte-h="svelte-13sa7m0">The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude | |
| larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.</p> <p data-svelte-h="svelte-dxh8d5">There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or | |
| hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the | |
| memory requirements moderately as well.</p> <p data-svelte-h="svelte-1jfhyzg">Most related papers and implementations are built around Tensorflow/TPUs:</p> <ul data-svelte-h="svelte-1fjc55v"><li><a href="https://arxiv.org/abs/2006.16668" rel="nofollow">GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding</a></li> <li><a href="https://arxiv.org/abs/2101.03961" rel="nofollow">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a></li> <li><a href="https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html" rel="nofollow">GLaM: Generalist Language Model (GLaM)</a></li></ul> <p data-svelte-h="svelte-1nxcmqe">And for Pytorch DeepSpeed has built one as well: <a href="https://arxiv.org/abs/2201.05596" rel="nofollow">DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale</a>, <a href="https://www.deepspeed.ai/tutorials/mixture-of-experts/" rel="nofollow">Mixture of Experts</a> - blog posts: <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/" rel="nofollow">1</a>, <a href="https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/" rel="nofollow">2</a> and specific deployment with large transformer-based natural language generation models: <a href="https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html" rel="nofollow">blog post</a>, <a href="https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training" rel="nofollow">Megatron-Deepspeed branch</a>.</p> <h2 class="relative group"><a id="using-pytorch-native-attention-and-flash-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-pytorch-native-attention-and-flash-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using PyTorch native attention and Flash Attention</span></h2> <p data-svelte-h="svelte-1yxcg0k">PyTorch’s <a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow"><code>torch.nn.functional.scaled_dot_product_attention</code></a> (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for <code>torch>=2.1.1</code> when an implementation is available. Please refer to <a href="https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention" rel="nofollow">PyTorch scaled dot product attention</a> for a list of supported models and more details.</p> <p data-svelte-h="svelte-1r3sioy">Check out this <a href="https://pytorch.org/blog/out-of-the-box-acceleration/" rel="nofollow">blogpost</a> to learn more about acceleration and memory-savings with SDPA.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/perf_train_gpu_one.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_z647wz = { | |
| assets: "/docs/transformers/pr_33913/en", | |
| base: "/docs/transformers/pr_33913/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"), | |
| import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 386], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 91.4 kB
- Xet hash:
- ab79a09a9e0c426c947d324374f397fe6cf19f9f1f4761b9a75ac55aae566f8e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.