Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /perf_train_gpu_one.html

rtrm

29 days ago

download

raw

91.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Methods and tools for efficient training on a single GPU","local":"methods-and-tools-for-efficient-training-on-a-single-gpu","sections":[{"title":"Batch size choice","local":"batch-size-choice","sections":[],"depth":2},{"title":"Gradient Accumulation","local":"gradient-accumulation","sections":[],"depth":2},{"title":"Gradient Checkpointing","local":"gradient-checkpointing","sections":[],"depth":2},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"BF16","local":"bf16","sections":[],"depth":3},{"title":"TF32","local":"tf32","sections":[],"depth":3}],"depth":2},{"title":"Flash Attention 2","local":"flash-attention-2","sections":[],"depth":2},{"title":"Optimizer choice","local":"optimizer-choice","sections":[{"title":"Adafactor","local":"adafactor","sections":[],"depth":3},{"title":"8-bit Adam","local":"8-bit-adam","sections":[],"depth":3},{"title":"multi_tensor","local":"multitensor","sections":[],"depth":3}],"depth":2},{"title":"Data preloading","local":"data-preloading","sections":[],"depth":2},{"title":"DeepSpeed ZeRO","local":"deepspeed-zero","sections":[],"depth":2},{"title":"Using torch.compile","local":"using-torchcompile","sections":[],"depth":2},{"title":"Using 🤗 PEFT","local":"using--peft","sections":[],"depth":2},{"title":"Using 🤗 Accelerate","local":"using--accelerate","sections":[],"depth":2},{"title":"Efficient Software Prebuilds","local":"efficient-software-prebuilds","sections":[],"depth":2},{"title":"Mixture of Experts","local":"mixture-of-experts","sections":[],"depth":2},{"title":"Using PyTorch native attention and Flash Attention","local":"using-pytorch-native-attention-and-flash-attention","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/386.88c43315.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Methods and tools for efficient training on a single GPU","local":"methods-and-tools-for-efficient-training-on-a-single-gpu","sections":[{"title":"Batch size choice","local":"batch-size-choice","sections":[],"depth":2},{"title":"Gradient Accumulation","local":"gradient-accumulation","sections":[],"depth":2},{"title":"Gradient Checkpointing","local":"gradient-checkpointing","sections":[],"depth":2},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"BF16","local":"bf16","sections":[],"depth":3},{"title":"TF32","local":"tf32","sections":[],"depth":3}],"depth":2},{"title":"Flash Attention 2","local":"flash-attention-2","sections":[],"depth":2},{"title":"Optimizer choice","local":"optimizer-choice","sections":[{"title":"Adafactor","local":"adafactor","sections":[],"depth":3},{"title":"8-bit Adam","local":"8-bit-adam","sections":[],"depth":3},{"title":"multi_tensor","local":"multitensor","sections":[],"depth":3}],"depth":2},{"title":"Data preloading","local":"data-preloading","sections":[],"depth":2},{"title":"DeepSpeed ZeRO","local":"deepspeed-zero","sections":[],"depth":2},{"title":"Using torch.compile","local":"using-torchcompile","sections":[],"depth":2},{"title":"Using 🤗 PEFT","local":"using--peft","sections":[],"depth":2},{"title":"Using 🤗 Accelerate","local":"using--accelerate","sections":[],"depth":2},{"title":"Efficient Software Prebuilds","local":"efficient-software-prebuilds","sections":[],"depth":2},{"title":"Mixture of Experts","local":"mixture-of-experts","sections":[],"depth":2},{"title":"Using PyTorch native attention and Flash Attention","local":"using-pytorch-native-attention-and-flash-attention","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="methods-and-tools-for-efficient-training-on-a-single-gpu" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#methods-and-tools-for-efficient-training-on-a-single-gpu"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Methods and tools for efficient training on a single GPU</span></h1> <p data-svelte-h="svelte-tqdu0u">This guide demonstrates practical techniques that you can use to increase the efficiency of your model’s training by
	optimizing memory utilization, speeding up the training, or both. If you’d like to understand how GPU is utilized during
	training, please refer to the <a href="model_memory_anatomy">Model training anatomy</a> conceptual guide first. This guide
	focuses on practical techniques.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-198lgun">If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the <a href="perf_train_gpu_many">multi-GPU section</a>.</p></div> <p data-svelte-h="svelte-4s75i8">When training large models, there are two aspects that should be considered at the same time:</p> <ul data-svelte-h="svelte-djgcn1"><li>Data throughput/training time</li> <li>Model performance</li></ul> <p data-svelte-h="svelte-uf18r4">Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU
	as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory,
	the memory optimization techniques, such as gradient accumulation, can help.</p> <p data-svelte-h="svelte-b3xmwb">However, if the preferred batch size fits into memory, there’s no reason to apply memory-optimizing techniques because they can
	slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of
	hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly.</p> <p data-svelte-h="svelte-1qbc4zp">The methods and tools covered in this guide can be classified based on the effect they have on the training process:</p> <table data-svelte-h="svelte-chf3xh"><thead><tr><th align="left">Method/tool</th> <th align="left">Improves training speed</th> <th align="left">Optimizes memory utilization</th></tr></thead> <tbody><tr><td align="left"><a href="#batch-size-choice">Batch size choice</a></td> <td align="left">Yes</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#gradient-accumulation">Gradient accumulation</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#gradient-checkpointing">Gradient checkpointing</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#mixed-precision-training">Mixed precision training</a></td> <td align="left">Yes</td> <td align="left">Maybe</td></tr> <tr><td align="left"><a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps" rel="nofollow">torch_empty_cache_steps</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#optimizer-choice">Optimizer choice</a></td> <td align="left">Yes</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#data-preloading">Data preloading</a></td> <td align="left">Yes</td> <td align="left">No</td></tr> <tr><td align="left"><a href="#deepspeed-zero">DeepSpeed Zero</a></td> <td align="left">No</td> <td align="left">Yes</td></tr> <tr><td align="left"><a href="#using-torchcompile">torch.compile</a></td> <td align="left">Yes</td> <td align="left">No</td></tr> <tr><td align="left"><a href="#using--peft">Parameter-Efficient Fine Tuning (PEFT)</a></td> <td align="left">No</td> <td align="left">Yes</td></tr></tbody></table> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-fh3gpe">Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a
	large model and a small batch size, the memory use will be larger.</p></div> <p data-svelte-h="svelte-ilb56e">You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are
	training your model with <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> or writing a pure PyTorch loop, in which case you can <a href="#using--accelerate">configure these optimizations
	with 🤗 Accelerate</a>.</p> <p data-svelte-h="svelte-1thjxg4">If these methods do not result in sufficient gains, you can explore the following options:</p> <ul data-svelte-h="svelte-lxno45"><li><a href="#efficient-software-prebuilds">Look into building your own custom Docker container with efficient software prebuilds</a></li> <li><a href="#mixture-of-experts">Consider a model that uses Mixture of Experts (MoE)</a></li> <li><a href="#using-pytorch-native-attention-and-flash-attention">Convert your model to BetterTransformer to leverage PyTorch native attention</a></li></ul> <p data-svelte-h="svelte-1uvbmrd">Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving
	to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism
	techniques outlined in the <a href="perf_train_gpu_many">multi-GPU section</a>.</p> <h2 class="relative group"><a id="batch-size-choice" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-size-choice"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch size choice</span></h2> <p data-svelte-h="svelte-1f9wkzb">To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and
	input/output neuron counts that are of size 2^N. Often it’s a multiple of 8, but it can be
	higher depending on the hardware being used and the model’s dtype.</p> <p data-svelte-h="svelte-h1ek0g">For reference, check out NVIDIA’s recommendation for <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features" rel="nofollow">input/output neuron counts</a> and
	<a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size" rel="nofollow">batch size</a> for
	fully connected layers (which are involved in GEMMs (General Matrix Multiplications)).</p> <p data-svelte-h="svelte-sfn5k1"><a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc" rel="nofollow">Tensor Core Requirements</a>
	define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless
	it’s an A100 GPU, in which case use multiples of 64.</p> <p data-svelte-h="svelte-wvp8kx">For parameters that are small, consider also <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization" rel="nofollow">Dimension Quantization Effects</a>.
	This is where tiling happens and the right multiplier can have a significant speedup.</p> <h2 class="relative group"><a id="gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient Accumulation</span></h2> <p data-svelte-h="svelte-1l7rfkq">The <strong>gradient accumulation</strong> method aims to calculate gradients in smaller increments instead of computing them for the
	entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward
	and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of
	gradients have been accumulated, the model’s optimization step is executed. By employing gradient accumulation, it
	becomes possible to increase the <strong>effective batch size</strong> beyond the limitations imposed by the GPU’s memory capacity.
	However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can
	slow down the training process.</p> <p data-svelte-h="svelte-ecils4">You can enable gradient accumulation by adding the <code>gradient_accumulation_steps</code> argument to <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">1</span>, gradient_accumulation_steps=<span class="hljs-number">4</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18ky75y">In the above example, your effective batch size becomes 4.</p> <p data-svelte-h="svelte-n8w6ni">Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example
	<a href="#using--accelerate">further down in this guide</a>.</p> <p data-svelte-h="svelte-1bitspm">While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can
	result in a more pronounced training slowdown. Consider the following example. Let’s say, the <code>per_device_train_batch_size=4</code>
	without gradient accumulation hits the GPU’s limit. If you would like to train with batches of size 64, do not set the
	<code>per_device_train_batch_size</code> to 1 and <code>gradient_accumulation_steps</code> to 64. Instead, keep <code>per_device_train_batch_size=4</code>
	and set <code>gradient_accumulation_steps=16</code>. This results in the same effective batch size while making better use of
	the available GPU resources.</p> <p data-svelte-h="svelte-nbfonf">For additional information, please refer to batch size and gradient accumulation benchmarks for <a href="https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537" rel="nofollow">RTX-3090</a>
	and <a href="https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957" rel="nofollow">A100</a>.</p> <h2 class="relative group"><a id="gradient-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient Checkpointing</span></h2> <p data-svelte-h="svelte-n1qbw">Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used.
	This is because there are other components that also require memory storage.</p> <p data-svelte-h="svelte-1419dmv">Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in
	significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed
	during the backward pass, would introduce a considerable computational overhead and slow down the training process.</p> <p data-svelte-h="svelte-nfo3gu"><strong>Gradient checkpointing</strong> offers a compromise between these two approaches and saves strategically selected activations
	throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For
	an in-depth explanation of gradient checkpointing, refer to <a href="https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9" rel="nofollow">this great article</a>.</p> <p data-svelte-h="svelte-1o03gkd">To enable gradient checkpointing in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, pass the corresponding a flag to <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(
	per_device_train_batch_size=<span class="hljs-number">1</span>, gradient_accumulation_steps=<span class="hljs-number">4</span>, gradient_checkpointing=<span class="hljs-literal">True</span>, **default_args
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lfeqfy">Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example <a href="#using--accelerate">further in this guide</a>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1rw5pua">While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%.</p></div> <h2 class="relative group"><a id="mixed-precision-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mixed-precision-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mixed precision training</span></h2> <p data-svelte-h="svelte-7pqnv0"><strong>Mixed precision training</strong> is a technique that aims to optimize the computational efficiency of training models by
	utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point
	precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision
	level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit
	floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed
	in half-precision, while some are still in full precision, the approach is called mixed precision training.</p> <p data-svelte-h="svelte-1ttanpy">Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures
	(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check
	out the <a href="https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/" rel="nofollow">NVIDIA Blog</a> to learn more about
	the differences between these data types.</p> <h3 class="relative group"><a id="fp16" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp16"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp16</span></h3> <p data-svelte-h="svelte-h2t8go">The main advantage of mixed precision training comes from saving the activations in half precision (fp16).
	Although the gradients are also computed in half precision they are converted back to full precision for the optimization
	step so no memory is saved here.
	While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes.
	This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).</p> <p data-svelte-h="svelte-wwg09w">To enable mixed precision training, set the <code>fp16</code> flag to <code>True</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, fp16=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1flavde">If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example <a href="#using--accelerate">further in this guide</a>.</p> <h3 class="relative group"><a id="bf16" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bf16"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BF16</span></h3> <p data-svelte-h="svelte-1itatvv">If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While
	bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have
	is <code>65504</code> and any number above that will result in an overflow. A bf16 number can be as large as <code>3.39e+38</code> (!) which
	is about the same as fp32 - because both have 8-bits used for the numerical range.</p> <p data-svelte-h="svelte-vq639r">You can enable BF16 in the 🤗 Trainer with:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(bf16=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="tf32" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tf32"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>TF32</span></h3> <p data-svelte-h="svelte-7szee7">The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead
	of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It’s “magical” in the sense that
	you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput
	improvement. All you need to do is to add the following to your code:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span>
	torch.backends.cudnn.allow_tf32 = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kvnj8w">CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.</p> <p data-svelte-h="svelte-alc3bu">According to <a href="https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/" rel="nofollow">NVIDIA research</a>, the
	majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32.
	If you’re already using fp16 or bf16 mixed precision it may help with the throughput as well.</p> <p data-svelte-h="svelte-d0keh1">You can enable this mode in the 🤗 Trainer:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TrainingArguments(tf32=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-lagf4l">tf32 can’t be accessed directly via <code>tensor.to(dtype=torch.tf32)</code> because it is an internal CUDA data type. You need <code>torch>=1.7</code> to use tf32 data types.</p></div> <p data-svelte-h="svelte-dzrbtx">For additional information on tf32 vs other precisions, please refer to the following benchmarks:
	<a href="https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803" rel="nofollow">RTX-3090</a> and
	<a href="https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189" rel="nofollow">A100</a>.</p> <h2 class="relative group"><a id="flash-attention-2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#flash-attention-2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Flash Attention 2</span></h2> <p data-svelte-h="svelte-4gzx8m">You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the <a href="./perf_infer_gpu_one#Flash-Attention-2">single GPU section</a> to learn more about how to load a model with Flash Attention 2 modules.</p> <h2 class="relative group"><a id="optimizer-choice" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizer-choice"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizer choice</span></h2> <p data-svelte-h="svelte-15ts99c">The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves
	good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory
	footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer.
	For example if you have <a href="https://github.com/NVIDIA/apex" rel="nofollow">NVIDIA/apex</a> installed for NVIDIA GPUs, or <a href="https://github.com/ROCmSoftwarePlatform/apex" rel="nofollow">ROCmSoftwarePlatform/apex</a> for AMD GPUs, <code>adamw_apex_fused</code> will give you the
	fastest training experience among all supported AdamW optimizers.</p> <p data-svelte-h="svelte-15c4jeh"><a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> integrates a variety of optimizers that can be used out of box: <code>adamw_hf</code>, <code>adamw_torch</code>, <code>adamw_torch_fused</code>,
	<code>adamw_apex_fused</code>, <code>adamw_anyprecision</code>, <code>adafactor</code>, or <code>adamw_bnb_8bit</code>. More optimizers can be plugged in via a third-party implementation.</p> <p data-svelte-h="svelte-1qfw41">Let’s take a closer look at two alternatives to AdamW optimizer:</p> <ol data-svelte-h="svelte-10f9wto"><li><code>adafactor</code> which is available in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a></li> <li><code>adamw_bnb_8bit</code> is also available in Trainer, but a third-party integration is provided below for demonstration.</li></ol> <p data-svelte-h="svelte-mpchgx">For comparison, for a 3B-parameter model, like “google-t5/t5-3b”:</p> <ul data-svelte-h="svelte-ajuctj"><li>A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (83 => 24GB)</li> <li>Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 43 and then some extra.</li> <li>8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.</li></ul> <h3 class="relative group"><a id="adafactor" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#adafactor"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Adafactor</span></h3> <p data-svelte-h="svelte-19jpav8">Adafactor doesn’t store rolling averages for each element in weight matrices. Instead, it keeps aggregated information
	(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam,
	Adafactor may have slower convergence in certain cases.</p> <p data-svelte-h="svelte-dh41ec">You can switch to Adafactor by setting <code>optim="adafactor"</code> in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, optim=<span class="hljs-string">"adafactor"</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1850zf2">Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training)
	you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of
	Adafactor can be worse than Adam.</p> <h3 class="relative group"><a id="8-bit-adam" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#8-bit-adam"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>8-bit Adam</span></h3> <p data-svelte-h="svelte-1ld1y8t">Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization
	means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the
	idea behind mixed precision training.</p> <p data-svelte-h="svelte-9trohq">To use <code>adamw_bnb_8bit</code>, you simply need to set <code>optim="adamw_bnb_8bit"</code> in <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, optim=<span class="hljs-string">"adamw_bnb_8bit"</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-czmcu6">However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.</p> <p data-svelte-h="svelte-cn7c3a">First, follow the installation guide in the GitHub <a href="https://github.com/bitsandbytes-foundation/bitsandbytes" rel="nofollow">repo</a> to install the <code>bitsandbytes</code> library
	that implements the 8-bit Adam optimizer.</p> <p data-svelte-h="svelte-19nyc4u">Next you need to initialize the optimizer. This involves two steps:</p> <ul data-svelte-h="svelte-8z8ok2"><li>First, group the model’s parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed.</li> <li>Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb
	<span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> nn
	<span class="hljs-keyword">from</span> transformers.trainer_pt_utils <span class="hljs-keyword">import</span> get_parameter_names

	training_args = TrainingArguments(per_device_train_batch_size=<span class="hljs-number">4</span>, **default_args)

	decay_parameters = get_parameter_names(model, [nn.LayerNorm])
	decay_parameters = [name <span class="hljs-keyword">for</span> name <span class="hljs-keyword">in</span> decay_parameters <span class="hljs-keyword">if</span> <span class="hljs-string">"bias"</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> name]
	optimizer_grouped_parameters = [
	{
	<span class="hljs-string">"params"</span>: [p <span class="hljs-keyword">for</span> n, p <span class="hljs-keyword">in</span> model.named_parameters() <span class="hljs-keyword">if</span> n <span class="hljs-keyword">in</span> decay_parameters],
	<span class="hljs-string">"weight_decay"</span>: training_args.weight_decay,
	},
	{
	<span class="hljs-string">"params"</span>: [p <span class="hljs-keyword">for</span> n, p <span class="hljs-keyword">in</span> model.named_parameters() <span class="hljs-keyword">if</span> n <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> decay_parameters],
	<span class="hljs-string">"weight_decay"</span>: <span class="hljs-number">0.0</span>,
	},
	]

	optimizer_kwargs = {
	<span class="hljs-string">"betas"</span>: (training_args.adam_beta1, training_args.adam_beta2),
	<span class="hljs-string">"eps"</span>: training_args.adam_epsilon,
	}
	optimizer_kwargs[<span class="hljs-string">"lr"</span>] = training_args.learning_rate
	adam_bnb_optim = bnb.optim.Adam8bit(
	optimizer_grouped_parameters,
	betas=(training_args.adam_beta1, training_args.adam_beta2),
	eps=training_args.adam_epsilon,
	lr=training_args.learning_rate,
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n8bkwf">Finally, pass the custom optimizer as an argument to the <code>Trainer</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, <span class="hljs-literal">None</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1egi1g7">Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training),
	you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor.</p> <h3 class="relative group"><a id="multitensor" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multitensor"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>multi_tensor</span></h3> <p data-svelte-h="svelte-1cvgqaq">pytorch-nightly introduced <code>torch.optim._multi_tensor</code> which should significantly speed up the optimizers for situations
	with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub <a href="https://github.com/huggingface/transformers/issues/9965" rel="nofollow">issue</a>.</p> <h2 class="relative group"><a id="data-preloading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-preloading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data preloading</span></h2> <p data-svelte-h="svelte-16tninz">One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it
	can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast
	enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:</p> <ul data-svelte-h="svelte-1i85v8g"><li><code>DataLoader(pin_memory=True, ...)</code> - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.</li> <li><code>DataLoader(num_workers=4, ...)</code> - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it’s far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won’t necessarily lead to better performance.</li></ul> <p data-svelte-h="svelte-ghk363">When using <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, the corresponding <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> are: <code>dataloader_pin_memory</code> (<code>True</code> by default), and <code>dataloader_num_workers</code> (defaults to <code>0</code>).</p> <h2 class="relative group"><a id="deepspeed-zero" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-zero"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed ZeRO</span></h2> <p data-svelte-h="svelte-1onp6io">DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate.
	It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale
	deep learning training.</p> <p data-svelte-h="svelte-18hg7g">If your model fits onto a single GPU and you have enough space to fit a small batch size, you don’t need to use DeepSpeed
	as it’ll only slow things down. However, if the model doesn’t fit onto a single GPU or you can’t fit a small batch, you can
	leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately
	<a href="main_classes/deepspeed#installation">install the library</a>, then follow one of the guides to create a configuration file
	and launch DeepSpeed:</p> <ul data-svelte-h="svelte-j49w9v"><li>For an in-depth guide on DeepSpeed integration with <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, review <a href="main_classes/deepspeed">the corresponding documentation</a>, specifically the
	<a href="main_classes/deepspeed#deployment-with-one-gpu">section for a single GPU</a>. Some adjustments are required to use DeepSpeed in a notebook; please take a look at the <a href="main_classes/deepspeed#deployment-in-notebooks">corresponding guide</a>.</li> <li>If you prefer to use 🤗 Accelerate, refer to <a href="https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed" rel="nofollow">🤗 Accelerate DeepSpeed guide</a>.</li></ul> <h2 class="relative group"><a id="using-torchcompile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-torchcompile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using torch.compile</span></h2> <p data-svelte-h="svelte-5qw0v5">PyTorch 2.0 introduced a new compile function that doesn’t require any modification to existing PyTorch code but can
	optimize your code by adding a single line of code: <code>model = torch.compile(model)</code>.</p> <p data-svelte-h="svelte-a2bbh0">If using <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, you only need <code>to</code> pass the <code>torch_compile</code> option in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(torch_compile=<span class="hljs-literal">True</span>, **default_args)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y4s25e"><code>torch.compile</code> uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After
	capturing the graph, different backends can be deployed to lower the graph to an optimized engine.
	You can find more details and benchmarks in <a href="https://pytorch.org/get-started/pytorch-2.0/" rel="nofollow">PyTorch documentation</a>.</p> <p data-svelte-h="svelte-jluuok"><code>torch.compile</code> has a growing list of backends, which can be found in by calling <code>torchdynamo.list_backends()</code>, each of which with its optional dependencies.</p> <p data-svelte-h="svelte-12ef0m3">Choose which backend to use by specifying it via <code>torch_compile_backend</code> in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>. Some of the most commonly used backends are:</p> <p data-svelte-h="svelte-v82kng"><strong>Debugging backends</strong>:</p> <ul data-svelte-h="svelte-1dbgjs7"><li><code>dynamo.optimize("eager")</code> - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.</li> <li><code>dynamo.optimize("aot_eager")</code> - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd’s extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.</li></ul> <p data-svelte-h="svelte-m2qf5f"><strong>Training & inference backends</strong>:</p> <ul data-svelte-h="svelte-x9b2wm"><li><code>dynamo.optimize("inductor")</code> - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels <a href="https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("nvfuser")</code> - nvFuser with TorchScript. <a href="https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("aot_nvfuser")</code> - nvFuser with AotAutograd. <a href="https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("aot_cudagraphs")</code> - cudagraphs with AotAutograd. <a href="https://github.com/pytorch/torchdynamo/pull/757" rel="nofollow">Read more</a></li></ul> <p data-svelte-h="svelte-78l3h4"><strong>Inference-only backend</strong>s:</p> <ul data-svelte-h="svelte-1aisp13"><li><code>dynamo.optimize("ofi")</code> - Uses TorchScript optimize_for_inference. <a href="https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("fx2trt")</code> - Uses NVIDIA TensorRT for inference optimizations. <a href="https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("onnxrt")</code> - Uses ONNXRT for inference on CPU/GPU. <a href="https://onnxruntime.ai/" rel="nofollow">Read more</a></li> <li><code>dynamo.optimize("ipex")</code> - Uses IPEX for inference on CPU. <a href="https://github.com/intel/intel-extension-for-pytorch" rel="nofollow">Read more</a></li></ul> <p data-svelte-h="svelte-1prqm7s">For an example of using <code>torch.compile</code> with 🤗 Transformers, check out this <a href="https://www.philschmid.de/getting-started-pytorch-2-0-transformers" rel="nofollow">blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features</a></p> <h2 class="relative group"><a id="using--peft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--peft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 PEFT</span></h2> <p data-svelte-h="svelte-8pfmhp"><a href="https://huggingface.co/blog/peft" rel="nofollow">Parameter-Efficient Fine Tuning (PEFT)</a> methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it.</p> <p data-svelte-h="svelte-vtpr6t">As a result the <a href="https://huggingface.co/docs/transformers/model_memory_anatomy#anatomy-of-models-memory" rel="nofollow">memory associated to the optimizer states and gradients</a> are greatly reduced.</p> <p data-svelte-h="svelte-10pzmck">For example with a vanilla AdamW, the memory requirement for the optimizer state would be:</p> <ul data-svelte-h="svelte-1c6ggig"><li>fp32 copy of parameters: 4 bytes/param</li> <li>Momentum: 4 bytes/param</li> <li>Variance: 4 bytes/param</li></ul> <p data-svelte-h="svelte-o73t8f">Suppose a model with 7B parameters and 200 million parameters injected with <a href="https://huggingface.co/docs/peft/conceptual_guides/lora" rel="nofollow">Low Rank Adapters</a>.</p> <p data-svelte-h="svelte-13ydqxp">The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters).</p> <p data-svelte-h="svelte-16ojy37">Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB.</p> <p data-svelte-h="svelte-1bqs6c8">Read more about PEFT and its detailed usage in <a href="https://huggingface.co/docs/peft/" rel="nofollow">the PEFT documentation</a> or <a href="https://github.com/huggingface/peft" rel="nofollow">PEFT repository</a>.</p> <h2 class="relative group"><a id="using--accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 Accelerate</span></h2> <p data-svelte-h="svelte-d91a9c">With <a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">🤗 Accelerate</a> you can use the above methods while gaining full
	control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications.</p> <p data-svelte-h="svelte-z796ng">Suppose you have combined the methods in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> like so:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = TrainingArguments(
	per_device_train_batch_size=<span class="hljs-number">1</span>,
	gradient_accumulation_steps=<span class="hljs-number">4</span>,
	gradient_checkpointing=<span class="hljs-literal">True</span>,
	fp16=<span class="hljs-literal">True</span>,
	**default_args,
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eja69v">The full example training loop with 🤗 Accelerate is only a handful of lines of code long:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
	<span class="hljs-keyword">from</span> torch.utils.data.dataloader <span class="hljs-keyword">import</span> DataLoader

	dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)

	<span class="hljs-keyword">if</span> training_args.gradient_checkpointing:
	model.gradient_checkpointing_enable()

	accelerator = Accelerator(fp16=training_args.fp16)
	model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

	model.train()
	<span class="hljs-keyword">for</span> step, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(dataloader, start=<span class="hljs-number">1</span>):
	loss = model(**batch).loss
	loss = loss / training_args.gradient_accumulation_steps
	accelerator.backward(loss)
	<span class="hljs-keyword">if</span> step % training_args.gradient_accumulation_steps == <span class="hljs-number">0</span>:
	optimizer.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wtx2y4">First we wrap the dataset in a <a href="https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader" rel="nofollow"><code>DataLoader</code></a>.
	Then we can enable gradient checkpointing by calling the model’s <a href="/docs/transformers/pr_33913/en/main_classes/model#transformers.PreTrainedModel.gradient_checkpointing_enable">gradient_checkpointing_enable()</a> method.
	When we initialize the <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator" rel="nofollow"><code>Accelerator</code></a>
	we can specify if we want to use mixed precision training and it will take care of it for us in the <code>prepare</code> call.
	During the <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare" rel="nofollow"><code>prepare</code></a>
	call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same <a href="#8-bit-adam">8-bit optimizer</a> from the earlier example.</p> <p data-svelte-h="svelte-i5733s">Finally, we can add the main training loop. Note that the <code>backward</code> call is handled by 🤗 Accelerate. We can also see
	how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have
	enough steps we run the optimization.</p> <p data-svelte-h="svelte-1caz98g">Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the
	benefit of more flexibility in the training loop. For a full documentation of all features have a look at the
	<a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate documentation</a>.</p> <h2 class="relative group"><a id="efficient-software-prebuilds" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#efficient-software-prebuilds"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Efficient Software Prebuilds</span></h2> <p data-svelte-h="svelte-nmhpgl">PyTorch’s <a href="https://pytorch.org/get-started/locally/#start-locally" rel="nofollow">pip and conda builds</a> come prebuilt with the cuda toolkit
	which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.</p> <p data-svelte-h="svelte-kxuhyz">At times, additional efforts may be required to pre-build some components. For instance, if you’re using libraries like <code>apex</code> that
	don’t come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated.
	To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with
	everything prebuilt. You just need to install your programs on it, and it will run out of the box.</p> <p data-svelte-h="svelte-qkuwkx">This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
	To find the docker image version you want start <a href="https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/" rel="nofollow">with PyTorch release notes</a>,
	choose one of the latest monthly releases. Go into the release’s notes for the desired release, check that the environment’s
	components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go
	to the corresponding NGC page. If for some reason you get lost, here is <a href="https://ngc.nvidia.com/catalog/containers/nvidia:pytorch" rel="nofollow">the index of all PyTorch NGC images</a>.</p> <p data-svelte-h="svelte-16g8p4x">Next follow the instructions to download and deploy the docker image.</p> <h2 class="relative group"><a id="mixture-of-experts" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mixture-of-experts"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mixture of Experts</span></h2> <p data-svelte-h="svelte-1aawf57">Some recent papers reported a 4-5x training speedup and a faster inference by integrating
	Mixture of Experts (MoE) into the Transformer models.</p> <p data-svelte-h="svelte-ln9ugc">Since it has been discovered that more parameters lead to better performance, this technique allows to increase the
	number of parameters by an order of magnitude without increasing training costs.</p> <p data-svelte-h="svelte-1ett6yb">In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function
	that trains each expert in a balanced way depending on the input token’s position in a sequence.</p> <p data-svelte-h="svelte-tdgts9"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png" alt="MoE Transformer 2x block"></p> <p data-svelte-h="svelte-z2dori">(source: <a href="https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html" rel="nofollow">GLAM</a>)</p> <p data-svelte-h="svelte-1gc8znk">You can find exhaustive details and comparison tables in the papers listed at the end of this section.</p> <p data-svelte-h="svelte-13sa7m0">The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude
	larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.</p> <p data-svelte-h="svelte-dxh8d5">There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or
	hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the
	memory requirements moderately as well.</p> <p data-svelte-h="svelte-1jfhyzg">Most related papers and implementations are built around Tensorflow/TPUs:</p> <ul data-svelte-h="svelte-1fjc55v"><li><a href="https://arxiv.org/abs/2006.16668" rel="nofollow">GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding</a></li> <li><a href="https://arxiv.org/abs/2101.03961" rel="nofollow">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a></li> <li><a href="https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html" rel="nofollow">GLaM: Generalist Language Model (GLaM)</a></li></ul> <p data-svelte-h="svelte-1nxcmqe">And for Pytorch DeepSpeed has built one as well: <a href="https://arxiv.org/abs/2201.05596" rel="nofollow">DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale</a>, <a href="https://www.deepspeed.ai/tutorials/mixture-of-experts/" rel="nofollow">Mixture of Experts</a> - blog posts: <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/" rel="nofollow">1</a>, <a href="https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/" rel="nofollow">2</a> and specific deployment with large transformer-based natural language generation models: <a href="https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html" rel="nofollow">blog post</a>, <a href="https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training" rel="nofollow">Megatron-Deepspeed branch</a>.</p> <h2 class="relative group"><a id="using-pytorch-native-attention-and-flash-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-pytorch-native-attention-and-flash-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using PyTorch native attention and Flash Attention</span></h2> <p data-svelte-h="svelte-1yxcg0k">PyTorch’s <a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow"><code>torch.nn.functional.scaled_dot_product_attention</code></a> (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for <code>torch>=2.1.1</code> when an implementation is available. Please refer to <a href="https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention" rel="nofollow">PyTorch scaled dot product attention</a> for a list of supported models and more details.</p> <p data-svelte-h="svelte-1r3sioy">Check out this <a href="https://pytorch.org/blog/out-of-the-box-acceleration/" rel="nofollow">blogpost</a> to learn more about acceleration and memory-savings with SDPA.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/perf_train_gpu_one.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 386],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 91.4 kB
Xet hash:: ab79a09a9e0c426c947d324374f397fe6cf19f9f1f4761b9a75ac55aae566f8e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.