Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /deepspeed.html

rtrm

27 days ago

download

raw

150 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Memory requirements","local":"memory-requirements","sections":[],"depth":2},{"title":"Select a ZeRO stage","local":"select-a-zero-stage","sections":[],"depth":2},{"title":"DeepSpeed configuration file","local":"deepspeed-configuration-file","sections":[{"title":"DeepSpeed and Trainer parameters","local":"deepspeed-and-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO configuration","local":"zero-configuration","sections":[],"depth":3},{"title":"NVMe configuration","local":"nvme-configuration","sections":[],"depth":3}],"depth":2},{"title":"DeepSpeed features","local":"deepspeed-features","sections":[{"title":"Activation/gradient checkpointing","local":"activationgradient-checkpointing","sections":[],"depth":3},{"title":"Optimizer and scheduler","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"Precision","local":"precision","sections":[],"depth":3},{"title":"Batch size","local":"batch-size","sections":[],"depth":3},{"title":"Gradient accumulation","local":"gradient-accumulation","sections":[],"depth":3},{"title":"Gradient clipping","local":"gradient-clipping","sections":[],"depth":3},{"title":"Communication data type","local":"communication-data-type","sections":[],"depth":3}],"depth":2},{"title":"Deployment","local":"deployment","sections":[{"title":"Multi-node deployment","local":"multi-node-deployment","sections":[],"depth":3},{"title":"SLURM","local":"slurm","sections":[],"depth":3},{"title":"Notebook","local":"notebook","sections":[],"depth":3}],"depth":2},{"title":"Save model weights","local":"save-model-weights","sections":[{"title":"Online","local":"online","sections":[],"depth":3},{"title":"Offline","local":"offline","sections":[],"depth":3}],"depth":2},{"title":"ZeRO Inference","local":"zero-inference","sections":[],"depth":2},{"title":"Non-Trainer DeepSpeed integration","local":"non-trainer-deepspeed-integration","sections":[{"title":"Non-Trainer ZeRO Inference","local":"non-trainer-zero-inference","sections":[],"depth":3},{"title":"Generate","local":"generate","sections":[],"depth":3}],"depth":2},{"title":"Troubleshoot","local":"troubleshoot","sections":[{"title":"DeepSpeed process killed at startup","local":"deepspeed-process-killed-at-startup","sections":[],"depth":3},{"title":"NaN loss","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/19.4d5d1b87.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/HfOption.1e589c90.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/stores.c3f24f16.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Memory requirements","local":"memory-requirements","sections":[],"depth":2},{"title":"Select a ZeRO stage","local":"select-a-zero-stage","sections":[],"depth":2},{"title":"DeepSpeed configuration file","local":"deepspeed-configuration-file","sections":[{"title":"DeepSpeed and Trainer parameters","local":"deepspeed-and-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO configuration","local":"zero-configuration","sections":[],"depth":3},{"title":"NVMe configuration","local":"nvme-configuration","sections":[],"depth":3}],"depth":2},{"title":"DeepSpeed features","local":"deepspeed-features","sections":[{"title":"Activation/gradient checkpointing","local":"activationgradient-checkpointing","sections":[],"depth":3},{"title":"Optimizer and scheduler","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"Precision","local":"precision","sections":[],"depth":3},{"title":"Batch size","local":"batch-size","sections":[],"depth":3},{"title":"Gradient accumulation","local":"gradient-accumulation","sections":[],"depth":3},{"title":"Gradient clipping","local":"gradient-clipping","sections":[],"depth":3},{"title":"Communication data type","local":"communication-data-type","sections":[],"depth":3}],"depth":2},{"title":"Deployment","local":"deployment","sections":[{"title":"Multi-node deployment","local":"multi-node-deployment","sections":[],"depth":3},{"title":"SLURM","local":"slurm","sections":[],"depth":3},{"title":"Notebook","local":"notebook","sections":[],"depth":3}],"depth":2},{"title":"Save model weights","local":"save-model-weights","sections":[{"title":"Online","local":"online","sections":[],"depth":3},{"title":"Offline","local":"offline","sections":[],"depth":3}],"depth":2},{"title":"ZeRO Inference","local":"zero-inference","sections":[],"depth":2},{"title":"Non-Trainer DeepSpeed integration","local":"non-trainer-deepspeed-integration","sections":[{"title":"Non-Trainer ZeRO Inference","local":"non-trainer-zero-inference","sections":[],"depth":3},{"title":"Generate","local":"generate","sections":[],"depth":3}],"depth":2},{"title":"Troubleshoot","local":"troubleshoot","sections":[{"title":"DeepSpeed process killed at startup","local":"deepspeed-process-killed-at-startup","sections":[],"depth":3},{"title":"NaN loss","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="deepspeed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed</span></h1> <p data-svelte-h="svelte-na7xsx"><a href="https://www.deepspeed.ai/" rel="nofollow">DeepSpeed</a> is a PyTorch optimization library that makes distributed training memory-efficient and fast. At its core is the <a href="https://hf.co/papers/1910.02054" rel="nofollow">Zero Redundancy Optimizer (ZeRO)</a> which enables training large models at scale. ZeRO works in several stages:</p> <ul data-svelte-h="svelte-q7iq3b"><li>ZeRO-1, optimizer state partitioning across GPUs</li> <li>ZeRO-2, gradient partitioning across GPUs</li> <li>ZeRO-3, parameter partitioning across GPUs</li></ul> <p data-svelte-h="svelte-11mv7mj">In GPU-limited environments, ZeRO also enables offloading optimizer memory and computation from the GPU to the CPU to fit and train really large models on a single GPU. DeepSpeed is integrated with the Transformers <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> class for all ZeRO stages and offloading. All you need to do is provide a config file or you can use a provided template. For inference, Transformers support ZeRO-3 and offloading since it allows loading huge models.</p> <p data-svelte-h="svelte-z4wws9">This guide will walk you through how to deploy DeepSpeed training, the features you can enable, how to setup the config files for different ZeRO stages, offloading, inference, and using DeepSpeed without the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>.</p> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation</span></h2> <p data-svelte-h="svelte-fuidv5">DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed <a href="https://www.deepspeed.ai/tutorials/advanced-install/" rel="nofollow">installation details</a> or the GitHub <a href="https://github.com/microsoft/deepspeed#installation" rel="nofollow">README</a>).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1cityiq">If you’re having difficulties installing DeepSpeed, check the <a href="../debugging#deepspeed-cuda-installation">DeepSpeed CUDA installation</a> guide. While DeepSpeed has a pip installable PyPI package, it is highly recommended to <a href="https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source" rel="nofollow">install it from source</a> to best match your hardware and to support certain features, like 1-bit Adam, which aren’t available in the PyPI distribution.</p></div> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">PyPI </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Transformers </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install deepspeed<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="memory-requirements" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-requirements"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory requirements</span></h2> <p data-svelte-h="svelte-1fd8xlc">Before you begin, it is a good idea to check whether you have enough GPU and CPU memory to fit your model. DeepSpeed provides a tool for estimating the required CPU/GPU memory. For example, to estimate the memory requirements for the <a href="bigscience/T0_3B">bigscience/T0_3B</a> model on a single GPU:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ python -c <span class="hljs-string">'from transformers import AutoModel; \
	from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
	model = AutoModel.from_pretrained("bigscience/T0_3B"); \
	estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'</span>
	[...]
	Estimated memory needed <span class="hljs-keyword">for</span> params, optim states and gradients <span class="hljs-keyword">for</span> a:
	HW: Setup with 1 node, 1 GPU per node.
	SW: Model with 2783M total params, 65M largest layer params.
	per CPU \| per GPU \| Options
	70.00GB \| 0.25GB \| offload_param=cpu , offload_optimizer=cpu , zero_init=1
	70.00GB \| 0.25GB \| offload_param=cpu , offload_optimizer=cpu , zero_init=0
	62.23GB \| 5.43GB \| offload_param=none, offload_optimizer=cpu , zero_init=1
	62.23GB \| 5.43GB \| offload_param=none, offload_optimizer=cpu , zero_init=0
	0.37GB \| 46.91GB \| offload_param=none, offload_optimizer=none, zero_init=1
	15.56GB \| 46.91GB \| offload_param=none, offload_optimizer=none, zero_init=0<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xpnss2">This means you either need a single 80GB GPU without CPU offload or a 8GB GPU and a ~60GB CPU to offload to (these are just the memory requirements for the parameters, optimizer states and gradients, and you’ll need a bit more for the CUDA kernels and activations). You should also consider the tradeoff between cost and speed because it’ll be cheaper to rent or buy a smaller GPU but it’ll take longer to train your model.</p> <p data-svelte-h="svelte-js61aw">If you have enough GPU memory make sure you disable CPU/NVMe offload to make everything faster.</p> <h2 class="relative group"><a id="select-a-zero-stage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#select-a-zero-stage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Select a ZeRO stage</span></h2> <p data-svelte-h="svelte-rkauna">After you’ve installed DeepSpeed and have a better idea of your memory requirements, the next step is selecting a ZeRO stage to use. In order of fastest and most memory-efficient:</p> <table data-svelte-h="svelte-msolv0"><thead><tr><th>Fastest</th> <th>Memory efficient</th></tr></thead> <tbody><tr><td>ZeRO-1</td> <td>ZeRO-3 + offload</td></tr> <tr><td>ZeRO-2</td> <td>ZeRO-3</td></tr> <tr><td>ZeRO-2 + offload</td> <td>ZeRO-2 + offload</td></tr> <tr><td>ZeRO-3</td> <td>ZeRO-2</td></tr> <tr><td>ZeRO-3 + offload</td> <td>ZeRO-1</td></tr></tbody></table> <p data-svelte-h="svelte-4tusjn">To find what works best for you, start with the fastest approach and if you run out of memory, try the next stage which is slower but more memory efficient. Feel free to work in whichever direction you prefer (starting with the most memory efficient or fastest) to discover the appropriate balance between speed and memory usage.</p> <p data-svelte-h="svelte-po9y52">A general process you can use is (start with batch size of 1):</p> <ol data-svelte-h="svelte-1qq8hfi"><li>enable gradient checkpointing</li> <li>try ZeRO-2</li> <li>try ZeRO-2 and offload the optimizer</li> <li>try ZeRO-3</li> <li>try ZeRO-3 and offload parameters to the CPU</li> <li>try ZeRO-3 and offload parameters and the optimizer to the CPU</li> <li>try lowering various default values like a narrower search beam if you’re using the <a href="/docs/transformers/pr_33913/en/main_classes/text_generation#transformers.GenerationMixin.generate">generate()</a> method</li> <li>try mixed half-precision (fp16 on older GPU architectures and bf16 on Ampere) over full-precision weights</li> <li>add more hardware if possible or enable Infinity to offload parameters and the optimizer to a NVMe</li> <li>once you’re not running out of memory, measure effective throughput and then try to increase the batch size as large as you can to maximize GPU efficiency</li> <li>lastly, try to optimize your training setup by disabling some offload features or use a faster ZeRO stage and increasing/decreasing the batch size to find the best tradeoff between speed and memory usage</li></ol> <h2 class="relative group"><a id="deepspeed-configuration-file" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-configuration-file"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed configuration file</span></h2> <p data-svelte-h="svelte-1sbx1m0">DeepSpeed works with the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> class by way of a config file containing all the parameters for configuring how you want setup your training run. When you execute your training script, DeepSpeed logs the configuration it received from <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> to the console so you can see exactly what configuration was used.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-a4z01l">Find a complete list of DeepSpeed configuration options on the <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed Configuration JSON</a> reference. You can also find more practical examples of various DeepSpeed configuration examples on the <a href="https://github.com/microsoft/DeepSpeedExamples" rel="nofollow">DeepSpeedExamples</a> repository or the main <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">DeepSpeed</a> repository. To quickly find specific examples, you can:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/microsoft/DeepSpeedExamples
	<span class="hljs-built_in">cd</span> DeepSpeedExamples
	find . -name <span class="hljs-string">'*json'</span>
	<span class="hljs-comment"># find examples with the Lamb optimizer</span>
	grep -i Lamb $(find . -name <span class="hljs-string">'*json'</span>)<!-- HTML_TAG_END --></pre></div></div> <p data-svelte-h="svelte-1mdw2zz">The DeepSpeed configuration file is passed as a path to a JSON file if you’re training from the command line interface or as a nested <code>dict</code> object if you’re using the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> in a notebook setting.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">path to file </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">nested dict </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TrainingArguments(..., deepspeed=<span class="hljs-string">"path/to/deepspeed_config.json"</span>)<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="deepspeed-and-trainer-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-and-trainer-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed and Trainer parameters</span></h3> <p data-svelte-h="svelte-1nzz5la">There are three types of configuration parameters:</p> <ol data-svelte-h="svelte-1plehsh"><li><p>Some of the configuration parameters are shared by <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> and DeepSpeed, and it can be difficult to identify errors when there are conflicting definitions. To make it easier, these shared configuration parameters are configured from the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> command line arguments.</p></li> <li><p>Some configuration parameters that are automatically derived from the model configuration so you don’t need to manually adjust these values. The <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> uses a configuration value <code>auto</code> to determine set the most correct or efficient value. You could set your own configuration parameters explicitly, but you must take care to ensure the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> arguments and DeepSpeed configuration parameters agree. Mismatches may cause the training to fail in very difficult to detect ways!</p></li> <li><p>Some configuration parameters specific to DeepSpeed only which need to be manually set based on your training needs.</p></li></ol> <p data-svelte-h="svelte-ia21ro">You could also modify the DeepSpeed configuration and edit <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> from it:</p> <ol data-svelte-h="svelte-zil17c"><li>Create or load a DeepSpeed configuration to use as the main configuration</li> <li>Create a <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> object based on these DeepSpeed configuration values</li></ol> <p data-svelte-h="svelte-120abcy">Some values, such as <code>scheduler.params.total_num_steps</code> are calculated by the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> during training.</p> <h3 class="relative group"><a id="zero-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO configuration</span></h3> <p data-svelte-h="svelte-nuf72i">There are three configurations, each corresponding to a different ZeRO stage. Stage 1 is not as interesting for scalability, and this guide focuses on stages 2 and 3. The <code>zero_optimization</code> configuration contains all the options for what to enable and how to configure them. For a more detailed explanation of each parameter, take a look at the <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed Configuration JSON</a> reference.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">DeepSpeed doesn’t validate parameter names and any typos fallback on the parameter's default setting. You can watch the DeepSpeed engine startup log messages to see what values it is going to use.</div> <p data-svelte-h="svelte-ahvflf">The following configurations must be setup with DeepSpeed because the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> doesn’t provide equivalent command line arguments.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">ZeRO-1 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-2 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-3 </div></div> <div class="language-select"><p data-svelte-h="svelte-hkpv8b">ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed up. The ZeRO-1 config can be setup like this:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"zero_optimization":</span> {
	<span class="hljs-attr">"stage":</span> <span class="hljs-number">1</span>
	}
	}<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="nvme-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nvme-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NVMe configuration</span></h3> <p data-svelte-h="svelte-1yvixx2"><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity</a> allows offloading model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.</p> <p data-svelte-h="svelte-14zbxsz">Depending on the CPU and/or NVMe memory available, you can offload both the <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-offloading" rel="nofollow">optimizer states</a> and <a href="https://www.deepspeed.ai/docs/config-json/#parameter-offloading" rel="nofollow">parameters</a>, just one of them, or none. You should also make sure the <code>nvme_path</code> is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it’ll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, <a href="https://github.com/microsoft/DeepSpeed/issues/998" rel="nofollow">run a benchmark</a> on your training setup to determine the optimal <code>aio</code> configuration.</p> <p data-svelte-h="svelte-1bftyio">The example ZeRO-3/Infinity configuration file below sets most of the parameter values to <code>auto</code>, but you could also manually add these values.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"fp16":</span> {
	<span class="hljs-attr">"enabled":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"loss_scale":</span> <span class="hljs-number">0</span>,
	<span class="hljs-attr">"loss_scale_window":</span> <span class="hljs-number">1000</span>,
	<span class="hljs-attr">"initial_scale_power":</span> <span class="hljs-number">16</span>,
	<span class="hljs-attr">"hysteresis":</span> <span class="hljs-number">2</span>,
	<span class="hljs-attr">"min_loss_scale":</span> <span class="hljs-number">1</span>
	},

	<span class="hljs-attr">"optimizer":</span> {
	<span class="hljs-attr">"type":</span> <span class="hljs-string">"AdamW"</span>,
	<span class="hljs-attr">"params":</span> {
	<span class="hljs-attr">"lr":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"betas":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"eps":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"weight_decay":</span> <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-attr">"scheduler":</span> {
	<span class="hljs-attr">"type":</span> <span class="hljs-string">"WarmupLR"</span>,
	<span class="hljs-attr">"params":</span> {
	<span class="hljs-attr">"warmup_min_lr":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"warmup_max_lr":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"warmup_num_steps":</span> <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-attr">"zero_optimization":</span> {
	<span class="hljs-attr">"stage":</span> <span class="hljs-number">3</span>,
	<span class="hljs-attr">"offload_optimizer":</span> {
	<span class="hljs-attr">"device":</span> <span class="hljs-string">"nvme"</span>,
	<span class="hljs-attr">"nvme_path":</span> <span class="hljs-string">"/local_nvme"</span>,
	<span class="hljs-attr">"pin_memory":</span> <span class="hljs-literal">true</span>,
	<span class="hljs-attr">"buffer_count":</span> <span class="hljs-number">4</span>,
	<span class="hljs-attr">"fast_init":</span> <span class="hljs-literal">false</span>
	},
	<span class="hljs-attr">"offload_param":</span> {
	<span class="hljs-attr">"device":</span> <span class="hljs-string">"nvme"</span>,
	<span class="hljs-attr">"nvme_path":</span> <span class="hljs-string">"/local_nvme"</span>,
	<span class="hljs-attr">"pin_memory":</span> <span class="hljs-literal">true</span>,
	<span class="hljs-attr">"buffer_count":</span> <span class="hljs-number">5</span>,
	<span class="hljs-attr">"buffer_size":</span> <span class="hljs-number">1e8</span>,
	<span class="hljs-attr">"max_in_cpu":</span> <span class="hljs-number">1e9</span>
	},
	<span class="hljs-attr">"aio":</span> {
	<span class="hljs-attr">"block_size":</span> <span class="hljs-number">262144</span>,
	<span class="hljs-attr">"queue_depth":</span> <span class="hljs-number">32</span>,
	<span class="hljs-attr">"thread_count":</span> <span class="hljs-number">1</span>,
	<span class="hljs-attr">"single_submit":</span> <span class="hljs-literal">false</span>,
	<span class="hljs-attr">"overlap_events":</span> <span class="hljs-literal">true</span>
	},
	<span class="hljs-attr">"overlap_comm":</span> <span class="hljs-literal">true</span>,
	<span class="hljs-attr">"contiguous_gradients":</span> <span class="hljs-literal">true</span>,
	<span class="hljs-attr">"sub_group_size":</span> <span class="hljs-number">1e9</span>,
	<span class="hljs-attr">"reduce_bucket_size":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"stage3_prefetch_bucket_size":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"stage3_param_persistence_threshold":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"stage3_max_live_parameters":</span> <span class="hljs-number">1e9</span>,
	<span class="hljs-attr">"stage3_max_reuse_distance":</span> <span class="hljs-number">1e9</span>,
	<span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save":</span> <span class="hljs-literal">true</span>
	},

	<span class="hljs-attr">"gradient_accumulation_steps":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"gradient_clipping":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"steps_per_print":</span> <span class="hljs-number">2000</span>,
	<span class="hljs-attr">"train_batch_size":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"train_micro_batch_size_per_gpu":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"wall_clock_breakdown":</span> <span class="hljs-literal">false</span>
	}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="deepspeed-features" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-features"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed features</span></h2> <p data-svelte-h="svelte-t13qkh">There are a number of important parameters to specify in the DeepSpeed configuration file which are briefly described in this section.</p> <h3 class="relative group"><a id="activationgradient-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#activationgradient-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Activation/gradient checkpointing</span></h3> <p data-svelte-h="svelte-1pyala7">Activation and gradient checkpointing trades speed for more GPU memory which allows you to overcome scenarios where your GPU is out of memory or to increase your batch size for better performance. To enable this feature:</p> <ol data-svelte-h="svelte-1y20l5v"><li>For a Hugging Face model, set <code>model.gradient_checkpointing_enable()</code> or <code>--gradient_checkpointing</code> in the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>.</li> <li>For a non-Hugging Face model, use the DeepSpeed <a href="https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html" rel="nofollow">Activation Checkpointing API</a>. You could also replace the Transformers modeling code and replace <code>torch.utils.checkpoint</code> with the DeepSpeed API. This approach is more flexible because you can offload the forward activations to the CPU memory instead of recalculating them.</li></ol> <h3 class="relative group"><a id="optimizer-and-scheduler" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizer-and-scheduler"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizer and scheduler</span></h3> <p data-svelte-h="svelte-8lb90i">DeepSpeed and Transformers optimizer and scheduler can be mixed and matched as long as you don’t enable <code>offload_optimizer</code>. When <code>offload_optimizer</code> is enabled, you could use a non-DeepSpeed optimizer (except for LAMB) as long as it has both a CPU and GPU implementation.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-130jdwu">The optimizer and scheduler parameters for the config file can be set from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place you can override it from the command line. Aside from the optimizer and scheduler parameters, you’ll need to ensure your <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> command line arguments match the DeepSpeed configuration.</p></div> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">optimizer </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">scheduler </div></div> <div class="language-select"><p data-svelte-h="svelte-3qnbzy">DeepSpeed offers several <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-parameters" rel="nofollow">optimizers</a> (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don’t configure the optimizer in the config, the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: <code>lr</code>, <code>adam_beta1</code>, <code>adam_beta2</code>, <code>adam_epsilon</code>, <code>weight_decay</code>.</p> <p data-svelte-h="svelte-1c8r8fs">You can set the parameters to <code>"auto"</code> or manually input your own desired values.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"optimizer":</span> {
	<span class="hljs-attr">"type":</span> <span class="hljs-string">"AdamW"</span>,
	<span class="hljs-attr">"params":</span> {
	<span class="hljs-attr">"lr":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"betas":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"eps":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"weight_decay":</span> <span class="hljs-string">"auto"</span>
	}
	}
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ecbnix">You can also use an unsupported optimizer by adding the following to the top level configuration.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"zero_allow_untested_optimizer":</span> <span class="hljs-literal">true</span>
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ybmwnz">From DeepSpeed==0.8.3 on, if you want to use offload, you’ll also need to the following to the top level configuration because offload works best with DeepSpeed’s CPU Adam optimizer.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"zero_force_ds_cpu_optimizer":</span> <span class="hljs-literal">false</span>
	}<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="precision" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#precision"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Precision</span></h3> <p data-svelte-h="svelte-j5yqaq">Deepspeed supports fp32, fp16, and bf16 mixed precision.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">fp32 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">fp16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">bf16 </div></div> <div class="language-select"><p data-svelte-h="svelte-14q857h">If your model doesn’t work well with mixed precision, for example if it wasn’t pretrained in mixed precision, you may encounter overflow or underflow issues which can cause NaN loss. For these cases, you should use full fp32 precision by explicitly disabling the default fp16 mode.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"fp16":</span> {
	<span class="hljs-attr">"enabled":</span> <span class="hljs-literal">false</span>
	}
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1q8uiyj">For Ampere GPUs and PyTorch > 1.7, it automatically switches to the more efficient <a href="https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" rel="nofollow">tf32</a> format for some operations but the results are still in fp32. You can control it from the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> by setting <code>--tf32</code> to enable it, and <code>--tf32 0</code> or <code>--no_tf32</code> to disable it.</p> </div> <h3 class="relative group"><a id="batch-size" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-size"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch size</span></h3> <p data-svelte-h="svelte-1ok976q">The batch size can be auto-configured or explicitly set. If you choose to use the <code>"auto"</code> option, <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets <code>train_micro_batch_size_per_gpu</code> to the value of args.<code>per_device_train_batch_size</code> and <code>train_batch_size</code> to <code>args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"train_micro_batch_size_per_gpu":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"train_batch_size":</span> <span class="hljs-string">"auto"</span>
	}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient accumulation</span></h3> <p data-svelte-h="svelte-4j1mvg">Gradient accumulation can be auto-configured or explicitly set. If you choose to use the <code>"auto"</code> option, <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets it to the value of <code>args.gradient_accumulation_steps</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"gradient_accumulation_steps":</span> <span class="hljs-string">"auto"</span>
	}
	<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-clipping" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-clipping"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient clipping</span></h3> <p data-svelte-h="svelte-9zx7h9">Gradient clipping can be auto-configured or explicitly set. If you choose to use the <code>"auto"</code> option, <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets it to the value of <code>args.max_grad_norm</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"gradient_clipping":</span> <span class="hljs-string">"auto"</span>
	}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="communication-data-type" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#communication-data-type"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Communication data type</span></h3> <p data-svelte-h="svelte-gpfm4j">For communication collectives like reduction, gathering and scattering operations, a separate data type is used.</p> <p data-svelte-h="svelte-mvoza6">All gather and scatter operations are performed in the same data type the data is in. For example, if you’re training with bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.</p> <p data-svelte-h="svelte-1ulbaso">Reduce operations are lossy, for example when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it is more likely to be lossy because adding multiple numbers in low precision isn’t exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.</p> <p data-svelte-h="svelte-b2p3xv">You can choose the communication data type by setting the <code>communication_data_type</code> parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you’re training in.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"communication_data_type":</span> <span class="hljs-string">"fp32"</span>
	}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="deployment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deployment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deployment</span></h2> <p data-svelte-h="svelte-18flzm5">DeepSpeed can be deployed by different launchers such as <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>, the <code>deepspeed</code> launcher, or <a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch" rel="nofollow">Accelerate</a>. To deploy, add <code>--deepspeed ds_config.json</code> to the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> command line. It’s recommended to use DeepSpeed’s <a href="https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing" rel="nofollow"><code>add_config_arguments</code></a> utility to add any necessary command line arguments to your code.</p> <p data-svelte-h="svelte-1270bwq">This guide will show you how to deploy DeepSpeed with the <code>deepspeed</code> launcher for different training setups. You can check out this <a href="https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400" rel="nofollow">post</a> for more practical usage examples.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">multi-GPU </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">single-GPU </div></div> <div class="language-select"><p data-svelte-h="svelte-1crs7lp">To deploy DeepSpeed on multiple GPUs, add the <code>--num_gpus</code> parameter. If you want to use all available GPUs, you don’t need to add <code>--num_gpus</code>. The example below uses 2 GPUs.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
	--deepspeed tests/deepspeed/ds_config_zero3.json \
	--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
	--output_dir output_dir --overwrite_output_dir --fp16 \
	--do_train --max_train_samples 500 --num_train_epochs 1 \
	--dataset_name wmt16 --dataset_config <span class="hljs-string">"ro-en"</span> \
	--source_lang en --target_lang ro<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="multi-node-deployment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-node-deployment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi-node deployment</span></h3> <p data-svelte-h="svelte-1vvv5pa">A node is one or more GPUs for running a workload. A more powerful setup is a multi-node setup which can be launched with the <code>deepspeed</code> launcher. For this guide, let’s assume there are two nodes with 8 GPUs each. The first node can be accessed <code>ssh hostname1</code> and the second node with <code>ssh hostname2</code>. Both nodes must be able to communicate with each other locally over ssh without a password.</p> <p data-svelte-h="svelte-mp54lp">By default, DeepSpeed expects your multi-node environment to use a shared storage. If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a <a href="https://www.deepspeed.ai/docs/config-json/#checkpoint-options" rel="nofollow"><code>checkpoint</code></a> to allow loading without access to a shared filesystem:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"checkpoint":</span> {
	<span class="hljs-attr">"use_node_local_storage":</span> <span class="hljs-literal">true</span>
	}
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1embmgy">You could also use the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>’s <code>--save_on_each_node</code> argument to automatically add the above <code>checkpoint</code> to your config.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">torchrun </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">deepspeed </div></div> <div class="language-select"><p data-svelte-h="svelte-1bt3z60">For <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>, you have to ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
	--master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="slurm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#slurm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>SLURM</span></h3> <p data-svelte-h="svelte-ow6xqy">In a SLURM environment, you’ll need to adapt your SLURM script to your specific SLURM environment. An example SLURM script may look like:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">#SBATCH --job-name=test-nodes # name</span>
	<span class="hljs-comment">#SBATCH --nodes=2 # nodes</span>
	<span class="hljs-comment">#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!</span>
	<span class="hljs-comment">#SBATCH --cpus-per-task=10 # number of cores per tasks</span>
	<span class="hljs-comment">#SBATCH --gres=gpu:8 # number of gpus</span>
	<span class="hljs-comment">#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)</span>
	<span class="hljs-comment">#SBATCH --output=%x-%j.out # output file name</span>

	<span class="hljs-built_in">export</span> GPUS_PER_NODE=8
	<span class="hljs-built_in">export</span> MASTER_ADDR=$(scontrol show hostnames <span class="hljs-variable">$SLURM_JOB_NODELIST</span> \| <span class="hljs-built_in">head</span> -n 1)
	<span class="hljs-built_in">export</span> MASTER_PORT=9901

	srun --jobid <span class="hljs-variable">$SLURM_JOBID</span> bash -c <span class="hljs-string">'python -m torch.distributed.run \
	--nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
	--master_addr $MASTER_ADDR --master_port $MASTER_PORT \
	your_program.py <normal cl args> --deepspeed ds_config.json'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xfixmf">Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sbatch launch.slurm<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="notebook" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#notebook"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Notebook</span></h3> <p data-svelte-h="svelte-4dix30">The <code>deepspeed</code> launcher doesn’t support deployment from a notebook so you’ll need to emulate the distributed environment. However, this only works for 1 GPU. If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. This means you have to use the <code>deepspeed</code> launcher which can’t be emulated as shown here.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># DeepSpeed requires a distributed environment even when only one process is used.</span>
	<span class="hljs-comment"># This emulates a launcher in the notebook</span>
	<span class="hljs-keyword">import</span> os

	os.environ[<span class="hljs-string">"MASTER_ADDR"</span>] = <span class="hljs-string">"localhost"</span>
	os.environ[<span class="hljs-string">"MASTER_PORT"</span>] = <span class="hljs-string">"9994"</span> <span class="hljs-comment"># modify if RuntimeError: Address already in use</span>
	os.environ[<span class="hljs-string">"RANK"</span>] = <span class="hljs-string">"0"</span>
	os.environ[<span class="hljs-string">"LOCAL_RANK"</span>] = <span class="hljs-string">"0"</span>
	os.environ[<span class="hljs-string">"WORLD_SIZE"</span>] = <span class="hljs-string">"1"</span>

	<span class="hljs-comment"># Now proceed as normal, plus pass the DeepSpeed config file</span>
	training_args = TrainingArguments(..., deepspeed=<span class="hljs-string">"ds_config_zero3.json"</span>)
	trainer = Trainer(...)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fw1nb2">If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated cell.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash
	cat <<<span class="hljs-string">'EOT'</span> > ds_config_zero3.json
	{
	<span class="hljs-string">"fp16"</span>: {
	<span class="hljs-string">"enabled"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"loss_scale"</span>: <span class="hljs-number">0</span>,
	<span class="hljs-string">"loss_scale_window"</span>: <span class="hljs-number">1000</span>,
	<span class="hljs-string">"initial_scale_power"</span>: <span class="hljs-number">16</span>,
	<span class="hljs-string">"hysteresis"</span>: <span class="hljs-number">2</span>,
	<span class="hljs-string">"min_loss_scale"</span>: <span class="hljs-number">1</span>
	},

	<span class="hljs-string">"optimizer"</span>: {
	<span class="hljs-string">"type"</span>: <span class="hljs-string">"AdamW"</span>,
	<span class="hljs-string">"params"</span>: {
	<span class="hljs-string">"lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"betas"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"eps"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"weight_decay"</span>: <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-string">"scheduler"</span>: {
	<span class="hljs-string">"type"</span>: <span class="hljs-string">"WarmupLR"</span>,
	<span class="hljs-string">"params"</span>: {
	<span class="hljs-string">"warmup_min_lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"warmup_max_lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"warmup_num_steps"</span>: <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-string">"zero_optimization"</span>: {
	<span class="hljs-string">"stage"</span>: <span class="hljs-number">3</span>,
	<span class="hljs-string">"offload_optimizer"</span>: {
	<span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>,
	<span class="hljs-string">"pin_memory"</span>: true
	},
	<span class="hljs-string">"offload_param"</span>: {
	<span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>,
	<span class="hljs-string">"pin_memory"</span>: true
	},
	<span class="hljs-string">"overlap_comm"</span>: true,
	<span class="hljs-string">"contiguous_gradients"</span>: true,
	<span class="hljs-string">"sub_group_size"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"reduce_bucket_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_prefetch_bucket_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_param_persistence_threshold"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_max_live_parameters"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"stage3_max_reuse_distance"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"stage3_gather_16bit_weights_on_model_save"</span>: true
	},

	<span class="hljs-string">"gradient_accumulation_steps"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"gradient_clipping"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"steps_per_print"</span>: <span class="hljs-number">2000</span>,
	<span class="hljs-string">"train_batch_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"wall_clock_breakdown"</span>: false
	}
	EOT<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xvblu4">If the training script is in a file and not in a notebook cell, you can launch <code>deepspeed</code> normally from the shell in a notebook cell. For example, to launch <code>run_translation.py</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!git clone https://github.com/huggingface/transformers
	!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dkflot">You could also use <code>%%bash</code> magic and write multi-line code to run the shell program, but you won’t be able to view the logs until training is complete. With <code>%%bash</code> magic, you don’t need to emulate a distributed environment.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash

	git clone https://github.com/huggingface/transformers
	cd transformers
	deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="save-model-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save-model-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save model weights</span></h2> <p data-svelte-h="svelte-v8wycf">DeepSpeed stores the main full precision fp32 weights in custom checkpoint optimizer files (the glob pattern looks like <code>global_step/optim_states.pt</code>) and are saved under the normal checkpoint.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">fp16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">fp32 </div></div> <div class="language-select"><p data-svelte-h="svelte-1ffu7au">A model trained with ZeRO-2 saves the pytorch_model.bin weights in fp16. To save the model weights in fp16 for a model trained with ZeRO-3, you need to set <code>"stage3_gather_16bit_weights_on_model_save": true</code> because the model weights are partitioned across multiple GPUs. Otherwise, the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> won’t save the weights in fp16 and it won’t create a pytorch_model.bin file. This is because DeepSpeed’s state_dict contains a placeholder instead of the real weights and you won’t be able to load them.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"zero_optimization":</span> {
	<span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save":</span> <span class="hljs-literal">true</span>
	}
	}<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="zero-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO Inference</span></h2> <p data-svelte-h="svelte-1cd47xa"><a href="https://www.deepspeed.ai/2022/09/09/zero-inference.html" rel="nofollow">ZeRO Inference</a> places the model weights in CPU or NVMe memory to avoid burdening the GPU which makes it possible to run inference with huge models on a GPU. Inference doesn’t require any large additional amounts of memory for the optimizer states and gradients so you can fit much larger batches and/or sequence lengths on the same hardware.</p> <p data-svelte-h="svelte-1tp1ibg">ZeRO Inference shares the same configuration file as <a href="#zero-configuration">ZeRO-3</a>, and ZeRO-2 and ZeRO-1 configs won’t work because they don’t provide any benefits for inference.</p> <p data-svelte-h="svelte-mo45sb">To run ZeRO Inference, pass your usual training arguments to the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> class and add the <code>--do_eval</code> argument.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="non-trainer-deepspeed-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-trainer-deepspeed-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Non-Trainer DeepSpeed integration</span></h2> <p data-svelte-h="svelte-1ieycn3">DeepSpeed also works with Transformers without the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> class. This is handled by the <code>HfDeepSpeedConfig</code> which only takes care of gathering ZeRO-3 parameters and splitting a model across multiple GPUs when you call <a href="/docs/transformers/pr_33913/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1i298n1">If you want everything automatically taken care of for you, try using DeepSpeed with the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>! You’ll need to follow the <a href="https://www.deepspeed.ai/" rel="nofollow">DeepSpeed documentation</a>, and manually configure the parameter values in the config file (you can’t use the <code>"auto"</code> value).</p></div> <p data-svelte-h="svelte-1u0t2ts">To efficiently deploy ZeRO-3, you must instantiate the <code>HfDeepSpeedConfig</code> object before the model and keep that object alive:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">pretrained model </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">non-pretrained model </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers.integrations <span class="hljs-keyword">import</span> HfDeepSpeedConfig
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModel
	<span class="hljs-keyword">import</span> deepspeed

	ds_config = {...} <span class="hljs-comment"># deepspeed config object or path to the file</span>
	<span class="hljs-comment"># must run before instantiating the model to detect zero 3</span>
	dschf = HfDeepSpeedConfig(ds_config) <span class="hljs-comment"># keep this object alive</span>
	model = AutoModel.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	engine = deepspeed.initialize(model=model, config_params=ds_config, ...)<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="non-trainer-zero-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-trainer-zero-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Non-Trainer ZeRO Inference</span></h3> <p data-svelte-h="svelte-1rz053a">To run ZeRO Inference without the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> in cases where you can’t fit a model onto a single GPU, try using additional GPUs or/and offloading to CPU memory. The important nuance to understand here is that the way ZeRO is designed, you can process different inputs on different GPUs in parallel.</p> <p data-svelte-h="svelte-o4r1i6">Make sure to:</p> <ul data-svelte-h="svelte-1bodp8e"><li>disable CPU offload if you have enough GPU memory (since it slows things down).</li> <li>enable bf16 if you have an Ampere or newer GPU to make things faster. If you don’t have one of these GPUs, you may enable fp16 as long as you don’t use a model pretrained in bf16 (T5 models) because it may lead to an overflow error.</li></ul> <p data-svelte-h="svelte-1h6hvj8">Take a look at the following script to get a better idea of how to run ZeRO Inference without the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> on a model that won’t fit on a single GPU.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">#!/usr/bin/env python</span>

	<span class="hljs-comment"># This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model</span>
	<span class="hljs-comment"># into a single GPU</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># 1. Use 1 GPU with CPU offload</span>
	<span class="hljs-comment"># 2. Or use multiple GPUs instead</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># First you need to install deepspeed: pip install deepspeed</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2</span>
	<span class="hljs-comment"># small GPUs can handle it. or 1 small GPU and a lot of CPU memory.</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -</span>
	<span class="hljs-comment"># you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to</span>
	<span class="hljs-comment"># process multiple inputs at once.</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># The provided deepspeed config also activates CPU memory offloading, so chances are that if you</span>
	<span class="hljs-comment"># have a lot of available CPU memory and you don't mind a slowdown you should be able to load a</span>
	<span class="hljs-comment"># model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will</span>
	<span class="hljs-comment"># run faster if you don't want offload to CPU - so disable that section then.</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># To deploy on 1 gpu:</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># deepspeed --num_gpus 1 t0.py</span>
	<span class="hljs-comment"># or:</span>
	<span class="hljs-comment"># python -m torch.distributed.run --nproc_per_node=1 t0.py</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># To deploy on 2 gpus:</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># deepspeed --num_gpus 2 t0.py</span>
	<span class="hljs-comment"># or:</span>
	<span class="hljs-comment"># python -m torch.distributed.run --nproc_per_node=2 t0.py</span>

	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
	<span class="hljs-keyword">from</span> transformers.integrations <span class="hljs-keyword">import</span> HfDeepSpeedConfig
	<span class="hljs-keyword">import</span> deepspeed
	<span class="hljs-keyword">import</span> os
	<span class="hljs-keyword">import</span> torch

	os.environ[<span class="hljs-string">"TOKENIZERS_PARALLELISM"</span>] = <span class="hljs-string">"false"</span> <span class="hljs-comment"># To avoid warnings about parallelism in tokenizers</span>

	<span class="hljs-comment"># distributed setup</span>
	local_rank = <span class="hljs-built_in">int</span>(os.getenv(<span class="hljs-string">"LOCAL_RANK"</span>, <span class="hljs-string">"0"</span>))
	world_size = <span class="hljs-built_in">int</span>(os.getenv(<span class="hljs-string">"WORLD_SIZE"</span>, <span class="hljs-string">"1"</span>))
	torch.cuda.set_device(local_rank)
	deepspeed.init_distributed()

	model_name = <span class="hljs-string">"bigscience/T0_3B"</span>

	config = AutoConfig.from_pretrained(model_name)
	model_hidden_size = config.d_model

	<span class="hljs-comment"># batch size has to be divisible by world_size, but can be bigger than world_size</span>
	train_batch_size = <span class="hljs-number">1</span> * world_size

	<span class="hljs-comment"># ds_config notes</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be</span>
	<span class="hljs-comment"># faster.</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.</span>
	<span class="hljs-comment"># all official t5 models are bf16-pretrained</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># - set offload_param.device to "none" or completely remove the `offload_param` section if you don't</span>
	<span class="hljs-comment"># - want CPU offload</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control</span>
	<span class="hljs-comment"># - which params should remain on gpus - the larger the value the smaller the offload size</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># For in-depth info on Deepspeed config see</span>
	<span class="hljs-comment"># https://huggingface.co/docs/transformers/main/main_classes/deepspeed</span>

	<span class="hljs-comment"># keeping the same format as json for consistency, except it uses lower case for true/false</span>
	<span class="hljs-comment"># fmt: off</span>
	ds_config = {
	<span class="hljs-string">"fp16"</span>: {
	<span class="hljs-string">"enabled"</span>: <span class="hljs-literal">False</span>
	},
	<span class="hljs-string">"bf16"</span>: {
	<span class="hljs-string">"enabled"</span>: <span class="hljs-literal">False</span>
	},
	<span class="hljs-string">"zero_optimization"</span>: {
	<span class="hljs-string">"stage"</span>: <span class="hljs-number">3</span>,
	<span class="hljs-string">"offload_param"</span>: {
	<span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>,
	<span class="hljs-string">"pin_memory"</span>: <span class="hljs-literal">True</span>
	},
	<span class="hljs-string">"overlap_comm"</span>: <span class="hljs-literal">True</span>,
	<span class="hljs-string">"contiguous_gradients"</span>: <span class="hljs-literal">True</span>,
	<span class="hljs-string">"reduce_bucket_size"</span>: model_hidden_size * model_hidden_size,
	<span class="hljs-string">"stage3_prefetch_bucket_size"</span>: <span class="hljs-number">0.9</span> * model_hidden_size * model_hidden_size,
	<span class="hljs-string">"stage3_param_persistence_threshold"</span>: <span class="hljs-number">10</span> * model_hidden_size
	},
	<span class="hljs-string">"steps_per_print"</span>: <span class="hljs-number">2000</span>,
	<span class="hljs-string">"train_batch_size"</span>: train_batch_size,
	<span class="hljs-string">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-number">1</span>,
	<span class="hljs-string">"wall_clock_breakdown"</span>: <span class="hljs-literal">False</span>
	}
	<span class="hljs-comment"># fmt: on</span>

	<span class="hljs-comment"># next line instructs transformers to partition the model directly over multiple gpus using</span>
	<span class="hljs-comment"># deepspeed.zero.Init when model's `from_pretrained` method is called.</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)</span>
	<span class="hljs-comment">#</span>
	<span class="hljs-comment"># otherwise the model will first be loaded normally and only partitioned at forward time which is</span>
	<span class="hljs-comment"># less efficient and when there is little CPU RAM may fail</span>
	dschf = HfDeepSpeedConfig(ds_config) <span class="hljs-comment"># keep this object alive</span>

	<span class="hljs-comment"># now a model can be loaded.</span>
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	<span class="hljs-comment"># initialise Deepspeed ZeRO and store only the engine object</span>
	ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[<span class="hljs-number">0</span>]
	ds_engine.module.<span class="hljs-built_in">eval</span>() <span class="hljs-comment"># inference</span>

	<span class="hljs-comment"># Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.</span>
	<span class="hljs-comment"># If you use more GPUs adjust for more.</span>
	<span class="hljs-comment"># And of course if you have just one input to process you then need to pass the same string to both gpus</span>
	<span class="hljs-comment"># If you use only one GPU, then you will have only rank 0.</span>
	rank = torch.distributed.get_rank()
	<span class="hljs-keyword">if</span> rank == <span class="hljs-number">0</span>:
	text_in = <span class="hljs-string">"Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"</span>
	<span class="hljs-keyword">elif</span> rank == <span class="hljs-number">1</span>:
	text_in = <span class="hljs-string">"Is this review positive or negative? Review: this is the worst restaurant ever"</span>

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	inputs = tokenizer.encode(text_in, return_tensors=<span class="hljs-string">"pt"</span>).to(device=local_rank)
	<span class="hljs-keyword">with</span> torch.no_grad():
	outputs = ds_engine.module.generate(inputs, synced_gpus=<span class="hljs-literal">True</span>)
	text_out = tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"rank<span class="hljs-subst">{rank}</span>:\n in=<span class="hljs-subst">{text_in}</span>\n out=<span class="hljs-subst">{text_out}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15z7rtv">Save the script as t0.py and launch it:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ deepspeed --num_gpus 2 t0.py
	rank0:
	<span class="hljs-keyword">in</span>=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
	out=Positive
	rank1:
	<span class="hljs-keyword">in</span>=Is this review positive or negative? Review: this is the worst restaurant ever
	out=negative<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sp9gkx">This is a very basic example and you’ll want to adapt it to your use case.</p> <h3 class="relative group"><a id="generate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#generate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Generate</span></h3> <p data-svelte-h="svelte-1ybd6vz">Using multiple GPUs with ZeRO-3 for generation requires synchronizing the GPUs by setting <code>synced_gpus=True</code> in the <a href="/docs/transformers/pr_33913/en/main_classes/text_generation#transformers.GenerationMixin.generate">generate()</a> method. Otherwise, if one GPU is finished generating before another one, the whole system hangs because the remaining GPUs haven’t received the weight shard from the GPU that finished first.</p> <p data-svelte-h="svelte-1fhttxy">For Transformers>=4.28, if <code>synced_gpus</code> is automatically set to <code>True</code> if multiple GPUs are detected during generation.</p> <h2 class="relative group"><a id="troubleshoot" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#troubleshoot"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Troubleshoot</span></h2> <p data-svelte-h="svelte-12ltrno">When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn’t (unless it’s super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">DeepSpeed repository</a>.</p> <p data-svelte-h="svelte-13gytjz">For issues related to the Transformers integration, please provide the following information:</p> <ul data-svelte-h="svelte-1lhstl"><li><p>the full DeepSpeed config file</p></li> <li><p>the command line arguments of the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a>, or <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> arguments if you’re scripting the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.Trainer">Trainer</a> setup yourself (don’t dump the <a href="/docs/transformers/pr_33913/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> which has dozens of irrelevant entries)</p></li> <li><p>the outputs of:</p></li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -c <span class="hljs-string">'import torch; print(f"torch: {torch.__version__}")'</span>
	python -c <span class="hljs-string">'import transformers; print(f"transformers: {transformers.__version__}")'</span>
	python -c <span class="hljs-string">'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'</span><!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-1mmu83y"><li><p>a link to a Google Colab notebook to reproduce the issue</p></li> <li><p>if impossible, a standard and non-custom dataset we can use and also try to use an existing example to reproduce the issue with</p></li></ul> <p data-svelte-h="svelte-oia8x0">The following sections provide a guide for resolving two of the most common issues.</p> <h3 class="relative group"><a id="deepspeed-process-killed-at-startup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-process-killed-at-startup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed process killed at startup</span></h3> <p data-svelte-h="svelte-1yhr0cr">When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than your system has or your process tried to allocate more CPU memory than allowed leading the OS kernel to terminate the process. In this case, check whether your configuration file has either <code>offload_optimizer</code>, <code>offload_param</code> or both configured to offload to the CPU.</p> <p data-svelte-h="svelte-1s2gzku">If you have NVMe and ZeRO-3 setup, experiment with offloading to the NVMe (<a href="https://deepspeed.readthedocs.io/en/latest/memory.html" rel="nofollow">estimate</a> the memory requirements for your model).</p> <h3 class="relative group"><a id="nan-loss" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nan-loss"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NaN loss</span></h3> <p data-svelte-h="svelte-4mvvb2">NaN loss often occurs when a model is pretrained in bf16 and then you try to use it with fp16 (especially relevant for TPU trained models). To resolve this, use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer).</p> <p data-svelte-h="svelte-7me63n">The other issue may be related to using fp16. For example, if this is your fp16 configuration:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"fp16":</span> {
	<span class="hljs-attr">"enabled":</span> <span class="hljs-string">"auto"</span>,
	<span class="hljs-attr">"loss_scale":</span> <span class="hljs-number">0</span>,
	<span class="hljs-attr">"loss_scale_window":</span> <span class="hljs-number">1000</span>,
	<span class="hljs-attr">"initial_scale_power":</span> <span class="hljs-number">16</span>,
	<span class="hljs-attr">"hysteresis":</span> <span class="hljs-number">2</span>,
	<span class="hljs-attr">"min_loss_scale":</span> <span class="hljs-number">1</span>
	}
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1livx4i">You might see the following <code>OVERFLOW!</code> messages in the logs:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->0%\| \| 0/189 [00:00<?, ?it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
	1%\|▌ \| 1/189 [00:00<01:26, 2.17it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
	1%\|█▏
	[...]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	14%\|████████████████▌ \| 27/189 [00:14<01:13, 2.21it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	15%\|█████████████████▏ \| 28/189 [00:14<01:13, 2.18it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	15%\|█████████████████▊ \| 29/189 [00:15<01:13, 2.18it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	[...]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fiingf">This means the DeepSpeed loss scaler is unable to find a scaling coefficient to overcome loss overflow. To fix it, try a higher <code>initial_scale_power</code> value (32 usually works).</p> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Resources</span></h2> <p data-svelte-h="svelte-1ufcnmq">DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the <a href="https://www.microsoft.com/en-us/research/search/?q=deepspeed" rel="nofollow">blog posts</a>, <a href="https://www.deepspeed.ai/getting-started/" rel="nofollow">documentation</a>, and <a href="https://github.com/microsoft/deepspeed" rel="nofollow">GitHub repository</a>.</p> <p data-svelte-h="svelte-khzrsx">The following papers are also a great resource for learning more about ZeRO:</p> <ul data-svelte-h="svelte-o0yfva"><li><a href="https://hf.co/papers/1910.02054" rel="nofollow">ZeRO: Memory Optimizations Toward Training Trillion Parameter Models</a></li> <li><a href="https://hf.co/papers/2101.06840" rel="nofollow">ZeRO-Offload: Democratizing Billion-Scale Model Training</a></li> <li><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/deepspeed.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 19],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 150 kB
Xet hash:: fc7e9f51ec59bf70fe6c32b21777df23262e0bce6df9994cd96857df7b055cd1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.