Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33892 /en /deepspeed.html

rtrm

about 1 month ago

download

raw

143 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"Choosing a ZeRO stage","local":"choosing-a-zero-stage","sections":[],"depth":2},{"title":"Config file","local":"config-file","sections":[{"title":"DeepSpeed versus Trainer parameters","local":"deepspeed-versus-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO stage","local":"zero-stage","sections":[],"depth":3},{"title":"Initialize large models","local":"initialize-large-models","sections":[],"depth":3},{"title":"NVMe","local":"nvme","sections":[],"depth":3}],"depth":2},{"title":"Training features","local":"training-features","sections":[{"title":"Gradient checkpointing","local":"gradient-checkpointing","sections":[],"depth":3},{"title":"Batch size","local":"batch-size","sections":[],"depth":3},{"title":"Communication data type","local":"communication-data-type","sections":[],"depth":3},{"title":"Gradient accumulation","local":"gradient-accumulation","sections":[],"depth":3},{"title":"Gradient clipping","local":"gradient-clipping","sections":[],"depth":3},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[],"depth":3},{"title":"Optimizer and scheduler","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"Universal checkpointing","local":"universal-checkpointing","sections":[],"depth":3}],"depth":2},{"title":"Deploy","local":"deploy","sections":[{"title":"Multi-node","local":"multi-node","sections":[],"depth":3},{"title":"Slurm","local":"slurm","sections":[],"depth":3},{"title":"Jupyter Notebook","local":"jupyter-notebook","sections":[],"depth":3}],"depth":2},{"title":"Save model weights","local":"save-model-weights","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"fp32","local":"fp32","sections":[],"depth":3}],"depth":2},{"title":"Non-Trainer integration","local":"non-trainer-integration","sections":[],"depth":2},{"title":"Troubleshoot","local":"troubleshoot","sections":[{"title":"Process killed at startup","local":"process-killed-at-startup","sections":[],"depth":3},{"title":"NaN loss","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33892/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/entry/start.b2c4257a.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/scheduler.31fdf58d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/singletons.9860629f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/index.252883d5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/paths.e85c0ec8.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/entry/app.05ef1f97.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/preload-helper.40847a0e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/index.2f76fdf0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/nodes/0.ca4aafa4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/nodes/21.3583ca18.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/CopyLLMTxtMenu.ff482081.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.71f274cc.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/IconCopy.ac192424.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/CodeBlock.ab12f8e1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/HfOption.fb051768.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"Choosing a ZeRO stage","local":"choosing-a-zero-stage","sections":[],"depth":2},{"title":"Config file","local":"config-file","sections":[{"title":"DeepSpeed versus Trainer parameters","local":"deepspeed-versus-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO stage","local":"zero-stage","sections":[],"depth":3},{"title":"Initialize large models","local":"initialize-large-models","sections":[],"depth":3},{"title":"NVMe","local":"nvme","sections":[],"depth":3}],"depth":2},{"title":"Training features","local":"training-features","sections":[{"title":"Gradient checkpointing","local":"gradient-checkpointing","sections":[],"depth":3},{"title":"Batch size","local":"batch-size","sections":[],"depth":3},{"title":"Communication data type","local":"communication-data-type","sections":[],"depth":3},{"title":"Gradient accumulation","local":"gradient-accumulation","sections":[],"depth":3},{"title":"Gradient clipping","local":"gradient-clipping","sections":[],"depth":3},{"title":"Mixed precision training","local":"mixed-precision-training","sections":[],"depth":3},{"title":"Optimizer and scheduler","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"Universal checkpointing","local":"universal-checkpointing","sections":[],"depth":3}],"depth":2},{"title":"Deploy","local":"deploy","sections":[{"title":"Multi-node","local":"multi-node","sections":[],"depth":3},{"title":"Slurm","local":"slurm","sections":[],"depth":3},{"title":"Jupyter Notebook","local":"jupyter-notebook","sections":[],"depth":3}],"depth":2},{"title":"Save model weights","local":"save-model-weights","sections":[{"title":"fp16","local":"fp16","sections":[],"depth":3},{"title":"fp32","local":"fp32","sections":[],"depth":3}],"depth":2},{"title":"Non-Trainer integration","local":"non-trainer-integration","sections":[],"depth":2},{"title":"Troubleshoot","local":"troubleshoot","sections":[{"title":"Process killed at startup","local":"process-killed-at-startup","sections":[],"depth":3},{"title":"NaN loss","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="deepspeed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed</span></h1> <p data-svelte-h="svelte-1x58rgl"><a href="https://www.deepspeed.ai/" rel="nofollow">DeepSpeed</a> is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three <a href="./perf_train_gpu_many">parallelism</a> strategies to provide better memory efficiency and faster training speeds. This is achieved with the <a href="https://hf.co/papers/1910.02054" rel="nofollow">Zero Redundancy Optimizer (ZeRO)</a> which consists of three stages.</p> <table data-svelte-h="svelte-g16f2l"><thead><tr><th>ZeRO stage</th> <th>description</th></tr></thead> <tbody><tr><td>1</td> <td>partition optimizer states</td></tr> <tr><td>2</td> <td>partition optimizer and gradient states</td></tr> <tr><td>3</td> <td>partition optimizer, gradient, and parameters</td></tr></tbody></table> <p data-svelte-h="svelte-167sw94">Each stage progressively saves more memory, allowing really large models to fit and train on a single GPU. All ZeRO stages, offloading optimizer memory and computations from the GPU to the CPU are integrated with <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a>. Provide a config file or one of the example templates to <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> to enable DeepSpeed features.</p> <p data-svelte-h="svelte-1d5hp5">This guide walks you through setting up a DeepSpeed config file, how to enable its features in <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a>, and deploy for training.</p> <p data-svelte-h="svelte-113jy4c">Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed <a href="https://www.deepspeed.ai/tutorials/advanced-install/" rel="nofollow">installation</a> or GitHUB <a href="https://github.com/microsoft/deepspeed#installation" rel="nofollow">README</a>.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">PyPI </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Transformers </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install deepspeed<!-- HTML_TAG_END --></pre></div> </div> <blockquote class="warning" data-svelte-h="svelte-1srgy08"><p>Refer to the <a href="./debugging#deepspeed-cuda-issues">DeepSpeed CUDA installation</a> if you’re having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to <a href="https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source" rel="nofollow">install it from source</a> to ensure it matches your hardware and to support certain features which aren’t available in the PyPI distribution.</p></blockquote> <p data-svelte-h="svelte-9txqst">DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You’ll also to need to reserve some memory for the CUDA kernels and activations.</p> <p data-svelte-h="svelte-4qy75k">Run the command below to check the memory requirements for <a href="https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B" rel="nofollow">bigscience/T0_3B</a> on a single GPU.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ python -c <span class="hljs-string">'from transformers import AutoModel; \
	from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
	model = AutoModel.from_pretrained("bigscience/T0_3B"); \
	estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'</span>
	[...]
	Estimated memory needed <span class="hljs-keyword">for</span> params, optim states and gradients <span class="hljs-keyword">for</span> a:
	HW: Setup with 1 node, 1 GPU per node.
	SW: Model with 2783M total params, 65M largest layer params.
	per CPU \| per GPU \| Options
	70.00GB \| 0.25GB \| offload_param=cpu , offload_optimizer=cpu , zero_init=1
	70.00GB \| 0.25GB \| offload_param=cpu , offload_optimizer=cpu , zero_init=0
	62.23GB \| 5.43GB \| offload_param=none, offload_optimizer=cpu , zero_init=1
	62.23GB \| 5.43GB \| offload_param=none, offload_optimizer=cpu , zero_init=0
	0.37GB \| 46.91GB \| offload_param=none, offload_optimizer=none, zero_init=1
	15.56GB \| 46.91GB \| offload_param=none, offload_optimizer=none, zero_init=0<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-dsbzg4"><p>If you have enough GPU memory, disable CPU and NVMe offload to speed everything up.</p></blockquote> <h2 class="relative group"><a id="choosing-a-zero-stage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-a-zero-stage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing a ZeRO stage</span></h2> <p data-svelte-h="svelte-1hpx19l">Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most.</p> <table data-svelte-h="svelte-d6tlin"><thead><tr><th>fastest</th> <th>least memory usage</th></tr></thead> <tbody><tr><td>ZeRO-1</td> <td>ZeRO-3 + offload</td></tr> <tr><td>ZeRO-2</td> <td>ZeRO-3</td></tr> <tr><td>ZeRO-2 + offload</td> <td>ZeRO-2 + offload</td></tr> <tr><td>ZeRO-3</td> <td>ZeRO-2</td></tr> <tr><td>ZeRO-3 + offload</td> <td>ZeRO-1</td></tr></tbody></table> <p data-svelte-h="svelte-1qh23gw">Decide the type of performance you’re optimizing for, speed or memory, and then work backwards to discover the best ZeRO stage for your use case. For example, if you’re optimizing for speed, start with the fastest ZeRO stage and if you run out of memory, try the next stage which is slower but more memory efficient.</p> <h2 class="relative group"><a id="config-file" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#config-file"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Config file</span></h2> <p data-svelte-h="svelte-3ah9qn">Once you’ve decided on a ZeRO stage, set up a config file to enable DeepSpeed with <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a>. The config file contains all the parameters for how to configure and set up your training. When the training script is executed, DeepSpeed logs the configuration from <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> to the console so you can see exactly what’s being used.</p> <blockquote class="tip"><p data-svelte-h="svelte-f0k2uz">Find a complete list of DeepSpeed configuration options on the <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed Configuration JSON</a> reference. There are also practical examples of various DeepSpeed configuration examples in the <a href="https://github.com/microsoft/DeepSpeedExamples" rel="nofollow">DeepSpeedExamples</a> main <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">DeepSpeed</a> repository. Run the command below to quickly find specific examples.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/microsoft/DeepSpeedExamples
	<span class="hljs-built_in">cd</span> DeepSpeedExamples
	find . -name <span class="hljs-string">'*json'</span>
	<span class="hljs-comment"># find examples with the Lamb optimizer</span>
	grep -i Lamb $(find . -name <span class="hljs-string">'*json'</span>)<!-- HTML_TAG_END --></pre></div></blockquote> <p data-svelte-h="svelte-n7pgem">The config file is passed as a path to a JSON file if you’re training from the command line interface or as a nested dict object if you’re using <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> in a notebook.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">path to file </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">nested dict </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TrainingArguments(
	deepspeed=<span class="hljs-string">"path/to/deepspeed_config.json"</span>,
	...,
	)<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="deepspeed-versus-trainer-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-versus-trainer-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed versus Trainer parameters</span></h3> <p data-svelte-h="svelte-ugzll6">There are three types of config parameters.</p> <ol data-svelte-h="svelte-9lanx9"><li>Some config parameters are shared by DeepSpeed and <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> making it difficult to identify errors when there are conflicting definitions. In this case, configure these parameters from the <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> command line arguments.</li> <li>Some config parameters are automatically derived from the model configuration and don’t need to be manually configured. <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> uses the config value <code>auto</code> to set the most correct or efficient option. You could define these parameters explicitly, but you must take care to ensure the <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> and DeepSpeed config parameters match. Mismatches may cause training to fail in very difficult to detect ways.</li> <li>Some config parameters are specific to DeepSpeed and should be manually set based on your training requirements.</li></ol> <p data-svelte-h="svelte-1xpsv2w">There are two ways to modify the config parameters.</p> <blockquote class="tip" data-svelte-h="svelte-1vbk7am"><p>Some values, such as <code>scheduler.params.total_num_steps</code>, are calculated by <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> during training.</p></blockquote> <ol data-svelte-h="svelte-a3ucs2"><li>Create or load a DeepSpeed config to use as the main config.</li> <li>Create a <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> object based on the DeepSpeed config values.</li></ol> <h3 class="relative group"><a id="zero-stage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-stage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO stage</span></h3> <p data-svelte-h="svelte-1ewr032">Each ZeRO stage config is defined in <code>zero_optimization</code>.</p> <p data-svelte-h="svelte-1f7xr36">For a more detailed explanation of each parameter, refer to the <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed Configuration JSON</a> reference. These parameters must be set up with DeepSpeed because <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> doesn’t provide equivalent command line arguments.</p> <blockquote class="warning" data-svelte-h="svelte-b1rcyk"><p>DeepSpeed doesn’t validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used.</p></blockquote> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">ZeRO-1 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-2 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-3 </div></div> <div class="language-select"><p data-svelte-h="svelte-49w3aw">ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed up.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-attr">"zero_optimization":</span> {
	<span class="hljs-attr">"stage":</span> <span class="hljs-number">1</span>
	}
	}<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="nvme" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nvme"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NVMe</span></h3> <p data-svelte-h="svelte-11g1ewl"><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity</a> offloads model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.</p> <p data-svelte-h="svelte-1ecykwm">Depending on the CPU and NVMe memory available, you can offload both the <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-offloading" rel="nofollow">optimizer states</a> and <a href="https://www.deepspeed.ai/docs/config-json/#parameter-offloading" rel="nofollow">parameters</a>, just one of them, or none of them. Make sure the <code>nvme_path</code> points to a NVMe device, because while it still works with a regular hard drive or solid state drive, it’ll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read operations and ~3GB/s for write operations.</p> <p data-svelte-h="svelte-2lak25">Consider running a <a href="https://github.com/microsoft/DeepSpeed/issues/998" rel="nofollow">benchmark</a> on your training setup to determine the optimal <code>aio</code> configuration.</p> <p data-svelte-h="svelte-7arozb">The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to <code>auto</code>, but you can also manually set configure these values.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"fp16"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"enabled"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"loss_scale"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">0</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"loss_scale_window"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1000</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"initial_scale_power"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">16</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"hysteresis"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">2</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"min_loss_scale"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1</span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>

	<span class="hljs-attr">"optimizer"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"type"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"AdamW"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"params"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"lr"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"betas"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"eps"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"weight_decay"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>

	<span class="hljs-attr">"scheduler"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"type"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"WarmupLR"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"params"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"warmup_min_lr"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"warmup_max_lr"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"warmup_num_steps"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>

	<span class="hljs-attr">"zero_optimization"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"stage"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">3</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"offload_optimizer"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"device"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"nvme"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"nvme_path"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"/local_nvme"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"pin_memory"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"buffer_count"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">4</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"fast_init"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">false</span></span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"offload_param"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"device"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"nvme"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"nvme_path"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"/local_nvme"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"pin_memory"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"buffer_count"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">5</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"buffer_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1e8</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"max_in_cpu"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1e9</span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"aio"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"block_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">262144</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"queue_depth"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">32</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"thread_count"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"single_submit"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">false</span></span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"overlap_events"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"overlap_comm"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"contiguous_gradients"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"sub_group_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1e9</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"reduce_bucket_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_prefetch_bucket_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_param_persistence_threshold"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_max_live_parameters"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1e9</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_max_reuse_distance"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1e9</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>

	<span class="hljs-attr">"gradient_accumulation_steps"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"gradient_clipping"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"steps_per_print"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">2000</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"train_batch_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"train_micro_batch_size_per_gpu"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"wall_clock_breakdown"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">false</span></span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="training-features" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-features"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training features</span></h2> <p data-svelte-h="svelte-b95nmg">DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features.</p> <h3 class="relative group"><a id="gradient-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient checkpointing</span></h3> <p data-svelte-h="svelte-e0krdz">Gradient checkpointing saves memory by only storing <em>some</em> of the intermediate activations instead of storing <em>all</em> of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though.</p> <ul data-svelte-h="svelte-t7u5tq"><li>For a Transformers model, set <code>model.gradient_checkpointing_enable()</code> or add <code>--gradient_checkpointing</code> in the <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>.</li> <li>For a non-Transformers model, use the DeepSpeed <a href="https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html" rel="nofollow">Activation Checkpointing API</a>. Replacing Transformers modeling code and <a href="https://pytorch.org/docs/stable/checkpoint.html" rel="nofollow">torch.utils.checkpoint</a> with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them.</li></ul> <h3 class="relative group"><a id="batch-size" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-size"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch size</span></h3> <p data-svelte-h="svelte-1fcuglo">The batch size can be automatically configured or manually set. When you choose the <code>"auto"</code> option, <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets <code>train_micro_batch_size_per_gpu</code> and <code>train_batch_size</code> to the value of <code>world_size * per_device_train_batch_size * gradient_accumulation_steps</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"train_micro_batch_size_per_gpu"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"train_batch_size"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="communication-data-type" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#communication-data-type"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Communication data type</span></h3> <p data-svelte-h="svelte-3cq4r9">A separate data type is used for communication collectives like reduction, gathering and scattering operations.</p> <p data-svelte-h="svelte-18k9mzv">All gather and scatter operations are performed in the same data type the data is in. For example, if you’re training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.</p> <p data-svelte-h="svelte-1b1bpok">Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it’s more likely to be lossy because adding multiple numbers in low precision isn’t exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.</p> <p data-svelte-h="svelte-1y41oji">Choose the communication data type by setting the <code>communication_data_type</code> parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it’s downcasted to whichever half-precision data type you’re training in.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"communication_data_type"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"fp32"</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient accumulation</span></h3> <p data-svelte-h="svelte-5jkqwz">Gradient accumulation accumulates gradients over several mini-batches of data before updating parameters. It stores less gradients and enables training with a larger <em>effective batch size</em>. Training speed is slower though, but it’s useful for overcoming memory constraints.</p> <p data-svelte-h="svelte-nveypb">Gradient accumulation can be automatically configured or manually set. When you choose the <code>"auto"</code> option, <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets it to the value of <code>gradient_accumulation_steps</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"gradient_accumulation_steps"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-clipping" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-clipping"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient clipping</span></h3> <p data-svelte-h="svelte-l72egn">Gradient clipping is useful for preventing exploding gradients which can lead to instability during training. It sets a maximum threshold value and rescales the gradients if their norm exceeds the threshold.</p> <p data-svelte-h="svelte-1vx4j6a">Gradient clipping can be automatically configured or manually set. When you choose the <code>"auto"</code> option, <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> sets it to the value of <code>max_grad_norm</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"gradient_clipping"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="mixed-precision-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mixed-precision-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mixed precision training</span></h3> <p data-svelte-h="svelte-gbn77h">Mixed precision accelerates training speed by performing some calculations in half-precision, but it also maintains some calculations in full-precision to preserve accuracy. DeepSpeed supports fp32, fp16, and bf16 data types.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">fp32 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">fp16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">bf16 </div></div> <div class="language-select"><p data-svelte-h="svelte-4l2sai">Train in fp32 if a model wasn’t pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"fp16"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"enabled"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">false</span></span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xgbogv">For Ampere GPUs and PyTorch 1.7+, the more efficient <a href="https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" rel="nofollow">tf32</a> mode is automatically enabled for some operations but the results are still in fp32. Configure it in <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> by setting <code>--tf32</code> to enable it, and <code>--tf32 0</code> or <code>--no_tf32</code> to disable it.</p> </div> <h3 class="relative group"><a id="optimizer-and-scheduler" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizer-and-scheduler"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizer and scheduler</span></h3> <p data-svelte-h="svelte-1hg70p7">DeepSpeed and Transformers optimizers and schedulers can be mixed and matched if <code>offload_optimizer</code> isn’t enabled. When <code>offload_optimizer</code> is enabled, use a non-DeepSpeed optimizer (except for LAMB) as long as it has it a CPU and GPU implementation.</p> <p data-svelte-h="svelte-1fvcbb2">Set the optimizer and scheduler parameters for the config file from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place, you can override it from the command line.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">optimizer </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">scheduler </div></div> <div class="language-select"><p data-svelte-h="svelte-1bf0563">DeepSpeed offers several <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-parameters" rel="nofollow">optimizers</a> (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don’t configure the optimizer in the config, <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: <code>lr</code>, <code>adam_beta1</code>, <code>adam_beta2</code>, <code>adam_epsilon</code>, <code>weight_decay</code>.</p> <p data-svelte-h="svelte-1uw9d8y">You can set the parameters to <code>"auto"</code> or manually input your own values.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"optimizer"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"type"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"AdamW"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"params"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"lr"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"betas"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"eps"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"weight_decay"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1k1vfhb">Use an unsupported optimizer by adding the following to the top level configuration.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"zero_allow_untested_optimizer"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cul5b4">From DeepSpeed 0.8.3+, if you want to use offload, you’ll also need to add the following to the top level configuration because offload works best with DeepSpeed’s CPU Adam optimizer.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"zero_force_ds_cpu_optimizer"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">false</span></span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="universal-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#universal-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Universal checkpointing</span></h3> <p data-svelte-h="svelte-f67kiq"><a href="https://www.deepspeed.ai/tutorials/universal-checkpointing" rel="nofollow">Universal Checkpointing</a> saves and loads model, optimizer and training scheduler states across different model architectures, parallelism techniques, and training configurations. By saving them in a Universal format, it enables easier model training continuation and fine-tuning.</p> <p data-svelte-h="svelte-1s13dgb">Resume training with a Universal checkpoint by setting <code>load_universal</code> to <code>true</code> in the config file.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"checkpoint"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"load_universal"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="deploy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deploy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deploy</span></h2> <p data-svelte-h="svelte-a6t3js">DeepSpeed can be deployed with its native launcher, <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a> or <a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch" rel="nofollow">Accelerate</a>.</p> <p data-svelte-h="svelte-qcrs2h">Add the <code>--deepspeed ds_config.json</code> argument to <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> in the command line. It is recommended to use DeepSpeeds <a href="https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing" rel="nofollow">add_config_arguments</a> utility to add any other command line arguments to your code.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">multi-GPU </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">single-GPU </div></div> <div class="language-select"><p data-svelte-h="svelte-1njfqta">To deploy DeepSpeed on multiple GPUs, add <code>--num_gpus</code>. You don’t need to add <code>--num_gpus</code> if you’re planning on using all available GPUs.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
	--deepspeed tests/deepspeed/ds_config_zero3.json \
	--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
	--output_dir output_dir --fp16 \
	--do_train --max_train_samples 500 --num_train_epochs 1 \
	--dataset_name wmt16 --dataset_config <span class="hljs-string">"ro-en"</span> \
	--source_lang en --target_lang ro<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="multi-node" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-node"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi-node</span></h3> <p data-svelte-h="svelte-1beh14s">A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a <a href="https://www.deepspeed.ai/docs/config-json/#checkpoint-options" rel="nofollow">checkpoint</a> to allow loading without access to a shared filesystem.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"checkpoint"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"use_node_local_storage"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iq4kzh">You could also use the <code>--save_on_each_node</code> parameter in <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> to automatically add the above <code>checkpoint</code> to your config.</p> <p data-svelte-h="svelte-d7ogvj">The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with <code>ssh hostname1</code> and the second node with <code>ssh hostname2</code>. Both nodes must be able to communicate with each other locally over ssh without a password.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">torchrun </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">DeepSpeed </div></div> <div class="language-select"><p data-svelte-h="svelte-bd1gud">With <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>, ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
	--master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="slurm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#slurm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Slurm</span></h3> <p data-svelte-h="svelte-jssjer"><a href="https://slurm.schedmd.com/documentation.html" rel="nofollow">Slurm</a> is a cluster management and job scheduling system. An example Slurm script is shown below.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">#SBATCH --job-name=test-nodes # name</span>
	<span class="hljs-comment">#SBATCH --nodes=2 # nodes</span>
	<span class="hljs-comment">#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!</span>
	<span class="hljs-comment">#SBATCH --cpus-per-task=10 # number of cores per tasks</span>
	<span class="hljs-comment">#SBATCH --gres=gpu:8 # number of gpus</span>
	<span class="hljs-comment">#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)</span>
	<span class="hljs-comment">#SBATCH --output=%x-%j.out # output file name</span>

	<span class="hljs-built_in">export</span> GPUS_PER_NODE=8
	<span class="hljs-built_in">export</span> MASTER_ADDR=$(scontrol show hostnames <span class="hljs-variable">$SLURM_JOB_NODELIST</span> \| <span class="hljs-built_in">head</span> -n 1)
	<span class="hljs-built_in">export</span> MASTER_PORT=9901

	srun --jobid <span class="hljs-variable">$SLURM_JOBID</span> bash -c <span class="hljs-string">'python -m torch.distributed.run \
	--nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
	--master_addr $MASTER_ADDR --master_port $MASTER_PORT \
	your_program.py <normal cl args> --deepspeed ds_config.json'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1up77xx">Launch training simultaneously on all nodes with the command below.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sbatch launch.slurm<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="jupyter-notebook" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#jupyter-notebook"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Jupyter Notebook</span></h3> <p data-svelte-h="svelte-2f0twl">To use DeepSpeed in a Jupyter Notebook, you need to emulate a distributed environment because the launcher doesn’t support deployment from a notebook. This is only supported for one GPU. To use multiple GPUs, you must use a multi-process environment, which means you have to use the DeepSpeed launcher which can’t be emulated as shown here.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># emulate a launcher in the notebook</span>
	<span class="hljs-keyword">import</span> os

	os.environ[<span class="hljs-string">"MASTER_ADDR"</span>] = <span class="hljs-string">"localhost"</span>
	os.environ[<span class="hljs-string">"MASTER_PORT"</span>] = <span class="hljs-string">"9994"</span> <span class="hljs-comment"># modify if RuntimeError: Address already in use</span>
	os.environ[<span class="hljs-string">"RANK"</span>] = <span class="hljs-string">"0"</span>
	os.environ[<span class="hljs-string">"LOCAL_RANK"</span>] = <span class="hljs-string">"0"</span>
	os.environ[<span class="hljs-string">"WORLD_SIZE"</span>] = <span class="hljs-string">"1"</span>

	training_args = TrainingArguments(..., deepspeed=<span class="hljs-string">"ds_config_zero3.json"</span>)
	trainer = Trainer(...)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8l6b91">Create a config file on the fly in the notebook in the current directory with a dedicated cell.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash
	cat <<<span class="hljs-string">'EOT'</span> > ds_config_zero3.json
	{
	<span class="hljs-string">"fp16"</span>: {
	<span class="hljs-string">"enabled"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"loss_scale"</span>: <span class="hljs-number">0</span>,
	<span class="hljs-string">"loss_scale_window"</span>: <span class="hljs-number">1000</span>,
	<span class="hljs-string">"initial_scale_power"</span>: <span class="hljs-number">16</span>,
	<span class="hljs-string">"hysteresis"</span>: <span class="hljs-number">2</span>,
	<span class="hljs-string">"min_loss_scale"</span>: <span class="hljs-number">1</span>
	},

	<span class="hljs-string">"optimizer"</span>: {
	<span class="hljs-string">"type"</span>: <span class="hljs-string">"AdamW"</span>,
	<span class="hljs-string">"params"</span>: {
	<span class="hljs-string">"lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"betas"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"eps"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"weight_decay"</span>: <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-string">"scheduler"</span>: {
	<span class="hljs-string">"type"</span>: <span class="hljs-string">"WarmupLR"</span>,
	<span class="hljs-string">"params"</span>: {
	<span class="hljs-string">"warmup_min_lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"warmup_max_lr"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"warmup_num_steps"</span>: <span class="hljs-string">"auto"</span>
	}
	},

	<span class="hljs-string">"zero_optimization"</span>: {
	<span class="hljs-string">"stage"</span>: <span class="hljs-number">3</span>,
	<span class="hljs-string">"offload_optimizer"</span>: {
	<span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>,
	<span class="hljs-string">"pin_memory"</span>: true
	},
	<span class="hljs-string">"offload_param"</span>: {
	<span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>,
	<span class="hljs-string">"pin_memory"</span>: true
	},
	<span class="hljs-string">"overlap_comm"</span>: true,
	<span class="hljs-string">"contiguous_gradients"</span>: true,
	<span class="hljs-string">"sub_group_size"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"reduce_bucket_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_prefetch_bucket_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_param_persistence_threshold"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"stage3_max_live_parameters"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"stage3_max_reuse_distance"</span>: <span class="hljs-number">1e9</span>,
	<span class="hljs-string">"stage3_gather_16bit_weights_on_model_save"</span>: true
	},

	<span class="hljs-string">"gradient_accumulation_steps"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"gradient_clipping"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"steps_per_print"</span>: <span class="hljs-number">2000</span>,
	<span class="hljs-string">"train_batch_size"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-string">"auto"</span>,
	<span class="hljs-string">"wall_clock_breakdown"</span>: false
	}
	EOT<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g12cu1">If the training script is in a file and not a notebook cell, launch DeepSpeed from the shell in the notebook cell.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!git clone https://github.com/huggingface/transformers
	!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jftyib">Another option is to use <code>%%bash</code> to run the shell program without emulating the distributed environment. However, you won’t be able to view the logs until training is complete.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash

	git clone https://github.com/huggingface/transformers
	cd transformers
	deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="save-model-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save-model-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save model weights</span></h2> <p data-svelte-h="svelte-1srdo4a">DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (<code>global_step/optim_states.pt</code>) which are saved under the normal checkpoint.</p> <h3 class="relative group"><a id="fp16" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp16"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp16</span></h3> <p data-svelte-h="svelte-ymywwd">ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set <code>"stage3_gather_16bit_weights_on_model_save": true</code> in the config file, because the weights are distributed across multiple GPUs.</p> <p data-svelte-h="svelte-1k6kf3z">If you don’t, <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> won’t save the weights in fp16 and won’t create a <code>pytorch_model.bin</code> file. This is because DeepSpeed’s state_dict contains a placeholder instead of the real weights, so you won’t be able to load it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"zero_optimization"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"stage"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">3</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save"</span><span class="hljs-punctuation">:</span> <span class="hljs-literal"><span class="hljs-keyword">true</span></span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="fp32" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp32"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp32</span></h3> <p data-svelte-h="svelte-1437twr">Unless you have a lot of free CPU memory, fp32 weights shouldn’t be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">offline </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">online </div></div> <div class="language-select"><p data-svelte-h="svelte-1c5jcch">DeepSpeed provides a <a href="https://github.com/microsoft/DeepSpeed/blob/91829476a8fd4d0d9268c03c1d56795d20a51c12/deepspeed/utils/zero_to_fp32.py#L14" rel="nofollow">zero_to_fp32.py</a> script at the top-level checkpoint folder for extracting weights at any point. This is a standalone script and you don’t need a config file or <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a>.</p> <p data-svelte-h="svelte-6k8ztx">For example, if your checkpoint folder looks like the one shown below, then you can run the following command to create and consolidate the fp32 weights from multiple GPUs into a single <code>pytorch_model.bin</code> file. The script automatically discovers the subfolder <code>global_step1</code> which contains the checkpoint.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ <span class="hljs-built_in">ls</span> -l output_dir/checkpoint-1/
	-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
	drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
	-rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest
	-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
	-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
	-rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt
	-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
	-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
	-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
	-rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json
	-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
	-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-15vj6lb"><p>Run <code>python zero_to_fp32.py -h</code> for more usage details. The script requires 2x the general RAM of the final fp32 weights.</p></blockquote> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python zero_to_fp32.py . pytorch_model.bin<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="non-trainer-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-trainer-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Non-Trainer integration</span></h2> <p data-svelte-h="svelte-1qq2wzh">DeepSpeed also works with Transformers without <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a>. The <a href="/docs/transformers/pr_33892/en/main_classes/deepspeed#transformers.integrations.HfDeepSpeedConfig">HfDeepSpeedConfig</a> is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when <a href="/docs/transformers/pr_33892/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> is called.</p> <p data-svelte-h="svelte-wy0wnv">You must instantiate <a href="/docs/transformers/pr_33892/en/main_classes/deepspeed#transformers.integrations.HfDeepSpeedConfig">HfDeepSpeedConfig</a> before loading a model to efficiently deploy ZeRO-3.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">pretrained model </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">non-pretrained model </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers.integrations <span class="hljs-keyword">import</span> HfDeepSpeedConfig
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModel
	<span class="hljs-keyword">import</span> deepspeed

	<span class="hljs-comment"># DeepSpeed config object or path to the file</span>
	ds_config = {...}
	<span class="hljs-comment"># must run before instantiating the model to detect ZeRO-3</span>
	dschf = HfDeepSpeedConfig(ds_config) <span class="hljs-comment"># keep this object alive</span>
	model = AutoModel.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	engine = deepspeed.initialize(model=model, config_params=ds_config, ...)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="troubleshoot" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#troubleshoot"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Troubleshoot</span></h2> <p data-svelte-h="svelte-1e3q0we">One of the first things to check when you encounter an error is whether DeepSpeed is the cause (because often it isn’t). Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">repository</a>.</p> <p data-svelte-h="svelte-1xz3o2z">For issues related to the Transformers integration, please provide the following information.</p> <ul><li data-svelte-h="svelte-18grubg"><p>The full DeepSpeed config file.</p></li> <li data-svelte-h="svelte-1koe6ne"><p>The command line arguments for <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> or the <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> if you’re scripting the <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.Trainer">Trainer</a> setup yourself (don’t dump the entire <a href="/docs/transformers/pr_33892/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> which contains many irrelevant entries).</p></li> <li><p data-svelte-h="svelte-1k6leae">The outputs of the following commands.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -c <span class="hljs-string">'import torch; print(f"torch: {torch.__version__}")'</span>
	python -c <span class="hljs-string">'import transformers; print(f"transformers: {transformers.__version__}")'</span>
	python -c <span class="hljs-string">'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'</span><!-- HTML_TAG_END --></pre></div></li> <li data-svelte-h="svelte-1imspt9"><p>A link to a Google Colab notebook to reproduce the issue.</p></li> <li data-svelte-h="svelte-1nfpn90"><p>A standard or non-custom dataset or an existing example to reproduce the issue.</p></li></ul> <p data-svelte-h="svelte-oia8x0">The following sections provide a guide for resolving two of the most common issues.</p> <h3 class="relative group"><a id="process-killed-at-startup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#process-killed-at-startup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Process killed at startup</span></h3> <p data-svelte-h="svelte-11ycwbg">When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to terminate the process.</p> <p data-svelte-h="svelte-dyag9c">In this case, check whether your config file has either <code>offload_optimizer</code>, <code>offlload_param</code>, or both configured to offload to the CPU.</p> <p data-svelte-h="svelte-1744kiq">If you have NVM3 and ZeRO-3 set up, experiment with offloading to the NVMe (<a href="https://deepspeed.readthedocs.io/en/latest/memory.html" rel="nofollow">estimate</a> the memory requirements of a model first) instead.</p> <h3 class="relative group"><a id="nan-loss" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nan-loss"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NaN loss</span></h3> <p data-svelte-h="svelte-1atsnu2">NaN loss often occurs when a model is pretrained in bf16 and you try to use it with fp16 (especially relevant to TPU trained models). To resolve this, use fp32 or bf16 if your hardware (TPUs, Ampere GPUs or newer) supports it.</p> <p data-svelte-h="svelte-3m4nno">It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"fp16"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">{</span>
	<span class="hljs-attr">"enabled"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"auto"</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"loss_scale"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">0</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"loss_scale_window"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1000</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"initial_scale_power"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">16</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"hysteresis"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">2</span><span class="hljs-punctuation">,</span>
	<span class="hljs-attr">"min_loss_scale"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">1</span>
	<span class="hljs-punctuation">}</span>
	<span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gpmvr7">The <code>OVERFLOW!</code> error below is a result of the DeepSpeed loss scaler unable to find a scaling coefficient to overcome the loss overflow. Try a higher <code>initial_scale_power</code> value in this case (32 usually works).</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->0%\| \| 0/189 [00:00<?, ?it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
	1%\|▌ \| 1/189 [00:00<01:26, 2.17it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
	1%\|█▏
	[...]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	14%\|████████████████▌ \| 27/189 [00:14<01:13, 2.21it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	15%\|█████████████████▏ \| 28/189 [00:14<01:13, 2.18it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	15%\|█████████████████▊ \| 29/189 [00:15<01:13, 2.18it/s]
	[deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
	[...]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Resources</span></h2> <p data-svelte-h="svelte-18m4cm3">DeepSpeed is a powerful technology for scaling large model training. To learn more about DeepSpeed, take a look at their <a href="https://www.microsoft.com/en-us/research/search/?q=deepspeed" rel="nofollow">blog posts</a>, <a href="https://www.deepspeed.ai/getting-started/" rel="nofollow">documentation</a>, and <a href="https://github.com/microsoft/deepspeed" rel="nofollow">GitHub</a>.</p> <p data-svelte-h="svelte-mkmcno">The papers below provide additional details about ZeRO.</p> <ul data-svelte-h="svelte-o0yfva"><li><a href="https://hf.co/papers/1910.02054" rel="nofollow">ZeRO: Memory Optimizations Toward Training Trillion Parameter Models</a></li> <li><a href="https://hf.co/papers/2101.06840" rel="nofollow">ZeRO-Offload: Democratizing Billion-Scale Model Training</a></li> <li><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/deepspeed.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_16tnnm8 = {
	assets: "/docs/transformers/pr_33892/en",
	base: "/docs/transformers/pr_33892/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33892/en/_app/immutable/entry/start.b2c4257a.js"),
	import("/docs/transformers/pr_33892/en/_app/immutable/entry/app.05ef1f97.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 21],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 143 kB
Xet hash:: 40d4d5e03f851583fd28b72f7166d4246a472e0fdefe9e348d6407fc238a7259

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.