Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / optimum-neuron /main /en /guides /distributed_training.html

rtrm

about 2 months ago

download

raw

78.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/optimum.neuron/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/scheduler.56725da7.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/singletons.635e76a3.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/paths.ed3a4dd8.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/preload-helper.ec99a452.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/index.18a26576.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/0.f24306d7.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/11.d7707db0.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/Tip.5b941656.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CopyLLMTxtMenu.18367b53.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/globals.7f7f1b26.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.47599cff.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CodeBlock.bd8b9965.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="distributed-training-with-optimum-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-training-with-optimum-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed Training with optimum-neuron</span></h1> <p data-svelte-h="svelte-1hnco7m">AWS Trainium instances provide powerful infrastructure for training large language models at scale. A <code>trn1.32xlarge</code> instance contains 16 Neuron devices with 32 cores total, offering 512GB of memory (16GB per core).</p> <p data-svelte-h="svelte-d2kadp">However, training large models presents a fundamental challenge: by default, each Neuron core operates as an independent data-parallel worker, requiring the entire model, gradients, and optimizer state (approximately 4× the model size) to fit within a single core’s 16GB memory limit, with additional space needed for activations.</p> <p data-svelte-h="svelte-1ndq4u8">For models that exceed these memory constraints, <code>optimum-neuron</code> provides sophisticated parallelism strategies that distribute computation and memory across multiple devices, enabling you to train models that would be impossible to fit on individual cores:</p> <h2 class="relative group"><a id="parallelism-strategies-overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#parallelism-strategies-overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Parallelism Strategies Overview</span></h2> <h3 class="relative group"><a id="1-zero-1-optimizer-state-sharding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-zero-1-optimizer-state-sharding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. ZeRO-1 (Optimizer State Sharding)</span></h3> <p data-svelte-h="svelte-1mjmifd"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a> is an optimizer-level optimization that reduces memory usage without changing your model architecture.</p> <p data-svelte-h="svelte-1f0jrrz"><strong>How it works</strong>: Shards the optimizer state (gradients, momentum, variance) across data-parallel ranks instead of replicating it on each device.</p> <p data-svelte-h="svelte-18396cf"><strong>Memory savings</strong>: Reduces optimizer memory usage by <code>1/data_parellel_size</code>.</p> <p data-svelte-h="svelte-190zbi2"><strong>When to use</strong>: Always beneficial when training with multiple devices, regardless of model size.</p> <h3 class="relative group"><a id="2-tensor-parallelism-intra-layer-model-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-tensor-parallelism-intra-layer-model-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Tensor Parallelism (Intra-layer Model Parallelism)</span></h3> <p data-svelte-h="svelte-1r1vc5"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a> splits individual model layers across multiple devices.</p> <p data-svelte-h="svelte-wsb08h"><strong>How it works</strong>: Shards matrix multiplications (linear layers, attention) along rows or columns across devices. Each device computes part of each layer, requiring communication between devices for each forward/backward pass.</p> <p data-svelte-h="svelte-1rz7p8m"><strong>Memory savings</strong>: Reduces model parameter memory by <code>1/tensor_parallel_size</code>.</p> <p data-svelte-h="svelte-kpqjki"><strong>When to use</strong>: When your model is too large to fit on a single device, even after applying ZeRO-1.</p> <p data-svelte-h="svelte-fc78y5"><strong>Typical deployment</strong>: Usually applied within a single node (intra-node) due to high communication requirements.</p> <p data-svelte-h="svelte-otaiec"><strong>Trade-offs</strong>: Increases communication overhead between devices, which can slow down training if overused.</p> <h3 class="relative group"><a id="3-sequence-parallelism-activation-sharding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-sequence-parallelism-activation-sharding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Sequence Parallelism (Activation Sharding)</span></h3> <p data-svelte-h="svelte-4fh0sm"><a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> is an optimization that works alongside Tensor Parallelism to further reduce memory usage.</p> <p data-svelte-h="svelte-1cma7oi"><strong>How it works</strong>: Shards activations along the sequence dimension in regions where tensors are not already sharded by tensor parallelism.</p> <p data-svelte-h="svelte-hrmmz0"><strong>Memory savings</strong>: Reduces activation memory proportional to sequence length, especially beneficial for long sequences.</p> <p data-svelte-h="svelte-1obmcw0"><strong>When to use</strong>: Always enable when using tensor parallelism - it provides additional memory savings with minimal overhead.</p> <p data-svelte-h="svelte-41lgu1"><strong>Requirement</strong>: Only works in combination with tensor parallelism.</p> <h3 class="relative group"><a id="4-pipeline-parallelism-inter-layer-model-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-pipeline-parallelism-inter-layer-model-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Pipeline Parallelism (Inter-layer Model Parallelism)</span></h3> <p data-svelte-h="svelte-1lhkl2x"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a> splits model layers across different devices.</p> <p data-svelte-h="svelte-a3vdn"><strong>How it works</strong>: Divides your model into stages, with each stage containing consecutive layers running on different devices. Uses microbatching to keep all devices busy.</p> <p data-svelte-h="svelte-1jq8cu1"><strong>Memory savings</strong>: Reduces model parameter memory by <code>1/pipeline_parallel_size</code>.</p> <p data-svelte-h="svelte-1vm5hdt"><strong>When to use</strong>: For very large models that don’t fit even with tensor parallelism, or when you want to scale across many devices with less communication overhead than tensor parallelism.</p> <p data-svelte-h="svelte-4tw4cj"><strong>Typical deployment</strong>: Usually applied across multiple nodes (inter-node) to scale to larger numbers of devices while minimizing high-bandwidth communication requirements.</p> <p data-svelte-h="svelte-l89uy8"><strong>Trade-offs</strong>: Introduces pipeline bubbles (idle time) and requires careful tuning of microbatch sizes.</p> <p data-svelte-h="svelte-1p0ihsg">The good news is that it is possible to combine those techniques, and <code>optimum-neuron</code> makes it very easy!</p> <blockquote class="tip"><p data-svelte-h="svelte-1s96mwn">All the training examples in the optimum-neuron repo use these parallelism features via the <code>NeuronTrainer</code>.</p></blockquote> <h2 class="relative group"><a id="how-to-enable-zero-1" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-zero-1"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable ZeRO-1?</span></h2> <p data-svelte-h="svelte-1xpk0lv">ZeRO-1 can be enabled either through the <code>NeuronTrainer</code> or directly with the <code>NeuronAccelerator</code>.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer

	<span class="hljs-comment"># Enable ZeRO-1 in the training arguments</span>
	training_args = NeuronTrainingArguments(
	output_dir=<span class="hljs-string">"./output"</span>,
	per_device_train_batch_size=<span class="hljs-number">1</span>,
	zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span>
	bf16=<span class="hljs-literal">True</span>,
	<span class="hljs-comment"># ... other training arguments</span>
	)

	trainer = NeuronTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-nmzr1t">Since the example scripts use the <code>NeuronTrainer</code>, you can enable ZeRO-1 when using them by adding the <code>--zero_1</code> flag to your command line.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=2 examples/training/qwen3/finetune_qwen3.py \
	--model_name_or_path Qwen/Qwen2.5-0.5B \
	--dataset_name wikitext \
	--dataset_config_name wikitext-2-raw-v1 \
	--do_train \
	--per_device_train_batch_size 1 \
	--block_size 1024 \
	--bf16 \
	--zero_1 \
	--tensor_parallel_size 2 \
	--output_dir my_training/<!-- HTML_TAG_END --></pre></div></blockquote> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-106kvj9">When using the <code>NeuronAccelerator</code> directly, you need to create a <code>TrainingNeuronConfig</code> and enable ZeRO-1 separately:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
	<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
	<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig

	<span class="hljs-comment"># Create the training configuration</span>
	trn_config = TrainingNeuronConfig()

	<span class="hljs-comment"># Create accelerator with ZeRO-1 enabled</span>
	accelerator = NeuronAccelerator(
	trn_config=trn_config,
	zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span>
	mixed_precision=<span class="hljs-string">"bf16"</span>,
	)

	model = ... <span class="hljs-comment"># Your model instance</span>
	optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)

	<span class="hljs-comment"># Prepare model and optimizer</span>
	model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-to-enable-tensor-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-tensor-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable Tensor Parallelism?</span></h2> <p data-svelte-h="svelte-1r4hhew">Tensor Parallelism can be used with either the <code>NeuronTrainer</code> or <code>NeuronAccelerator</code>.</p> <p data-svelte-h="svelte-v1qtdm"><strong>Important</strong>: Tensor parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code>.</p> <p data-svelte-h="svelte-n127re">When doing Tensor Parallelism, you have several important settings:</p> <ol data-svelte-h="svelte-1hoskl8"><li>The <code>tensor_parallel_size</code>: Ideally it should be the smallest value for which the model fits in memory.</li> <li>Whether sequence parallelism should be enabled: <a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions, saving memory by sharding the activations.</li></ol> <p data-svelte-h="svelte-11wpmlp">When using distributed training, the training script is called by <code>torchrun</code>, which will dispatch it to workers, one worker per core. Each worker will load the sharded model and dispatch the parameters automatically across the cores. The <code>tensor_parallel_size</code> is the number of workers to shard the model parameters on.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer

	<span class="hljs-comment"># Configure tensor parallelism in training arguments</span>
	training_args = NeuronTrainingArguments(
	output_dir=<span class="hljs-string">"./output"</span>,
	per_device_train_batch_size=<span class="hljs-number">1</span>,
	bf16=<span class="hljs-literal">True</span>,
	tensor_parallel_size=<span class="hljs-number">8</span>,
	<span class="hljs-comment"># ... other training arguments</span>
	)

	trainer = NeuronTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-7fhmhn">Since the example scripts use the <code>NeuronTrainer</code>, you can enable Tensor Parallelism when using them by specifying the <code>--tensor_parallel_size</code> argument.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=8 examples/training/qwen3/finetune_qwen3.py \
	--model_name_or_path Qwen/Qwen2.5-0.5B \
	--dataset_name wikitext \
	--dataset_config_name wikitext-2-raw-v1 \
	--do_train \
	--per_device_train_batch_size 1 \
	--block_size 1024 \
	--bf16 \
	--tensor_parallel_size 8 \
	--output_dir my_training/<!-- HTML_TAG_END --></pre></div></blockquote> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-1ncu8vs">When using the <code>NeuronAccelerator</code> directly, you configure tensor parallelism through the <code>TrainingNeuronConfig</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
	<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
	<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig

	<span class="hljs-comment"># Configure tensor parallelism</span>
	trn_config = TrainingNeuronConfig(
	tensor_parallel_size=<span class="hljs-number">8</span>,
	sequence_parallel_enabled=<span class="hljs-literal">True</span>,
	checkpoint_dir=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Can be specified when resuming from checkpoint</span>
	)

	accelerator = NeuronAccelerator(
	trn_config=trn_config,
	mixed_precision=<span class="hljs-string">"bf16"</span>,
	)

	model = ... <span class="hljs-comment"># Your model instance</span>
	optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)

	model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-to-enable-pipeline-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-pipeline-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable Pipeline Parallelism?</span></h2> <p data-svelte-h="svelte-1vp0c4m">Pipeline Parallelism allows you to split your model layers across multiple devices, enabling training of very large models that wouldn’t fit on a single device, or even a signle node.</p> <p data-svelte-h="svelte-1ytrjb2"><strong>Important</strong>: Pipeline parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code> and declare <code>SUPPORTS_PIPELINE_PARALLELISM = True</code>.</p> <h3 class="relative group"><a id="configuration-options" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configuration-options"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configuration Options</span></h3> <p data-svelte-h="svelte-wwttlo">Pipeline parallelism has several configuration parameters:</p> <ul data-svelte-h="svelte-9fwfrb"><li><code>pipeline_parallel_size</code>: Number of pipeline stages (devices to split layers across)</li> <li><code>pipeline_parallel_num_microbatches</code>: Number of microbatches for pipeline scheduling</li> <li>When pipeline parallelism is enabled, ZeRO-1 can be automatically applied to the pipeline parallel optimizer</li></ul> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
	<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM <span class="hljs-comment"># Custom model implementation</span>

	<span class="hljs-comment"># Configure pipeline parallelism in training arguments</span>
	training_args = NeuronTrainingArguments(
	output_dir=<span class="hljs-string">"./output"</span>,
	per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Will be split into microbatches</span>
	bf16=<span class="hljs-literal">True</span>,
	tensor_parallel_size=<span class="hljs-number">2</span>,
	pipeline_parallel_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Split model across 4 pipeline stages</span>
	pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, <span class="hljs-comment"># Number of microbatches</span>
	zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1 with pipeline parallelism</span>
	<span class="hljs-comment"># ... other training arguments</span>
	)

	<span class="hljs-comment"># Load model using custom implementation - must be done with the model class directly</span>
	model = LlamaForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>,
	trn_config=training_args.trn_config <span class="hljs-comment"># Pass the auto-generated trn_config</span>
	)

	trainer = NeuronTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
	<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig
	<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM
	<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW

	<span class="hljs-comment"># Configure combined parallelism strategies</span>
	trn_config = TrainingNeuronConfig(
	tensor_parallel_size=<span class="hljs-number">2</span>,
	pipeline_parallel_size=<span class="hljs-number">4</span>,
	pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>,
	sequence_parallel_enabled=<span class="hljs-literal">True</span>,
	)

	accelerator = NeuronAccelerator(
	trn_config=trn_config,
	zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Can combine with ZeRO-1</span>
	mixed_precision=<span class="hljs-string">"bf16"</span>,
	)

	<span class="hljs-comment"># Load model with custom implementation</span>
	model = LlamaForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>,
	trn_config=trn_config
	)

	optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)
	model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-1ypidsl">When using pipeline parallelism, the total number of processes should be at least <code>tensor_parallel_size * pipeline_parallel_size</code>. For example, with <code>tensor_parallel_size=2</code> and <code>pipeline_parallel_size=4</code>, you need 8 processes total.</p></blockquote> <h2 class="relative group"><a id="combining-parallelism-strategies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#combining-parallelism-strategies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Combining Parallelism Strategies</span></h2> <p data-svelte-h="svelte-ktf0yf">You can combine multiple parallelism strategies for maximum memory efficiency and performance. Here’s an example with all strategies combined:</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
	<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM

	<span class="hljs-comment"># Example: Combine all parallelism strategies</span>
	training_args = NeuronTrainingArguments(
	output_dir=<span class="hljs-string">"./output"</span>,
	per_device_train_batch_size=<span class="hljs-number">32</span>,
	bf16=<span class="hljs-literal">True</span>,
	gradient_checkpointing=<span class="hljs-literal">True</span>,

	<span class="hljs-comment"># ZeRO-1</span>
	zero_1=<span class="hljs-literal">True</span>,

	<span class="hljs-comment"># Tensor parallelism</span>
	tensor_parallel_size=<span class="hljs-number">4</span>,
	disable_sequence_parallel=<span class="hljs-literal">False</span>, <span class="hljs-comment"># Enable sequence parallelism</span>

	<span class="hljs-comment"># Pipeline parallelism</span>
	pipeline_parallel_size=<span class="hljs-number">2</span>,
	pipeline_parallel_num_microbatches=<span class="hljs-number">8</span>,

	<span class="hljs-comment"># Additional optimizations</span>
	fuse_qkv=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Fuse QKV projections for efficiency</span>
	kv_size_multiplier=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Auto-calculate optimal KV multiplier</span>
	)

	<span class="hljs-comment"># Load model using custom implementation</span>
	model = LlamaForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>,
	trn_config=training_args.trn_config
	)

	trainer = NeuronTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e9hr70">This configuration uses 4 * 2 = 8 total processes:</p> <ul data-svelte-h="svelte-138kta0"><li>Each tensor parallel group has 4 processes</li> <li>Each pipeline stage runs on one tensor parallel group</li></ul> <p data-svelte-h="svelte-1oatqej">We can then run the training script on the <code>trn1.32xlarge</code> instance with 32 Neuron cores, resulting in the following configuration: <code>dp=4, tp=4, pp=2</code>, which means 4 data-parallel groups, each with 4 tensor-parallel devices, and 2 pipeline stages.</p> <h2 class="relative group"><a id="checkpoint-consolidation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#checkpoint-consolidation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Checkpoint consolidation</span></h2> <p data-svelte-h="svelte-10z9rkn">Since distributed training uses sharded checkpoints across different workers, you need to consolidate them to create a standard model checkpoint that can be shared and used outside of the specific training configuration.</p> <p data-svelte-h="svelte-24042q">The Optimum CLI provides a way of doing that very easily via the <code>optimum neuron consolidate</code> command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate --<span class="hljs-built_in">help</span>

	usage: optimum-cli neuron consolidate [-h] [-f {pytorch,safetensors}] checkpoint_dir output_dir

	positional arguments:
	checkpoint_dir The path to the directory containing the checkpoints.
	output_dir The path to the output directory containing the consolidated checkpoint.

	optional arguments:
	-h, --<span class="hljs-built_in">help</span> show this <span class="hljs-built_in">help</span> message and <span class="hljs-built_in">exit</span>
	-f {pytorch,safetensors}, --format {pytorch,safetensors}
	The format used to save the consolidated checkpoint.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7l8i2j">All you need to do is specify the sharded checkpoints directory and the output directory that will contain the consolidated checkpoints, and the command takes care of the rest.
	It is also possible to specify the output format of the consolidated checkpoints. By default it will export them to the <code>safetensors</code> format, which is the recommended format to use.</p> <p data-svelte-h="svelte-11lpom8">Example:</p> <p data-svelte-h="svelte-15ioqnc">Training with distributed parallelism just completed and the output dir is called <code>my_training</code>. The directory looks like the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_training/
	├── README.md
	├── all_results.json
	├── checkpoint-10
	│ ├── config.json
	│ ├── scheduler.pt
	│ ├── special_tokens_map.json
	│ ├── shards/
	│ ├── tokenizer.json
	│ ├── tokenizer.model
	│ ├── tokenizer_config.json
	│ ├── trainer_state.json
	│ └── training_args.bin
	├── config.json
	├── special_tokens_map.json
	├── shards/
	│ ├── tp_rank_00_pp_rank_00
	│ ├── tp_rank_01_pp_rank_00
	│ ├── tp_rank_02_pp_rank_00
	│ ├── tp_rank_03_pp_rank_00
	│ ├── tp_rank_00_pp_rank_01
	│ ├── tp_rank_01_pp_rank_01
	│ ├── tp_rank_02_pp_rank_01
	│ └── tp_rank_03_pp_rank_01
	├── tokenizer.json
	├── tokenizer.model
	├── tokenizer_config.json
	├── train_results.json
	├── trainer_state.json
	├── training_args.bin
	└── trn_config.json<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e8fv74">You can consolidate the sharded checkpoints in <code>my_training/shards</code>, which correspond to the sharded checkpoints saved at the end of training, by running the following command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate my_training my_training_consolidated_checkpoint<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-1sydsko">The sharded checkpoints are saved under a directory called <code>shards</code>. The <code>optimum-cli neuron consolidate</code> command accepts as input both a directory that contains a <code>shards</code> directory, or the <code>shards</code> directory itself.</p></blockquote> <h2 class="relative group"><a id="best-practices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#best-practices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Best Practices</span></h2> <h3 class="relative group"><a id="choosing-parallelism-strategy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-parallelism-strategy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing Parallelism Strategy</span></h3> <ol data-svelte-h="svelte-1j56tw2"><li><strong>Start with Tensor Parallelism</strong>: Use the smallest <code>tensor_parallel_size</code> that fits your model in memory</li> <li><strong>Add Pipeline Parallelism</strong>: For very large models, combine with pipeline parallelism</li> <li><strong>Enable Sequence Parallelism</strong>: Always enable when using tensor parallelism for memory savings (set <code>disable_sequence_parallel=False</code>)</li> <li><strong>Use ZeRO-1</strong>: Combine with any parallelism strategy for optimizer memory savings</li></ol> <h3 class="relative group"><a id="memory-optimization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-optimization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory Optimization</span></h3> <ul data-svelte-h="svelte-pjt7c2"><li>Enable <code>gradient_checkpointing</code> for large models</li> <li>Set appropriate <code>pipeline_parallel_num_microbatches</code> for pipeline parallelism</li></ul> <h2 class="relative group"><a id="troubleshooting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#troubleshooting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Troubleshooting</span></h2> <h3 class="relative group"><a id="common-issues" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#common-issues"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Common Issues</span></h3> <ol data-svelte-h="svelte-j5qw5u"><li><strong>Out of Memory</strong>: Reduce batch size, increase parallelism, or enable gradient checkpointing</li> <li><strong>Model Not Supported</strong>: Ensure you’re using a model from <code>optimum.neuron.models.training</code></li> <li><strong>Pipeline Parallelism Fails</strong>: Check that the model supports pipeline parallelism</li> <li><strong>Incorrect Process Count</strong>: Ensure <code>nproc_per_node</code> matches your parallelism configuration</li></ol> <h3 class="relative group"><a id="debugging-tips" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#debugging-tips"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Debugging Tips</span></h3> <ul data-svelte-h="svelte-jjqy14"><li>Start with smaller models and parallelism sizes</li> <li>Check that all processes can communicate properly</li> <li>Verify checkpoint directories and permissions</li> <li>Monitor Neuron device utilization</li></ul> <p></p>

	<script>
	{
	__sveltekit_89oqon = {
	assets: "/docs/optimum.neuron/main/en",
	base: "/docs/optimum.neuron/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js"),
	import("/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 11],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 78.7 kB
Xet hash:: 69531f38a71de458c2e3110958a04a90183b652fcfe5ebed7b98de1bed9d0068

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.