Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}"> | |
| <link href="/docs/optimum.neuron/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/scheduler.56725da7.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/singletons.635e76a3.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/paths.ed3a4dd8.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/preload-helper.ec99a452.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/index.18a26576.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/0.f24306d7.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/11.d7707db0.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/Tip.5b941656.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CopyLLMTxtMenu.18367b53.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/globals.7f7f1b26.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.47599cff.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CodeBlock.bd8b9965.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="distributed-training-with-optimum-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-training-with-optimum-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed Training with optimum-neuron</span></h1> <p data-svelte-h="svelte-1hnco7m">AWS Trainium instances provide powerful infrastructure for training large language models at scale. A <code>trn1.32xlarge</code> instance contains 16 Neuron devices with 32 cores total, offering 512GB of memory (16GB per core).</p> <p data-svelte-h="svelte-d2kadp">However, training large models presents a fundamental challenge: by default, each Neuron core operates as an independent data-parallel worker, requiring the entire model, gradients, and optimizer state (approximately 4× the model size) to fit within a single core’s 16GB memory limit, with additional space needed for activations.</p> <p data-svelte-h="svelte-1ndq4u8">For models that exceed these memory constraints, <code>optimum-neuron</code> provides sophisticated parallelism strategies that distribute computation and memory across multiple devices, enabling you to train models that would be impossible to fit on individual cores:</p> <h2 class="relative group"><a id="parallelism-strategies-overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#parallelism-strategies-overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Parallelism Strategies Overview</span></h2> <h3 class="relative group"><a id="1-zero-1-optimizer-state-sharding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-zero-1-optimizer-state-sharding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. ZeRO-1 (Optimizer State Sharding)</span></h3> <p data-svelte-h="svelte-1mjmifd"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a> is an optimizer-level optimization that reduces memory usage without changing your model architecture.</p> <p data-svelte-h="svelte-1f0jrrz"><strong>How it works</strong>: Shards the optimizer state (gradients, momentum, variance) across data-parallel ranks instead of replicating it on each device.</p> <p data-svelte-h="svelte-18396cf"><strong>Memory savings</strong>: Reduces optimizer memory usage by <code>1/data_parellel_size</code>.</p> <p data-svelte-h="svelte-190zbi2"><strong>When to use</strong>: Always beneficial when training with multiple devices, regardless of model size.</p> <h3 class="relative group"><a id="2-tensor-parallelism-intra-layer-model-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-tensor-parallelism-intra-layer-model-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Tensor Parallelism (Intra-layer Model Parallelism)</span></h3> <p data-svelte-h="svelte-1r1vc5"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a> splits individual model layers across multiple devices.</p> <p data-svelte-h="svelte-wsb08h"><strong>How it works</strong>: Shards matrix multiplications (linear layers, attention) along rows or columns across devices. Each device computes part of each layer, requiring communication between devices for each forward/backward pass.</p> <p data-svelte-h="svelte-1rz7p8m"><strong>Memory savings</strong>: Reduces model parameter memory by <code>1/tensor_parallel_size</code>.</p> <p data-svelte-h="svelte-kpqjki"><strong>When to use</strong>: When your model is too large to fit on a single device, even after applying ZeRO-1.</p> <p data-svelte-h="svelte-fc78y5"><strong>Typical deployment</strong>: Usually applied within a single node (intra-node) due to high communication requirements.</p> <p data-svelte-h="svelte-otaiec"><strong>Trade-offs</strong>: Increases communication overhead between devices, which can slow down training if overused.</p> <h3 class="relative group"><a id="3-sequence-parallelism-activation-sharding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-sequence-parallelism-activation-sharding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Sequence Parallelism (Activation Sharding)</span></h3> <p data-svelte-h="svelte-4fh0sm"><a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> is an optimization that works alongside Tensor Parallelism to further reduce memory usage.</p> <p data-svelte-h="svelte-1cma7oi"><strong>How it works</strong>: Shards activations along the sequence dimension in regions where tensors are not already sharded by tensor parallelism.</p> <p data-svelte-h="svelte-hrmmz0"><strong>Memory savings</strong>: Reduces activation memory proportional to sequence length, especially beneficial for long sequences.</p> <p data-svelte-h="svelte-1obmcw0"><strong>When to use</strong>: Always enable when using tensor parallelism - it provides additional memory savings with minimal overhead.</p> <p data-svelte-h="svelte-41lgu1"><strong>Requirement</strong>: Only works in combination with tensor parallelism.</p> <h3 class="relative group"><a id="4-pipeline-parallelism-inter-layer-model-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-pipeline-parallelism-inter-layer-model-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Pipeline Parallelism (Inter-layer Model Parallelism)</span></h3> <p data-svelte-h="svelte-1lhkl2x"><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a> splits model layers across different devices.</p> <p data-svelte-h="svelte-a3vdn"><strong>How it works</strong>: Divides your model into stages, with each stage containing consecutive layers running on different devices. Uses microbatching to keep all devices busy.</p> <p data-svelte-h="svelte-1jq8cu1"><strong>Memory savings</strong>: Reduces model parameter memory by <code>1/pipeline_parallel_size</code>.</p> <p data-svelte-h="svelte-1vm5hdt"><strong>When to use</strong>: For very large models that don’t fit even with tensor parallelism, or when you want to scale across many devices with less communication overhead than tensor parallelism.</p> <p data-svelte-h="svelte-4tw4cj"><strong>Typical deployment</strong>: Usually applied across multiple nodes (inter-node) to scale to larger numbers of devices while minimizing high-bandwidth communication requirements.</p> <p data-svelte-h="svelte-l89uy8"><strong>Trade-offs</strong>: Introduces pipeline bubbles (idle time) and requires careful tuning of microbatch sizes.</p> <p data-svelte-h="svelte-1p0ihsg">The good news is that it is possible to combine those techniques, and <code>optimum-neuron</code> makes it very easy!</p> <blockquote class="tip"><p data-svelte-h="svelte-1s96mwn">All the training examples in the optimum-neuron repo use these parallelism features via the <code>NeuronTrainer</code>.</p></blockquote> <h2 class="relative group"><a id="how-to-enable-zero-1" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-zero-1"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable ZeRO-1?</span></h2> <p data-svelte-h="svelte-1xpk0lv">ZeRO-1 can be enabled either through the <code>NeuronTrainer</code> or directly with the <code>NeuronAccelerator</code>.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-comment"># Enable ZeRO-1 in the training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span> | |
| bf16=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-nmzr1t">Since the example scripts use the <code>NeuronTrainer</code>, you can enable ZeRO-1 when using them by adding the <code>--zero_1</code> flag to your command line.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=2 examples/training/qwen3/finetune_qwen3.py \ | |
| --model_name_or_path Qwen/Qwen2.5-0.5B \ | |
| --dataset_name wikitext \ | |
| --dataset_config_name wikitext-2-raw-v1 \ | |
| --do_train \ | |
| --per_device_train_batch_size 1 \ | |
| --block_size 1024 \ | |
| --bf16 \ | |
| --zero_1 \ | |
| --tensor_parallel_size 2 \ | |
| --output_dir my_training/<!-- HTML_TAG_END --></pre></div></blockquote> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-106kvj9">When using the <code>NeuronAccelerator</code> directly, you need to create a <code>TrainingNeuronConfig</code> and enable ZeRO-1 separately:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-comment"># Create the training configuration</span> | |
| trn_config = TrainingNeuronConfig() | |
| <span class="hljs-comment"># Create accelerator with ZeRO-1 enabled</span> | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span> | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| model = ... <span class="hljs-comment"># Your model instance</span> | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| <span class="hljs-comment"># Prepare model and optimizer</span> | |
| model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-to-enable-tensor-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-tensor-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable Tensor Parallelism?</span></h2> <p data-svelte-h="svelte-1r4hhew">Tensor Parallelism can be used with either the <code>NeuronTrainer</code> or <code>NeuronAccelerator</code>.</p> <p data-svelte-h="svelte-v1qtdm"><strong>Important</strong>: Tensor parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code>.</p> <p data-svelte-h="svelte-n127re">When doing Tensor Parallelism, you have several important settings:</p> <ol data-svelte-h="svelte-1hoskl8"><li>The <code>tensor_parallel_size</code>: Ideally it should be the smallest value for which the model fits in memory.</li> <li>Whether sequence parallelism should be enabled: <a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions, saving memory by sharding the activations.</li></ol> <p data-svelte-h="svelte-11wpmlp">When using distributed training, the training script is called by <code>torchrun</code>, which will dispatch it to workers, one worker per core. Each worker will load the sharded model and dispatch the parameters automatically across the cores. The <code>tensor_parallel_size</code> is the number of workers to shard the model parameters on.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-comment"># Configure tensor parallelism in training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, | |
| bf16=<span class="hljs-literal">True</span>, | |
| tensor_parallel_size=<span class="hljs-number">8</span>, | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-7fhmhn">Since the example scripts use the <code>NeuronTrainer</code>, you can enable Tensor Parallelism when using them by specifying the <code>--tensor_parallel_size</code> argument.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=8 examples/training/qwen3/finetune_qwen3.py \ | |
| --model_name_or_path Qwen/Qwen2.5-0.5B \ | |
| --dataset_name wikitext \ | |
| --dataset_config_name wikitext-2-raw-v1 \ | |
| --do_train \ | |
| --per_device_train_batch_size 1 \ | |
| --block_size 1024 \ | |
| --bf16 \ | |
| --tensor_parallel_size 8 \ | |
| --output_dir my_training/<!-- HTML_TAG_END --></pre></div></blockquote> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-1ncu8vs">When using the <code>NeuronAccelerator</code> directly, you configure tensor parallelism through the <code>TrainingNeuronConfig</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-comment"># Configure tensor parallelism</span> | |
| trn_config = TrainingNeuronConfig( | |
| tensor_parallel_size=<span class="hljs-number">8</span>, | |
| sequence_parallel_enabled=<span class="hljs-literal">True</span>, | |
| checkpoint_dir=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Can be specified when resuming from checkpoint</span> | |
| ) | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| model = ... <span class="hljs-comment"># Your model instance</span> | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-to-enable-pipeline-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-pipeline-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable Pipeline Parallelism?</span></h2> <p data-svelte-h="svelte-1vp0c4m">Pipeline Parallelism allows you to split your model layers across multiple devices, enabling training of very large models that wouldn’t fit on a single device, or even a signle node.</p> <p data-svelte-h="svelte-1ytrjb2"><strong>Important</strong>: Pipeline parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code> and declare <code>SUPPORTS_PIPELINE_PARALLELISM = True</code>.</p> <h3 class="relative group"><a id="configuration-options" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configuration-options"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configuration Options</span></h3> <p data-svelte-h="svelte-wwttlo">Pipeline parallelism has several configuration parameters:</p> <ul data-svelte-h="svelte-9fwfrb"><li><code>pipeline_parallel_size</code>: Number of pipeline stages (devices to split layers across)</li> <li><code>pipeline_parallel_num_microbatches</code>: Number of microbatches for pipeline scheduling</li> <li>When pipeline parallelism is enabled, ZeRO-1 can be automatically applied to the pipeline parallel optimizer</li></ul> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM <span class="hljs-comment"># Custom model implementation</span> | |
| <span class="hljs-comment"># Configure pipeline parallelism in training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Will be split into microbatches</span> | |
| bf16=<span class="hljs-literal">True</span>, | |
| tensor_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Split model across 4 pipeline stages</span> | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, <span class="hljs-comment"># Number of microbatches</span> | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1 with pipeline parallelism</span> | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| <span class="hljs-comment"># Load model using custom implementation - must be done with the model class directly</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=training_args.trn_config <span class="hljs-comment"># Pass the auto-generated trn_config</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM | |
| <span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-comment"># Configure combined parallelism strategies</span> | |
| trn_config = TrainingNeuronConfig( | |
| tensor_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_size=<span class="hljs-number">4</span>, | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, | |
| sequence_parallel_enabled=<span class="hljs-literal">True</span>, | |
| ) | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Can combine with ZeRO-1</span> | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| <span class="hljs-comment"># Load model with custom implementation</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=trn_config | |
| ) | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| model, optimizer = accelerator.prepare(model, optimizer)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-1ypidsl">When using pipeline parallelism, the total number of processes should be at least <code>tensor_parallel_size * pipeline_parallel_size</code>. For example, with <code>tensor_parallel_size=2</code> and <code>pipeline_parallel_size=4</code>, you need 8 processes total.</p></blockquote> <h2 class="relative group"><a id="combining-parallelism-strategies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#combining-parallelism-strategies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Combining Parallelism Strategies</span></h2> <p data-svelte-h="svelte-ktf0yf">You can combine multiple parallelism strategies for maximum memory efficiency and performance. Here’s an example with all strategies combined:</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM | |
| <span class="hljs-comment"># Example: Combine all parallelism strategies</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">32</span>, | |
| bf16=<span class="hljs-literal">True</span>, | |
| gradient_checkpointing=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># ZeRO-1</span> | |
| zero_1=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># Tensor parallelism</span> | |
| tensor_parallel_size=<span class="hljs-number">4</span>, | |
| disable_sequence_parallel=<span class="hljs-literal">False</span>, <span class="hljs-comment"># Enable sequence parallelism</span> | |
| <span class="hljs-comment"># Pipeline parallelism</span> | |
| pipeline_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">8</span>, | |
| <span class="hljs-comment"># Additional optimizations</span> | |
| fuse_qkv=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Fuse QKV projections for efficiency</span> | |
| kv_size_multiplier=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Auto-calculate optimal KV multiplier</span> | |
| ) | |
| <span class="hljs-comment"># Load model using custom implementation</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=training_args.trn_config | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e9hr70">This configuration uses 4 * 2 = 8 total processes:</p> <ul data-svelte-h="svelte-138kta0"><li>Each tensor parallel group has 4 processes</li> <li>Each pipeline stage runs on one tensor parallel group</li></ul> <p data-svelte-h="svelte-1oatqej">We can then run the training script on the <code>trn1.32xlarge</code> instance with 32 Neuron cores, resulting in the following configuration: <code>dp=4, tp=4, pp=2</code>, which means 4 data-parallel groups, each with 4 tensor-parallel devices, and 2 pipeline stages.</p> <h2 class="relative group"><a id="checkpoint-consolidation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#checkpoint-consolidation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Checkpoint consolidation</span></h2> <p data-svelte-h="svelte-10z9rkn">Since distributed training uses sharded checkpoints across different workers, you need to consolidate them to create a standard model checkpoint that can be shared and used outside of the specific training configuration.</p> <p data-svelte-h="svelte-24042q">The Optimum CLI provides a way of doing that very easily via the <code>optimum neuron consolidate</code> command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate --<span class="hljs-built_in">help</span> | |
| usage: optimum-cli neuron consolidate [-h] [-f {pytorch,safetensors}] checkpoint_dir output_dir | |
| positional arguments: | |
| checkpoint_dir The path to the directory containing the checkpoints. | |
| output_dir The path to the output directory containing the consolidated checkpoint. | |
| optional arguments: | |
| -h, --<span class="hljs-built_in">help</span> show this <span class="hljs-built_in">help</span> message and <span class="hljs-built_in">exit</span> | |
| -f {pytorch,safetensors}, --format {pytorch,safetensors} | |
| The format used to save the consolidated checkpoint.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7l8i2j">All you need to do is specify the sharded checkpoints directory and the output directory that will contain the consolidated checkpoints, and the command takes care of the rest. | |
| It is also possible to specify the output format of the consolidated checkpoints. By default it will export them to the <code>safetensors</code> format, which is the recommended format to use.</p> <p data-svelte-h="svelte-11lpom8">Example:</p> <p data-svelte-h="svelte-15ioqnc">Training with distributed parallelism just completed and the output dir is called <code>my_training</code>. The directory looks like the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_training/ | |
| ├── README.md | |
| ├── all_results.json | |
| ├── checkpoint-10 | |
| │ ├── config.json | |
| │ ├── scheduler.pt | |
| │ ├── special_tokens_map.json | |
| │ ├── shards/ | |
| │ ├── tokenizer.json | |
| │ ├── tokenizer.model | |
| │ ├── tokenizer_config.json | |
| │ ├── trainer_state.json | |
| │ └── training_args.bin | |
| ├── config.json | |
| ├── special_tokens_map.json | |
| ├── shards/ | |
| │ ├── tp_rank_00_pp_rank_00 | |
| │ ├── tp_rank_01_pp_rank_00 | |
| │ ├── tp_rank_02_pp_rank_00 | |
| │ ├── tp_rank_03_pp_rank_00 | |
| │ ├── tp_rank_00_pp_rank_01 | |
| │ ├── tp_rank_01_pp_rank_01 | |
| │ ├── tp_rank_02_pp_rank_01 | |
| │ └── tp_rank_03_pp_rank_01 | |
| ├── tokenizer.json | |
| ├── tokenizer.model | |
| ├── tokenizer_config.json | |
| ├── train_results.json | |
| ├── trainer_state.json | |
| ├── training_args.bin | |
| └── trn_config.json<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e8fv74">You can consolidate the sharded checkpoints in <code>my_training/shards</code>, which correspond to the sharded checkpoints saved at the end of training, by running the following command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate my_training my_training_consolidated_checkpoint<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-1sydsko">The sharded checkpoints are saved under a directory called <code>shards</code>. The <code>optimum-cli neuron consolidate</code> command accepts as input both a directory that contains a <code>shards</code> directory, or the <code>shards</code> directory itself.</p></blockquote> <h2 class="relative group"><a id="best-practices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#best-practices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Best Practices</span></h2> <h3 class="relative group"><a id="choosing-parallelism-strategy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-parallelism-strategy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing Parallelism Strategy</span></h3> <ol data-svelte-h="svelte-1j56tw2"><li><strong>Start with Tensor Parallelism</strong>: Use the smallest <code>tensor_parallel_size</code> that fits your model in memory</li> <li><strong>Add Pipeline Parallelism</strong>: For very large models, combine with pipeline parallelism</li> <li><strong>Enable Sequence Parallelism</strong>: Always enable when using tensor parallelism for memory savings (set <code>disable_sequence_parallel=False</code>)</li> <li><strong>Use ZeRO-1</strong>: Combine with any parallelism strategy for optimizer memory savings</li></ol> <h3 class="relative group"><a id="memory-optimization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-optimization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory Optimization</span></h3> <ul data-svelte-h="svelte-pjt7c2"><li>Enable <code>gradient_checkpointing</code> for large models</li> <li>Set appropriate <code>pipeline_parallel_num_microbatches</code> for pipeline parallelism</li></ul> <h2 class="relative group"><a id="troubleshooting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#troubleshooting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Troubleshooting</span></h2> <h3 class="relative group"><a id="common-issues" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#common-issues"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Common Issues</span></h3> <ol data-svelte-h="svelte-j5qw5u"><li><strong>Out of Memory</strong>: Reduce batch size, increase parallelism, or enable gradient checkpointing</li> <li><strong>Model Not Supported</strong>: Ensure you’re using a model from <code>optimum.neuron.models.training</code></li> <li><strong>Pipeline Parallelism Fails</strong>: Check that the model supports pipeline parallelism</li> <li><strong>Incorrect Process Count</strong>: Ensure <code>nproc_per_node</code> matches your parallelism configuration</li></ol> <h3 class="relative group"><a id="debugging-tips" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#debugging-tips"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Debugging Tips</span></h3> <ul data-svelte-h="svelte-jjqy14"><li>Start with smaller models and parallelism sizes</li> <li>Check that all processes can communicate properly</li> <li>Verify checkpoint directories and permissions</li> <li>Monitor Neuron device utilization</li></ul> <p></p> | |
| <script> | |
| { | |
| __sveltekit_89oqon = { | |
| assets: "/docs/optimum.neuron/main/en", | |
| base: "/docs/optimum.neuron/main/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js"), | |
| import("/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 11], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 78.7 kB
- Xet hash:
- 69531f38a71de458c2e3110958a04a90183b652fcfe5ebed7b98de1bed9d0068
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.