Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / optimum-neuron /v0.2.0.dev2 /en /guides /distributed_training.html

rtrm

about 1 month ago

download

raw

41.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/entry/start.c3692dcd.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/scheduler.85c25b89.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/singletons.bf318e21.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/paths.4f2bc42b.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/entry/app.40ef12d9.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/index.c9bcf812.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/nodes/0.8386078c.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/nodes/7.d70ff40b.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/Tip.d8f753fa.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/CodeBlock.c004bd26.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/chunks/index.9790a2b6.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="distributed-training-with-optimum-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-training-with-optimum-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed Training with optimum-neuron</span></h1> <p data-svelte-h="svelte-1cdv2fm"><a href="https://aws.amazon.com/machine-learning/trainium/" rel="nofollow">AWS Trainium instances</a> are great to train models. They can contain up to 16 Neuron devices, each device containing 2 Neuron cores and has 32GB of memory (16GB per core). For example a <code>trn1.32xlarge</code> instance has 32 x 16 = 512GB of memory.</p> <p data-svelte-h="svelte-trqv6g">But there is a caveat: each Neuron core is an independent data-parallel worker by default. It means that the model, the gradient state and the optimizer state, amounting to approximately 4 times the model size, must fit in each of the Neuron cores (16GB) to be able to train. If that is the case, then the activations must also fit in the remaining memory.</p> <p data-svelte-h="svelte-1rlk2kz">To alleviate that, <code>optimum-neuron</code> supports parallelism features enabling you to harness the full power of your Trainium instance:</p> <ol data-svelte-h="svelte-pdytp0"><li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a>: It is an optimization of data-parallelism which consists in sharding the optimizer state (which usually represents half of the memory needed on the device) over the data-parallel ranks.</li> <li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a>: It is a technique which consists in sharding each of your model matrix-multiplications along a given axis (row or column) on multiple devices. It also known as intra-layer model parallelism. The number of devices to shard your parameters on is called the <code>tensor_parallel_size</code>.</li> <li><a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a>: It is an optimization over Tensor Parallelism which shards the activations on the sequence axis outside of the tensor parallel regions. It is useful because it saves memory by sharding the activations.</li> <li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a>: It consists in sharding the model block layers on multiple devices. It is also known as inter-layer model parallelism. The number of devices to shard your layers on is called the <code>pipeline_parallel_size</code>.</li></ol> <p data-svelte-h="svelte-ogaike">The good news is that is it possible to combine those techniques, and <code>optimum-neuron</code> makes it very easy!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-18pumv8">All the example scripts provided in the optimum-neuron repo have those features implemented via the <code>NeuronTrainer</code>.</p></div> <h2 class="relative group"><a id="how-to-enable-zero-1" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-zero-1"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable ZeRO-1?</span></h2> <p data-svelte-h="svelte-1gw40d0">Whether you use the <a href="/docs/optimum.neuron/v0.2.0.dev2/en/package_reference/trainer#optimum.neuron.NeuronTrainer">NeuronTrainer</a> or decide to have your own training script that uses the <code>NeuronAccelerator</code>, it is very easy to enable the ZeRO-1 optimization.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer

	<span class="hljs-comment"># To enable ZeRO-1, set the `zero_1` argument to `True` in the training arguments.</span>
	training_args = NeuronTrainingArguments(
	...,
	zero_1=<span class="hljs-literal">True</span>,
	)

	trainer = NeuronTrainer(
	model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1971m31">Since the example scripts use the <code>NeuronTrainer</code>, you can enable ZeRO-1 when using them by add the <code>--zero_1</code> flag
	to your command line.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=2 examples/language-modeling/run_clm.py \
	--model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v0.6 \
	--dataset_name wikitext \
	--dataset_config_name wikitext-2-raw-v1 \
	--do_train \
	--per_device_train_batch_size 1 \
	--block_size 1024 \
	--bf16 \
	--zero_1 \
	--output_dir my_training/
	<!-- HTML_TAG_END --></pre></div></div> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-la2f2b">There is a little bit more work to do when not using the <code>NeuronTrainer</code>:</p> <ol data-svelte-h="svelte-6khx73"><li>(Optional) Wrap the optimizer class to make it lazy. When ZeRO-1 is enabled the original optimizer is overridden to use a sharded version of it. Hence, it is possible to load the original optimizer lazily so that the optimizer state is not materialized until it is actually sharded.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
	<span class="hljs-keyword">from</span> optimum.neuron.distributed <span class="hljs-keyword">import</span> make_optimizer_constructor_lazy

	lazy_adamw = make_optimizer_constructor_lazy(AdamW)<!-- HTML_TAG_END --></pre></div> <ol start="2" data-svelte-h="svelte-1ufjyew"><li>Set the <code>zero_1</code> argument to <code>True</code> when instantiating the <code>NeuronAccelerator</code>.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerator = NeuronAccelerator(
	...
	zero_1=<span class="hljs-literal">True</span>,
	)

	model = ...
	lazy_optimizer = lazy_adamw(...) <span class="hljs-comment"># Actually instantiate the optimizer.</span>


	model, optimizer = accelerator.prepare(model, lazy_optimizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-to-enable-tensor-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-enable-tensor-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to enable Tensor Parallelism?</span></h2> <p data-svelte-h="svelte-15kw5do">Just as for ZeRO-1, it is possible to apply Tensor Parallelism either with the <a href="/docs/optimum.neuron/v0.2.0.dev2/en/package_reference/trainer#optimum.neuron.NeuronTrainer">NeuronTrainer</a> or the <code>NeuronAccelerator</code>.</p> <p data-svelte-h="svelte-c14p9h">When doing Tensor Parallelism, you have different settings:</p> <ol data-svelte-h="svelte-gly1sh"><li>The <code>tensor_parallel_size</code>. Ideally it should be smallest value for which the model fits.</li> <li>Whether or not sequence parallelism should be enabled. <a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions.
	It is useful because it saves memory by sharding the activations.</li> <li>Whether or not parallelization of the embedding layer should be done. By default it is done because it offers multiple benefits:</li></ol> <ul data-svelte-h="svelte-19rx9yl"><li>Parallelizing the embedding layer saves memory, which can enable fitting a bigger batch size and/or sequence length.</li> <li>For language models, where the embedding layer weights and the language-modeling head weights are usually tied, the language-modeling head ends up parallel
	and does not require to <code>all-gather</code> its output since it is fed to a cross entropy loss compatible with parallelism, saving expensive communication.</li></ul> <p data-svelte-h="svelte-v8xmeb">On top of that, it is very important to make sure that the original model is loaded in an efficient manner: the training script is going to be called by <code>torchrun</code>, which will dispatch it to workers, one worker per core. If each worker (there are 32 of them in a <code>trn1.32xlarge</code> instance) loads the full model weights, it can take a lot of time and go out-of-memory really fast.</p> <p data-svelte-h="svelte-1ne635r"><code>optimum-neuron</code> provides a context-manager <a href="/docs/optimum.neuron/v0.2.0.dev2/en/package_reference/distributed#optimum.neuron.distributed.lazy_load_for_parallelism">distributed.lazy_load_for_parallelism()</a> that loads the model lazily to prevent that, only the parameters of the corresponding model shard will be materialized in each worker.</p> <h3 class="relative group"><a id="via-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronTrainer</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
	<span class="hljs-keyword">from</span> optimum.neuron.distributed <span class="hljs-keyword">import</span> lazy_load_for_parallelism

	<span class="hljs-comment"># Specify the `tensor_parallel_size` in the training arguments.</span>
	training_args = NeuronTrainingArguments(
	...,
	tensor_parallel_size=<span class="hljs-number">8</span>,
	disable_embedding_parallelization=<span class="hljs-literal">False</span>, <span class="hljs-comment"># It is `False` by default.</span>
	disable_sequence_parallel=<span class="hljs-literal">False</span>, <span class="hljs-comment"># It is `False` by default.</span>
	)

	<span class="hljs-keyword">with</span> lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
	model = ...


	trainer = NeuronTrainer(
	model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ar8hdp">Since the example scripts use the <code>NeuronTrainer</code>, you can enable Tensor Parallelism when using them by specifying the <code>--tensor_parallel_size</code> argument, and optionally the <code>disable_embedding_parallelization</code> and <code>disable_sequence_parallel</code> flags.
	to your command line.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=2 examples/language-modeling/run_clm.py \
	--model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v0.6 \
	--dataset_name wikitext \
	--dataset_config_name wikitext-2-raw-v1 \
	--do_train \
	--per_device_train_batch_size 1 \
	--block_size 1024 \
	--bf16 \
	--tensor_parallel_size 2 \
	--output_dir my_training/<!-- HTML_TAG_END --></pre></div></div> <h3 class="relative group"><a id="via-the-neuronaccelerator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#via-the-neuronaccelerator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Via the NeuronAccelerator</span></h3> <p data-svelte-h="svelte-1gks0nu">Just as for ZeRO-1, it is possible to wrap the optimizer class to make it lazy. Since the model parameters are going to be sharded, it is not needed to materialize the optimizer state prior to model parallelization: the wrapper makes sure that it stays unmaterialized.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
	<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
	<span class="hljs-keyword">from</span> optimum.neuron.accelerate.utils <span class="hljs-keyword">import</span> ModelParallelismPlugin
	<span class="hljs-keyword">from</span> optimum.neuron.distributed <span class="hljs-keyword">import</span> lazy_load_for_parallelism

	tensor_parallel_size = <span class="hljs-number">8</span>
	mp_plugin = ModelParallelismPlugin(
	tensor_parallel_size,
	parallelize_embeddings=<span class="hljs-literal">True</span>,
	sequence_parallel_enabled=<span class="hljs-literal">True</span>,
	checkpoint_dir=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Can be specified when resuming from checkpoint.</span>
	)

	accelerator = NeuronAccelerator(
	...
	mp_plugin=mp_plugin,
	)

	<span class="hljs-keyword">with</span> lazy_load_for_parallelism(tensor_parallel_size=tensor_parallel_size):
	model = ...

	lazy_adamw = make_optimizer_constructor_lazy(AdamW)
	lazy_optimizer = lazy_adamw(...) <span class="hljs-comment"># Actually instantiate the optimizer.</span>

	model, optimizer = accelerator.prepare(model, lazy_optimizer)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="checkpoint-consolidation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#checkpoint-consolidation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Checkpoint consolidation</span></h3> <p data-svelte-h="svelte-bc8xo9">Since Tensor Parallelism consists in sharding the model weights accross different workers, only sharded checkpoints will be saved during training. It is necessary to consolidate the sharded checkpoints to be able to share and use them outside of the specific training configuration there were created under.</p> <p data-svelte-h="svelte-24042q">The Optimum CLI provides a way of doing that very easily via the <code>optimum neuron consolidate</code> command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate --help

	usage: optimum-cli neuron consolidate [-h] [-f {pytorch,safetensors}] checkpoint_dir output_dir

	positional arguments:
	checkpoint_dir The path to the directory containing the checkpoints.
	output_dir The path to the output directory containing the consolidated checkpoint.

	optional arguments:
	-h, --help show this help message and <span class="hljs-keyword">exit</span>
	-f {pytorch,safetensors}, --format {pytorch,safetensors}
	The format used to save the consolidated checkpoint.
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xib7ue">All you need to do is specify the sharded checkpoints directory and the output directory that will contain the consolidated checkpoints, and the command takes care of the rest.
	It is also possible to specify the output format of the consolidated checkpoints, by default it will export them to the <code>safetensors</code> format, which is the recommend format to use.</p> <p data-svelte-h="svelte-11lpom8">Example:</p> <p data-svelte-h="svelte-u3s5ug">Training with Tensor Parallelism just completed and the output dir is called <code>my_training</code>. The directory looks like the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_training/
	├── README.md
	├── all_results.json
	├── checkpoint-10
	│ ├── config.json
	│ ├── scheduler.pt
	│ ├── special_tokens_map.json
	│ ├── tensor_parallel_shards
	│ ├── tokenizer.json
	│ ├── tokenizer.model
	│ ├── tokenizer_config.json
	│ ├── trainer_state.json
	│ └── training_args.bin
	├── config.json
	├── special_tokens_map.json
	├── tensor_parallel_shards
	│ ├── tp_rank_00_pp_rank_00
	│ ├── tp_rank_01_pp_rank_00
	│ ├── tp_rank_02_pp_rank_00
	│ ├── tp_rank_03_pp_rank_00
	│ ├── tp_rank_04_pp_rank_00
	│ ├── tp_rank_05_pp_rank_00
	│ ├── tp_rank_06_pp_rank_00
	│ └── tp_rank_07_pp_rank_00
	├── tokenizer.json
	├── tokenizer.model
	├── tokenizer_config.json
	├── train_results.json
	├── trainer_state.json
	└── training_args.bin<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14t9mzj">It is possible to consolidate the sharded checkpoints in <code>my_training/tensor_parallel_shards</code>, which correspond to the sharded checkpoints saved at the end of the training, by running the following command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate my_training my_training_consolidated_checkpoint<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1rjmyzp">The sharded checkpoints are saved under a directory called <code>tensor_parallel_shards</code>. The <code>optimum-cli neuron consolidate</code> command accept as input both a directory that contains a <code>tensor_parallel_shards</code> directory, or the <code>tensor_parallel_shards</code> directory itself.</p></div> <p></p>

	<script>
	{
	__sveltekit_1eimgpm = {
	assets: "/docs/optimum.neuron/v0.2.0.dev2/en",
	base: "/docs/optimum.neuron/v0.2.0.dev2/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/entry/start.c3692dcd.js"),
	import("/docs/optimum.neuron/v0.2.0.dev2/en/_app/immutable/entry/app.40ef12d9.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 7],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 41.4 kB
Xet hash:: 5084acec05303acefe7907175c47f7db666f63c1783becd23a49ffd6aa0bc96f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.