Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / accelerate /pr_4021 /en /usage_guides /gradient_accumulation.html

HuggingFaceDocBuilder

2 months ago

download

raw

69.5 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Performing gradient accumulation with Accelerate","local":"performing-gradient-accumulation-with-accelerate","sections":[{"title":"Converting it to Accelerate","local":"converting-it-to-accelerate","sections":[],"depth":2},{"title":"Letting Accelerate handle gradient accumulation","local":"letting-accelerate-handle-gradient-accumulation","sections":[],"depth":2},{"title":"The finished code","local":"the-finished-code","sections":[],"depth":2},{"title":"Self-contained example","local":"self-contained-example","sections":[],"depth":2},{"title":"Gradient accumulation on training samples of variable size","local":"gradient-accumulation-on-training-samples-of-variable-size","sections":[{"title":"Skeleton code","local":"skeleton-code","sections":[],"depth":3},{"title":"Self-contained causal LM example","local":"self-contained-causal-lm-example","sections":[],"depth":3},{"title":"To go further:","local":"to-go-further","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/accelerate/pr_4021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/scheduler.b9285784.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/singletons.7547c222.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.6d423e5c.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/paths.d42c9205.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/preload-helper.b0bd19d1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.26bc89a1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/0.0e7c56e8.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/48.7cdf601e.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/Tip.e4eba3d6.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.7a0ae628.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/CodeBlock.844ff9c3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Performing gradient accumulation with Accelerate","local":"performing-gradient-accumulation-with-accelerate","sections":[{"title":"Converting it to Accelerate","local":"converting-it-to-accelerate","sections":[],"depth":2},{"title":"Letting Accelerate handle gradient accumulation","local":"letting-accelerate-handle-gradient-accumulation","sections":[],"depth":2},{"title":"The finished code","local":"the-finished-code","sections":[],"depth":2},{"title":"Self-contained example","local":"self-contained-example","sections":[],"depth":2},{"title":"Gradient accumulation on training samples of variable size","local":"gradient-accumulation-on-training-samples-of-variable-size","sections":[{"title":"Skeleton code","local":"skeleton-code","sections":[],"depth":3},{"title":"Self-contained causal LM example","local":"self-contained-causal-lm-example","sections":[],"depth":3},{"title":"To go further:","local":"to-go-further","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="performing-gradient-accumulation-with-accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performing-gradient-accumulation-with-accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Performing gradient accumulation with Accelerate</span></h1> <p data-svelte-h="svelte-1762c2w">Gradient accumulation is a technique where you can train on bigger batch sizes than
	your machine would normally be able to fit into memory. This is done by accumulating gradients over
	several batches, and only stepping the optimizer after a certain number of batches have been performed.</p> <p data-svelte-h="svelte-1u2vc4p">While technically standard gradient accumulation code would work fine in a distributed setup, it is not the most efficient
	method for doing so and you may experience considerable slowdowns!</p> <p data-svelte-h="svelte-dgpyx1">In this tutorial you will see how to quickly setup gradient accumulation and perform it with the utilities provided in Accelerate,
	which can total to adding just one new line of code!</p> <p data-svelte-h="svelte-n9bxrd">This example will use a very simplistic PyTorch training loop that performs gradient accumulation every two batches:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->device = <span class="hljs-string">"cuda"</span>
	model.to(device)

	gradient_accumulation_steps = <span class="hljs-number">2</span>

	<span class="hljs-keyword">for</span> index, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(training_dataloader):
	inputs, targets = batch
	inputs = inputs.to(device)
	targets = targets.to(device)
	outputs = model(inputs)
	loss = loss_function(outputs, targets)
	loss = loss / gradient_accumulation_steps
	loss.backward()
	<span class="hljs-keyword">if</span> (index + <span class="hljs-number">1</span>) % gradient_accumulation_steps == <span class="hljs-number">0</span>:
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="converting-it-to-accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#converting-it-to-accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Converting it to Accelerate</span></h2> <p data-svelte-h="svelte-1cm6g3v">First the code shown earlier will be converted to utilize Accelerate without the special gradient accumulation helper:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-addition">+ from accelerate import Accelerator</span>
	<span class="hljs-addition">+ accelerator = Accelerator()</span>

	<span class="hljs-addition">+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(</span>
	<span class="hljs-addition">+ model, optimizer, training_dataloader, scheduler</span>
	<span class="hljs-addition">+ )</span>

	for index, batch in enumerate(training_dataloader):
	inputs, targets = batch
	<span class="hljs-deletion">- inputs = inputs.to(device)</span>
	<span class="hljs-deletion">- targets = targets.to(device)</span>
	outputs = model(inputs)
	loss = loss_function(outputs, targets)
	loss = loss / gradient_accumulation_steps
	<span class="hljs-addition">+ accelerator.backward(loss)</span>
	if (index+1) % gradient_accumulation_steps == 0:
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <blockquote class="warning"><p data-svelte-h="svelte-s0dec7">In its current state, this code is not going to perform gradient accumulation efficiently due to a process called gradient synchronization. Read more about that in the <a href="../concept_guides/gradient_synchronization">Concepts tutorial</a>!</p></blockquote> <h2 class="relative group"><a id="letting-accelerate-handle-gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#letting-accelerate-handle-gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Letting Accelerate handle gradient accumulation</span></h2> <p data-svelte-h="svelte-168h8ei">All that is left now is to let Accelerate handle the gradient accumulation for us. To do so you should pass in a <code>gradient_accumulation_steps</code> parameter to <a href="/docs/accelerate/pr_4021/en/package_reference/accelerator#accelerate.Accelerator">Accelerator</a>, dictating the number
	of steps to perform before each call to <code>step()</code> and how to automatically adjust the loss during the call to <a href="/docs/accelerate/pr_4021/en/package_reference/accelerator#accelerate.Accelerator.backward">backward()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> from accelerate import Accelerator
	<span class="hljs-deletion">- accelerator = Accelerator()</span>
	<span class="hljs-addition">+ accelerator = Accelerator(gradient_accumulation_steps=2)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xgkhl7">Alternatively, you can pass in a <code>gradient_accumulation_plugin</code> parameter to the <a href="/docs/accelerate/pr_4021/en/package_reference/accelerator#accelerate.Accelerator">Accelerator</a> object’s <code>__init__</code>, which will allow you to further customize the gradient accumulation behavior.
	Read more about that in the <a href="../package_reference/accelerator#accelerate.utils.GradientAccumulationPlugin">GradientAccumulationPlugin</a> docs.</p> <p data-svelte-h="svelte-4k1b8r">From here you can use the <a href="/docs/accelerate/pr_4021/en/package_reference/accelerator#accelerate.Accelerator.accumulate">accumulate()</a> context manager from inside your training loop to automatically perform the gradient accumulation for you!
	You just wrap it around the entire training part of our code:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- for index, batch in enumerate(training_dataloader):</span>
	<span class="hljs-addition">+ for batch in training_dataloader:</span>
	<span class="hljs-addition">+ with accelerator.accumulate(model):</span>
	inputs, targets = batch
	outputs = model(inputs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mcvl9w">You can remove all the special checks for the step number and the loss adjustment:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- loss = loss / gradient_accumulation_steps</span>
	accelerator.backward(loss)
	<span class="hljs-deletion">- if (index+1) % gradient_accumulation_steps == 0:</span>
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pcce2l">As you can see the <a href="/docs/accelerate/pr_4021/en/package_reference/accelerator#accelerate.Accelerator">Accelerator</a> is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss.</p> <blockquote class="tip"><p data-svelte-h="svelte-1jsd87o">Typically with gradient accumulation, you would need to adjust the number of steps to reflect the change in total batches you are
	training on. Accelerate automagically does this for you by default. Behind the scenes we instantiate a <code>GradientAccumulationPlugin</code> configured to do this.</p></blockquote> <blockquote class="warning"><p data-svelte-h="svelte-4ir9hq">The <a href="/docs/accelerate/pr_4021/en/package_reference/state#accelerate.state.GradientState">state.GradientState</a> is sync’d with the active dataloader being iterated upon. As such it assumes naively that when we have reached the end of the dataloader everything will sync and a step will be performed. To disable this, set <code>sync_with_dataloader</code> to be <code>False</code> in the <code>GradientAccumulationPlugin</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate import Accelerator
	<span class="hljs-keyword">from</span> accelerate.utils import GradientAccumulationPlugin

	plugin = GradientAccumulationPlugin(<span class="hljs-attribute">sync_with_dataloader</span>=<span class="hljs-literal">False</span>)
	accelerator = Accelerator(<span class="hljs-built_in">..</span>., <span class="hljs-attribute">gradient_accumulation_plugin</span>=plugin)<!-- HTML_TAG_END --></pre></div></blockquote> <h2 class="relative group"><a id="the-finished-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-finished-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The finished code</span></h2> <p data-svelte-h="svelte-1ynctgg">Below is the finished implementation for performing gradient accumulation with Accelerate</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
	accelerator = Accelerator(gradient_accumulation_steps=<span class="hljs-number">2</span>)
	model, optimizer, training_dataloader, scheduler = accelerator.prepare(
	model, optimizer, training_dataloader, scheduler
	)
	<span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> training_dataloader:
	<span class="hljs-keyword">with</span> accelerator.accumulate(model):
	inputs, targets = batch
	outputs = model(inputs)
	loss = loss_function(outputs, targets)
	accelerator.backward(loss)
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <blockquote class="warning"><p data-svelte-h="svelte-1kw2co7">It’s important that <strong>only one forward/backward</strong> should be done inside the context manager <code>with accelerator.accumulate(model)</code>.</p></blockquote> <p data-svelte-h="svelte-aw0h59">To learn more about what magic this wraps around, read the <a href="../concept_guides/gradient_synchronization">Gradient Synchronization concept guide</a></p> <h2 class="relative group"><a id="self-contained-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#self-contained-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Self-contained example</span></h2> <p data-svelte-h="svelte-1e26tlz">Here is a self-contained example that you can run to see gradient accumulation in action with Accelerate:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">import</span> copy
	<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
	<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> set_seed
	<span class="hljs-keyword">from</span> torch.utils.data <span class="hljs-keyword">import</span> TensorDataset, DataLoader

	<span class="hljs-comment"># seed</span>
	set_seed(<span class="hljs-number">0</span>)

	<span class="hljs-comment"># define toy inputs and labels</span>
	x = torch.tensor([<span class="hljs-number">1.</span>, <span class="hljs-number">2.</span>, <span class="hljs-number">3.</span>, <span class="hljs-number">4.</span>, <span class="hljs-number">5.</span>, <span class="hljs-number">6.</span>, <span class="hljs-number">7.</span>, <span class="hljs-number">8.</span>])
	y = torch.tensor([<span class="hljs-number">2.</span>, <span class="hljs-number">4.</span>, <span class="hljs-number">6.</span>, <span class="hljs-number">8.</span>, <span class="hljs-number">10.</span>, <span class="hljs-number">12.</span>, <span class="hljs-number">14.</span>, <span class="hljs-number">16.</span>])
	gradient_accumulation_steps = <span class="hljs-number">4</span>
	per_device_batch_size = <span class="hljs-built_in">len</span>(x) // gradient_accumulation_steps

	<span class="hljs-comment"># define dataset and dataloader</span>
	dataset = TensorDataset(x, y)
	dataloader = DataLoader(dataset, batch_size=per_device_batch_size)

	<span class="hljs-comment"># define model, optimizer and loss function</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">SimpleLinearModel</span>(torch.nn.Module):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
	<span class="hljs-built_in">super</span>(SimpleLinearModel, self).__init__()
	self.weight = torch.nn.Parameter(torch.zeros((<span class="hljs-number">1</span>, <span class="hljs-number">1</span>)))

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, inputs</span>):
	<span class="hljs-keyword">return</span> inputs @ self.weight

	model = SimpleLinearModel()
	model_clone = copy.deepcopy(model)
	criterion = torch.nn.MSELoss()
	model_optimizer = torch.optim.SGD(model.parameters(), lr=<span class="hljs-number">0.02</span>)
	accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
	model, model_optimizer, dataloader = accelerator.prepare(model, model_optimizer, dataloader)
	model_clone_optimizer = torch.optim.SGD(model_clone.parameters(), lr=<span class="hljs-number">0.02</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"initial model weight is <span class="hljs-subst">{model.weight.mean().item():<span class="hljs-number">.5</span>f}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"initial model weight is <span class="hljs-subst">{model_clone.weight.mean().item():<span class="hljs-number">.5</span>f}</span>"</span>)
	<span class="hljs-keyword">for</span> i, (inputs, labels) <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(dataloader):
	<span class="hljs-keyword">with</span> accelerator.accumulate(model):
	inputs = inputs.view(-<span class="hljs-number">1</span>, <span class="hljs-number">1</span>)
	<span class="hljs-built_in">print</span>(i, inputs.flatten())
	labels = labels.view(-<span class="hljs-number">1</span>, <span class="hljs-number">1</span>)
	outputs = model(inputs)
	loss = criterion(outputs, labels)
	accelerator.backward(loss)
	model_optimizer.step()
	model_optimizer.zero_grad()
	loss = criterion(x.view(-<span class="hljs-number">1</span>, <span class="hljs-number">1</span>) @ model_clone.weight, y.view(-<span class="hljs-number">1</span>, <span class="hljs-number">1</span>))
	model_clone_optimizer.zero_grad()
	loss.backward()
	model_clone_optimizer.step()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"w/ accumulation, the final model weight is <span class="hljs-subst">{model.weight.mean().item():<span class="hljs-number">.5</span>f}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"w/o accumulation, the final model weight is <span class="hljs-subst">{model_clone.weight.mean().item():<span class="hljs-number">.5</span>f}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">initial</span> model weight is <span class="hljs-number">0</span>.<span class="hljs-number">00000</span>
	<span class="hljs-attribute">initial</span> model weight is <span class="hljs-number">0</span>.<span class="hljs-number">00000</span>
	<span class="hljs-attribute">0</span> tensor([<span class="hljs-number">1</span>., <span class="hljs-number">2</span>.])
	<span class="hljs-attribute">1</span> tensor([<span class="hljs-number">3</span>., <span class="hljs-number">4</span>.])
	<span class="hljs-attribute">2</span> tensor([<span class="hljs-number">5</span>., <span class="hljs-number">6</span>.])
	<span class="hljs-attribute">3</span> tensor([<span class="hljs-number">7</span>., <span class="hljs-number">8</span>.])
	<span class="hljs-attribute">w</span>/ accumulation, the final model weight is <span class="hljs-number">2</span>.<span class="hljs-number">04000</span>
	<span class="hljs-attribute">w</span>/o accumulation, the final model weight is <span class="hljs-number">2</span>.<span class="hljs-number">04000</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="gradient-accumulation-on-training-samples-of-variable-size" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation-on-training-samples-of-variable-size"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gradient accumulation on training samples of variable size</span></h2> <p data-svelte-h="svelte-yaf5ek">As was pointed out in this <a href="https://huggingface.co/blog/gradient_accumulation" rel="nofollow">blog-post</a>, which points out a common error that occurs when performing gradient accumulation on training samples of variable size:</p> <blockquote data-svelte-h="svelte-1dpvjrf"><p>[…] for gradient accumulation across token-level tasks like causal LM training, the correct loss should be computed by the <strong>total loss across all batches in a gradient accumulation step</strong> divided by the <strong>total number of all non padding tokens in those batches</strong>. This is not the same as the average of the per-batch loss values.</p></blockquote> <p data-svelte-h="svelte-ffthww">In other words, some adjustments must be made on losses that operate on a token-level basis.</p> <h3 class="relative group"><a id="skeleton-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#skeleton-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Skeleton code</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
	<span class="hljs-keyword">import</span> math
	<span class="hljs-keyword">import</span> contextlib

	gradient_accumulation_steps = <span class="hljs-number">2</span>
	accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
	model, optimizer, training_dataloader, scheduler = accelerator.prepare(
	model, optimizer, training_dataloader, scheduler
	)

	training_iterator = <span class="hljs-built_in">iter</span>(training_dataloader)
	num_samples_in_epoch = <span class="hljs-built_in">len</span>(training_dataloader)
	remainder = num_samples_in_epoch % gradient_accumulation_steps
	remainder = remainder <span class="hljs-keyword">if</span> remainder != <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> gradient_accumulation_steps
	total_updates = math.ceil(num_samples_in_epoch / gradient_accumulation_steps)


	total_batched_samples = <span class="hljs-number">0</span>
	<span class="hljs-keyword">for</span> update_step <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(total_updates):
	<span class="hljs-comment"># In order to correctly the total number of non-padded tokens on which we'll compute the cross-entropy loss</span>
	<span class="hljs-comment"># we need to pre-load the full local batch - i.e the next per_device_batch_size * accumulation_steps samples</span>
	batch_samples = []
	num_batches_in_step = gradient_accumulation_steps <span class="hljs-keyword">if</span> update_step != (total_updates - <span class="hljs-number">1</span>) <span class="hljs-keyword">else</span> remainder
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(num_batches_in_step):
	batch_samples += [<span class="hljs-built_in">next</span>(training_iterator)]

	<span class="hljs-comment"># get local num items in batch </span>
	num_items_in_batch = <span class="hljs-built_in">sum</span>([(batch[<span class="hljs-string">"labels"</span>].ne(-<span class="hljs-number">100</span>)).<span class="hljs-built_in">sum</span>() <span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> batch_samples])
	<span class="hljs-comment"># to compute it correctly in a multi-device DDP training, we need to gather the total number of items in the full batch.</span>
	num_items_in_batch = accelerator.gather(num_items_in_batch).<span class="hljs-built_in">sum</span>().item()

	<span class="hljs-keyword">for</span> i, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(batch_samples):
	<span class="hljs-comment"># if we perform gradient accumulation in a multi-devices set-up, we want to avoid unnecessary communications when accumulating</span>
	<span class="hljs-comment"># cf: https://muellerzr.github.io/blog/gradient_accumulation.html</span>
	<span class="hljs-keyword">if</span> (i < <span class="hljs-built_in">len</span>(batch_samples) - <span class="hljs-number">1</span> <span class="hljs-keyword">and</span> accelerator.num_processes > <span class="hljs-number">1</span>):
	ctx = model.no_sync
	<span class="hljs-keyword">else</span>:
	ctx = contextlib.nullcontext

	total_batched_samples += <span class="hljs-number">1</span>

	<span class="hljs-keyword">with</span> ctx():
	inputs, targets = batch
	outputs = model(inputs)
	loss = loss_function(outputs, targets) <span class="hljs-comment"># the loss function should sum over samples rather than averaging</span>

	<span class="hljs-comment"># We multiply by num_processes because the DDP calculates the average gradient across all devices whereas dividing by num_items_in_batch already takes into account all devices</span>
	<span class="hljs-comment"># Same reason for gradient_accumulation_steps, but this times it's Accelerate that calculate the average gradient across the accumulated steps</span>
	loss = (loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch

	accelerator.backward(loss)

	<span class="hljs-comment"># Sync gradients and perform optimization steps once every gradient_accumulation_steps</span>
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="self-contained-causal-lm-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#self-contained-causal-lm-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Self-contained causal LM example</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">import</span> copy
	<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
	<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> set_seed
	<span class="hljs-keyword">from</span> accelerate.logging <span class="hljs-keyword">import</span> get_logger
	<span class="hljs-keyword">from</span> torch.utils.data <span class="hljs-keyword">import</span> Dataset, DataLoader
	<span class="hljs-keyword">import</span> math
	<span class="hljs-keyword">import</span> contexlib

	<span class="hljs-comment"># seed</span>
	set_seed(<span class="hljs-number">0</span>)
	logger = get_logger(__name__)

	<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyDataset</span>(<span class="hljs-title class_ inherited__">Dataset</span>):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, num_samples</span>):
	<span class="hljs-built_in">super</span>().__init__()
	self.<span class="hljs-built_in">len</span> = num_samples

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__getitem__</span>(<span class="hljs-params">self, index</span>):
	input_ids = torch.arange(<span class="hljs-number">1</span>, index+<span class="hljs-number">2</span>, dtype=torch.float32)
	labels = torch.remainder(input_ids, <span class="hljs-number">2</span>)
	<span class="hljs-keyword">return</span> {<span class="hljs-string">"input_ids"</span>: input_ids, <span class="hljs-string">"labels"</span>: labels}

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__len__</span>(<span class="hljs-params">self</span>):
	<span class="hljs-keyword">return</span> self.<span class="hljs-built_in">len</span>

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">collate_fn</span>(<span class="hljs-params">features</span>):
	input_ids = torch.nn.utils.rnn.pad_sequence([f[<span class="hljs-string">"input_ids"</span>] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features], batch_first=<span class="hljs-literal">True</span>, padding_value=-<span class="hljs-number">100</span>)
	labels = torch.nn.utils.rnn.pad_sequence([f[<span class="hljs-string">"labels"</span>] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features], batch_first=<span class="hljs-literal">True</span>, padding_value=-<span class="hljs-number">100</span>)
	<span class="hljs-keyword">return</span> {<span class="hljs-string">"input_ids"</span>: input_ids[..., <span class="hljs-literal">None</span>], <span class="hljs-string">"labels"</span>: labels[..., <span class="hljs-literal">None</span>]}

	<span class="hljs-comment"># define toy inputs and labels</span>
	gradient_accumulation_steps = <span class="hljs-number">2</span>
	per_device_batch_size = <span class="hljs-number">4</span>

	<span class="hljs-comment"># define accelerator</span>
	accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)

	<span class="hljs-comment"># define dataset and dataloader</span>
	<span class="hljs-comment"># for this toy example, we'll compute gradient descent over one single global batch</span>
	dataset = MyDataset(per_device_batch_sizegradient_accumulation_stepsaccelerator.num_processes)
	dataloader = DataLoader(dataset, batch_size=per_device_batch_size, collate_fn=collate_fn)

	<span class="hljs-comment"># define model, model_optimizer and loss function</span>
	model = torch.nn.Linear(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, bias=<span class="hljs-literal">False</span>)
	model_clone = copy.deepcopy(model)
	criterion = torch.nn.CrossEntropyLoss(reduction=<span class="hljs-string">"sum"</span>) <span class="hljs-comment"># must sum over samples rather than averaging</span>
	model_optimizer = torch.optim.SGD(model.parameters(), lr=<span class="hljs-number">0.08</span>)


	logger.warning(<span class="hljs-string">f"initial model weight is <span class="hljs-subst">{model.weight.detach().cpu().squeeze()}</span>"</span>)
	logger.warning(<span class="hljs-string">f"initial model clone weight is <span class="hljs-subst">{model_clone.weight.detach().cpu().squeeze()}</span>"</span>)

	<span class="hljs-comment"># prepare artifacts - accelerator handles device placement and dataloader splitting</span>
	model, model_optimizer = accelerator.prepare(model, model_optimizer)
	dataloader = accelerator.prepare_data_loader(dataloader, device_placement=<span class="hljs-literal">True</span>)
	training_iterator = <span class="hljs-built_in">iter</span>(dataloader)

	num_samples_in_epoch = <span class="hljs-built_in">len</span>(dataloader)
	remainder = num_samples_in_epoch % gradient_accumulation_steps
	remainder = remainder <span class="hljs-keyword">if</span> remainder != <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> gradient_accumulation_steps
	total_gradient_updates = math.ceil(num_samples_in_epoch / gradient_accumulation_steps)

	total_batched_samples = <span class="hljs-number">0</span>
	<span class="hljs-keyword">for</span> update_step <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(total_gradient_updates):
	<span class="hljs-comment"># In order to correctly the total number of non-padded tokens on which we'll compute the cross-entropy loss</span>
	<span class="hljs-comment"># we need to pre-load the full local batch - i.e the next per_device_batch_size * accumulation_steps samples</span>
	batch_samples = []
	num_batches_in_step = gradient_accumulation_steps <span class="hljs-keyword">if</span> update_step != (total_gradient_updates - <span class="hljs-number">1</span>) <span class="hljs-keyword">else</span> remainder
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(num_batches_in_step):
	batch_samples += [<span class="hljs-built_in">next</span>(training_iterator)]

	<span class="hljs-comment"># get local num items in batch </span>
	local_num_items_in_batch = <span class="hljs-built_in">sum</span>([(batch[<span class="hljs-string">"labels"</span>].ne(-<span class="hljs-number">100</span>)).<span class="hljs-built_in">sum</span>() <span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> batch_samples])
	logger.warning(<span class="hljs-string">f"Step <span class="hljs-subst">{update_step}</span> - Device <span class="hljs-subst">{accelerator.process_index}</span> - num items in the local batch <span class="hljs-subst">{local_num_items_in_batch}</span>"</span>, main_process_only=<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># to compute it correctly in a multi-device DDP training, we need to gather the total number of items in the full batch.</span>
	num_items_in_batch = accelerator.gather(local_num_items_in_batch).<span class="hljs-built_in">sum</span>().item()
	logger.warning(<span class="hljs-string">f"Total num items <span class="hljs-subst">{num_items_in_batch}</span>"</span>)

	<span class="hljs-keyword">for</span> i, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(batch_samples):
	inputs, labels = batch[<span class="hljs-string">"input_ids"</span>], batch[<span class="hljs-string">"labels"</span>]
	total_batched_samples += <span class="hljs-number">1</span>
	<span class="hljs-comment"># if we perform gradient accumulation in a multi-devices set-up, we want to avoid unnecessary communications when accumulating</span>
	<span class="hljs-comment"># cf: https://muellerzr.github.io/blog/gradient_accumulation.html</span>
	<span class="hljs-keyword">if</span> (i < <span class="hljs-built_in">len</span>(batch_samples) - <span class="hljs-number">1</span> <span class="hljs-keyword">and</span> accelerator.num_processes > <span class="hljs-number">1</span>):
	ctx = model.no_sync
	<span class="hljs-keyword">else</span>:
	ctx = contextlib.nullcontext
	<span class="hljs-keyword">with</span> ctx():

	outputs = model(inputs)
	loss = criterion(outputs.view(-<span class="hljs-number">1</span>, <span class="hljs-number">2</span>), labels.view(-<span class="hljs-number">1</span>).to(torch.int64))

	<span class="hljs-comment"># We multiply by num_processes because the DDP calculates the average gradient across all devices whereas dividing by num_items_in_batch already takes into account all devices</span>
	<span class="hljs-comment"># Same reason for gradient_accumulation_steps, but this times it's Accelerate that calculate the average gradient across the accumulated steps </span>
	loss = (loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
	accelerator.backward(loss)
	model_optimizer.step()
	model_optimizer.zero_grad()


	logger.warning(<span class="hljs-string">f"Device <span class="hljs-subst">{accelerator.process_index}</span> - w/ accumulation, the final model weight is <span class="hljs-subst">{accelerator.unwrap_model(model).weight.detach().cpu().squeeze()}</span>"</span>, main_process_only=<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># We know do the same operation but on a single device and without gradient accumulation</span>

	<span class="hljs-keyword">if</span> accelerator.is_main_process:
	<span class="hljs-comment"># prepare one single entire batch</span>
	dataloader = DataLoader(dataset, batch_size=<span class="hljs-built_in">len</span>(dataset), collate_fn=collate_fn)
	full_batch_without_accum = <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(dataloader))
	total_inputs, total_labels = full_batch_without_accum[<span class="hljs-string">"input_ids"</span>], full_batch_without_accum[<span class="hljs-string">"labels"</span>]
	model_clone_optimizer = torch.optim.SGD(model_clone.parameters(), lr=<span class="hljs-number">0.08</span>)

	<span class="hljs-comment"># train the cloned model</span>
	loss = torch.nn.CrossEntropyLoss(reduction=<span class="hljs-string">"mean"</span>)(model_clone(total_inputs).view(-<span class="hljs-number">1</span>, <span class="hljs-number">2</span>), total_labels.view(-<span class="hljs-number">1</span>).to(torch.int64))
	model_clone_optimizer.zero_grad()
	loss.backward()
	model_clone_optimizer.step()

	<span class="hljs-comment"># We should have the same final weights.</span>
	logger.warning(<span class="hljs-string">f"w/o accumulation, the final model weight is <span class="hljs-subst">{model_clone.weight.detach().cpu().squeeze()}</span>"</span>)
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oywi83">Results on a single device - gradient accumulation steps set to 1 and batch_size set to 8:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">initial</span> model weight is tensor([-<span class="hljs-number">0</span>.<span class="hljs-number">0075</span>, <span class="hljs-number">0</span>.<span class="hljs-number">5364</span>])
	<span class="hljs-attribute">initial</span> model clone weight is tensor([-<span class="hljs-number">0</span>.<span class="hljs-number">0075</span>, <span class="hljs-number">0</span>.<span class="hljs-number">5364</span>])
	<span class="hljs-attribute">Step</span> <span class="hljs-number">0</span> - Device <span class="hljs-number">0</span> - num items in the local batch <span class="hljs-number">36</span>
	<span class="hljs-attribute">Total</span> num items <span class="hljs-number">36</span>
	<span class="hljs-attribute">Device</span> <span class="hljs-number">0</span> - w/ accumulation, the final model weight is tensor([<span class="hljs-number">0</span>.<span class="hljs-number">0953</span>, <span class="hljs-number">0</span>.<span class="hljs-number">4337</span>])
	<span class="hljs-attribute">w</span>/o accumulation, the final model weight is tensor([<span class="hljs-number">0</span>.<span class="hljs-number">0953</span>, <span class="hljs-number">0</span>.<span class="hljs-number">4337</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1519q1p">Results on a two devices set-up - gradient accumulation steps set to 2 and batch_size set to 4.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">initial</span> model weight is tensor([-<span class="hljs-number">0</span>.<span class="hljs-number">0075</span>, <span class="hljs-number">0</span>.<span class="hljs-number">5364</span>])
	<span class="hljs-attribute">initial</span> model clone weight is tensor([-<span class="hljs-number">0</span>.<span class="hljs-number">0075</span>, <span class="hljs-number">0</span>.<span class="hljs-number">5364</span>])
	<span class="hljs-attribute">Step</span> <span class="hljs-number">0</span> - Device <span class="hljs-number">0</span> - num items in the local batch <span class="hljs-number">52</span>
	<span class="hljs-attribute">Step</span> <span class="hljs-number">0</span> - Device <span class="hljs-number">1</span> - num items in the local batch <span class="hljs-number">84</span>
	<span class="hljs-attribute">Total</span> num items <span class="hljs-number">136</span>
	<span class="hljs-attribute">Device</span> <span class="hljs-number">1</span> - w/ accumulation, the final model weight is tensor([<span class="hljs-number">0</span>.<span class="hljs-number">2117</span>, <span class="hljs-number">0</span>.<span class="hljs-number">3172</span>])
	<span class="hljs-attribute">Device</span> <span class="hljs-number">0</span> - w/ accumulation, the final model weight is tensor([<span class="hljs-number">0</span>.<span class="hljs-number">2117</span>, <span class="hljs-number">0</span>.<span class="hljs-number">3172</span>])
	<span class="hljs-attribute">w</span>/o accumulation, the final model weight is tensor([<span class="hljs-number">0</span>.<span class="hljs-number">2117</span>, <span class="hljs-number">0</span>.<span class="hljs-number">3172</span>])<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="to-go-further" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#to-go-further"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>To go further:</span></h3> <p data-svelte-h="svelte-14ssh20">Please find a complete example script on a real world training run in the examples folder at the path <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation_for_autoregressive_models.py" rel="nofollow"><code>accelerate/examples/by_feature/gradient_accumulation_for_autoregressive_models.py</code></a>.</p> <p data-svelte-h="svelte-1mi0t6">Running it on several training configurations with constant global batch size equal to 32 gives the following graph:</p> <div style="text-align: center" data-svelte-h="svelte-11tj8ly"><img src="https://huggingface.co/datasets/hf-audio/gradient_accumulation_example/resolve/main/training_losses.png"></div> <p data-svelte-h="svelte-vzj4yr">Note that the training losses are exactly the same up to training step 20. The small deviation after this training step occurs at the very end of the first epoch, because, by <a href="https://huggingface.co/docs/accelerate/en/package_reference/torch_wrappers#accelerate.data_loader.prepare_data_loader.even_batches" rel="nofollow">default</a>, the dataloader duplicates the samples at the beginning of the dataset when the total batch size doesn’t exactly divide the dataset.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/accelerate/blob/main/docs/source/usage_guides/gradient_accumulation.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1q7nz6m = {
	assets: "/docs/accelerate/pr_4021/en",
	base: "/docs/accelerate/pr_4021/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js"),
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 48],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 69.5 kB
Xet hash:: ab29d3e354d8218234f22b5c80878a811f2a1dc98a9893bf6023ce59d7deef8f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.