Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / accelerate /pr_4021 /en /concept_guides /low_precision_training.html

HuggingFaceDocBuilder

about 1 month ago

download

raw

18.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Low precision training methods","local":"low-precision-training-methods","sections":[{"title":"A Quick Chart","local":"a-quick-chart","sections":[],"depth":2},{"title":"TransformersEngine","local":"transformersengine","sections":[],"depth":2},{"title":"MS-AMP","local":"ms-amp","sections":[],"depth":2},{"title":"Combining the two","local":"combining-the-two","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/accelerate/pr_4021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/scheduler.b9285784.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/singletons.7547c222.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.6d423e5c.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/paths.d42c9205.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/preload-helper.b0bd19d1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.26bc89a1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/0.0e7c56e8.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/17.cadcb083.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/Tip.e4eba3d6.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.7a0ae628.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Low precision training methods","local":"low-precision-training-methods","sections":[{"title":"A Quick Chart","local":"a-quick-chart","sections":[],"depth":2},{"title":"TransformersEngine","local":"transformersengine","sections":[],"depth":2},{"title":"MS-AMP","local":"ms-amp","sections":[],"depth":2},{"title":"Combining the two","local":"combining-the-two","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="low-precision-training-methods" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#low-precision-training-methods"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Low precision training methods</span></h1> <p data-svelte-h="svelte-1q5qvk1">The release of new kinds of hardware led to the emergence of new training paradigms that better utilize them. Currently, this is in the form of training
	in 8-bit precision using packages such as <a href="https://github.com/NVIDIA/TransformerEngine" rel="nofollow">TransformersEngine</a> (TE), <a href="https://github.com/pytorch/ao" rel="nofollow">torchao</a> (native PyTorch FP8), or the legacy <a href="https://github.com/Azure/MS-AMP/tree/main" rel="nofollow">MS-AMP</a> (no longer maintained, see warning below).</p> <p data-svelte-h="svelte-h747mi">For an introduction to the topics discussed today, we recommend reviewing the <a href="../usage_guides/low_precision_training">low-precision usage guide</a> as this documentation will reference it regularly.</p> <h2 class="relative group"><a id="a-quick-chart" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-quick-chart"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A Quick Chart</span></h2> <p data-svelte-h="svelte-exttfl">Below is a quick chart from the MS-AMP documentation showing the different bit-precisions for each solution during training:</p> <table data-svelte-h="svelte-1i0vsc4"><thead><tr><th>Optimization Level</th> <th>Computation(GEMM)</th> <th>Comm</th> <th>Weight</th> <th>Master Weight</th> <th>Weight Gradient</th> <th>Optimizer States</th></tr></thead> <tbody><tr><td>FP16 AMP</td> <td>FP16</td> <td>FP32</td> <td>FP32</td> <td>N/A</td> <td>FP32</td> <td>FP32+FP32</td></tr> <tr><td>Nvidia TE</td> <td>FP8</td> <td>FP32</td> <td>FP32</td> <td>N/A</td> <td>FP32</td> <td>FP32+FP32</td></tr> <tr><td>MS-AMP O1</td> <td>FP8</td> <td>FP8</td> <td>FP16</td> <td>N/A</td> <td>FP8</td> <td>FP32+FP32</td></tr> <tr><td>MS-AMP O2</td> <td>FP8</td> <td>FP8</td> <td>FP16</td> <td>N/A</td> <td>FP8</td> <td>FP8+FP16</td></tr> <tr><td>MS-AMP O3</td> <td>FP8</td> <td>FP8</td> <td>FP8</td> <td>FP16</td> <td>FP8</td> <td>FP8+FP16</td></tr></tbody></table> <h2 class="relative group"><a id="transformersengine" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transformersengine"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>TransformersEngine</span></h2> <p data-svelte-h="svelte-dr9e4k"><code>TransformersEngine</code> is the first solution to trying to train in 8-bit floating point. It works by using drop-in replacement layers for certain ones in a model that utilizes their FP8-engine to reduce the number of bits (such as 32 to 8) without degrading the final accuracy of the model.</p> <p data-svelte-h="svelte-1q0x39z">Specifically, Accelerate will find and replace the following layers with <code>TransformersEngine</code> versions:</p> <ul data-svelte-h="svelte-skm64u"><li><code>nn.LayerNorm</code> for <code>te.LayerNorm</code></li> <li><code>nn.Linear</code> for <code>te.Linear</code></li></ul> <p data-svelte-h="svelte-j7rttk">As a result we wind up with a model that has most of its layers in BF16, while some layers are in FP8 reducing some of the memory.</p> <p data-svelte-h="svelte-1jj2y1j">Anecdotally, we have noticed that performance gains don’t really start showing when using <code>TransformerEngine</code> until a large majority of the layers
	in the model are made up of those two layers to replace. As a result, only larger models have shown performance improvements when the number of parameters is around and upwards of a few billion.</p> <p data-svelte-h="svelte-14x2a3p">The <code>TransformerEngine</code> can receive many different arguments that customize how it performs FP8 calculations and what they do. A full list of the arguments is available below:</p> <ul data-svelte-h="svelte-n2qwcq"><li><code>margin</code>: The margin to use for the gradient scaling.</li> <li><code>interval</code>: The interval to use for how often the scaling factor is recomputed.</li> <li><code>fp8_format``: The format to use for the FP8 recipe. Must be one of </code>HYBRID<code>or</code>E4M3<code>. (Generally </code>HYBRID<code>for training,</code>E4M3` for evaluation)</li> <li><code>amax_history_len</code>: The length of the history to use for the scaling factor computation</li> <li><code>amax_compute_algo</code>: The algorithm to use for the scaling factor computation. Must be one of <code>max</code> or <code>most_recent</code>.</li> <li><code>override_linear_precision</code>: Whether or not to execute <code>fprop</code>, <code>dgrad</code>, and <code>wgrad</code> GEMMS in higher precision.</li></ul> <p data-svelte-h="svelte-5acexk">You can customize each of these as part of <a href="/docs/accelerate/pr_4021/en/package_reference/utilities#accelerate.utils.FP8RecipeKwargs">utils.FP8RecipeKwargs</a> to help optimize performance of your models.</p> <p data-svelte-h="svelte-1fdva09">If we notice in the chart mentioned earlier, TE simply casts the computation layers into FP8, while everything else is in FP32. As a result this winds up utilizing the most memory but does so with the benefit of guaranteeing the least amount of loss in end accuracy during training.</p> <h2 class="relative group"><a id="ms-amp" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ms-amp"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>MS-AMP</span></h2> <blockquote class="warning"><p data-svelte-h="svelte-1w6kovd"><strong>⚠️ Deprecated / Unmaintained:</strong> MS-AMP is no longer actively maintained by Microsoft. The repository has not seen updates since 2023 and has known compatibility issues with CUDA 12.x+, modern NCCL versions, and recent PyTorch releases (2.2+). <strong>We strongly recommend using <code>TransformersEngine</code> or <code>torchao</code> instead.</strong> See the <a href="../usage_guides/low_precision_training">usage guide</a> for migration instructions.</p></blockquote> <p data-svelte-h="svelte-wkik4f">MS-AMP takes a different approach to <code>TransformersEngine</code> by providing three different optimization levels to convert more operations in FP8 or FP16.</p> <ul data-svelte-h="svelte-1uqkdni"><li><p>The base optimization level (<code>O1</code>), passes communications of the weights (such as in DDP) in FP8, stores the weights of the model in FP16, and leaves the optimizer states in FP32. The main benefit of this optimization level is that we can reduce the communication bandwidth by essentially half. Additionally, more GPU memory is saved due to 1/2 of everything being cast in FP8, and the weights being cast to FP16. Notably, both the optimizer states remain in FP32.</p></li> <li><p>The second optimization level (<code>O2</code>) improves upon this by also reducing the precision of the optimizer states. One is in FP8 while the other is in FP16. Generally it’s been shown that this will only provide a net-gain of no degraded end accuracy, increased training speed, and reduced memory as now every state is either in FP16 or FP8.</p></li> <li><p>Finally, MS-AMP has a third optimization level (<code>O3</code>) which helps during DDP scenarios such as DeepSpeed. The weights of the model in memory are fully cast to FP8, and the master weights are now stored in FP16. This fully reduces memory by the highest factor as now not only is almost everything in FP8, only two states are left in FP16. Currently, only DeepSpeed versions up through 0.9.2 are supported, so this capability is not included in the Accelerate integration</p></li></ul> <h2 class="relative group"><a id="combining-the-two" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#combining-the-two"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Combining the two</span></h2> <blockquote class="warning"><p data-svelte-h="svelte-10ivk1l">Since MS-AMP is no longer maintained, this combination is not recommended for new projects.</p></blockquote> <p data-svelte-h="svelte-1nx3yqq">More experiments need to be performed but it’s been noted that combining both MS-AMP and TransformersEngine can lead to the highest throughput by relying on NVIDIA’s optimized FP8 operators and utilizing how MS-AMP reduces the memory overhead.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/low_precision_training.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1q7nz6m = {
	assets: "/docs/accelerate/pr_4021/en",
	base: "/docs/accelerate/pr_4021/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js"),
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 17],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 18.8 kB
Xet hash:: f5fc3bf6aa1ced24e1e343bb15f56160eaae46e5fcd0942c486160726f53ba2a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.