Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / accelerate /pr_4021 /en /concept_guides /sequence_parallelism.html

HuggingFaceDocBuilder

29 days ago

download

raw

40 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Sequence parallel in 🤗 accelerate","local":"sequence-parallel-in--accelerate","sections":[{"title":"Why sequence parallelism?","local":"why-sequence-parallelism","sections":[],"depth":2},{"title":"How is Ulysses SP different from FSDP CP","local":"how-is-ulysses-sp-different-from-fsdp-cp","sections":[],"depth":2},{"title":"Supported sequence parallelism backends","local":"supported-sequence-parallelism-backends","sections":[],"depth":2},{"title":"How to use sequence parallelism?","local":"how-to-use-sequence-parallelism","sections":[],"depth":2},{"title":"ALST/Ulysses SP backend configuration","local":"alstulysses-sp-backend-configuration","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/accelerate/pr_4021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/scheduler.b9285784.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/singletons.7547c222.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.6d423e5c.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/paths.d42c9205.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/preload-helper.b0bd19d1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/index.26bc89a1.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/0.0e7c56e8.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/nodes/19.49319bee.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.7a0ae628.js">
	<link rel="modulepreload" href="/docs/accelerate/pr_4021/en/_app/immutable/chunks/CodeBlock.844ff9c3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Sequence parallel in 🤗 accelerate","local":"sequence-parallel-in--accelerate","sections":[{"title":"Why sequence parallelism?","local":"why-sequence-parallelism","sections":[],"depth":2},{"title":"How is Ulysses SP different from FSDP CP","local":"how-is-ulysses-sp-different-from-fsdp-cp","sections":[],"depth":2},{"title":"Supported sequence parallelism backends","local":"supported-sequence-parallelism-backends","sections":[],"depth":2},{"title":"How to use sequence parallelism?","local":"how-to-use-sequence-parallelism","sections":[],"depth":2},{"title":"ALST/Ulysses SP backend configuration","local":"alstulysses-sp-backend-configuration","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="sequence-parallel-in--accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sequence-parallel-in--accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sequence parallel in 🤗 accelerate</span></h1> <p data-svelte-h="svelte-1y95ixw">This guide will cover basics of using sequence parallelism in 🤗<code>accelerate</code>.</p> <p data-svelte-h="svelte-1uutlop">See also the very related <a href="./context_parallelism">Context Parallellism</a>.</p> <h2 class="relative group"><a id="why-sequence-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-sequence-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why sequence parallelism?</span></h2> <p data-svelte-h="svelte-2sleob">With the advent of large language models, and recently reasoning models, the sequence length has been growing rapidly. This, combined with quadratic memory complexity of attention, has led to a need for more efficient ways to train models with long sequences.
	With sequence length of 128k, the memory requirement of the attention matrix is <code>128k * 128k * 2 bytes * num_heads = ~32 GB * num_heads</code> for <code>bf16</code> precision, given vanilla attention implementation. Granted, with usage of <code>flash attention</code> or <code>SDPA</code> which do not materialize these attention weights, this decreases drastically, but the growth in memory requirements is still considerable.</p> <p data-svelte-h="svelte-10nqeju">Ulysses Sequence parallelism allows us to shard the inputs to the attention computation along the sequence dimension and compute the attention normally, but using only a slice of attention heads on each GPU. With this, we can train models with long sequences, with a few more tools, scaling to 15M+ sequence length. To see how to augment Ulysses SP with TiledMLP, Liger-Kernel, Activation checkpoint offload to cpu and a few other tricks pleae refer to the paper: <a href="https://arxiv.org/abs/2506.13996" rel="nofollow">Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences</a>.</p> <h2 class="relative group"><a id="how-is-ulysses-sp-different-from-fsdp-cp" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-is-ulysses-sp-different-from-fsdp-cp"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How is Ulysses SP different from FSDP CP</span></h2> <p data-svelte-h="svelte-1ulu66">In the document <a href="./context_parallelism">Context Parallellism</a> you can learn about deploying another technology called Context Parallelism, which too slices on the sequence dimension but uses Ring Attention instead of slicing on the head dimension.</p> <p data-svelte-h="svelte-1vni0ze">The following articles go into a very detailed explanation of the differences between the two technologies:</p> <ul data-svelte-h="svelte-105nvw"><li><a href="https://insujang.github.io/2024-01-11/tensor-parallelism-and-sequence-parallelism-detailed-analysis/" rel="nofollow">https://insujang.github.io/2024-01-11/tensor-parallelism-and-sequence-parallelism-detailed-analysis/</a></li> <li><a href="https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention" rel="nofollow">https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention</a></li></ul> <p data-svelte-h="svelte-1h1w23n">A quick summary adapting from one of the articles:</p> <ul data-svelte-h="svelte-5wq4l4"><li>Ulysses SP has a relatively low communication overhead, but is limited by the number of Attention Heads and thus it has certain requirements for network topology (number of attention heads has has to be divisible by the number of participating gpus for a single replica). All-to-all communication is sensitive to latency and it requires Deepspeed.</li> <li>FSDP CP Ring-Attention’s P2P ring communication has no aforementioned divisibilty requirements, but has a higher communication volume.</li></ul> <p data-svelte-h="svelte-qtmpoo">Finally it should be possible to combine SP + CP as explained in the paper <a href="https://arxiv.org/abs/2405.07719" rel="nofollow">USP: A Unified Sequence Parallelism Approach for Long Context Generative AI</a> to support an even longer sequence length, albeit this is not yet integrated into 🤗<code>accelerate</code>.</p> <h2 class="relative group"><a id="supported-sequence-parallelism-backends" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supported-sequence-parallelism-backends"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supported sequence parallelism backends</span></h2> <p data-svelte-h="svelte-1t64il8">Currently the only sequence parallelism backend is <code>deepspeed</code>, which comes from the modernized Ulysses SP which is part of the <a href="https://arxiv.org/abs/2506.13996" rel="nofollow">Arctic Long Sequence Training technology</a>. There is also a <a href="https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/" rel="nofollow">tutorial</a> should you want to integrate it into your own code directly.</p> <h2 class="relative group"><a id="how-to-use-sequence-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-use-sequence-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to use sequence parallelism?</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->from accelerate.utils import ParallelismConfig, DeepSpeedSequenceParallelConfig

	<span class="hljs-addition">+# Example: 4 GPUs with sp_size=4, dp_shard_size=1</span>
	<span class="hljs-addition">+# Ensure: dp_replicate_size × dp_shard_size × sp_size = 1 × 1 × 4 = 4 GPUs</span>
	parallelism_config = ParallelismConfig(
	<span class="hljs-addition">+ sp_backend="deepspeed",</span>
	<span class="hljs-addition">+ sp_size=4,</span>
	<span class="hljs-addition">+ dp_shard_size=1, # Explicit: no data parallelism</span>
	<span class="hljs-addition">+ sp_handler=DeepSpeedSequenceParallelConfig(</span>
	<span class="hljs-addition">+ sp_seq_length_is_variable: true,</span>
	<span class="hljs-addition">+ sp_attn_implementation="sdpa",</span>
	<span class="hljs-addition">+ ),</span>
	<span class="hljs-addition">+ )</span>

	accelerator = Accelerator(
	...,
	parallelism_config=parallelism_config,
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jsu941">As with any other feature in 🤗<code>accelerate</code>, you can enable sequence parallelism also by passing the corresponding flags to <code>accelerate launch</code>. In this case, it’s no different:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch --parallelism-config-sp-size 8 ...<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-1605fl9"><p>You can also set the <code>sp_size</code> and other configuration in the <code>accelerate config</code> command, which will save them in your <code>accelerate</code> configuration file, so you don’t have to pass them every time you launch your script.</p></blockquote> <blockquote class="tip" data-svelte-h="svelte-awy0tc"><p>sequence parallelism combines with data parallelism. It doesn’t require additional GPUs.
	So if you have 8 gpus you can do: <code>--parallelism-config-dp-shard-size 8 --parallelism-config-sp-size 8</code>. Or you can use the <code>ParallelismConfig</code> class to set them programmatically.</p> <p><strong>Important</strong>: You must ensure <code>dp_replicate_size × dp_shard_size × sp_size = num_processes</code>. For example, with 8 GPUs and <code>sp_size=8</code>, you need <code>dp_shard_size=1</code> (since 1 × 1 × 8 = 8). With 4 GPUs and <code>sp_size=2</code>, you could use <code>dp_shard_size=2</code> (since 1 × 2 × 2 = 4) for 2D parallelism.</p></blockquote> <h2 class="relative group"><a id="alstulysses-sp-backend-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#alstulysses-sp-backend-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ALST/Ulysses SP backend configuration</span></h2> <p data-svelte-h="svelte-11qv90n">ALST/UlyssesSP implements sequence parallelism using attention head parallelism, as explained in <a href="https://arxiv.org/abs/2506.13996" rel="nofollow">this paper</a>. For simplicity, we reuse the concept and setup of sequence parallelism, which, from the user’s perspective, is the same: multiple GPUs are used to process a single batch.</p> <p data-svelte-h="svelte-6a2vre">To give a sense of what ALST made possible - it allowed us to train in bf16 with 500K tokens on a single H100 GPU, 3.7M on a single node, and 15M on Llama-8B using just four nodes. This feature of HF Accelerate enables only 1 of the 3 ALST components, so the achievable sequence length will be smaller. You’d want TiledMLP, Activation checkpoint offload to CPU, and a few other things enabled to get the full power of ALST. For details, please refer to <a href="https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/" rel="nofollow">this tutorial</a>.</p> <p data-svelte-h="svelte-oter7c">To configure the <code>deepspeed</code> backend:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Example: 4 GPUs with sp_size=4, dp_shard_size=1</span>
	<span class="hljs-comment"># Ensure: dp_replicate_size × dp_shard_size × sp_size = 1 × 1 × 4 = 4 GPUs</span>
	parallelism_config = ParallelismConfig(
	sp_backend=<span class="hljs-string">"deepspeed"</span>,
	sp_size=<span class="hljs-number">4</span>,
	dp_shard_size=<span class="hljs-number">1</span>, <span class="hljs-comment"># Explicit: no data parallelism</span>
	sp_handler=DeepSpeedSequenceParallelConfig(
	sp_seq_length=<span class="hljs-number">256</span>,
	sp_seq_length_is_variable=<span class="hljs-literal">True</span>,
	sp_attn_implementation=<span class="hljs-string">"sdpa"</span>,
	),
	)
	accelerator = Accelerator(
	...,
	parallelism_config=parallelism_config,
	)<!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-3nnh2k"><li><code>sp_backend</code>: set to <code>deepspeed</code> here</li> <li><code>sp_size</code> is the degree of the sequence parallelism - in the above example it’s 4, therefore 4 gpus will be used to process a single batch (while doing DP=4 over the same gpus)</li> <li><code>sp_seq_length</code> and <code>sp_seq_length_is_variable</code> are used to deal with sequence lengths. If <code>sp_seq_length_is_variable=True</code> the backend will work with a sequence length that may change between batches, in which case <code>sp_seq_length</code> value can be set to anything divisible by the sequence parallel degree or not set at all. In this case on every <code>forward</code> the sequence variables will be derived from input. If <code>False</code> then <code>seq_length</code> needs to match the batch’s sequence length dimension, which then will have to be padded to be always the same. The default is <code>True</code>.</li> <li><code>sp_attn_implementation</code> is one of <code>sdpa</code>, <code>flash_attention_2</code> or <code>flash_attention_3</code>. This sequence parallel implementation uses <code>position_ids</code> instead of <code>attention_mask</code> therefore, <code>eager</code> can’t work here until it supports working with <code>position_ids</code>. Also, please note that <code>sdpa</code> doesn’t handle multiple samples combined into one correctly; it will attend to the whole sample as one. If the samples aren’t combined, <code>sdpa</code> will work correctly. Therefore, Flash Attention should be the ideal choice as it always works.</li></ul> <p data-svelte-h="svelte-4mpkcu">Instead of setting these values in <code>DeepSpeedSequenceParallelConfig</code> object, you can also use the environment variables to accomplish the same - here they are correspondingly to the end of the list above.</p> <ul data-svelte-h="svelte-1vnsins"><li><code>PARALLELISM_CONFIG_SP_BACKEND</code></li> <li><code>PARALLELISM_CONFIG_SP_SEQ_LENGTH</code></li> <li><code>PARALLELISM_CONFIG_SP_SEQ_LENGTH_IS_VARIABLE</code></li> <li><code>PARALLELISM_CONFIG_SP_ATTN_IMPLEMENTATION</code></li></ul> <p data-svelte-h="svelte-1jbv0ue">If not passed in the code, <code>sp_size</code> can be set via <code>--parallelism_config_sp_size</code> CLI argument. Same for other arguments. You can also do the accelerate config file style config, e.g., for 2 GPUs:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">distributed_type:</span> <span class="hljs-string">DEEPSPEED</span>
	<span class="hljs-attr">deepspeed_config:</span>
	<span class="hljs-attr">deepspeed_config_file:</span> <span class="hljs-string">path/to/ds_config.json</span>
	<span class="hljs-attr">machine_rank:</span> <span class="hljs-number">0</span>
	<span class="hljs-attr">num_machines:</span> <span class="hljs-number">1</span>
	<span class="hljs-attr">num_processes:</span> <span class="hljs-number">2</span>
	<span class="hljs-attr">parallelism_config:</span>
	<span class="hljs-attr">parallelism_config_dp_replicate_size:</span> <span class="hljs-number">1</span>
	<span class="hljs-attr">parallelism_config_dp_shard_size:</span> <span class="hljs-number">1</span> <span class="hljs-comment"># Must satisfy: 1 × 1 × 2 = 2 num_processes</span>
	<span class="hljs-attr">parallelism_config_sp_size:</span> <span class="hljs-number">2</span>
	<span class="hljs-attr">parallelism_config_sp_backend:</span> <span class="hljs-string">deepspeed</span>
	<span class="hljs-attr">parallelism_config_sp_seq_length_is_variable:</span> <span class="hljs-literal">true</span>
	<span class="hljs-attr">parallelism_config_sp_attn_implementation:</span> <span class="hljs-string">sdpa</span>
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cd9zrb">As mentioned earlier Ulysses sequence parallelism is normally overlayed with data parallelism - same ranks are used for feeding unique data streams and also perform Ulysses Sequence Parallelism. But you could also create replicas like so:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Example: 4 GPUs with 2D parallelism (SP=2, DP=2)</span>
	<span class="hljs-comment"># Ensure: dp_replicate_size × dp_shard_size × sp_size = 2 × 1 × 2 = 4 GPUs</span>
	parallelism_config = ParallelismConfig(
	dp_replicate_size=<span class="hljs-number">2</span>,
	dp_shard_size=<span class="hljs-number">1</span>, <span class="hljs-comment"># Explicit: no sharding within replicas</span>
	sp_size=<span class="hljs-number">2</span>,
	sp_backend=<span class="hljs-string">"deepspeed"</span>,
	sp_handler=DeepSpeedSequenceParallelConfig(...),
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fapldk">Here we use 4 gpus, with 2 sequence parallelism replicas. Deepspeed-ZeRO is what drives the data parallelism here.</p> <p data-svelte-h="svelte-wvrcbe">Please note that a lot of magic is hidden inside <a href="https://github.com/deepspeedai/DeepSpeed/blob/64c0052fa08438b4ecf4cae30af15091a92d2108/deepspeed/runtime/sequence_parallel/ulysses_sp.py#L442" rel="nofollow">UlyssesSPDataLoaderAdapter</a>. It’s used behind the scenes, wrapping your original DataLoader object, but you should be aware of it should you run into any problems. It also automatically injects the correct <code>shift_labels</code> into the batch dictionary, before the batch gets sharded across the participating ranks.</p> <p data-svelte-h="svelte-k9ukq7">Now the only remaining piece to start using ALST/UlyssesSP is to aggregate the loss across ranks using a differentiable <code>all_gather</code> to get the grads right. The following code does it, while also excluding any masked out with <code>-100</code> tokens, to get the correct average:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sp_size = parallelism_config.sp_size <span class="hljs-keyword">if</span> parallelism_config <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> <span class="hljs-number">1</span>
	<span class="hljs-keyword">if</span> sp_size > <span class="hljs-number">1</span>:
	sp_group = accelerator.torch_device_mesh[<span class="hljs-string">"sp"</span>].get_group()
	sp_world_size = parallelism_config.sp_size

	<span class="hljs-comment"># Normal training loop</span>
	<span class="hljs-keyword">for</span> <span class="hljs-built_in">iter</span>, batch <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(dl):
	optimizer.zero_grad()

	batch = move_to_device(batch, model.device)

	<span class="hljs-comment"># The model automatically receives shift_labels via **kwargs and uses it for loss computation.</span>
	<span class="hljs-comment"># Both standard transformers models and Liger-patched models handle this correctly.</span>
	outputs = model(**batch)
	loss = outputs.loss
	shift_labels = batch[<span class="hljs-string">"shift_labels"</span>]

	<span class="hljs-keyword">if</span> sp_size > <span class="hljs-number">1</span>:
	<span class="hljs-comment"># differentiable weighted per-shard-loss aggregation across ranks</span>
	losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
	<span class="hljs-comment"># special dealing with SFT that has prompt tokens that aren't used in loss computation</span>
	good_tokens = (shift_labels != -<span class="hljs-number">100</span>).view(-<span class="hljs-number">1</span>).<span class="hljs-built_in">sum</span>()
	good_tokens_per_rank = torch.distributed.nn.functional.all_gather(
	good_tokens, group=sp_group
	)
	<span class="hljs-comment"># Skip ranks with zero valid tokens to avoid NaN contamination (NaN * 0 = NaN)</span>
	total_loss = <span class="hljs-built_in">sum</span>(
	losses_per_rank[rank] * good_tokens_per_rank[rank]
	<span class="hljs-keyword">for</span> rank <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(sp_world_size)
	<span class="hljs-keyword">if</span> good_tokens_per_rank[rank] > <span class="hljs-number">0</span>
	)
	total_good_tokens = <span class="hljs-built_in">sum</span>(good_tokens_per_rank)
	loss = total_loss / <span class="hljs-built_in">max</span>(total_good_tokens, <span class="hljs-number">1</span>)

	<span class="hljs-keyword">if</span> rank == <span class="hljs-number">0</span>: accelerator.<span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{<span class="hljs-built_in">iter</span>}</span>: <span class="hljs-subst">{loss=}</span>"</span>)
	accelerator.log(<span class="hljs-built_in">dict</span>(train_loss=loss, step=<span class="hljs-built_in">iter</span>))

	accelerator.backward(loss)
	optimizer.step()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sges4l">Note that models automatically handle <code>shift_labels</code> when it’s present in the batch. The model’s forward pass receives <code>shift_labels</code> via <code>**kwargs</code> and passes it to the loss function, which correctly computes the loss for sequence parallelism. If you use <a href="https://github.com/linkedin/Liger-Kernel" rel="nofollow">Liger Kernel</a>, it also handles <code>shift_labels</code> seamlessly and computes loss in a very memory-efficient way. Liger is highly recommended for long sequence lengths, as it liberates GPU memory by using fused operations (e.g., fused logit-loss computation that never materializes the full logits tensor in memory).</p> <p data-svelte-h="svelte-1i6q8r9">If you want to see what HF Accelerate did behind the scenes please read <a href="https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/" rel="nofollow">this full integration tutorial</a>.</p> <p data-svelte-h="svelte-1x8p321">For an example of an Accelerate training loop with enabled ALST/UlyssesSP see <a href="https://github.com/huggingface/accelerate/blob/main/examples/alst_ulysses_sequence_parallelism" rel="nofollow">examples/alst_ulysses_sequence_parallelism</a>.</p> <p data-svelte-h="svelte-1i5r9ol">[!Warning]</p> <blockquote data-svelte-h="svelte-1do693p"><p>This API is quite new and still in its experimental stage. While we strive to provide a stable API, some small parts of the public API may change in the future.</p></blockquote> <p data-svelte-h="svelte-ctzlnp">Since this is a Deepspeed backend the usual Deepspeed configuration applies, so you can combine sequence parallelism with optimizer states and/or weights offloading as well to liberate more gpu memory and enable an even longer sequence length. This technology has been tested to work with DeepSpeed ZeRO stage 2 and 3.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/sequence_parallelism.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1q7nz6m = {
	assets: "/docs/accelerate/pr_4021/en",
	base: "/docs/accelerate/pr_4021/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/start.8a49e72b.js"),
	import("/docs/accelerate/pr_4021/en/_app/immutable/entry/app.1df4d18e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 19],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 40 kB
Xet hash:: 8383e96f87243d1c8a10cf2034f933ba8e1b8c8c9460a9a9a0670743d99257c4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.