Buckets:

hf-doc-build/doc-dev / trl /pr_5607 /en /distributing_training.html
download
raw
96.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Distributing Training&quot;,&quot;local&quot;:&quot;distributing-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Multi-GPU Training with TRL&quot;,&quot;local&quot;:&quot;multi-gpu-training-with-trl&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Sequence Parallelism for Long Context Training&quot;,&quot;local&quot;:&quot;sequence-parallelism-for-long-context-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Available Implementations&quot;,&quot;local&quot;:&quot;available-implementations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Choosing Between Ring Attention and Ulysses&quot;,&quot;local&quot;:&quot;choosing-between-ring-attention-and-ulysses&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Ring Attention Implementation (FSDP2)&quot;,&quot;local&quot;:&quot;ring-attention-implementation-fsdp2&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requirements and Limitations&quot;,&quot;local&quot;:&quot;requirements-and-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Configuration&quot;,&quot;local&quot;:&quot;configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5},{&quot;title&quot;:&quot;Training Configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5}],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Best Practices&quot;,&quot;local&quot;:&quot;best-practices&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Benchmarking Ring Attention&quot;,&quot;local&quot;:&quot;benchmarking-ring-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;ALST/Ulysses Implementation (DeepSpeed)&quot;,&quot;local&quot;:&quot;alstulysses-implementation-deepspeed&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requirements and Limitations&quot;,&quot;local&quot;:&quot;requirements-and-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Configuration&quot;,&quot;local&quot;:&quot;configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5},{&quot;title&quot;:&quot;Training Configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5}],&quot;depth&quot;:4},{&quot;title&quot;:&quot;2D Parallelism&quot;,&quot;local&quot;:&quot;2d-parallelism&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Best Practices&quot;,&quot;local&quot;:&quot;best-practices&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Complete Example&quot;,&quot;local&quot;:&quot;complete-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;General Resources&quot;,&quot;local&quot;:&quot;general-resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Ring Attention (FSDP2)&quot;,&quot;local&quot;:&quot;ring-attention-fsdp2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;ALST/Ulysses (DeepSpeed)&quot;,&quot;local&quot;:&quot;alstulysses-deepspeed&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multi-Node Training&quot;,&quot;local&quot;:&quot;multi-node-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Launching&quot;,&quot;local&quot;:&quot;launching&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Option 1: Manual Launch (Non-HPC)&quot;,&quot;local&quot;:&quot;option-1-manual-launch-non-hpc&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Option 2: SLURM Launch (HPC Clusters)&quot;,&quot;local&quot;:&quot;option-2-slurm-launch-hpc-clusters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/trl/pr_5607/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/entry/start.151d81bd.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/scheduler.7b731bd4.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/singletons.2cf51804.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/index.ac28c20f.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/paths.ba01f37d.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/entry/app.3d9a91c0.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/preload-helper.e1689b3a.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/index.cc268345.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/nodes/0.cd288160.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/nodes/15.bb9a33aa.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/CodeBlock.169a125f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Distributing Training&quot;,&quot;local&quot;:&quot;distributing-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Multi-GPU Training with TRL&quot;,&quot;local&quot;:&quot;multi-gpu-training-with-trl&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Sequence Parallelism for Long Context Training&quot;,&quot;local&quot;:&quot;sequence-parallelism-for-long-context-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Available Implementations&quot;,&quot;local&quot;:&quot;available-implementations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Choosing Between Ring Attention and Ulysses&quot;,&quot;local&quot;:&quot;choosing-between-ring-attention-and-ulysses&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Ring Attention Implementation (FSDP2)&quot;,&quot;local&quot;:&quot;ring-attention-implementation-fsdp2&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requirements and Limitations&quot;,&quot;local&quot;:&quot;requirements-and-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Configuration&quot;,&quot;local&quot;:&quot;configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5},{&quot;title&quot;:&quot;Training Configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5}],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Best Practices&quot;,&quot;local&quot;:&quot;best-practices&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Benchmarking Ring Attention&quot;,&quot;local&quot;:&quot;benchmarking-ring-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;ALST/Ulysses Implementation (DeepSpeed)&quot;,&quot;local&quot;:&quot;alstulysses-implementation-deepspeed&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requirements and Limitations&quot;,&quot;local&quot;:&quot;requirements-and-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Configuration&quot;,&quot;local&quot;:&quot;configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5},{&quot;title&quot;:&quot;Training Configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:5}],&quot;depth&quot;:4},{&quot;title&quot;:&quot;2D Parallelism&quot;,&quot;local&quot;:&quot;2d-parallelism&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Best Practices&quot;,&quot;local&quot;:&quot;best-practices&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Complete Example&quot;,&quot;local&quot;:&quot;complete-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;General Resources&quot;,&quot;local&quot;:&quot;general-resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Ring Attention (FSDP2)&quot;,&quot;local&quot;:&quot;ring-attention-fsdp2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;ALST/Ulysses (DeepSpeed)&quot;,&quot;local&quot;:&quot;alstulysses-deepspeed&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multi-Node Training&quot;,&quot;local&quot;:&quot;multi-node-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Accelerate Configuration&quot;,&quot;local&quot;:&quot;accelerate-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Launching&quot;,&quot;local&quot;:&quot;launching&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Option 1: Manual Launch (Non-HPC)&quot;,&quot;local&quot;:&quot;option-1-manual-launch-non-hpc&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Option 2: SLURM Launch (HPC Clusters)&quot;,&quot;local&quot;:&quot;option-2-slurm-launch-hpc-clusters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="distributing-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributing-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributing Training</span></h1> <blockquote class="warning" data-svelte-h="svelte-1gc28wp"><p>Section under construction. Feel free to contribute!</p></blockquote> <h2 class="relative group"><a id="multi-gpu-training-with-trl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-gpu-training-with-trl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi-GPU Training with TRL</span></h2> <p data-svelte-h="svelte-1awbvnz">The trainers in TRL use <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> to enable distributed training across multiple GPUs or nodes. To do so, first create an <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> config file by running</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate config<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n2kdl3">and answering the questions according to your multi-GPU / multi-node setup. You can then launch distributed training by running:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch train.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15pg0q3">We also provide config files in the <a href="https://github.com/huggingface/trl/tree/main/examples/accelerate_configs" rel="nofollow">examples folder</a> that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train.py &lt;SCRIPT_ARGS&gt;<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1n2xjt">This automatically distributes the workload across all available GPUs.</p> <p data-svelte-h="svelte-zfygwf">Under the hood, <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> creates one model per GPU. Each process:</p> <ul data-svelte-h="svelte-3s441m"><li>Processes its own batch of data</li> <li>Computes the loss and gradients for that batch</li> <li>Shares gradient updates across all GPUs</li></ul> <p data-svelte-h="svelte-jw4lic"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png" alt="multi gpu"></p> <p>The effective batch size is calculated as:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mtext>Batch Size</mtext><mo>=</mo><mtext>per_device_train_batch_size</mtext><mo>×</mo><mtext>num_devices</mtext><mo>×</mo><mtext>gradient_accumulation_steps</mtext></mrow><annotation encoding="application/x-tex">
\text{Batch Size} = \text{per\_device\_train\_batch\_size} \times \text{num\_devices} \times \text{gradient\_accumulation\_steps}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord text"><span class="mord">Batch Size</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">per_device_train_batch_size</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">num_devices</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">gradient_accumulation_steps</span></span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-19d9mn0">To maintain a consistent batch size when scaling to multiple GPUs, make sure to update <code>per_device_train_batch_size</code> and <code>gradient_accumulation_steps</code> accordingly.</p> <p data-svelte-h="svelte-1npf4a2">Example, these configurations are equivalent, and should yield the same results:</p> <table data-svelte-h="svelte-nztwgr"><thead><tr><th>Number of GPUs</th> <th>Per device batch size</th> <th>Gradient accumulation steps</th> <th>Comments</th></tr></thead> <tbody><tr><td>1</td> <td>32</td> <td>1</td> <td>Possibly high memory usage, but faster training</td></tr> <tr><td>1</td> <td>4</td> <td>8</td> <td>Lower memory usage, slower training</td></tr> <tr><td>8</td> <td>4</td> <td>1</td> <td>Multi-GPU to get the best of both worlds</td></tr></tbody></table> <blockquote class="tip" data-svelte-h="svelte-yb03hq"><p>Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage <a href="https://github.com/deepspeedai/DeepSpeed" rel="nofollow">DeepSpeed</a>, which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our <a href="deepspeed_integration">DeepSpeed Integration</a> guide for more details.</p></blockquote> <h2 class="relative group"><a id="sequence-parallelism-for-long-context-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sequence-parallelism-for-long-context-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sequence Parallelism for Long Context Training</span></h2> <p data-svelte-h="svelte-1brkfhl">Sequence Parallelism (also called Context Parallelism) is a parallelization technique that enables training with longer sequences by splitting the sequence dimension across multiple GPUs. Each GPU processes a portion of the sequence, allowing you to train with sequences longer than what would fit on a single GPU’s memory.</p> <blockquote class="note" data-svelte-h="svelte-1hiylk8"><p><strong>Terminology clarification:</strong> This section describes parallelism techniques for splitting sequences to enable longer context training:</p> <ul><li><strong>Context Parallelism (CP)</strong>: Splits sequences across GPUs (implemented as Ring Attention with FSDP2)</li> <li><strong>Sequence Parallelism (SP)</strong>: Another form of sequence splitting (implemented as ALST/Ulysses with DeepSpeed)</li></ul> <p>Both CP and SP are different from traditional Sequence Parallelism used with Tensor Parallelism (TP+SP) to reduce activation memory. With the techniques here, parallelism dimensions multiply: <code>TP=2</code> and <code>CP=2</code> would require 4 GPUs (2×2), whereas traditional <code>TP+SP=2</code> only needs 2 GPUs as they share the same ranks.</p> <p>In Accelerate’s <code>ParallelismConfig</code>:</p> <ul><li>Use <code>cp_size</code> with <code>cp_backend=&quot;torch&quot;</code> for Ring Attention (FSDP2)</li> <li>Use <code>sp_size</code> with <code>sp_backend=&quot;deepspeed&quot;</code> for ALST/Ulysses (DeepSpeed)</li></ul></blockquote> <p data-svelte-h="svelte-zxojyl">Sequence parallelism is particularly useful when:</p> <ul data-svelte-h="svelte-1los5ox"><li>You want to train with very long sequences (&gt;32k tokens)</li> <li>Single GPU memory is insufficient for your desired sequence length</li> <li>You need to maintain sequence coherence across the full context</li></ul> <h3 class="relative group"><a id="available-implementations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#available-implementations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Available Implementations</span></h3> <p data-svelte-h="svelte-1rfz09e">TRL supports two sequence parallelism implementations, each with different characteristics:</p> <ol data-svelte-h="svelte-hf6l75"><li><strong>Ring Attention (FSDP2)</strong> - Uses ring-based communication for memory-efficient processing of extremely long sequences</li> <li><strong>ALST/Ulysses (DeepSpeed)</strong> - Uses attention head parallelism for faster training with high-bandwidth interconnects</li></ol> <blockquote class="important" data-svelte-h="svelte-134wih1"><p><strong>Sequence Length Terminology:</strong> When using Context Parallelism, the sequence is split across GPUs, introducing two concepts:</p> <ul><li><strong>Global sequence length</strong>: The full sequence length before splitting across GPUs</li> <li><strong>Micro sequence length</strong>: The sequence length per GPU after splitting</li></ul> <p>In TRL, <code>max_seq_length</code> (or <code>max_length</code>) refers to the <strong>global sequence length</strong>. The framework automatically handles splitting into micro sequences:</p> <ul><li><strong>Ring Attention (FSDP2)</strong>: Uses <code>cp_size</code> to split sequences. With <code>max_seq_length=8192</code> and <code>cp_size=4</code>, each GPU processes 2048 tokens.</li> <li><strong>ALST/Ulysses (DeepSpeed)</strong>: Uses <code>sp_size</code> (with <code>sp_backend=&quot;deepspeed&quot;</code>) to split sequences. With <code>max_seq_length=8192</code> and <code>sp_size=2</code>, each GPU processes 4096 tokens.</li></ul> <p>The Trainer automatically accounts for context parallelism when calculating batch sizes and training metrics.</p></blockquote> <h3 class="relative group"><a id="choosing-between-ring-attention-and-ulysses" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-between-ring-attention-and-ulysses"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing Between Ring Attention and Ulysses</span></h3> <p data-svelte-h="svelte-1wzlppl">The comparison table below highlights the key differences between the two approaches:</p> <table data-svelte-h="svelte-1260gia"><thead><tr><th>Feature</th> <th>Ring Attention (FSDP2)</th> <th>ALST/Ulysses (DeepSpeed)</th></tr></thead> <tbody><tr><td><strong>Method</strong></td> <td>Ring Self-Attention</td> <td>Attention Head Parallelism</td></tr> <tr><td><strong>Backend</strong></td> <td>PyTorch FSDP2</td> <td>DeepSpeed ZeRO</td></tr> <tr><td><strong>Attention</strong></td> <td>SDPA only</td> <td>Flash Attention 2 or SDPA</td></tr> <tr><td><strong>Minimum Accelerate</strong></td> <td>1.11.0+</td> <td>1.12.0+</td></tr> <tr><td><strong>Minimum DeepSpeed</strong></td> <td>N/A</td> <td>0.18.1+</td></tr> <tr><td><strong>Sequence Divisibility</strong></td> <td><code>cp_size * 2</code></td> <td><code>sp_size</code></td></tr> <tr><td><strong>Zero Stage</strong></td> <td>N/A</td> <td>ZeRO Stage 1/2/3</td></tr></tbody></table> <p data-svelte-h="svelte-1d2ikj1"><strong>Ring Attention is better when:</strong></p> <ul data-svelte-h="svelte-qvcdaq"><li>You need to handle extremely long sequences (1M+ tokens)</li> <li>The model has limited attention heads (Ring Attention is not constrained by head count)</li> <li>You want flexibility in scaling to any sequence length</li> <li>Network topology is limited (Ring Attention works with simple P2P ring communication)</li></ul> <p data-svelte-h="svelte-139nysj"><strong>Ulysses is better when:</strong></p> <ul data-svelte-h="svelte-nj5q4e"><li>You have high-bandwidth, low-latency interconnects (NVLink, InfiniBand)</li> <li>The model has many attention heads that can be split across GPUs</li> <li>You want lower communication volume</li> <li>You want faster training speed for moderate sequence lengths (up to ~500k tokens)</li></ul> <p data-svelte-h="svelte-y9bny5"><strong>Key Trade-offs:</strong></p> <ul data-svelte-h="svelte-1fqxzgi"><li><strong>Communication Volume:</strong> Ulysses has lower communication volume, making it more efficient with good interconnects. Ring Attention has higher communication volume but is more flexible with different network topologies.</li> <li><strong>Attention Head Constraints:</strong> Ulysses is limited by the number of attention heads (requires <code>num_heads &gt;= sp_size</code>). Ring Attention scales with sequence length regardless of model architecture.</li> <li><strong>Network Sensitivity:</strong> Ulysses all-to-all communication is sensitive to network latency. Ring Attention uses P2P ring communication which is more tolerant of varying network conditions.</li></ul> <p data-svelte-h="svelte-cv9p0n">For a detailed comparison, see the <a href="https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention" rel="nofollow">Ulysses and Ring Attention blog post</a>.</p> <h3 class="relative group"><a id="ring-attention-implementation-fsdp2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ring-attention-implementation-fsdp2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ring Attention Implementation (FSDP2)</span></h3> <p data-svelte-h="svelte-1f7hwjq">Ring Attention uses a ring-like communication pattern where each GPU processes a portion of the sequence and passes information to the next GPU in the ring.</p> <h4 class="relative group"><a id="requirements-and-limitations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#requirements-and-limitations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Requirements and Limitations</span></h4> <ol data-svelte-h="svelte-1d99bf5"><li><strong>Accelerate 1.11.0 or higher</strong> is required for Ring Attention / Context Parallelism support</li> <li><strong>FSDP2 (PyTorch FSDP v2)</strong> is required as the distributed training backend</li> <li><strong>SDPA attention</strong> - Flash Attention is currently not supported</li> <li><strong>Sequence length divisibility</strong> - sequences must be divisible by <code>cp_size * 2</code>. This is automatically handled using the <code>pad_to_multiple_of</code> parameter in the data collator.</li></ol> <h4 class="relative group"><a id="configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configuration</span></h4> <h5 class="relative group"><a id="accelerate-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#accelerate-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Accelerate Configuration</span></h5> <p data-svelte-h="svelte-14prv5">Use one of the provided accelerate config files (e.g. <a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml" rel="nofollow"><code>context_parallel_2gpu.yaml</code></a> for 2 GPUs):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">debug:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">FSDP</span>
<span class="hljs-attr">downcast_bf16:</span> <span class="hljs-string">&#x27;no&#x27;</span>
<span class="hljs-attr">enable_cpu_affinity:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">fsdp_config:</span>
<span class="hljs-attr">fsdp_activation_checkpointing:</span> <span class="hljs-literal">true</span> <span class="hljs-comment"># Enable activation checkpointing for memory efficiency</span>
<span class="hljs-attr">fsdp_auto_wrap_policy:</span> <span class="hljs-string">TRANSFORMER_BASED_WRAP</span>
<span class="hljs-attr">fsdp_cpu_ram_efficient_loading:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">fsdp_offload_params:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">fsdp_reshard_after_forward:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">fsdp_state_dict_type:</span> <span class="hljs-string">FULL_STATE_DICT</span>
<span class="hljs-attr">fsdp_version:</span> <span class="hljs-number">2</span>
<span class="hljs-attr">machine_rank:</span> <span class="hljs-number">0</span>
<span class="hljs-attr">main_training_function:</span> <span class="hljs-string">main</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Number of GPUs</span>
<span class="hljs-attr">rdzv_backend:</span> <span class="hljs-string">static</span>
<span class="hljs-attr">same_network:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">tpu_env:</span> []
<span class="hljs-attr">tpu_use_cluster:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">tpu_use_sudo:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">use_cpu:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">parallelism_config:</span>
<span class="hljs-attr">parallelism_config_dp_replicate_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_dp_shard_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_tp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_cp_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Context parallel size</span><!-- HTML_TAG_END --></pre></div> <h5 class="relative group"><a id="training-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training Configuration</span></h5> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig
training_args = SFTConfig(
<span class="hljs-comment"># required</span>
pad_to_multiple_of=<span class="hljs-number">4</span>, <span class="hljs-comment"># ensures divisibility by cp_size * 2</span>
<span class="hljs-comment"># to get the most out of CP</span>
max_length=<span class="hljs-number">16384</span>, <span class="hljs-comment"># long sequence length</span>
packing=<span class="hljs-literal">True</span>, <span class="hljs-comment"># use packing to reduce padding</span>
use_liger_kernel=<span class="hljs-literal">True</span>, <span class="hljs-comment"># compatible with CP</span>
gradient_checkpointing=<span class="hljs-literal">False</span>, <span class="hljs-comment"># The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can&#x27;t be set to True simultaneously</span>
per_device_train_batch_size=<span class="hljs-number">1</span>,
...
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1opb19">Then, launch your training script with the appropriate accelerate config file:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch --config_file context_parallel_2gpu.yaml train.py<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="best-practices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#best-practices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Best Practices</span></h4> <ol data-svelte-h="svelte-mfle9"><li><p><strong>Use the <code>pad_to_multiple_of</code> parameter</strong> - This is now the recommended way to ensure sequence length divisibility:</p> <ul><li>For <code>cp_size=2</code>: use <code>pad_to_multiple_of=4</code> (since <code>cp_size * 2 = 4</code>)</li> <li>For <code>cp_size=4</code>: use <code>pad_to_multiple_of=8</code> (since <code>cp_size * 2 = 8</code>)</li> <li>The data collator automatically pads sequences to the required multiple, ensuring compatibility with CP</li></ul></li> <li><p><strong>Use packing with padding</strong> - The default BFD (Best Fit Decreasing) strategy works perfectly:</p> <ul><li>Preserves sequence boundaries and maintains training quality</li> <li>Works seamlessly with both <code>padding_free=True</code> and standard padding modes</li></ul></li> <li><p><strong>Combine with other memory optimizations</strong> like Liger kernels, bfloat16, and gradient checkpointing</p></li> <li><p><strong>Start with smaller context parallel sizes</strong> (2-4 GPUs) before scaling up</p></li> <li><p><strong>Monitor memory usage</strong> across all GPUs to ensure balanced workload</p></li></ol> <h4 class="relative group"><a id="benchmarking-ring-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#benchmarking-ring-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Benchmarking Ring Attention</span></h4> <p data-svelte-h="svelte-17o749c">We benchmarked Ring Attention to highlight its potential improvements in training efficiency.<br>
Our experiments were conducted using <strong>1, 2, 4, and 8 H100 GPUs</strong>, though the results can be extended to larger clusters with more nodes and GPUs.</p> <p data-svelte-h="svelte-1ar3p4q">For the setup, we fine-tuned an <strong>8B model</strong> (<a href="https://huggingface.co/Qwen/Qwen3-8B" rel="nofollow">Qwen/Qwen3-8B</a>) using the provided accelerate configuration<br>
(<a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml" rel="nofollow"><code>context_parallel_2gpu.yaml</code></a>).<br>
We adjusted <code>num_processes</code> and <code>parallelism_config_cp_size</code> based on the number of GPUs for each run.<br>
Training was performed with the <a href="https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py" rel="nofollow">sft.py</a> example script, combined with the parameters described above.</p> <p data-svelte-h="svelte-iruthr">The results below summarize the <strong>maximum trainable sequence length</strong> and <strong>iterations per second</strong> for different numbers of GPUs. A value marked as <code>OOM</code> indicates that the configuration ran out of memory and could not be trained.</p> <p data-svelte-h="svelte-1yjupp2">These results show that <strong>Context Parallelism (CP) scales effectively with more GPUs</strong>, enabling training on much longer sequences. With <strong>8 GPUs</strong>, context lengths of over <strong>300k tokens</strong> become feasible, unlocking training with extremely long contexts while maintaining reasonable throughput.</p> <div class="flex justify-center" data-svelte-h="svelte-66t12q"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_max_length_plot.png" alt="CP Max content length" width="45%"> <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_s_it_plot.png" alt="CP seconds/iteration" width="45%"></div> <blockquote class="tip" data-svelte-h="svelte-x1ndil"><p>Accelerate also supports <strong>N-Dimensional Parallelism (ND-parallelism)</strong>, which enables you to combine different parallelization strategies to efficiently distribute model training across multiple GPUs.</p> <p>You can learn more and explore configuration examples in the <a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism" rel="nofollow">Accelerate ND-parallelism guide</a>.</p></blockquote> <h3 class="relative group"><a id="alstulysses-implementation-deepspeed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#alstulysses-implementation-deepspeed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ALST/Ulysses Implementation (DeepSpeed)</span></h3> <p data-svelte-h="svelte-atj2q9">ALST (Arctic Long Sequence Training) / Ulysses uses attention head parallelism to split long sequences across GPUs, working with DeepSpeed’s ZeRO optimizer.</p> <blockquote class="note" data-svelte-h="svelte-2of9o2"><p><strong>Technical Note on Parallelism Configuration:</strong></p> <ul><li><strong>DeepSpeed ALST/Ulysses</strong> uses <code>sp_size</code> with <code>sp_backend=&quot;deepspeed&quot;</code> in both YAML and Python API</li> <li><strong>Ring Attention (FSDP2)</strong> uses <code>cp_size</code> with <code>cp_backend=&quot;torch&quot;</code></li></ul> <p>The Trainer automatically accounts for both CP and SP when calculating effective batch sizes and training metrics.</p></blockquote> <h4 class="relative group"><a id="requirements-and-limitations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#requirements-and-limitations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Requirements and Limitations</span></h4> <ol data-svelte-h="svelte-1v9mepx"><li><strong>DeepSpeed 0.18.1 or higher</strong> is required</li> <li><strong>Accelerate 1.12.0 or higher</strong> is required for ALST/Ulysses sequence parallelism support</li> <li><strong>Attention implementation</strong> - Flash Attention 2 recommended (clean output), SDPA works as fallback</li> <li><strong>Sequence length divisibility</strong> - sequences must be divisible by <code>sp_size</code>. Use <code>pad_to_multiple_of</code> in your training config.</li> <li><strong>Parallelism configuration</strong> - You must ensure <code>dp_replicate_size × dp_shard_size × sp_size = num_processes</code></li></ol> <h4 class="relative group"><a id="configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configuration</span></h4> <h5 class="relative group"><a id="accelerate-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#accelerate-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Accelerate Configuration</span></h5> <p data-svelte-h="svelte-gkmp17">Use the provided accelerate config file (<a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/alst_ulysses_4gpu.yaml" rel="nofollow"><code>alst_ulysses_4gpu.yaml</code></a>):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">debug:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">deepspeed_config:</span>
<span class="hljs-attr">zero_stage:</span> <span class="hljs-number">3</span>
<span class="hljs-attr">seq_parallel_communication_data_type:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">DEEPSPEED</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">4</span> <span class="hljs-comment"># Number of GPUs</span>
<span class="hljs-attr">parallelism_config:</span>
<span class="hljs-attr">parallelism_config_dp_replicate_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_dp_shard_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Enables 2D parallelism with SP</span>
<span class="hljs-attr">parallelism_config_tp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_sp_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Sequence parallel size</span>
<span class="hljs-attr">parallelism_config_sp_backend:</span> <span class="hljs-string">deepspeed</span>
<span class="hljs-attr">parallelism_config_sp_seq_length_is_variable:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">parallelism_config_sp_attn_implementation:</span> <span class="hljs-string">flash_attention_2</span><!-- HTML_TAG_END --></pre></div> <h5 class="relative group"><a id="training-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training Configuration</span></h5> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig
training_args = SFTConfig(
<span class="hljs-comment"># required</span>
pad_to_multiple_of=<span class="hljs-number">2</span>, <span class="hljs-comment"># Must equal sp_size</span>
<span class="hljs-comment"># to get the most out of SP</span>
max_seq_length=<span class="hljs-number">4096</span>,
packing=<span class="hljs-literal">True</span>,
attn_implementation=<span class="hljs-string">&quot;flash_attention_2&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">1</span>,
...
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1opb19">Then, launch your training script with the appropriate accelerate config file:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch --config_file examples/accelerate_configs/alst_ulysses_4gpu.yaml train.py<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="2d-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2d-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2D Parallelism</span></h4> <p data-svelte-h="svelte-1bz4r8q">The 4 GPU configuration above automatically enables 2D parallelism by combining Data Parallelism (DP) with Sequence Parallelism (SP). With <code>sp_size=2</code> and <code>dp_shard_size=2</code>, the 4 GPUs are organized as:</p> <ul data-svelte-h="svelte-15qusls"><li>2 sequence parallel groups (processing the same data split across sequences)</li> <li>2 data parallel groups (processing different data)</li></ul> <p data-svelte-h="svelte-vbgu25">To adjust the parallelism for different GPU counts, modify the YAML config:</p> <table data-svelte-h="svelte-maroh2"><thead><tr><th>GPUs</th> <th>sp_size</th> <th>dp_shard_size</th> <th>Use Case</th> <th>YAML Changes</th></tr></thead> <tbody><tr><td>4</td> <td>2</td> <td>2</td> <td>Balanced - longer sequences + more data</td> <td><code>num_processes: 4</code>, <code>sp_size: 2</code>, <code>dp_shard_size: 2</code></td></tr> <tr><td>4</td> <td>4</td> <td>1</td> <td>Pure SP for maximum sequence length</td> <td><code>num_processes: 4</code>, <code>sp_size: 4</code>, <code>dp_shard_size: 1</code></td></tr> <tr><td>8</td> <td>2</td> <td>4</td> <td>Large-scale training</td> <td><code>num_processes: 8</code>, <code>sp_size: 2</code>, <code>dp_shard_size: 4</code></td></tr></tbody></table> <h4 class="relative group"><a id="best-practices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#best-practices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Best Practices</span></h4> <ol data-svelte-h="svelte-1pt6kdg"><li><strong>Use <code>pad_to_multiple_of</code></strong> to ensure sequences are divisible by <code>sp_size</code></li> <li><strong>Use Flash Attention 2</strong> for clean output (SDPA works but shows packing warnings)</li> <li><strong>Start with <code>sp_size=2</code></strong> before scaling to larger values</li> <li><strong>Use DeepSpeed ZeRO Stage 3</strong> for large models</li> <li><strong>Combine with memory optimizations</strong> like Liger kernels and gradient checkpointing</li> <li><strong>Validate parallelism config</strong>: Ensure <code>dp_replicate_size × dp_shard_size × sp_size = num_processes</code></li></ol> <h4 class="relative group"><a id="complete-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#complete-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Complete Example</span></h4> <p data-svelte-h="svelte-18n46cg">Here’s how to run ALST/Ulysses training using the built-in <a href="https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py" rel="nofollow"><code>sft.py</code></a> script with 4 GPUs:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch --config_file examples/accelerate_configs/alst_ulysses_4gpu.yaml \
trl/scripts/sft.py \
--model_name_or_path Qwen/Qwen2-0.5B \
--dataset_name trl-lib/Capybara \
--learning_rate 2e-4 \
--max_steps 100 \
--max_seq_length 4096 \
--packing \
--packing_strategy wrapped \
--torch_dtype bfloat16 \
--attn_implementation flash_attention_2 \
--output_dir output-alst-4gpu \
--logging_steps 10 \
--report_to trackio<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mk72jq">This command automatically:</p> <ul data-svelte-h="svelte-9wfs52"><li>Configures 2D parallelism (SP=2, DP=2) across 4 GPUs</li> <li>Uses Flash Attention 2 for clean training</li> <li>Enables packing with automatic padding to ensure sequence divisibility</li> <li>Leverages DeepSpeed ZeRO Stage 3 for memory efficiency</li></ul> <h3 class="relative group"><a id="further-reading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#further-reading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Further Reading</span></h3> <h4 class="relative group"><a id="general-resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#general-resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>General Resources</span></h4> <ul data-svelte-h="svelte-1nda474"><li><a href="https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention" rel="nofollow">Hugging Face Blog: Understanding Ulysses and Ring Attention</a> - Detailed comparison of Ring Attention vs Ulysses approaches</li> <li><a href="https://huggingface.co/docs/accelerate/concept_guides/context_parallelism" rel="nofollow">Accelerate: Context Parallelism Guide</a></li> <li><a href="https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl" rel="nofollow">Hugging Face Blog: Enabling Long-Context Training with Sequence Parallelism in Axolotl</a></li></ul> <h4 class="relative group"><a id="ring-attention-fsdp2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ring-attention-fsdp2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ring Attention (FSDP2)</span></h4> <ul data-svelte-h="svelte-snw76h"><li><a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=context_parallelism" rel="nofollow">Ultrascale Playbook - Context Parallelism</a></li> <li><a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#context-parallelism-128k-sequence-length" rel="nofollow">Accelerate Example: 128k Sequence Length</a></li> <li><a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism" rel="nofollow">Accelerate ND-parallelism Guide</a></li></ul> <h4 class="relative group"><a id="alstulysses-deepspeed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#alstulysses-deepspeed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ALST/Ulysses (DeepSpeed)</span></h4> <ul data-svelte-h="svelte-klcbuh"><li><a href="https://www.deepspeed.ai/tutorials/ds-sequence/" rel="nofollow">DeepSpeed Sequence Parallelism Documentation</a></li> <li><a href="https://www.snowflake.com/en/engineering-blog/arctic-long-sequence-training-multi-million-token-ai/" rel="nofollow">Snowflake Engineering Blog: Arctic Long Sequence Training (ALST)</a></li></ul> <h2 class="relative group"><a id="multi-node-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-node-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi-Node Training</span></h2> <p data-svelte-h="svelte-owoicr">When a single machine doesn’t have enough GPUs, TRL can scale training across multiple machines (nodes) using <a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch#multi-node-training" rel="nofollow">🤗 Accelerate</a>.</p> <h3 class="relative group"><a id="accelerate-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#accelerate-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Accelerate Configuration</span></h3> <p data-svelte-h="svelte-tyb6fz">Create an <code>accelerate</code> config file (e.g., <code>multi_node.yaml</code>) for multi-node training. Key fields:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">MULTI_GPU</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">2</span>
<span class="hljs-attr">machine_rank:</span> <span class="hljs-number">0</span> <span class="hljs-comment"># 0 for main node, 1 for second node</span>
<span class="hljs-attr">main_process_ip:</span> <span class="hljs-number">10.0</span><span class="hljs-number">.0</span><span class="hljs-number">.1</span> <span class="hljs-comment"># IP of rank 0 node</span>
<span class="hljs-attr">main_process_port:</span> <span class="hljs-number">29500</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">16</span> <span class="hljs-comment"># total processes across nodes</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">use_cpu:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">same_network:</span> <span class="hljs-literal">true</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ox6g4h">Adjust <code>num_processes</code> to match the total number of GPUs across all nodes.</p> <blockquote class="note" data-svelte-h="svelte-cqvfv7"><p>Replace <code>10.0.0.1</code> with the actual IP address of the rank 0 (main) node.</p></blockquote> <h3 class="relative group"><a id="launching" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#launching"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Launching</span></h3> <h4 class="relative group"><a id="option-1-manual-launch-non-hpc" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#option-1-manual-launch-non-hpc"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Option 1: Manual Launch (Non-HPC)</span></h4> <p data-svelte-h="svelte-psn6sg">Run the following on each node manually:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Node 0 (main node)</span>
accelerate launch --config_file multi_node.yaml --machine_rank 0 train.py
<span class="hljs-comment"># Node 1</span>
accelerate launch --config_file multi_node.yaml --machine_rank 1 train.py<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="option-2-slurm-launch-hpc-clusters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#option-2-slurm-launch-hpc-clusters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Option 2: SLURM Launch (HPC Clusters)</span></h4> <p data-svelte-h="svelte-1mulz1q">For clusters using SLURM job scheduler, create a job script (e.g., <code>slurm_job.sh</code>):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">#!/bin/bash</span>
<span class="hljs-comment">#SBATCH --nodes=2</span>
<span class="hljs-comment">#SBATCH --gpus-per-node=8</span>
<span class="hljs-comment">#SBATCH --job-name=trl_multi</span>
srun accelerate launch --config_file multi_node.yaml train.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g7vbkf">Then submit the job:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sbatch slurm_job.sh<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1japm">SLURM automatically distributes the training across all requested nodes and GPUs, and <code>srun</code> configures the necessary environment variables for multi-node communication.</p> <p data-svelte-h="svelte-6awds9"><strong>Key SLURM directives:</strong></p> <ul data-svelte-h="svelte-qk7gry"><li><code>--nodes=2</code>: Request 2 compute nodes</li> <li><code>--gpus-per-node=8</code>: Allocate 8 GPUs per node (16 total)</li> <li><code>--job-name</code>: Label for tracking in the job queue</li></ul> <p data-svelte-h="svelte-1usji2a">You can combine multi-node with DeepSpeed by setting <code>distributed_type: DEEPSPEED</code> and adding a <code>deepspeed_config</code> block. See the <a href="https://huggingface.co/docs/trl/en/deepspeed_integration" rel="nofollow">DeepSpeed integration guide</a>.</p> <h3 class="relative group"><a id="further-reading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#further-reading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Further Reading</span></h3> <ul data-svelte-h="svelte-f7ho9s"><li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch" rel="nofollow">Accelerate: Launching Scripts</a></li> <li><a href="https://huggingface.co/docs/accelerate/usage_guides/training_zoo" rel="nofollow">Accelerate: Example Zoo</a></li> <li><a href="https://slurm.schedmd.com/" rel="nofollow">SLURM Workload Manager Documentation</a> - For cluster job scheduling</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/trl/blob/main/docs/source/distributing_training.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1hqaf25 = {
assets: "/docs/trl/pr_5607/en",
base: "/docs/trl/pr_5607/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/trl/pr_5607/en/_app/immutable/entry/start.151d81bd.js"),
import("/docs/trl/pr_5607/en/_app/immutable/entry/app.3d9a91c0.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 15],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
96.5 kB
·
Xet hash:
04968721fba63e4bcc9ca14641db24b4ac56d4a3234abe045203c6d3355e039a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.