Buckets:

HuggingFaceDocBuilder's picture
download
raw
62.1 kB
import{s as sa,n as na,o as aa}from"../chunks/scheduler.7b731bd4.js";import{S as ia,i as pa,e as i,s as n,c as o,q as ra,H as oa,h as ca,a as p,d as l,b as a,f as zs,g as c,j as r,r as ma,u as ua,k as M,l as ta,m as s,n as m,t as u,o as d,p as g}from"../chunks/index.cc268345.js";import{C as da,H as h,E as ga}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{C as f}from"../chunks/CodeBlock.169a125f.js";function ha(Rs){let T,Ct,jt,xt,C,vt,x,Zt,w,Ws="<p>Section under construction. Feel free to contribute!</p>",It,v,Lt,Z,ks='The trainers in TRL use <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> to enable distributed training across multiple GPUs or nodes. To do so, first create an <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> config file by running',St,I,Bt,L,Fs="and answering the questions according to your multi-GPU / multi-node setup. You can then launch distributed training by running:",Ht,S,Gt,B,Ns='We also provide config files in the <a href="https://github.com/huggingface/trl/tree/main/examples/accelerate_configs" rel="nofollow">examples folder</a> that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:',Pt,H,At,G,Ys="This automatically distributes the workload across all available GPUs.",zt,P,Xs='Under the hood, <a href="https://github.com/huggingface/accelerate" rel="nofollow">🤗 Accelerate</a> creates one model per GPU. Each process:',Rt,A,qs="<li>Processes its own batch of data</li> <li>Computes the loss and gradients for that batch</li> <li>Shares gradient updates across all GPUs</li>",Wt,z,Vs='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png" alt="multi gpu"/>',kt,R,Ps,Ft,la=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mtext>Batch Size</mtext><mo>=</mo><mtext>per_device_train_batch_size</mtext><mo>×</mo><mtext>num_devices</mtext><mo>×</mo><mtext>gradient_accumulation_steps</mtext></mrow><annotation encoding="application/x-tex">
\\text{Batch Size} = \\text{per\\_device\\_train\\_batch\\_size} \\times \\text{num\\_devices} \\times \\text{gradient\\_accumulation\\_steps}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord text"><span class="mord">Batch Size</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">per_device_train_batch_size</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">num_devices</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0044em;vertical-align:-0.31em;"></span><span class="mord text"><span class="mord">gradient_accumulation_steps</span></span></span></span></span></span>`,Nt,W,Es="To maintain a consistent batch size when scaling to multiple GPUs, make sure to update <code>per_device_train_batch_size</code> and <code>gradient_accumulation_steps</code> accordingly.",Yt,k,Qs="Example, these configurations are equivalent, and should yield the same results:",Xt,F,Ds="<thead><tr><th>Number of GPUs</th> <th>Per device batch size</th> <th>Gradient accumulation steps</th> <th>Comments</th></tr></thead> <tbody><tr><td>1</td> <td>32</td> <td>1</td> <td>Possibly high memory usage, but faster training</td></tr> <tr><td>1</td> <td>4</td> <td>8</td> <td>Lower memory usage, slower training</td></tr> <tr><td>8</td> <td>4</td> <td>1</td> <td>Multi-GPU to get the best of both worlds</td></tr></tbody>",qt,y,Os='<p>Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage <a href="https://github.com/deepspeedai/DeepSpeed" rel="nofollow">DeepSpeed</a>, which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our <a href="deepspeed_integration">DeepSpeed Integration</a> guide for more details.</p>',Vt,N,Et,Y,Ks="Sequence Parallelism (also called Context Parallelism) is a parallelization technique that enables training with longer sequences by splitting the sequence dimension across multiple GPUs. Each GPU processes a portion of the sequence, allowing you to train with sequences longer than what would fit on a single GPU’s memory.",Qt,b,en="<p><strong>Terminology clarification:</strong> This section describes parallelism techniques for splitting sequences to enable longer context training:</p> <ul><li><strong>Context Parallelism (CP)</strong>: Splits sequences across GPUs (implemented as Ring Attention with FSDP2)</li> <li><strong>Sequence Parallelism (SP)</strong>: Another form of sequence splitting (implemented as ALST/Ulysses with DeepSpeed)</li></ul> <p>Both CP and SP are different from traditional Sequence Parallelism used with Tensor Parallelism (TP+SP) to reduce activation memory. With the techniques here, parallelism dimensions multiply: <code>TP=2</code> and <code>CP=2</code> would require 4 GPUs (2×2), whereas traditional <code>TP+SP=2</code> only needs 2 GPUs as they share the same ranks.</p> <p>In Accelerate’s <code>ParallelismConfig</code>:</p> <ul><li>Use <code>cp_size</code> with <code>cp_backend=&quot;torch&quot;</code> for Ring Attention (FSDP2)</li> <li>Use <code>sp_size</code> with <code>sp_backend=&quot;deepspeed&quot;</code> for ALST/Ulysses (DeepSpeed)</li></ul>",Dt,X,tn="Sequence parallelism is particularly useful when:",Ot,q,ln="<li>You want to train with very long sequences (&gt;32k tokens)</li> <li>Single GPU memory is insufficient for your desired sequence length</li> <li>You need to maintain sequence coherence across the full context</li>",Kt,V,el,E,sn="TRL supports two sequence parallelism implementations, each with different characteristics:",tl,Q,nn="<li><strong>Ring Attention (FSDP2)</strong> - Uses ring-based communication for memory-efficient processing of extremely long sequences</li> <li><strong>ALST/Ulysses (DeepSpeed)</strong> - Uses attention head parallelism for faster training with high-bandwidth interconnects</li>",ll,_,an="<p><strong>Sequence Length Terminology:</strong> When using Context Parallelism, the sequence is split across GPUs, introducing two concepts:</p> <ul><li><strong>Global sequence length</strong>: The full sequence length before splitting across GPUs</li> <li><strong>Micro sequence length</strong>: The sequence length per GPU after splitting</li></ul> <p>In TRL, <code>max_seq_length</code> (or <code>max_length</code>) refers to the <strong>global sequence length</strong>. The framework automatically handles splitting into micro sequences:</p> <ul><li><strong>Ring Attention (FSDP2)</strong>: Uses <code>cp_size</code> to split sequences. With <code>max_seq_length=8192</code> and <code>cp_size=4</code>, each GPU processes 2048 tokens.</li> <li><strong>ALST/Ulysses (DeepSpeed)</strong>: Uses <code>sp_size</code> (with <code>sp_backend=&quot;deepspeed&quot;</code>) to split sequences. With <code>max_seq_length=8192</code> and <code>sp_size=2</code>, each GPU processes 4096 tokens.</li></ul> <p>The Trainer automatically accounts for context parallelism when calculating batch sizes and training metrics.</p>",sl,D,nl,O,pn="The comparison table below highlights the key differences between the two approaches:",al,K,rn="<thead><tr><th>Feature</th> <th>Ring Attention (FSDP2)</th> <th>ALST/Ulysses (DeepSpeed)</th></tr></thead> <tbody><tr><td><strong>Method</strong></td> <td>Ring Self-Attention</td> <td>Attention Head Parallelism</td></tr> <tr><td><strong>Backend</strong></td> <td>PyTorch FSDP2</td> <td>DeepSpeed ZeRO</td></tr> <tr><td><strong>Attention</strong></td> <td>SDPA only</td> <td>Flash Attention 2 or SDPA</td></tr> <tr><td><strong>Minimum Accelerate</strong></td> <td>1.11.0+</td> <td>1.12.0+</td></tr> <tr><td><strong>Minimum DeepSpeed</strong></td> <td>N/A</td> <td>0.18.1+</td></tr> <tr><td><strong>Sequence Divisibility</strong></td> <td><code>cp_size * 2</code></td> <td><code>sp_size</code></td></tr> <tr><td><strong>Zero Stage</strong></td> <td>N/A</td> <td>ZeRO Stage 1/2/3</td></tr></tbody>",il,ee,on="<strong>Ring Attention is better when:</strong>",pl,te,cn="<li>You need to handle extremely long sequences (1M+ tokens)</li> <li>The model has limited attention heads (Ring Attention is not constrained by head count)</li> <li>You want flexibility in scaling to any sequence length</li> <li>Network topology is limited (Ring Attention works with simple P2P ring communication)</li>",rl,le,mn="<strong>Ulysses is better when:</strong>",ol,se,un="<li>You have high-bandwidth, low-latency interconnects (NVLink, InfiniBand)</li> <li>The model has many attention heads that can be split across GPUs</li> <li>You want lower communication volume</li> <li>You want faster training speed for moderate sequence lengths (up to ~500k tokens)</li>",cl,ne,dn="<strong>Key Trade-offs:</strong>",ml,ae,gn="<li><strong>Communication Volume:</strong> Ulysses has lower communication volume, making it more efficient with good interconnects. Ring Attention has higher communication volume but is more flexible with different network topologies.</li> <li><strong>Attention Head Constraints:</strong> Ulysses is limited by the number of attention heads (requires <code>num_heads &gt;= sp_size</code>). Ring Attention scales with sequence length regardless of model architecture.</li> <li><strong>Network Sensitivity:</strong> Ulysses all-to-all communication is sensitive to network latency. Ring Attention uses P2P ring communication which is more tolerant of varying network conditions.</li>",ul,ie,hn='For a detailed comparison, see the <a href="https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention" rel="nofollow">Ulysses and Ring Attention blog post</a>.',dl,pe,gl,re,fn="Ring Attention uses a ring-like communication pattern where each GPU processes a portion of the sequence and passes information to the next GPU in the ring.",hl,oe,fl,ce,Mn="<li><strong>Accelerate 1.11.0 or higher</strong> is required for Ring Attention / Context Parallelism support</li> <li><strong>FSDP2 (PyTorch FSDP v2)</strong> is required as the distributed training backend</li> <li><strong>SDPA attention</strong> - Flash Attention is currently not supported</li> <li><strong>Sequence length divisibility</strong> - sequences must be divisible by <code>cp_size * 2</code>. This is automatically handled using the <code>pad_to_multiple_of</code> parameter in the data collator.</li>",Ml,me,Tl,ue,wl,de,Tn='Use one of the provided accelerate config files (e.g. <a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml" rel="nofollow"><code>context_parallel_2gpu.yaml</code></a> for 2 GPUs):',yl,ge,bl,he,_l,fe,Ul,Me,wn="Then, launch your training script with the appropriate accelerate config file:",$l,Te,jl,we,Jl,ye,yn="<li><p><strong>Use the <code>pad_to_multiple_of</code> parameter</strong> - This is now the recommended way to ensure sequence length divisibility:</p> <ul><li>For <code>cp_size=2</code>: use <code>pad_to_multiple_of=4</code> (since <code>cp_size * 2 = 4</code>)</li> <li>For <code>cp_size=4</code>: use <code>pad_to_multiple_of=8</code> (since <code>cp_size * 2 = 8</code>)</li> <li>The data collator automatically pads sequences to the required multiple, ensuring compatibility with CP</li></ul></li> <li><p><strong>Use packing with padding</strong> - The default BFD (Best Fit Decreasing) strategy works perfectly:</p> <ul><li>Preserves sequence boundaries and maintains training quality</li> <li>Works seamlessly with both <code>padding_free=True</code> and standard padding modes</li></ul></li> <li><p><strong>Combine with other memory optimizations</strong> like Liger kernels, bfloat16, and gradient checkpointing</p></li> <li><p><strong>Start with smaller context parallel sizes</strong> (2-4 GPUs) before scaling up</p></li> <li><p><strong>Monitor memory usage</strong> across all GPUs to ensure balanced workload</p></li>",Cl,be,xl,_e,bn=`We benchmarked Ring Attention to highlight its potential improvements in training efficiency.<br/>
Our experiments were conducted using <strong>1, 2, 4, and 8 H100 GPUs</strong>, though the results can be extended to larger clusters with more nodes and GPUs.`,vl,Ue,_n=`For the setup, we fine-tuned an <strong>8B model</strong> (<a href="https://huggingface.co/Qwen/Qwen3-8B" rel="nofollow">Qwen/Qwen3-8B</a>) using the provided accelerate configuration<br/>
(<a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml" rel="nofollow"><code>context_parallel_2gpu.yaml</code></a>).<br/>
We adjusted <code>num_processes</code> and <code>parallelism_config_cp_size</code> based on the number of GPUs for each run.<br/>
Training was performed with the <a href="https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py" rel="nofollow">sft.py</a> example script, combined with the parameters described above.`,Zl,$e,Un="The results below summarize the <strong>maximum trainable sequence length</strong> and <strong>iterations per second</strong> for different numbers of GPUs. A value marked as <code>OOM</code> indicates that the configuration ran out of memory and could not be trained.",Il,je,$n="These results show that <strong>Context Parallelism (CP) scales effectively with more GPUs</strong>, enabling training on much longer sequences. With <strong>8 GPUs</strong>, context lengths of over <strong>300k tokens</strong> become feasible, unlocking training with extremely long contexts while maintaining reasonable throughput.",Ll,U,jn='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_max_length_plot.png" alt="CP Max content length" width="45%"/> <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_s_it_plot.png" alt="CP seconds/iteration" width="45%"/>',Sl,$,Jn='<p>Accelerate also supports <strong>N-Dimensional Parallelism (ND-parallelism)</strong>, which enables you to combine different parallelization strategies to efficiently distribute model training across multiple GPUs.</p> <p>You can learn more and explore configuration examples in the <a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism" rel="nofollow">Accelerate ND-parallelism guide</a>.</p>',Bl,Je,Hl,Ce,Cn="ALST (Arctic Long Sequence Training) / Ulysses uses attention head parallelism to split long sequences across GPUs, working with DeepSpeed’s ZeRO optimizer.",Gl,j,xn="<p><strong>Technical Note on Parallelism Configuration:</strong></p> <ul><li><strong>DeepSpeed ALST/Ulysses</strong> uses <code>sp_size</code> with <code>sp_backend=&quot;deepspeed&quot;</code> in both YAML and Python API</li> <li><strong>Ring Attention (FSDP2)</strong> uses <code>cp_size</code> with <code>cp_backend=&quot;torch&quot;</code></li></ul> <p>The Trainer automatically accounts for both CP and SP when calculating effective batch sizes and training metrics.</p>",Pl,xe,Al,ve,vn="<li><strong>DeepSpeed 0.18.1 or higher</strong> is required</li> <li><strong>Accelerate 1.12.0 or higher</strong> is required for ALST/Ulysses sequence parallelism support</li> <li><strong>Attention implementation</strong> - Flash Attention 2 recommended (clean output), SDPA works as fallback</li> <li><strong>Sequence length divisibility</strong> - sequences must be divisible by <code>sp_size</code>. Use <code>pad_to_multiple_of</code> in your training config.</li> <li><strong>Parallelism configuration</strong> - You must ensure <code>dp_replicate_size × dp_shard_size × sp_size = num_processes</code></li>",zl,Ze,Rl,Ie,Wl,Le,Zn='Use the provided accelerate config file (<a href="https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/alst_ulysses_4gpu.yaml" rel="nofollow"><code>alst_ulysses_4gpu.yaml</code></a>):',kl,Se,Fl,Be,Nl,He,Yl,Ge,In="Then, launch your training script with the appropriate accelerate config file:",Xl,Pe,ql,Ae,Vl,ze,Ln="The 4 GPU configuration above automatically enables 2D parallelism by combining Data Parallelism (DP) with Sequence Parallelism (SP). With <code>sp_size=2</code> and <code>dp_shard_size=2</code>, the 4 GPUs are organized as:",El,Re,Sn="<li>2 sequence parallel groups (processing the same data split across sequences)</li> <li>2 data parallel groups (processing different data)</li>",Ql,We,Bn="To adjust the parallelism for different GPU counts, modify the YAML config:",Dl,ke,Hn="<thead><tr><th>GPUs</th> <th>sp_size</th> <th>dp_shard_size</th> <th>Use Case</th> <th>YAML Changes</th></tr></thead> <tbody><tr><td>4</td> <td>2</td> <td>2</td> <td>Balanced - longer sequences + more data</td> <td><code>num_processes: 4</code>, <code>sp_size: 2</code>, <code>dp_shard_size: 2</code></td></tr> <tr><td>4</td> <td>4</td> <td>1</td> <td>Pure SP for maximum sequence length</td> <td><code>num_processes: 4</code>, <code>sp_size: 4</code>, <code>dp_shard_size: 1</code></td></tr> <tr><td>8</td> <td>2</td> <td>4</td> <td>Large-scale training</td> <td><code>num_processes: 8</code>, <code>sp_size: 2</code>, <code>dp_shard_size: 4</code></td></tr></tbody>",Ol,Fe,Kl,Ne,Gn="<li><strong>Use <code>pad_to_multiple_of</code></strong> to ensure sequences are divisible by <code>sp_size</code></li> <li><strong>Use Flash Attention 2</strong> for clean output (SDPA works but shows packing warnings)</li> <li><strong>Start with <code>sp_size=2</code></strong> before scaling to larger values</li> <li><strong>Use DeepSpeed ZeRO Stage 3</strong> for large models</li> <li><strong>Combine with memory optimizations</strong> like Liger kernels and gradient checkpointing</li> <li><strong>Validate parallelism config</strong>: Ensure <code>dp_replicate_size × dp_shard_size × sp_size = num_processes</code></li>",es,Ye,ts,Xe,Pn='Here’s how to run ALST/Ulysses training using the built-in <a href="https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py" rel="nofollow"><code>sft.py</code></a> script with 4 GPUs:',ls,qe,ss,Ve,An="This command automatically:",ns,Ee,zn="<li>Configures 2D parallelism (SP=2, DP=2) across 4 GPUs</li> <li>Uses Flash Attention 2 for clean training</li> <li>Enables packing with automatic padding to ensure sequence divisibility</li> <li>Leverages DeepSpeed ZeRO Stage 3 for memory efficiency</li>",as,Qe,is,De,ps,Oe,Rn='<li><a href="https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention" rel="nofollow">Hugging Face Blog: Understanding Ulysses and Ring Attention</a> - Detailed comparison of Ring Attention vs Ulysses approaches</li> <li><a href="https://huggingface.co/docs/accelerate/concept_guides/context_parallelism" rel="nofollow">Accelerate: Context Parallelism Guide</a></li> <li><a href="https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl" rel="nofollow">Hugging Face Blog: Enabling Long-Context Training with Sequence Parallelism in Axolotl</a></li>',rs,Ke,os,et,Wn='<li><a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=context_parallelism" rel="nofollow">Ultrascale Playbook - Context Parallelism</a></li> <li><a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#context-parallelism-128k-sequence-length" rel="nofollow">Accelerate Example: 128k Sequence Length</a></li> <li><a href="https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism" rel="nofollow">Accelerate ND-parallelism Guide</a></li>',cs,tt,ms,lt,kn='<li><a href="https://www.deepspeed.ai/tutorials/ds-sequence/" rel="nofollow">DeepSpeed Sequence Parallelism Documentation</a></li> <li><a href="https://www.snowflake.com/en/engineering-blog/arctic-long-sequence-training-multi-million-token-ai/" rel="nofollow">Snowflake Engineering Blog: Arctic Long Sequence Training (ALST)</a></li>',us,st,ds,nt,Fn='When a single machine doesn’t have enough GPUs, TRL can scale training across multiple machines (nodes) using <a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch#multi-node-training" rel="nofollow">🤗 Accelerate</a>.',gs,at,hs,it,Nn="Create an <code>accelerate</code> config file (e.g., <code>multi_node.yaml</code>) for multi-node training. Key fields:",fs,pt,Ms,rt,Yn="Adjust <code>num_processes</code> to match the total number of GPUs across all nodes.",Ts,J,Xn="<p>Replace <code>10.0.0.1</code> with the actual IP address of the rank 0 (main) node.</p>",ws,ot,ys,ct,bs,mt,qn="Run the following on each node manually:",_s,ut,Us,dt,$s,gt,Vn="For clusters using SLURM job scheduler, create a job script (e.g., <code>slurm_job.sh</code>):",js,ht,Js,ft,En="Then submit the job:",Cs,Mt,xs,Tt,Qn="SLURM automatically distributes the training across all requested nodes and GPUs, and <code>srun</code> configures the necessary environment variables for multi-node communication.",vs,wt,Dn="<strong>Key SLURM directives:</strong>",Zs,yt,On="<li><code>--nodes=2</code>: Request 2 compute nodes</li> <li><code>--gpus-per-node=8</code>: Allocate 8 GPUs per node (16 total)</li> <li><code>--job-name</code>: Label for tracking in the job queue</li>",Is,bt,Kn='You can combine multi-node with DeepSpeed by setting <code>distributed_type: DEEPSPEED</code> and adding a <code>deepspeed_config</code> block. See the <a href="https://huggingface.co/docs/trl/en/deepspeed_integration" rel="nofollow">DeepSpeed integration guide</a>.',Ls,_t,Ss,Ut,ea='<li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch" rel="nofollow">Accelerate: Launching Scripts</a></li> <li><a href="https://huggingface.co/docs/accelerate/usage_guides/training_zoo" rel="nofollow">Accelerate: Example Zoo</a></li> <li><a href="https://slurm.schedmd.com/" rel="nofollow">SLURM Workload Manager Documentation</a> - For cluster job scheduling</li>',Bs,$t,Hs,Jt,Gs;return C=new da({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new h({props:{title:"Distributing Training",local:"distributing-training",headingTag:"h1"}}),v=new h({props:{title:"Multi-GPU Training with TRL",local:"multi-gpu-training-with-trl",headingTag:"h2"}}),I=new f({props:{code:"YWNjZWxlcmF0ZSUyMGNvbmZpZw==",highlighted:"accelerate config",wrap:!1}}),S=new f({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMHRyYWluLnB5",highlighted:"accelerate launch train.py",wrap:!1}}),H=new f({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBleGFtcGxlcyUyRmFjY2VsZXJhdGVfY29uZmlncyUyRm11bHRpX2dwdS55YW1sJTIwdHJhaW4ucHklMjAlM0NTQ1JJUFRfQVJHUyUzRQ==",highlighted:"accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train.py &lt;SCRIPT_ARGS&gt;",wrap:!1}}),N=new h({props:{title:"Sequence Parallelism for Long Context Training",local:"sequence-parallelism-for-long-context-training",headingTag:"h2"}}),V=new h({props:{title:"Available Implementations",local:"available-implementations",headingTag:"h3"}}),D=new h({props:{title:"Choosing Between Ring Attention and Ulysses",local:"choosing-between-ring-attention-and-ulysses",headingTag:"h3"}}),pe=new h({props:{title:"Ring Attention Implementation (FSDP2)",local:"ring-attention-implementation-fsdp2",headingTag:"h3"}}),oe=new h({props:{title:"Requirements and Limitations",local:"requirements-and-limitations",headingTag:"h4"}}),me=new h({props:{title:"Configuration",local:"configuration",headingTag:"h4"}}),ue=new h({props:{title:"Accelerate Configuration",local:"accelerate-configuration",headingTag:"h5"}}),ge=new f({props:{code:"Y29tcHV0ZV9lbnZpcm9ubWVudCUzQSUyMExPQ0FMX01BQ0hJTkUlMEFkZWJ1ZyUzQSUyMGZhbHNlJTBBZGlzdHJpYnV0ZWRfdHlwZSUzQSUyMEZTRFAlMEFkb3duY2FzdF9iZjE2JTNBJTIwJ25vJyUwQWVuYWJsZV9jcHVfYWZmaW5pdHklM0ElMjBmYWxzZSUwQWZzZHBfY29uZmlnJTNBJTBBJTIwJTIwZnNkcF9hY3RpdmF0aW9uX2NoZWNrcG9pbnRpbmclM0ElMjB0cnVlJTIwJTIwJTIzJTIwRW5hYmxlJTIwYWN0aXZhdGlvbiUyMGNoZWNrcG9pbnRpbmclMjBmb3IlMjBtZW1vcnklMjBlZmZpY2llbmN5JTBBJTIwJTIwZnNkcF9hdXRvX3dyYXBfcG9saWN5JTNBJTIwVFJBTlNGT1JNRVJfQkFTRURfV1JBUCUwQSUyMCUyMGZzZHBfY3B1X3JhbV9lZmZpY2llbnRfbG9hZGluZyUzQSUyMHRydWUlMEElMjAlMjBmc2RwX29mZmxvYWRfcGFyYW1zJTNBJTIwZmFsc2UlMEElMjAlMjBmc2RwX3Jlc2hhcmRfYWZ0ZXJfZm9yd2FyZCUzQSUyMHRydWUlMEElMjAlMjBmc2RwX3N0YXRlX2RpY3RfdHlwZSUzQSUyMEZVTExfU1RBVEVfRElDVCUwQSUyMCUyMGZzZHBfdmVyc2lvbiUzQSUyMDIlMEFtYWNoaW5lX3JhbmslM0ElMjAwJTBBbWFpbl90cmFpbmluZ19mdW5jdGlvbiUzQSUyMG1haW4lMEFtaXhlZF9wcmVjaXNpb24lM0ElMjBiZjE2JTBBbnVtX21hY2hpbmVzJTNBJTIwMSUwQW51bV9wcm9jZXNzZXMlM0ElMjAyJTIwJTIwJTIzJTIwTnVtYmVyJTIwb2YlMjBHUFVzJTBBcmR6dl9iYWNrZW5kJTNBJTIwc3RhdGljJTBBc2FtZV9uZXR3b3JrJTNBJTIwdHJ1ZSUwQXRwdV9lbnYlM0ElMjAlNUIlNUQlMEF0cHVfdXNlX2NsdXN0ZXIlM0ElMjBmYWxzZSUwQXRwdV91c2Vfc3VkbyUzQSUyMGZhbHNlJTBBdXNlX2NwdSUzQSUyMGZhbHNlJTBBcGFyYWxsZWxpc21fY29uZmlnJTNBJTBBJTIwJTIwcGFyYWxsZWxpc21fY29uZmlnX2RwX3JlcGxpY2F0ZV9zaXplJTNBJTIwMSUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ19kcF9zaGFyZF9zaXplJTNBJTIwMSUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ190cF9zaXplJTNBJTIwMSUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ19jcF9zaXplJTNBJTIwMiUyMCUyMCUyMyUyMENvbnRleHQlMjBwYXJhbGxlbCUyMHNpemU=",highlighted:`<span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">debug:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">FSDP</span>
<span class="hljs-attr">downcast_bf16:</span> <span class="hljs-string">&#x27;no&#x27;</span>
<span class="hljs-attr">enable_cpu_affinity:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">fsdp_config:</span>
<span class="hljs-attr">fsdp_activation_checkpointing:</span> <span class="hljs-literal">true</span> <span class="hljs-comment"># Enable activation checkpointing for memory efficiency</span>
<span class="hljs-attr">fsdp_auto_wrap_policy:</span> <span class="hljs-string">TRANSFORMER_BASED_WRAP</span>
<span class="hljs-attr">fsdp_cpu_ram_efficient_loading:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">fsdp_offload_params:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">fsdp_reshard_after_forward:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">fsdp_state_dict_type:</span> <span class="hljs-string">FULL_STATE_DICT</span>
<span class="hljs-attr">fsdp_version:</span> <span class="hljs-number">2</span>
<span class="hljs-attr">machine_rank:</span> <span class="hljs-number">0</span>
<span class="hljs-attr">main_training_function:</span> <span class="hljs-string">main</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Number of GPUs</span>
<span class="hljs-attr">rdzv_backend:</span> <span class="hljs-string">static</span>
<span class="hljs-attr">same_network:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">tpu_env:</span> []
<span class="hljs-attr">tpu_use_cluster:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">tpu_use_sudo:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">use_cpu:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">parallelism_config:</span>
<span class="hljs-attr">parallelism_config_dp_replicate_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_dp_shard_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_tp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_cp_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Context parallel size</span>`,wrap:!1}}),he=new h({props:{title:"Training Configuration",local:"training-configuration",headingTag:"h5"}}),fe=new f({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoJTBBJTIwJTIwJTIwJTIwJTIzJTIwcmVxdWlyZWQlMEElMjAlMjAlMjAlMjBwYWRfdG9fbXVsdGlwbGVfb2YlM0Q0JTJDJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwZW5zdXJlcyUyMGRpdmlzaWJpbGl0eSUyMGJ5JTIwY3Bfc2l6ZSUyMColMjAyJTBBJTIwJTIwJTIwJTIwJTIzJTIwdG8lMjBnZXQlMjB0aGUlMjBtb3N0JTIwb3V0JTIwb2YlMjBDUCUwQSUyMCUyMCUyMCUyMG1heF9sZW5ndGglM0QxNjM4NCUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMGxvbmclMjBzZXF1ZW5jZSUyMGxlbmd0aCUwQSUyMCUyMCUyMCUyMHBhY2tpbmclM0RUcnVlJTJDJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwdXNlJTIwcGFja2luZyUyMHRvJTIwcmVkdWNlJTIwcGFkZGluZyUwQSUyMCUyMCUyMCUyMHVzZV9saWdlcl9rZXJuZWwlM0RUcnVlJTJDJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwY29tcGF0aWJsZSUyMHdpdGglMjBDUCUwQSUyMCUyMCUyMCUyMGdyYWRpZW50X2NoZWNrcG9pbnRpbmclM0RGYWxzZSUyQyUyMCUyMCUyMCUyMyUyMFRoZSUyMGFjdGl2YXRpb25fY2hlY2twb2ludGluZyUyMGluJTIwRlNEUCUyMGNvbmZpZyUyMGFuZCUyMHRoZSUyMGdyYWRpZW50X2NoZWNrcG9pbnRpbmclMjBpbiUyMHRyYWluaW5nJTIwYXJnJTIwY2FuJ3QlMjBiZSUyMHNldCUyMHRvJTIwVHJ1ZSUyMHNpbXVsdGFuZW91c2x5JTBBJTIwJTIwJTIwJTIwcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTNEMSUyQyUwQSUyMCUyMCUyMCUyMC4uLiUwQSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig
training_args = SFTConfig(
<span class="hljs-comment"># required</span>
pad_to_multiple_of=<span class="hljs-number">4</span>, <span class="hljs-comment"># ensures divisibility by cp_size * 2</span>
<span class="hljs-comment"># to get the most out of CP</span>
max_length=<span class="hljs-number">16384</span>, <span class="hljs-comment"># long sequence length</span>
packing=<span class="hljs-literal">True</span>, <span class="hljs-comment"># use packing to reduce padding</span>
use_liger_kernel=<span class="hljs-literal">True</span>, <span class="hljs-comment"># compatible with CP</span>
gradient_checkpointing=<span class="hljs-literal">False</span>, <span class="hljs-comment"># The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can&#x27;t be set to True simultaneously</span>
per_device_train_batch_size=<span class="hljs-number">1</span>,
...
)`,wrap:!1}}),Te=new f({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBjb250ZXh0X3BhcmFsbGVsXzJncHUueWFtbCUyMHRyYWluLnB5",highlighted:"accelerate launch --config_file context_parallel_2gpu.yaml train.py",wrap:!1}}),we=new h({props:{title:"Best Practices",local:"best-practices",headingTag:"h4"}}),be=new h({props:{title:"Benchmarking Ring Attention",local:"benchmarking-ring-attention",headingTag:"h4"}}),Je=new h({props:{title:"ALST/Ulysses Implementation (DeepSpeed)",local:"alstulysses-implementation-deepspeed",headingTag:"h3"}}),xe=new h({props:{title:"Requirements and Limitations",local:"requirements-and-limitations",headingTag:"h4"}}),Ze=new h({props:{title:"Configuration",local:"configuration",headingTag:"h4"}}),Ie=new h({props:{title:"Accelerate Configuration",local:"accelerate-configuration",headingTag:"h5"}}),Se=new f({props:{code:"Y29tcHV0ZV9lbnZpcm9ubWVudCUzQSUyMExPQ0FMX01BQ0hJTkUlMEFkZWJ1ZyUzQSUyMGZhbHNlJTBBZGVlcHNwZWVkX2NvbmZpZyUzQSUwQSUyMCUyMHplcm9fc3RhZ2UlM0ElMjAzJTBBJTIwJTIwc2VxX3BhcmFsbGVsX2NvbW11bmljYXRpb25fZGF0YV90eXBlJTNBJTIwYmYxNiUwQWRpc3RyaWJ1dGVkX3R5cGUlM0ElMjBERUVQU1BFRUQlMEFtaXhlZF9wcmVjaXNpb24lM0ElMjBiZjE2JTBBbnVtX21hY2hpbmVzJTNBJTIwMSUwQW51bV9wcm9jZXNzZXMlM0ElMjA0JTIwJTIwJTIzJTIwTnVtYmVyJTIwb2YlMjBHUFVzJTBBcGFyYWxsZWxpc21fY29uZmlnJTNBJTBBJTIwJTIwcGFyYWxsZWxpc21fY29uZmlnX2RwX3JlcGxpY2F0ZV9zaXplJTNBJTIwMSUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ19kcF9zaGFyZF9zaXplJTNBJTIwMiUyMCUyMCUyMyUyMEVuYWJsZXMlMjAyRCUyMHBhcmFsbGVsaXNtJTIwd2l0aCUyMFNQJTBBJTIwJTIwcGFyYWxsZWxpc21fY29uZmlnX3RwX3NpemUlM0ElMjAxJTBBJTIwJTIwcGFyYWxsZWxpc21fY29uZmlnX3NwX3NpemUlM0ElMjAyJTIwJTIwJTIzJTIwU2VxdWVuY2UlMjBwYXJhbGxlbCUyMHNpemUlMEElMjAlMjBwYXJhbGxlbGlzbV9jb25maWdfc3BfYmFja2VuZCUzQSUyMGRlZXBzcGVlZCUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ19zcF9zZXFfbGVuZ3RoX2lzX3ZhcmlhYmxlJTNBJTIwdHJ1ZSUwQSUyMCUyMHBhcmFsbGVsaXNtX2NvbmZpZ19zcF9hdHRuX2ltcGxlbWVudGF0aW9uJTNBJTIwZmxhc2hfYXR0ZW50aW9uXzI=",highlighted:`<span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">debug:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">deepspeed_config:</span>
<span class="hljs-attr">zero_stage:</span> <span class="hljs-number">3</span>
<span class="hljs-attr">seq_parallel_communication_data_type:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">DEEPSPEED</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">4</span> <span class="hljs-comment"># Number of GPUs</span>
<span class="hljs-attr">parallelism_config:</span>
<span class="hljs-attr">parallelism_config_dp_replicate_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_dp_shard_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Enables 2D parallelism with SP</span>
<span class="hljs-attr">parallelism_config_tp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">parallelism_config_sp_size:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Sequence parallel size</span>
<span class="hljs-attr">parallelism_config_sp_backend:</span> <span class="hljs-string">deepspeed</span>
<span class="hljs-attr">parallelism_config_sp_seq_length_is_variable:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">parallelism_config_sp_attn_implementation:</span> <span class="hljs-string">flash_attention_2</span>`,wrap:!1}}),Be=new h({props:{title:"Training Configuration",local:"training-configuration",headingTag:"h5"}}),He=new f({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoJTBBJTIwJTIwJTIwJTIwJTIzJTIwcmVxdWlyZWQlMEElMjAlMjAlMjAlMjBwYWRfdG9fbXVsdGlwbGVfb2YlM0QyJTJDJTIwJTIwJTIwJTIwJTIzJTIwTXVzdCUyMGVxdWFsJTIwc3Bfc2l6ZSUwQSUyMCUyMCUyMCUyMCUyMyUyMHRvJTIwZ2V0JTIwdGhlJTIwbW9zdCUyMG91dCUyMG9mJTIwU1AlMEElMjAlMjAlMjAlMjBtYXhfc2VxX2xlbmd0aCUzRDQwOTYlMkMlMEElMjAlMjAlMjAlMjBwYWNraW5nJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMGF0dG5faW1wbGVtZW50YXRpb24lM0QlMjJmbGFzaF9hdHRlbnRpb25fMiUyMiUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDElMkMlMEElMjAlMjAlMjAlMjAuLi4lMEEp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig
training_args = SFTConfig(
<span class="hljs-comment"># required</span>
pad_to_multiple_of=<span class="hljs-number">2</span>, <span class="hljs-comment"># Must equal sp_size</span>
<span class="hljs-comment"># to get the most out of SP</span>
max_seq_length=<span class="hljs-number">4096</span>,
packing=<span class="hljs-literal">True</span>,
attn_implementation=<span class="hljs-string">&quot;flash_attention_2&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">1</span>,
...
)`,wrap:!1}}),Pe=new f({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBleGFtcGxlcyUyRmFjY2VsZXJhdGVfY29uZmlncyUyRmFsc3RfdWx5c3Nlc180Z3B1LnlhbWwlMjB0cmFpbi5weQ==",highlighted:"accelerate launch --config_file examples/accelerate_configs/alst_ulysses_4gpu.yaml train.py",wrap:!1}}),Ae=new h({props:{title:"2D Parallelism",local:"2d-parallelism",headingTag:"h4"}}),Fe=new h({props:{title:"Best Practices",local:"best-practices",headingTag:"h4"}}),Ye=new h({props:{title:"Complete Example",local:"complete-example",headingTag:"h4"}}),qe=new f({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBleGFtcGxlcyUyRmFjY2VsZXJhdGVfY29uZmlncyUyRmFsc3RfdWx5c3Nlc180Z3B1LnlhbWwlMjAlNUMlMEElMjAlMjAlMjAlMjB0cmwlMkZzY3JpcHRzJTJGc2Z0LnB5JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbF9uYW1lX29yX3BhdGglMjBRd2VuJTJGUXdlbjItMC41QiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZGF0YXNldF9uYW1lJTIwdHJsLWxpYiUyRkNhcHliYXJhJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1sZWFybmluZ19yYXRlJTIwMmUtNCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbWF4X3N0ZXBzJTIwMTAwJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tYXhfc2VxX2xlbmd0aCUyMDQwOTYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXBhY2tpbmclMjAlNUMlMEElMjAlMjAlMjAlMjAtLXBhY2tpbmdfc3RyYXRlZ3klMjB3cmFwcGVkJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS10b3JjaF9kdHlwZSUyMGJmbG9hdDE2JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1hdHRuX2ltcGxlbWVudGF0aW9uJTIwZmxhc2hfYXR0ZW50aW9uXzIlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW91dHB1dF9kaXIlMjBvdXRwdXQtYWxzdC00Z3B1JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1sb2dnaW5nX3N0ZXBzJTIwMTAlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXJlcG9ydF90byUyMHRyYWNraW8=",highlighted:`accelerate launch --config_file examples/accelerate_configs/alst_ulysses_4gpu.yaml \\
trl/scripts/sft.py \\
--model_name_or_path Qwen/Qwen2-0.5B \\
--dataset_name trl-lib/Capybara \\
--learning_rate 2e-4 \\
--max_steps 100 \\
--max_seq_length 4096 \\
--packing \\
--packing_strategy wrapped \\
--torch_dtype bfloat16 \\
--attn_implementation flash_attention_2 \\
--output_dir output-alst-4gpu \\
--logging_steps 10 \\
--report_to trackio`,wrap:!1}}),Qe=new h({props:{title:"Further Reading",local:"further-reading",headingTag:"h3"}}),De=new h({props:{title:"General Resources",local:"general-resources",headingTag:"h4"}}),Ke=new h({props:{title:"Ring Attention (FSDP2)",local:"ring-attention-fsdp2",headingTag:"h4"}}),tt=new h({props:{title:"ALST/Ulysses (DeepSpeed)",local:"alstulysses-deepspeed",headingTag:"h4"}}),st=new h({props:{title:"Multi-Node Training",local:"multi-node-training",headingTag:"h2"}}),at=new h({props:{title:"Accelerate Configuration",local:"accelerate-configuration",headingTag:"h3"}}),pt=new f({props:{code:"Y29tcHV0ZV9lbnZpcm9ubWVudCUzQSUyMExPQ0FMX01BQ0hJTkUlMEFkaXN0cmlidXRlZF90eXBlJTNBJTIwTVVMVElfR1BVJTBBbnVtX21hY2hpbmVzJTNBJTIwMiUwQW1hY2hpbmVfcmFuayUzQSUyMDAlMjAlMjAlMjMlMjAwJTIwZm9yJTIwbWFpbiUyMG5vZGUlMkMlMjAxJTIwZm9yJTIwc2Vjb25kJTIwbm9kZSUwQW1haW5fcHJvY2Vzc19pcCUzQSUyMDEwLjAuMC4xJTIwJTIwJTIzJTIwSVAlMjBvZiUyMHJhbmslMjAwJTIwbm9kZSUwQW1haW5fcHJvY2Vzc19wb3J0JTNBJTIwMjk1MDAlMEFudW1fcHJvY2Vzc2VzJTNBJTIwMTYlMjAlMjAlMjMlMjB0b3RhbCUyMHByb2Nlc3NlcyUyMGFjcm9zcyUyMG5vZGVzJTBBbWl4ZWRfcHJlY2lzaW9uJTNBJTIwYmYxNiUwQXVzZV9jcHUlM0ElMjBmYWxzZSUwQXNhbWVfbmV0d29yayUzQSUyMHRydWU=",highlighted:`<span class="hljs-attr">compute_environment:</span> <span class="hljs-string">LOCAL_MACHINE</span>
<span class="hljs-attr">distributed_type:</span> <span class="hljs-string">MULTI_GPU</span>
<span class="hljs-attr">num_machines:</span> <span class="hljs-number">2</span>
<span class="hljs-attr">machine_rank:</span> <span class="hljs-number">0</span> <span class="hljs-comment"># 0 for main node, 1 for second node</span>
<span class="hljs-attr">main_process_ip:</span> <span class="hljs-number">10.0</span><span class="hljs-number">.0</span><span class="hljs-number">.1</span> <span class="hljs-comment"># IP of rank 0 node</span>
<span class="hljs-attr">main_process_port:</span> <span class="hljs-number">29500</span>
<span class="hljs-attr">num_processes:</span> <span class="hljs-number">16</span> <span class="hljs-comment"># total processes across nodes</span>
<span class="hljs-attr">mixed_precision:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">use_cpu:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">same_network:</span> <span class="hljs-literal">true</span>`,wrap:!1}}),ot=new h({props:{title:"Launching",local:"launching",headingTag:"h3"}}),ct=new h({props:{title:"Option 1: Manual Launch (Non-HPC)",local:"option-1-manual-launch-non-hpc",headingTag:"h4"}}),ut=new f({props:{code:"JTIzJTIwTm9kZSUyMDAlMjAobWFpbiUyMG5vZGUpJTBBYWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBtdWx0aV9ub2RlLnlhbWwlMjAtLW1hY2hpbmVfcmFuayUyMDAlMjB0cmFpbi5weSUwQSUwQSUyMyUyME5vZGUlMjAxJTBBYWNjZWxlcmF0ZSUyMGxhdW5jaCUyMC0tY29uZmlnX2ZpbGUlMjBtdWx0aV9ub2RlLnlhbWwlMjAtLW1hY2hpbmVfcmFuayUyMDElMjB0cmFpbi5weQ==",highlighted:`<span class="hljs-comment"># Node 0 (main node)</span>
accelerate launch --config_file multi_node.yaml --machine_rank 0 train.py
<span class="hljs-comment"># Node 1</span>
accelerate launch --config_file multi_node.yaml --machine_rank 1 train.py`,wrap:!1}}),dt=new h({props:{title:"Option 2: SLURM Launch (HPC Clusters)",local:"option-2-slurm-launch-hpc-clusters",headingTag:"h4"}}),ht=new f({props:{code:"JTIzISUyRmJpbiUyRmJhc2glMEElMjNTQkFUQ0glMjAtLW5vZGVzJTNEMiUwQSUyM1NCQVRDSCUyMC0tZ3B1cy1wZXItbm9kZSUzRDglMEElMjNTQkFUQ0glMjAtLWpvYi1uYW1lJTNEdHJsX211bHRpJTBBJTBBc3J1biUyMGFjY2VsZXJhdGUlMjBsYXVuY2glMjAtLWNvbmZpZ19maWxlJTIwbXVsdGlfbm9kZS55YW1sJTIwdHJhaW4ucHk=",highlighted:`<span class="hljs-meta">#!/bin/bash</span>
<span class="hljs-comment">#SBATCH --nodes=2</span>
<span class="hljs-comment">#SBATCH --gpus-per-node=8</span>
<span class="hljs-comment">#SBATCH --job-name=trl_multi</span>
srun accelerate launch --config_file multi_node.yaml train.py`,wrap:!1}}),Mt=new f({props:{code:"c2JhdGNoJTIwc2x1cm1fam9iLnNo",highlighted:"sbatch slurm_job.sh",wrap:!1}}),_t=new h({props:{title:"Further Reading",local:"further-reading",headingTag:"h3"}}),$t=new ga({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/distributing_training.md"}}),{c(){T=i("meta"),Ct=n(),jt=i("p"),xt=n(),o(C.$$.fragment),vt=n(),o(x.$$.fragment),Zt=n(),w=i("blockquote"),w.innerHTML=Ws,It=n(),o(v.$$.fragment),Lt=n(),Z=i("p"),Z.innerHTML=ks,St=n(),o(I.$$.fragment),Bt=n(),L=i("p"),L.textContent=Fs,Ht=n(),o(S.$$.fragment),Gt=n(),B=i("p"),B.innerHTML=Ns,Pt=n(),o(H.$$.fragment),At=n(),G=i("p"),G.textContent=Ys,zt=n(),P=i("p"),P.innerHTML=Xs,Rt=n(),A=i("ul"),A.innerHTML=qs,Wt=n(),z=i("p"),z.innerHTML=Vs,kt=n(),R=i("p"),Ps=ra(`The effective batch size is calculated as:
`),Ft=new oa(!1),Nt=n(),W=i("p"),W.innerHTML=Es,Yt=n(),k=i("p"),k.textContent=Qs,Xt=n(),F=i("table"),F.innerHTML=Ds,qt=n(),y=i("blockquote"),y.innerHTML=Os,Vt=n(),o(N.$$.fragment),Et=n(),Y=i("p"),Y.textContent=Ks,Qt=n(),b=i("blockquote"),b.innerHTML=en,Dt=n(),X=i("p"),X.textContent=tn,Ot=n(),q=i("ul"),q.innerHTML=ln,Kt=n(),o(V.$$.fragment),el=n(),E=i("p"),E.textContent=sn,tl=n(),Q=i("ol"),Q.innerHTML=nn,ll=n(),_=i("blockquote"),_.innerHTML=an,sl=n(),o(D.$$.fragment),nl=n(),O=i("p"),O.textContent=pn,al=n(),K=i("table"),K.innerHTML=rn,il=n(),ee=i("p"),ee.innerHTML=on,pl=n(),te=i("ul"),te.innerHTML=cn,rl=n(),le=i("p"),le.innerHTML=mn,ol=n(),se=i("ul"),se.innerHTML=un,cl=n(),ne=i("p"),ne.innerHTML=dn,ml=n(),ae=i("ul"),ae.innerHTML=gn,ul=n(),ie=i("p"),ie.innerHTML=hn,dl=n(),o(pe.$$.fragment),gl=n(),re=i("p"),re.textContent=fn,hl=n(),o(oe.$$.fragment),fl=n(),ce=i("ol"),ce.innerHTML=Mn,Ml=n(),o(me.$$.fragment),Tl=n(),o(ue.$$.fragment),wl=n(),de=i("p"),de.innerHTML=Tn,yl=n(),o(ge.$$.fragment),bl=n(),o(he.$$.fragment),_l=n(),o(fe.$$.fragment),Ul=n(),Me=i("p"),Me.textContent=wn,$l=n(),o(Te.$$.fragment),jl=n(),o(we.$$.fragment),Jl=n(),ye=i("ol"),ye.innerHTML=yn,Cl=n(),o(be.$$.fragment),xl=n(),_e=i("p"),_e.innerHTML=bn,vl=n(),Ue=i("p"),Ue.innerHTML=_n,Zl=n(),$e=i("p"),$e.innerHTML=Un,Il=n(),je=i("p"),je.innerHTML=$n,Ll=n(),U=i("div"),U.innerHTML=jn,Sl=n(),$=i("blockquote"),$.innerHTML=Jn,Bl=n(),o(Je.$$.fragment),Hl=n(),Ce=i("p"),Ce.textContent=Cn,Gl=n(),j=i("blockquote"),j.innerHTML=xn,Pl=n(),o(xe.$$.fragment),Al=n(),ve=i("ol"),ve.innerHTML=vn,zl=n(),o(Ze.$$.fragment),Rl=n(),o(Ie.$$.fragment),Wl=n(),Le=i("p"),Le.innerHTML=Zn,kl=n(),o(Se.$$.fragment),Fl=n(),o(Be.$$.fragment),Nl=n(),o(He.$$.fragment),Yl=n(),Ge=i("p"),Ge.textContent=In,Xl=n(),o(Pe.$$.fragment),ql=n(),o(Ae.$$.fragment),Vl=n(),ze=i("p"),ze.innerHTML=Ln,El=n(),Re=i("ul"),Re.innerHTML=Sn,Ql=n(),We=i("p"),We.textContent=Bn,Dl=n(),ke=i("table"),ke.innerHTML=Hn,Ol=n(),o(Fe.$$.fragment),Kl=n(),Ne=i("ol"),Ne.innerHTML=Gn,es=n(),o(Ye.$$.fragment),ts=n(),Xe=i("p"),Xe.innerHTML=Pn,ls=n(),o(qe.$$.fragment),ss=n(),Ve=i("p"),Ve.textContent=An,ns=n(),Ee=i("ul"),Ee.innerHTML=zn,as=n(),o(Qe.$$.fragment),is=n(),o(De.$$.fragment),ps=n(),Oe=i("ul"),Oe.innerHTML=Rn,rs=n(),o(Ke.$$.fragment),os=n(),et=i("ul"),et.innerHTML=Wn,cs=n(),o(tt.$$.fragment),ms=n(),lt=i("ul"),lt.innerHTML=kn,us=n(),o(st.$$.fragment),ds=n(),nt=i("p"),nt.innerHTML=Fn,gs=n(),o(at.$$.fragment),hs=n(),it=i("p"),it.innerHTML=Nn,fs=n(),o(pt.$$.fragment),Ms=n(),rt=i("p"),rt.innerHTML=Yn,Ts=n(),J=i("blockquote"),J.innerHTML=Xn,ws=n(),o(ot.$$.fragment),ys=n(),o(ct.$$.fragment),bs=n(),mt=i("p"),mt.textContent=qn,_s=n(),o(ut.$$.fragment),Us=n(),o(dt.$$.fragment),$s=n(),gt=i("p"),gt.innerHTML=Vn,js=n(),o(ht.$$.fragment),Js=n(),ft=i("p"),ft.textContent=En,Cs=n(),o(Mt.$$.fragment),xs=n(),Tt=i("p"),Tt.innerHTML=Qn,vs=n(),wt=i("p"),wt.innerHTML=Dn,Zs=n(),yt=i("ul"),yt.innerHTML=On,Is=n(),bt=i("p"),bt.innerHTML=Kn,Ls=n(),o(_t.$$.fragment),Ss=n(),Ut=i("ul"),Ut.innerHTML=ea,Bs=n(),o($t.$$.fragment),Hs=n(),Jt=i("p"),this.h()},l(e){const t=ca("svelte-u9bgzb",document.head);T=p(t,"META",{name:!0,content:!0}),t.forEach(l),Ct=a(e),jt=p(e,"P",{}),zs(jt).forEach(l),xt=a(e),c(C.$$.fragment,e),vt=a(e),c(x.$$.fragment,e),Zt=a(e),w=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(w)!=="svelte-1gc28wp"&&(w.innerHTML=Ws),It=a(e),c(v.$$.fragment,e),Lt=a(e),Z=p(e,"P",{"data-svelte-h":!0}),r(Z)!=="svelte-1awbvnz"&&(Z.innerHTML=ks),St=a(e),c(I.$$.fragment,e),Bt=a(e),L=p(e,"P",{"data-svelte-h":!0}),r(L)!=="svelte-n2kdl3"&&(L.textContent=Fs),Ht=a(e),c(S.$$.fragment,e),Gt=a(e),B=p(e,"P",{"data-svelte-h":!0}),r(B)!=="svelte-15pg0q3"&&(B.innerHTML=Ns),Pt=a(e),c(H.$$.fragment,e),At=a(e),G=p(e,"P",{"data-svelte-h":!0}),r(G)!=="svelte-1n2xjt"&&(G.textContent=Ys),zt=a(e),P=p(e,"P",{"data-svelte-h":!0}),r(P)!=="svelte-zfygwf"&&(P.innerHTML=Xs),Rt=a(e),A=p(e,"UL",{"data-svelte-h":!0}),r(A)!=="svelte-3s441m"&&(A.innerHTML=qs),Wt=a(e),z=p(e,"P",{"data-svelte-h":!0}),r(z)!=="svelte-jw4lic"&&(z.innerHTML=Vs),kt=a(e),R=p(e,"P",{});var As=zs(R);Ps=ma(As,`The effective batch size is calculated as:
`),Ft=ua(As,!1),As.forEach(l),Nt=a(e),W=p(e,"P",{"data-svelte-h":!0}),r(W)!=="svelte-19d9mn0"&&(W.innerHTML=Es),Yt=a(e),k=p(e,"P",{"data-svelte-h":!0}),r(k)!=="svelte-1npf4a2"&&(k.textContent=Qs),Xt=a(e),F=p(e,"TABLE",{"data-svelte-h":!0}),r(F)!=="svelte-nztwgr"&&(F.innerHTML=Ds),qt=a(e),y=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(y)!=="svelte-yb03hq"&&(y.innerHTML=Os),Vt=a(e),c(N.$$.fragment,e),Et=a(e),Y=p(e,"P",{"data-svelte-h":!0}),r(Y)!=="svelte-1brkfhl"&&(Y.textContent=Ks),Qt=a(e),b=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(b)!=="svelte-1hiylk8"&&(b.innerHTML=en),Dt=a(e),X=p(e,"P",{"data-svelte-h":!0}),r(X)!=="svelte-zxojyl"&&(X.textContent=tn),Ot=a(e),q=p(e,"UL",{"data-svelte-h":!0}),r(q)!=="svelte-1los5ox"&&(q.innerHTML=ln),Kt=a(e),c(V.$$.fragment,e),el=a(e),E=p(e,"P",{"data-svelte-h":!0}),r(E)!=="svelte-1rfz09e"&&(E.textContent=sn),tl=a(e),Q=p(e,"OL",{"data-svelte-h":!0}),r(Q)!=="svelte-hf6l75"&&(Q.innerHTML=nn),ll=a(e),_=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(_)!=="svelte-134wih1"&&(_.innerHTML=an),sl=a(e),c(D.$$.fragment,e),nl=a(e),O=p(e,"P",{"data-svelte-h":!0}),r(O)!=="svelte-1wzlppl"&&(O.textContent=pn),al=a(e),K=p(e,"TABLE",{"data-svelte-h":!0}),r(K)!=="svelte-1260gia"&&(K.innerHTML=rn),il=a(e),ee=p(e,"P",{"data-svelte-h":!0}),r(ee)!=="svelte-1d2ikj1"&&(ee.innerHTML=on),pl=a(e),te=p(e,"UL",{"data-svelte-h":!0}),r(te)!=="svelte-qvcdaq"&&(te.innerHTML=cn),rl=a(e),le=p(e,"P",{"data-svelte-h":!0}),r(le)!=="svelte-139nysj"&&(le.innerHTML=mn),ol=a(e),se=p(e,"UL",{"data-svelte-h":!0}),r(se)!=="svelte-nj5q4e"&&(se.innerHTML=un),cl=a(e),ne=p(e,"P",{"data-svelte-h":!0}),r(ne)!=="svelte-y9bny5"&&(ne.innerHTML=dn),ml=a(e),ae=p(e,"UL",{"data-svelte-h":!0}),r(ae)!=="svelte-1fqxzgi"&&(ae.innerHTML=gn),ul=a(e),ie=p(e,"P",{"data-svelte-h":!0}),r(ie)!=="svelte-cv9p0n"&&(ie.innerHTML=hn),dl=a(e),c(pe.$$.fragment,e),gl=a(e),re=p(e,"P",{"data-svelte-h":!0}),r(re)!=="svelte-1f7hwjq"&&(re.textContent=fn),hl=a(e),c(oe.$$.fragment,e),fl=a(e),ce=p(e,"OL",{"data-svelte-h":!0}),r(ce)!=="svelte-1d99bf5"&&(ce.innerHTML=Mn),Ml=a(e),c(me.$$.fragment,e),Tl=a(e),c(ue.$$.fragment,e),wl=a(e),de=p(e,"P",{"data-svelte-h":!0}),r(de)!=="svelte-14prv5"&&(de.innerHTML=Tn),yl=a(e),c(ge.$$.fragment,e),bl=a(e),c(he.$$.fragment,e),_l=a(e),c(fe.$$.fragment,e),Ul=a(e),Me=p(e,"P",{"data-svelte-h":!0}),r(Me)!=="svelte-1opb19"&&(Me.textContent=wn),$l=a(e),c(Te.$$.fragment,e),jl=a(e),c(we.$$.fragment,e),Jl=a(e),ye=p(e,"OL",{"data-svelte-h":!0}),r(ye)!=="svelte-mfle9"&&(ye.innerHTML=yn),Cl=a(e),c(be.$$.fragment,e),xl=a(e),_e=p(e,"P",{"data-svelte-h":!0}),r(_e)!=="svelte-17o749c"&&(_e.innerHTML=bn),vl=a(e),Ue=p(e,"P",{"data-svelte-h":!0}),r(Ue)!=="svelte-1ar3p4q"&&(Ue.innerHTML=_n),Zl=a(e),$e=p(e,"P",{"data-svelte-h":!0}),r($e)!=="svelte-iruthr"&&($e.innerHTML=Un),Il=a(e),je=p(e,"P",{"data-svelte-h":!0}),r(je)!=="svelte-1yjupp2"&&(je.innerHTML=$n),Ll=a(e),U=p(e,"DIV",{class:!0,"data-svelte-h":!0}),r(U)!=="svelte-66t12q"&&(U.innerHTML=jn),Sl=a(e),$=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r($)!=="svelte-x1ndil"&&($.innerHTML=Jn),Bl=a(e),c(Je.$$.fragment,e),Hl=a(e),Ce=p(e,"P",{"data-svelte-h":!0}),r(Ce)!=="svelte-atj2q9"&&(Ce.textContent=Cn),Gl=a(e),j=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(j)!=="svelte-2of9o2"&&(j.innerHTML=xn),Pl=a(e),c(xe.$$.fragment,e),Al=a(e),ve=p(e,"OL",{"data-svelte-h":!0}),r(ve)!=="svelte-1v9mepx"&&(ve.innerHTML=vn),zl=a(e),c(Ze.$$.fragment,e),Rl=a(e),c(Ie.$$.fragment,e),Wl=a(e),Le=p(e,"P",{"data-svelte-h":!0}),r(Le)!=="svelte-gkmp17"&&(Le.innerHTML=Zn),kl=a(e),c(Se.$$.fragment,e),Fl=a(e),c(Be.$$.fragment,e),Nl=a(e),c(He.$$.fragment,e),Yl=a(e),Ge=p(e,"P",{"data-svelte-h":!0}),r(Ge)!=="svelte-1opb19"&&(Ge.textContent=In),Xl=a(e),c(Pe.$$.fragment,e),ql=a(e),c(Ae.$$.fragment,e),Vl=a(e),ze=p(e,"P",{"data-svelte-h":!0}),r(ze)!=="svelte-1bz4r8q"&&(ze.innerHTML=Ln),El=a(e),Re=p(e,"UL",{"data-svelte-h":!0}),r(Re)!=="svelte-15qusls"&&(Re.innerHTML=Sn),Ql=a(e),We=p(e,"P",{"data-svelte-h":!0}),r(We)!=="svelte-vbgu25"&&(We.textContent=Bn),Dl=a(e),ke=p(e,"TABLE",{"data-svelte-h":!0}),r(ke)!=="svelte-maroh2"&&(ke.innerHTML=Hn),Ol=a(e),c(Fe.$$.fragment,e),Kl=a(e),Ne=p(e,"OL",{"data-svelte-h":!0}),r(Ne)!=="svelte-1pt6kdg"&&(Ne.innerHTML=Gn),es=a(e),c(Ye.$$.fragment,e),ts=a(e),Xe=p(e,"P",{"data-svelte-h":!0}),r(Xe)!=="svelte-18n46cg"&&(Xe.innerHTML=Pn),ls=a(e),c(qe.$$.fragment,e),ss=a(e),Ve=p(e,"P",{"data-svelte-h":!0}),r(Ve)!=="svelte-mk72jq"&&(Ve.textContent=An),ns=a(e),Ee=p(e,"UL",{"data-svelte-h":!0}),r(Ee)!=="svelte-9wfs52"&&(Ee.innerHTML=zn),as=a(e),c(Qe.$$.fragment,e),is=a(e),c(De.$$.fragment,e),ps=a(e),Oe=p(e,"UL",{"data-svelte-h":!0}),r(Oe)!=="svelte-1nda474"&&(Oe.innerHTML=Rn),rs=a(e),c(Ke.$$.fragment,e),os=a(e),et=p(e,"UL",{"data-svelte-h":!0}),r(et)!=="svelte-snw76h"&&(et.innerHTML=Wn),cs=a(e),c(tt.$$.fragment,e),ms=a(e),lt=p(e,"UL",{"data-svelte-h":!0}),r(lt)!=="svelte-klcbuh"&&(lt.innerHTML=kn),us=a(e),c(st.$$.fragment,e),ds=a(e),nt=p(e,"P",{"data-svelte-h":!0}),r(nt)!=="svelte-owoicr"&&(nt.innerHTML=Fn),gs=a(e),c(at.$$.fragment,e),hs=a(e),it=p(e,"P",{"data-svelte-h":!0}),r(it)!=="svelte-tyb6fz"&&(it.innerHTML=Nn),fs=a(e),c(pt.$$.fragment,e),Ms=a(e),rt=p(e,"P",{"data-svelte-h":!0}),r(rt)!=="svelte-ox6g4h"&&(rt.innerHTML=Yn),Ts=a(e),J=p(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(J)!=="svelte-cqvfv7"&&(J.innerHTML=Xn),ws=a(e),c(ot.$$.fragment,e),ys=a(e),c(ct.$$.fragment,e),bs=a(e),mt=p(e,"P",{"data-svelte-h":!0}),r(mt)!=="svelte-psn6sg"&&(mt.textContent=qn),_s=a(e),c(ut.$$.fragment,e),Us=a(e),c(dt.$$.fragment,e),$s=a(e),gt=p(e,"P",{"data-svelte-h":!0}),r(gt)!=="svelte-1mulz1q"&&(gt.innerHTML=Vn),js=a(e),c(ht.$$.fragment,e),Js=a(e),ft=p(e,"P",{"data-svelte-h":!0}),r(ft)!=="svelte-1g7vbkf"&&(ft.textContent=En),Cs=a(e),c(Mt.$$.fragment,e),xs=a(e),Tt=p(e,"P",{"data-svelte-h":!0}),r(Tt)!=="svelte-1japm"&&(Tt.innerHTML=Qn),vs=a(e),wt=p(e,"P",{"data-svelte-h":!0}),r(wt)!=="svelte-6awds9"&&(wt.innerHTML=Dn),Zs=a(e),yt=p(e,"UL",{"data-svelte-h":!0}),r(yt)!=="svelte-qk7gry"&&(yt.innerHTML=On),Is=a(e),bt=p(e,"P",{"data-svelte-h":!0}),r(bt)!=="svelte-1usji2a"&&(bt.innerHTML=Kn),Ls=a(e),c(_t.$$.fragment,e),Ss=a(e),Ut=p(e,"UL",{"data-svelte-h":!0}),r(Ut)!=="svelte-f7ho9s"&&(Ut.innerHTML=ea),Bs=a(e),c($t.$$.fragment,e),Hs=a(e),Jt=p(e,"P",{}),zs(Jt).forEach(l),this.h()},h(){M(T,"name","hf:doc:metadata"),M(T,"content",fa),M(w,"class","warning"),Ft.a=null,M(y,"class","tip"),M(b,"class","note"),M(_,"class","important"),M(U,"class","flex justify-center"),M($,"class","tip"),M(j,"class","note"),M(J,"class","note")},m(e,t){ta(document.head,T),s(e,Ct,t),s(e,jt,t),s(e,xt,t),m(C,e,t),s(e,vt,t),m(x,e,t),s(e,Zt,t),s(e,w,t),s(e,It,t),m(v,e,t),s(e,Lt,t),s(e,Z,t),s(e,St,t),m(I,e,t),s(e,Bt,t),s(e,L,t),s(e,Ht,t),m(S,e,t),s(e,Gt,t),s(e,B,t),s(e,Pt,t),m(H,e,t),s(e,At,t),s(e,G,t),s(e,zt,t),s(e,P,t),s(e,Rt,t),s(e,A,t),s(e,Wt,t),s(e,z,t),s(e,kt,t),s(e,R,t),ta(R,Ps),Ft.m(la,R),s(e,Nt,t),s(e,W,t),s(e,Yt,t),s(e,k,t),s(e,Xt,t),s(e,F,t),s(e,qt,t),s(e,y,t),s(e,Vt,t),m(N,e,t),s(e,Et,t),s(e,Y,t),s(e,Qt,t),s(e,b,t),s(e,Dt,t),s(e,X,t),s(e,Ot,t),s(e,q,t),s(e,Kt,t),m(V,e,t),s(e,el,t),s(e,E,t),s(e,tl,t),s(e,Q,t),s(e,ll,t),s(e,_,t),s(e,sl,t),m(D,e,t),s(e,nl,t),s(e,O,t),s(e,al,t),s(e,K,t),s(e,il,t),s(e,ee,t),s(e,pl,t),s(e,te,t),s(e,rl,t),s(e,le,t),s(e,ol,t),s(e,se,t),s(e,cl,t),s(e,ne,t),s(e,ml,t),s(e,ae,t),s(e,ul,t),s(e,ie,t),s(e,dl,t),m(pe,e,t),s(e,gl,t),s(e,re,t),s(e,hl,t),m(oe,e,t),s(e,fl,t),s(e,ce,t),s(e,Ml,t),m(me,e,t),s(e,Tl,t),m(ue,e,t),s(e,wl,t),s(e,de,t),s(e,yl,t),m(ge,e,t),s(e,bl,t),m(he,e,t),s(e,_l,t),m(fe,e,t),s(e,Ul,t),s(e,Me,t),s(e,$l,t),m(Te,e,t),s(e,jl,t),m(we,e,t),s(e,Jl,t),s(e,ye,t),s(e,Cl,t),m(be,e,t),s(e,xl,t),s(e,_e,t),s(e,vl,t),s(e,Ue,t),s(e,Zl,t),s(e,$e,t),s(e,Il,t),s(e,je,t),s(e,Ll,t),s(e,U,t),s(e,Sl,t),s(e,$,t),s(e,Bl,t),m(Je,e,t),s(e,Hl,t),s(e,Ce,t),s(e,Gl,t),s(e,j,t),s(e,Pl,t),m(xe,e,t),s(e,Al,t),s(e,ve,t),s(e,zl,t),m(Ze,e,t),s(e,Rl,t),m(Ie,e,t),s(e,Wl,t),s(e,Le,t),s(e,kl,t),m(Se,e,t),s(e,Fl,t),m(Be,e,t),s(e,Nl,t),m(He,e,t),s(e,Yl,t),s(e,Ge,t),s(e,Xl,t),m(Pe,e,t),s(e,ql,t),m(Ae,e,t),s(e,Vl,t),s(e,ze,t),s(e,El,t),s(e,Re,t),s(e,Ql,t),s(e,We,t),s(e,Dl,t),s(e,ke,t),s(e,Ol,t),m(Fe,e,t),s(e,Kl,t),s(e,Ne,t),s(e,es,t),m(Ye,e,t),s(e,ts,t),s(e,Xe,t),s(e,ls,t),m(qe,e,t),s(e,ss,t),s(e,Ve,t),s(e,ns,t),s(e,Ee,t),s(e,as,t),m(Qe,e,t),s(e,is,t),m(De,e,t),s(e,ps,t),s(e,Oe,t),s(e,rs,t),m(Ke,e,t),s(e,os,t),s(e,et,t),s(e,cs,t),m(tt,e,t),s(e,ms,t),s(e,lt,t),s(e,us,t),m(st,e,t),s(e,ds,t),s(e,nt,t),s(e,gs,t),m(at,e,t),s(e,hs,t),s(e,it,t),s(e,fs,t),m(pt,e,t),s(e,Ms,t),s(e,rt,t),s(e,Ts,t),s(e,J,t),s(e,ws,t),m(ot,e,t),s(e,ys,t),m(ct,e,t),s(e,bs,t),s(e,mt,t),s(e,_s,t),m(ut,e,t),s(e,Us,t),m(dt,e,t),s(e,$s,t),s(e,gt,t),s(e,js,t),m(ht,e,t),s(e,Js,t),s(e,ft,t),s(e,Cs,t),m(Mt,e,t),s(e,xs,t),s(e,Tt,t),s(e,vs,t),s(e,wt,t),s(e,Zs,t),s(e,yt,t),s(e,Is,t),s(e,bt,t),s(e,Ls,t),m(_t,e,t),s(e,Ss,t),s(e,Ut,t),s(e,Bs,t),m($t,e,t),s(e,Hs,t),s(e,Jt,t),Gs=!0},p:na,i(e){Gs||(u(C.$$.fragment,e),u(x.$$.fragment,e),u(v.$$.fragment,e),u(I.$$.fragment,e),u(S.$$.fragment,e),u(H.$$.fragment,e),u(N.$$.fragment,e),u(V.$$.fragment,e),u(D.$$.fragment,e),u(pe.$$.fragment,e),u(oe.$$.fragment,e),u(me.$$.fragment,e),u(ue.$$.fragment,e),u(ge.$$.fragment,e),u(he.$$.fragment,e),u(fe.$$.fragment,e),u(Te.$$.fragment,e),u(we.$$.fragment,e),u(be.$$.fragment,e),u(Je.$$.fragment,e),u(xe.$$.fragment,e),u(Ze.$$.fragment,e),u(Ie.$$.fragment,e),u(Se.$$.fragment,e),u(Be.$$.fragment,e),u(He.$$.fragment,e),u(Pe.$$.fragment,e),u(Ae.$$.fragment,e),u(Fe.$$.fragment,e),u(Ye.$$.fragment,e),u(qe.$$.fragment,e),u(Qe.$$.fragment,e),u(De.$$.fragment,e),u(Ke.$$.fragment,e),u(tt.$$.fragment,e),u(st.$$.fragment,e),u(at.$$.fragment,e),u(pt.$$.fragment,e),u(ot.$$.fragment,e),u(ct.$$.fragment,e),u(ut.$$.fragment,e),u(dt.$$.fragment,e),u(ht.$$.fragment,e),u(Mt.$$.fragment,e),u(_t.$$.fragment,e),u($t.$$.fragment,e),Gs=!0)},o(e){d(C.$$.fragment,e),d(x.$$.fragment,e),d(v.$$.fragment,e),d(I.$$.fragment,e),d(S.$$.fragment,e),d(H.$$.fragment,e),d(N.$$.fragment,e),d(V.$$.fragment,e),d(D.$$.fragment,e),d(pe.$$.fragment,e),d(oe.$$.fragment,e),d(me.$$.fragment,e),d(ue.$$.fragment,e),d(ge.$$.fragment,e),d(he.$$.fragment,e),d(fe.$$.fragment,e),d(Te.$$.fragment,e),d(we.$$.fragment,e),d(be.$$.fragment,e),d(Je.$$.fragment,e),d(xe.$$.fragment,e),d(Ze.$$.fragment,e),d(Ie.$$.fragment,e),d(Se.$$.fragment,e),d(Be.$$.fragment,e),d(He.$$.fragment,e),d(Pe.$$.fragment,e),d(Ae.$$.fragment,e),d(Fe.$$.fragment,e),d(Ye.$$.fragment,e),d(qe.$$.fragment,e),d(Qe.$$.fragment,e),d(De.$$.fragment,e),d(Ke.$$.fragment,e),d(tt.$$.fragment,e),d(st.$$.fragment,e),d(at.$$.fragment,e),d(pt.$$.fragment,e),d(ot.$$.fragment,e),d(ct.$$.fragment,e),d(ut.$$.fragment,e),d(dt.$$.fragment,e),d(ht.$$.fragment,e),d(Mt.$$.fragment,e),d(_t.$$.fragment,e),d($t.$$.fragment,e),Gs=!1},d(e){e&&(l(Ct),l(jt),l(xt),l(vt),l(Zt),l(w),l(It),l(Lt),l(Z),l(St),l(Bt),l(L),l(Ht),l(Gt),l(B),l(Pt),l(At),l(G),l(zt),l(P),l(Rt),l(A),l(Wt),l(z),l(kt),l(R),l(Nt),l(W),l(Yt),l(k),l(Xt),l(F),l(qt),l(y),l(Vt),l(Et),l(Y),l(Qt),l(b),l(Dt),l(X),l(Ot),l(q),l(Kt),l(el),l(E),l(tl),l(Q),l(ll),l(_),l(sl),l(nl),l(O),l(al),l(K),l(il),l(ee),l(pl),l(te),l(rl),l(le),l(ol),l(se),l(cl),l(ne),l(ml),l(ae),l(ul),l(ie),l(dl),l(gl),l(re),l(hl),l(fl),l(ce),l(Ml),l(Tl),l(wl),l(de),l(yl),l(bl),l(_l),l(Ul),l(Me),l($l),l(jl),l(Jl),l(ye),l(Cl),l(xl),l(_e),l(vl),l(Ue),l(Zl),l($e),l(Il),l(je),l(Ll),l(U),l(Sl),l($),l(Bl),l(Hl),l(Ce),l(Gl),l(j),l(Pl),l(Al),l(ve),l(zl),l(Rl),l(Wl),l(Le),l(kl),l(Fl),l(Nl),l(Yl),l(Ge),l(Xl),l(ql),l(Vl),l(ze),l(El),l(Re),l(Ql),l(We),l(Dl),l(ke),l(Ol),l(Kl),l(Ne),l(es),l(ts),l(Xe),l(ls),l(ss),l(Ve),l(ns),l(Ee),l(as),l(is),l(ps),l(Oe),l(rs),l(os),l(et),l(cs),l(ms),l(lt),l(us),l(ds),l(nt),l(gs),l(hs),l(it),l(fs),l(Ms),l(rt),l(Ts),l(J),l(ws),l(ys),l(bs),l(mt),l(_s),l(Us),l($s),l(gt),l(js),l(Js),l(ft),l(Cs),l(xs),l(Tt),l(vs),l(wt),l(Zs),l(yt),l(Is),l(bt),l(Ls),l(Ss),l(Ut),l(Bs),l(Hs),l(Jt)),l(T),g(C,e),g(x,e),g(v,e),g(I,e),g(S,e),g(H,e),g(N,e),g(V,e),g(D,e),g(pe,e),g(oe,e),g(me,e),g(ue,e),g(ge,e),g(he,e),g(fe,e),g(Te,e),g(we,e),g(be,e),g(Je,e),g(xe,e),g(Ze,e),g(Ie,e),g(Se,e),g(Be,e),g(He,e),g(Pe,e),g(Ae,e),g(Fe,e),g(Ye,e),g(qe,e),g(Qe,e),g(De,e),g(Ke,e),g(tt,e),g(st,e),g(at,e),g(pt,e),g(ot,e),g(ct,e),g(ut,e),g(dt,e),g(ht,e),g(Mt,e),g(_t,e),g($t,e)}}}const fa='{"title":"Distributing Training","local":"distributing-training","sections":[{"title":"Multi-GPU Training with TRL","local":"multi-gpu-training-with-trl","sections":[],"depth":2},{"title":"Sequence Parallelism for Long Context Training","local":"sequence-parallelism-for-long-context-training","sections":[{"title":"Available Implementations","local":"available-implementations","sections":[],"depth":3},{"title":"Choosing Between Ring Attention and Ulysses","local":"choosing-between-ring-attention-and-ulysses","sections":[],"depth":3},{"title":"Ring Attention Implementation (FSDP2)","local":"ring-attention-implementation-fsdp2","sections":[{"title":"Requirements and Limitations","local":"requirements-and-limitations","sections":[],"depth":4},{"title":"Configuration","local":"configuration","sections":[{"title":"Accelerate Configuration","local":"accelerate-configuration","sections":[],"depth":5},{"title":"Training Configuration","local":"training-configuration","sections":[],"depth":5}],"depth":4},{"title":"Best Practices","local":"best-practices","sections":[],"depth":4},{"title":"Benchmarking Ring Attention","local":"benchmarking-ring-attention","sections":[],"depth":4}],"depth":3},{"title":"ALST/Ulysses Implementation (DeepSpeed)","local":"alstulysses-implementation-deepspeed","sections":[{"title":"Requirements and Limitations","local":"requirements-and-limitations","sections":[],"depth":4},{"title":"Configuration","local":"configuration","sections":[{"title":"Accelerate Configuration","local":"accelerate-configuration","sections":[],"depth":5},{"title":"Training Configuration","local":"training-configuration","sections":[],"depth":5}],"depth":4},{"title":"2D Parallelism","local":"2d-parallelism","sections":[],"depth":4},{"title":"Best Practices","local":"best-practices","sections":[],"depth":4},{"title":"Complete Example","local":"complete-example","sections":[],"depth":4}],"depth":3},{"title":"Further Reading","local":"further-reading","sections":[{"title":"General Resources","local":"general-resources","sections":[],"depth":4},{"title":"Ring Attention (FSDP2)","local":"ring-attention-fsdp2","sections":[],"depth":4},{"title":"ALST/Ulysses (DeepSpeed)","local":"alstulysses-deepspeed","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Multi-Node Training","local":"multi-node-training","sections":[{"title":"Accelerate Configuration","local":"accelerate-configuration","sections":[],"depth":3},{"title":"Launching","local":"launching","sections":[{"title":"Option 1: Manual Launch (Non-HPC)","local":"option-1-manual-launch-non-hpc","sections":[],"depth":4},{"title":"Option 2: SLURM Launch (HPC Clusters)","local":"option-2-slurm-launch-hpc-clusters","sections":[],"depth":4}],"depth":3},{"title":"Further Reading","local":"further-reading","sections":[],"depth":3}],"depth":2}],"depth":1}';function Ma(Rs){return aa(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class _a extends ia{constructor(T){super(),pa(this,T,Ma,ha,sa,{})}}export{_a as component};

Xet Storage Details

Size:
62.1 kB
·
Xet hash:
2fecf93d577393a68f044288e647172a2d4c410177afc25c8f6be51760d3b31c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.