Buckets:
| import{s as Dn,a as Vn,o as zn,n as L}from"../chunks/scheduler.7b731bd4.js";import{S as Yn,i as An,e as T,s as o,c as $,h as Kn,a as M,d as s,b as p,f as tn,g as d,j as C,k as H,l as el,m as i,n as h,t as y,o as w,p as _}from"../chunks/index.cc268345.js";import{C as tl,H as P,E as nl}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{C as Z}from"../chunks/CodeBlock.169a125f.js";import{H as Ie,a as j}from"../chunks/HfOption.9f04abd1.js";function ll(v){let n,a="DPO truncation is controlled via <code>max_length</code>, which truncates the combined prompt+completion sequence.",t,f,r='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="DPO truncation"/>',c,b,U="To set the truncation parameter, use the following code snippet:",m,k,g,u,R="<p>The legacy <code>max_prompt_length</code> and <code>max_completion_length</code> parameters are now removed; instead, filter or pre-truncate overlong prompts/completions in your dataset before training.</p>",F;return k=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMERQT0NvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBEUE9Db25maWcoLi4uJTJDJTIwbWF4X2xlbmd0aCUzRC4uLik=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> DPOConfig | |
| training_args = DPOConfig(..., max_length=...)`,wrap:!1}}),{c(){n=T("p"),n.innerHTML=a,t=o(),f=T("p"),f.innerHTML=r,c=o(),b=T("p"),b.textContent=U,m=o(),$(k.$$.fragment),g=o(),u=T("blockquote"),u.innerHTML=R,this.h()},l(J){n=M(J,"P",{"data-svelte-h":!0}),C(n)!=="svelte-bap72y"&&(n.innerHTML=a),t=p(J),f=M(J,"P",{"data-svelte-h":!0}),C(f)!=="svelte-by7zy6"&&(f.innerHTML=r),c=p(J),b=M(J,"P",{"data-svelte-h":!0}),C(b)!=="svelte-ijv2jp"&&(b.textContent=U),m=p(J),d(k.$$.fragment,J),g=p(J),u=M(J,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(u)!=="svelte-1b6z5qd"&&(u.innerHTML=R),this.h()},h(){H(u,"class","warning")},m(J,x){i(J,n,x),i(J,t,x),i(J,f,x),i(J,c,x),i(J,b,x),i(J,m,x),h(k,J,x),i(J,g,x),i(J,u,x),F=!0},p:L,i(J){F||(y(k.$$.fragment,J),F=!0)},o(J){w(k.$$.fragment,J),F=!1},d(J){J&&(s(n),s(t),s(f),s(c),s(b),s(m),s(g),s(u)),_(k,J)}}}function sl(v){let n,a="SFT truncation is applied to the input sequence via the <code>max_length</code> parameter.",t,f,r='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png" alt="Truncation input ids"/>',c,b,U="To set the truncation parameter, use the following code snippet:",m,k,g;return k=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwbWF4X2xlbmd0aCUzRC4uLik=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., max_length=...)`,wrap:!1}}),{c(){n=T("p"),n.innerHTML=a,t=o(),f=T("p"),f.innerHTML=r,c=o(),b=T("p"),b.textContent=U,m=o(),$(k.$$.fragment)},l(u){n=M(u,"P",{"data-svelte-h":!0}),C(n)!=="svelte-1ytup32"&&(n.innerHTML=a),t=p(u),f=M(u,"P",{"data-svelte-h":!0}),C(f)!=="svelte-mk5ay7"&&(f.innerHTML=r),c=p(u),b=M(u,"P",{"data-svelte-h":!0}),C(b)!=="svelte-ijv2jp"&&(b.textContent=U),m=p(u),d(k.$$.fragment,u)},m(u,R){i(u,n,R),i(u,t,R),i(u,f,R),i(u,c,R),i(u,b,R),i(u,m,R),h(k,u,R),g=!0},p:L,i(u){g||(y(k.$$.fragment,u),g=!0)},o(u){w(k.$$.fragment,u),g=!1},d(u){u&&(s(n),s(t),s(f),s(c),s(b),s(m)),_(k,u)}}}function il(v){let n,a,t,f;return n=new j({props:{id:"truncation",option:"DPO",$$slots:{default:[ll]},$$scope:{ctx:v}}}),t=new j({props:{id:"truncation",option:"SFT",$$slots:{default:[sl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment)},l(r){d(n.$$.fragment,r),a=p(r),d(t.$$.fragment,r)},m(r,c){h(n,r,c),i(r,a,c),h(t,r,c),f=!0},p(r,c){const b={};c&2&&(b.$$scope={dirty:c,ctx:r}),n.$set(b);const U={};c&2&&(U.$$scope={dirty:c,ctx:r}),t.$set(U)},i(r){f||(y(n.$$.fragment,r),y(t.$$.fragment,r),f=!0)},o(r){w(n.$$.fragment,r),w(t.$$.fragment,r),f=!1},d(r){r&&s(a),_(n,r),_(t,r)}}}function al(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwdXNlX2xpZ2VyX2tlcm5lbCUzRFRydWUp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., use_liger_kernel=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function rl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMERQT0NvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBEUE9Db25maWcoLi4uJTJDJTIwdXNlX2xpZ2VyX2tlcm5lbCUzRFRydWUp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> DPOConfig | |
| training_args = DPOConfig(..., use_liger_kernel=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function ol(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyguLi4lMkMlMjB1c2VfbGlnZXJfa2VybmVsJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig | |
| training_args = GRPOConfig(..., use_liger_kernel=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function pl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwua3RvJTIwaW1wb3J0JTIwS1RPQ29uZmlnJTBBJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyMEtUT0NvbmZpZyguLi4lMkMlMjB1c2VfbGlnZXJfa2VybmVsJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.kto <span class="hljs-keyword">import</span> KTOConfig | |
| training_args = KTOConfig(..., use_liger_kernel=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function fl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwuZ2tkJTIwaW1wb3J0JTIwR0tEQ29uZmlnJTBBJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyMEdLRENvbmZpZyguLi4lMkMlMjB1c2VfbGlnZXJfa2VybmVsJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.gkd <span class="hljs-keyword">import</span> GKDConfig | |
| training_args = GKDConfig(..., use_liger_kernel=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function ml(v){let n,a,t,f,r,c,b,U,m,k;return n=new j({props:{id:"liger",option:"SFT",$$slots:{default:[al]},$$scope:{ctx:v}}}),t=new j({props:{id:"liger",option:"DPO",$$slots:{default:[rl]},$$scope:{ctx:v}}}),r=new j({props:{id:"liger",option:"GRPO",$$slots:{default:[ol]},$$scope:{ctx:v}}}),b=new j({props:{id:"liger",option:"KTO",$$slots:{default:[pl]},$$scope:{ctx:v}}}),m=new j({props:{id:"liger",option:"GKD",$$slots:{default:[fl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment),f=o(),$(r.$$.fragment),c=o(),$(b.$$.fragment),U=o(),$(m.$$.fragment)},l(g){d(n.$$.fragment,g),a=p(g),d(t.$$.fragment,g),f=p(g),d(r.$$.fragment,g),c=p(g),d(b.$$.fragment,g),U=p(g),d(m.$$.fragment,g)},m(g,u){h(n,g,u),i(g,a,u),h(t,g,u),i(g,f,u),h(r,g,u),i(g,c,u),h(b,g,u),i(g,U,u),h(m,g,u),k=!0},p(g,u){const R={};u&2&&(R.$$scope={dirty:u,ctx:g}),n.$set(R);const F={};u&2&&(F.$$scope={dirty:u,ctx:g}),t.$set(F);const J={};u&2&&(J.$$scope={dirty:u,ctx:g}),r.$set(J);const x={};u&2&&(x.$$scope={dirty:u,ctx:g}),b.$set(x);const V={};u&2&&(V.$$scope={dirty:u,ctx:g}),m.$set(V)},i(g){k||(y(n.$$.fragment,g),y(t.$$.fragment,g),y(r.$$.fragment,g),y(b.$$.fragment,g),y(m.$$.fragment,g),k=!0)},o(g){w(n.$$.fragment,g),w(t.$$.fragment,g),w(r.$$.fragment,g),w(b.$$.fragment,g),w(m.$$.fragment,g),k=!1},d(g){g&&(s(a),s(f),s(c),s(U)),_(n,g),_(t,g),_(r,g),_(b,g),_(m,g)}}}function ul(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMERQT0NvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBEUE9Db25maWcoLi4uJTJDJTIwcGFkZGluZ19mcmVlJTNEVHJ1ZSUyQyUyMG1vZGVsX2luaXRfa3dhcmdzJTNEJTdCJTIyYXR0bl9pbXBsZW1lbnRhdGlvbiUyMiUzQSUyMCUyMmtlcm5lbHMtY29tbXVuaXR5JTJGZmxhc2gtYXR0bjIlMjIlN0Qp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> DPOConfig | |
| training_args = DPOConfig(..., padding_free=<span class="hljs-literal">True</span>, model_init_kwargs={<span class="hljs-string">"attn_implementation"</span>: <span class="hljs-string">"kernels-community/flash-attn2"</span>})`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function gl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwcGFkZGluZ19mcmVlJTNEVHJ1ZSUyQyUyMG1vZGVsX2luaXRfa3dhcmdzJTNEJTdCJTIyYXR0bl9pbXBsZW1lbnRhdGlvbiUyMiUzQSUyMCUyMmtlcm5lbHMtY29tbXVuaXR5JTJGZmxhc2gtYXR0bjIlMjIlN0Qp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., padding_free=<span class="hljs-literal">True</span>, model_init_kwargs={<span class="hljs-string">"attn_implementation"</span>: <span class="hljs-string">"kernels-community/flash-attn2"</span>})`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function cl(v){let n,a,t,f;return n=new j({props:{id:"padding-free",option:"DPO",$$slots:{default:[ul]},$$scope:{ctx:v}}}),t=new j({props:{id:"padding-free",option:"SFT",$$slots:{default:[gl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment)},l(r){d(n.$$.fragment,r),a=p(r),d(t.$$.fragment,r)},m(r,c){h(n,r,c),i(r,a,c),h(t,r,c),f=!0},p(r,c){const b={};c&2&&(b.$$scope={dirty:c,ctx:r}),n.$set(b);const U={};c&2&&(U.$$scope={dirty:c,ctx:r}),t.$set(U)},i(r){f||(y(n.$$.fragment,r),y(t.$$.fragment,r),f=!0)},o(r){w(n.$$.fragment,r),w(t.$$.fragment,r),f=!1},d(r){r&&s(a),_(n,r),_(t,r)}}}function $l(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwcGFkX3RvX211bHRpcGxlX29mJTNEMjA0OCk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., pad_to_multiple_of=<span class="hljs-number">2048</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function dl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBSZXdhcmRDb25maWcoLi4uJTJDJTIwcGFkX3RvX211bHRpcGxlX29mJTNEMjA0OCk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardConfig | |
| training_args = RewardConfig(..., pad_to_multiple_of=<span class="hljs-number">2048</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function hl(v){let n,a,t,f;return n=new j({props:{id:"pad_to_multiple_of",option:"SFT",$$slots:{default:[$l]},$$scope:{ctx:v}}}),t=new j({props:{id:"pad_to_multiple_of",option:"Reward",$$slots:{default:[dl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment)},l(r){d(n.$$.fragment,r),a=p(r),d(t.$$.fragment,r)},m(r,c){h(n,r,c),i(r,a,c),h(t,r,c),f=!0},p(r,c){const b={};c&2&&(b.$$scope={dirty:c,ctx:r}),n.$set(b);const U={};c&2&&(U.$$scope={dirty:c,ctx:r}),t.$set(U)},i(r){f||(y(n.$$.fragment,r),y(t.$$.fragment,r),f=!0)},o(r){w(n.$$.fragment,r),w(t.$$.fragment,r),f=!1},d(r){r&&s(a),_(n,r),_(t,r)}}}function yl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyguLi4lMkMlMjBkczNfZ2F0aGVyX2Zvcl9nZW5lcmF0aW9uJTNERmFsc2Up",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig | |
| training_args = GRPOConfig(..., ds3_gather_for_generation=<span class="hljs-literal">False</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function wl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwub25saW5lX2RwbyUyMGltcG9ydCUyME9ubGluZURQT0NvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBPbmxpbmVEUE9Db25maWcoLi4uJTJDJTIwZHMzX2dhdGhlcl9mb3JfZ2VuZXJhdGlvbiUzREZhbHNlKQ==",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.online_dpo <span class="hljs-keyword">import</span> OnlineDPOConfig | |
| training_args = OnlineDPOConfig(..., ds3_gather_for_generation=<span class="hljs-literal">False</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function _l(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwucHBvJTIwaW1wb3J0JTIwUFBPQ29uZmlnJTBBJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyMFBQT0NvbmZpZyguLi4lMkMlMjBkczNfZ2F0aGVyX2Zvcl9nZW5lcmF0aW9uJTNERmFsc2Up",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.ppo <span class="hljs-keyword">import</span> PPOConfig | |
| training_args = PPOConfig(..., ds3_gather_for_generation=<span class="hljs-literal">False</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function bl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJMT09Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwUkxPT0NvbmZpZyguLi4lMkMlMjBkczNfZ2F0aGVyX2Zvcl9nZW5lcmF0aW9uJTNERmFsc2Up",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RLOOConfig | |
| training_args = RLOOConfig(..., ds3_gather_for_generation=<span class="hljs-literal">False</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function Tl(v){let n,a,t,f,r,c,b,U;return n=new j({props:{id:"ds3_gather_for_generation",option:"GRPO",$$slots:{default:[yl]},$$scope:{ctx:v}}}),t=new j({props:{id:"ds3_gather_for_generation",option:"Online DPO",$$slots:{default:[wl]},$$scope:{ctx:v}}}),r=new j({props:{id:"ds3_gather_for_generation",option:"PPO",$$slots:{default:[_l]},$$scope:{ctx:v}}}),b=new j({props:{id:"ds3_gather_for_generation",option:"RLOO",$$slots:{default:[bl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment),f=o(),$(r.$$.fragment),c=o(),$(b.$$.fragment)},l(m){d(n.$$.fragment,m),a=p(m),d(t.$$.fragment,m),f=p(m),d(r.$$.fragment,m),c=p(m),d(b.$$.fragment,m)},m(m,k){h(n,m,k),i(m,a,k),h(t,m,k),i(m,f,k),h(r,m,k),i(m,c,k),h(b,m,k),U=!0},p(m,k){const g={};k&2&&(g.$$scope={dirty:k,ctx:m}),n.$set(g);const u={};k&2&&(u.$$scope={dirty:k,ctx:m}),t.$set(u);const R={};k&2&&(R.$$scope={dirty:k,ctx:m}),r.$set(R);const F={};k&2&&(F.$$scope={dirty:k,ctx:m}),b.$set(F)},i(m){U||(y(n.$$.fragment,m),y(t.$$.fragment,m),y(r.$$.fragment,m),y(b.$$.fragment,m),U=!0)},o(m){w(n.$$.fragment,m),w(t.$$.fragment,m),w(r.$$.fragment,m),w(b.$$.fragment,m),U=!1},d(m){m&&(s(a),s(f),s(c)),_(n,m),_(t,m),_(r,m),_(b,m)}}}function Ml(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyguLi4lMkMlMjB2bGxtX2VuYWJsZV9zbGVlcF9tb2RlJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig | |
| training_args = GRPOConfig(..., vllm_enable_sleep_mode=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function vl(v){let n,a;return n=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJMT09Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwUkxPT0NvbmZpZyguLi4lMkMlMjB2bGxtX2VuYWJsZV9zbGVlcF9tb2RlJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RLOOConfig | |
| training_args = RLOOConfig(..., vllm_enable_sleep_mode=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){$(n.$$.fragment)},l(t){d(n.$$.fragment,t)},m(t,f){h(n,t,f),a=!0},p:L,i(t){a||(y(n.$$.fragment,t),a=!0)},o(t){w(n.$$.fragment,t),a=!1},d(t){_(n,t)}}}function Cl(v){let n,a,t,f;return n=new j({props:{id:"vllm_sleep",option:"GRPO",$$slots:{default:[Ml]},$$scope:{ctx:v}}}),t=new j({props:{id:"vllm_sleep",option:"RLOO",$$slots:{default:[vl]},$$scope:{ctx:v}}}),{c(){$(n.$$.fragment),a=o(),$(t.$$.fragment)},l(r){d(n.$$.fragment,r),a=p(r),d(t.$$.fragment,r)},m(r,c){h(n,r,c),i(r,a,c),h(t,r,c),f=!0},p(r,c){const b={};c&2&&(b.$$scope={dirty:c,ctx:r}),n.$set(b);const U={};c&2&&(U.$$scope={dirty:c,ctx:r}),t.$set(U)},i(r){f||(y(n.$$.fragment,r),y(t.$$.fragment,r),f=!0)},o(r){w(n.$$.fragment,r),w(t.$$.fragment,r),f=!1},d(r){r&&s(a),_(n,r),_(t,r)}}}function kl(v){let n,a,t,f,r,c,b,U,m,k="Training workflows can often be optimized to <strong>reduce memory consumption</strong>, and TRL provides several built-in features to help achieve this.",g,u,R="Below, we outline these techniques and recommend experimenting with different combinations to figure out which configuration works best for your specific setup.",F,J,x="Each method includes examples for the supported trainers. If you’re unsure whether a technique is compatible with your trainer, please take a look at the corresponding trainer documentation.",V,z,nn='For additional strategies, such as <strong>gradient checkpointing</strong>, which is supported across all trainers, see the <a href="https://huggingface.co/docs/transformers/perf_train_gpu_one#gradient-checkpointing" rel="nofollow"><code>transformers</code> performance guide</a>.',Ne,Y,De,A,ln="Sequence lengths in the dataset can vary widely. When data is batched, sequences are padded to match the longest one in the batch, which can cause high memory usage, even if most sequences are relatively short.",Ve,K,sn='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt-completion"/>',ze,ee,an="To reduce memory usage, it’s important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.",Ye,E,Ae,te,Ke,ne,rn="If <code>max_length</code> is too small, a significant portion of your tokens will be discarded and won’t contribute to training. If it’s too large, memory usage can spike, potentially leading to out-of-memory (OOM) errors. Without packing or padding-free, a large <code>max_length</code> may also result in inefficient training, as many tokens will be padding.",et,le,on="To help you choose an appropriate value, we provide a utility to visualize the sequence length distribution in your dataset.",tt,G,pn,nt,se,lt,W,fn="<p>This technique is available only for <strong>SFT</strong> training and setups that use <strong>FlashAttention</strong> (or its variants).</p>",st,ie,mn='<a href="#truncation">Truncation</a> has several drawbacks:',it,ae,un="<li><strong>Loss of information</strong>: Important tokens at the end of sequences may be discarded.</li> <li><strong>Choosing truncation length</strong>: Too short loses data; too long reduces efficiency.</li>",at,re,gn="Packing mitigates these issues by grouping multiple sequences into the same training row, filling each row up to <code>max_length</code>.",rt,oe,cn='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_3.png" alt="Packing"/>',ot,pe,$n="TRL implements packing using <strong>Best-Fit Decreasing (BFD)</strong> bin packing, which groups sequences efficiently while minimizing padding. When a sequence exceeds <code>max_length</code>, different strategies determine how the overflow tokens are handled.",pt,fe,dn="TRL supports three strategies:",ft,me,hn='<li><p><code>"bfd"</code> (default): Uses <strong>Best-Fit Decreasing packing</strong>. If a sequence exceeds <code>max_length</code>, the overflow tokens are discarded.</p></li> <li><p><code>"bfd_split"</code>: Uses <strong>Best-Fit Decreasing packing</strong>, but long sequences are split into chunks ≤ <code>max_length</code> before packing. This preserves all tokens and follows the approach proposed in <a href="https://huggingface.co/papers/2404.10830" rel="nofollow">Fewer Truncations Improve Language Modeling</a>.</p></li> <li><p><code>"wrapped"</code>: All tokens are concatenated into a stream and split into fixed-length blocks. This minimizes padding but may mix unrelated examples. This strategy corresponds to the <em>concatenate-then-split</em> preprocessing described in the literature (e.g., <a href="https://huggingface.co/papers/2404.10830" rel="nofollow">Fewer Truncations Improve Language Modeling</a>). It has the downside of breaking sequence continuity for a large fraction of the dataset, which hurts performance, as discussed in the <a href="https://huggingface.co/papers/2603.00729" rel="nofollow">Qwen3-Coder-Next Technical Report</a>.</p></li>',mt,S,yn="<p>If all sequences are shorter than <code>max_length</code>, <strong><code>bfd</code> and <code>bfd_split</code> behave identically</strong>, since no truncation or splitting is required.</p>",ut,ue,gt,ge,ct,ce,wn="Parameter-Efficient Fine-Tuning (PEFT) methods like LoRA are among the most effective techniques for reducing memory usage during training. Instead of training all model parameters, PEFT methods train only a small number of adapter parameters, significantly reducing memory requirements and enabling fine-tuning of larger models on limited hardware.",$t,$e,_n='For comprehensive details on using PEFT with TRL, including various adapter methods, quantization options, and advanced configurations, see <a href="peft_integration">PEFT Integration</a>.',dt,de,bn="To use PEFT for reducing memory usage:",ht,he,yt,ye,Tn='PEFT can be combined with other memory reduction techniques such as quantization (4-bit or 8-bit) for even greater memory savings. See <a href="peft_integration">PEFT Integration</a> for quantization examples.',wt,we,_t,_e,Mn='<a href="https://github.com/linkedin/Liger-Kernel" rel="nofollow">Liger Kernel</a> is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduce memory usage by 60%.',bt,be,vn='For more information, see <a href="liger_kernel_integration">Liger Kernel Integration</a>.',Tt,Te,Cn="To use Liger for reducing peak memory usage, use the following code snippet:",Mt,O,vt,Me,Ct,ve,kn="Padding-free batching is an alternative approach for reducing memory usage. In this method, a batch is first sampled and then flattened into a single sequence, avoiding padding. Unlike packing, which can result in incomplete sequences by combining parts of different samples, padding-free batching ensures that all sequences remain complete and intact.",kt,Ce,Jn='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png" alt="Padding-free"/>',Jt,q,Un="<p>It’s highly recommended to use padding-free batching with <strong>FlashAttention 2</strong> or <strong>FlashAttention 3</strong>. Otherwise, you may encounter batch contamination issues.</p>",Ut,X,Zt,ke,Rt,Je,Zn="Activation offloading is a memory efficiency technique that reduces GPU VRAM usage by temporarily moving activation tensors to CPU RAM during the forward pass and bringing them back only when needed for the backward pass. This significantly reduces peak memory usage at the cost of slightly increased training time.",jt,Ue,Rn="To enable activation offloading in your SFT training configuration:",Lt,Ze,Ft,Re,jn='Under the hood, activation offloading implements PyTorch’s <a href="https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#hooks-for-autograd-saved-tensors" rel="nofollow"><code>saved_tensors_hooks</code></a> to intercept activations during the forward pass. It intelligently manages which tensors to offload based on size and context, avoiding offloading output tensors that would be inefficient. For performance optimization, it can, via a flag (which is true by default), use CUDA streams to overlap computation with CPU-GPU transfers.',xt,je,Pt,B,Ln="<p>This technique is supported for <strong>SFT</strong> and <strong>Reward</strong> trainers currently.</p>",Ht,Le,Fn=`When enabled, this option ensures that all sequences are <strong>padded to a multiple</strong> of the specified value.<br/> | |
| This can improve computational efficiency on some hardware by aligning sequence lengths to memory-friendly boundaries.`,Gt,I,Et,Fe,Wt,xe,xn='When using DeepSpeed ZeRO-3, model weights are sharded across multiple GPUs. Online methods involve generating completions from the model as part of the training process. During this step, the model weights are temporarily gathered on a single GPU for generation. For very large models, this gathering can lead to OOM errors, as described in this issue: <a href="https://github.com/huggingface/trl/issues/2250#issue-2598304204" rel="nofollow">#2250</a>.',St,Pe,Pn="If you encounter this issue, you can disable the gathering of model weights for generation by setting the following parameter:",Ot,Q,qt,He,Hn="This adjustment prevents model weights from being gathered, avoiding OOM errors, but it may result in slower generation speeds.",Xt,Ge,Bt,Ee,Gn="When using <strong>vLLM</strong> as the generation backend for online training methods, you can enable <em>sleep mode</em> to offload vLLM parameters and cache to CPU RAM during the optimization step and reload them back to GPU VRAM when needed for weight synchronization and generation.",It,N,Qt,We,En="Offloading the vLLM weights and cache helps keep GPU memory usage low, which can be particularly beneficial when training large models or using limited GPU resources. However, waking the vLLM engine from sleep mode introduces some host–device transfer latency, which may slightly impact training speed.",Nt,Se,Dt,Oe,Wn="Gradient checkpointing trades compute for memory by not storing all intermediate activations during the forward pass, recomputing them during the backward pass instead.",Vt,qe,zt,D,Sn="<p>Gradient checkpointing is enabled by default in all trainers to optimize memory usage. You can disable it by setting <code>gradient_checkpointing=False</code> if needed.</p>",Yt,Xe,On='For more memory optimization techniques, see the <a href="https://huggingface.co/docs/transformers/perf_train_gpu_one#gradient-checkpointing" rel="nofollow">Transformers Performance Guide</a>.',At,Be,Kt,Qe,en;return r=new tl({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),b=new P({props:{title:"Reducing Memory Usage",local:"reducing-memory-usage",headingTag:"h1"}}),Y=new P({props:{title:"Truncation",local:"truncation",headingTag:"h2"}}),E=new Ie({props:{id:"truncation",options:["DPO","SFT"],$$slots:{default:[il]},$$scope:{ctx:v}}}),te=new P({props:{title:"How to choose the max_length value?",local:"how-to-choose-the-maxlength-value",headingTag:"h3"}}),se=new P({props:{title:"Packing",local:"packing",headingTag:"h2"}}),ue=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoJTBBJTIwJTIwJTIwJTIwLi4uJTJDJTBBJTIwJTIwJTIwJTIwcGFja2luZyUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBwYWNraW5nX3N0cmF0ZWd5JTNEJTIyYmZkJTIyJTJDJTBBJTIwJTIwJTIwJTIwbWF4X2xlbmd0aCUzRDUxMiUyQyUwQSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig( | |
| ..., | |
| packing=<span class="hljs-literal">True</span>, | |
| packing_strategy=<span class="hljs-string">"bfd"</span>, | |
| max_length=<span class="hljs-number">512</span>, | |
| )`,wrap:!1}}),ge=new P({props:{title:"PEFT for parameter-efficient fine-tuning",local:"peft-for-parameter-efficient-fine-tuning",headingTag:"h2"}}),he=new Z({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZnJvbSUyMHBlZnQlMjBpbXBvcnQlMjBMb3JhQ29uZmlnJTBBZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVFRyYWluZXIlMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZDYXB5YmFyYSUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBcGVmdF9jb25maWclMjAlM0QlMjBMb3JhQ29uZmlnKCklMEElMEF0cmFpbmVyJTIwJTNEJTIwU0ZUVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMi41LTAuNUIlMjIlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMHBlZnRfY29uZmlnJTNEcGVmdF9jb25maWclMkMlMEEp",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTTrainer | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/Capybara"</span>, split=<span class="hljs-string">"train"</span>) | |
| peft_config = LoraConfig() | |
| trainer = SFTTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen2.5-0.5B"</span>, | |
| train_dataset=dataset, | |
| peft_config=peft_config, | |
| )`,wrap:!1}}),we=new P({props:{title:"Liger for reducing peak memory usage",local:"liger-for-reducing-peak-memory-usage",headingTag:"h2"}}),O=new Ie({props:{id:"liger",options:["SFT","DPO","GRPO","KTO","GKD"],$$slots:{default:[ml]},$$scope:{ctx:v}}}),Me=new P({props:{title:"Padding-free",local:"padding-free",headingTag:"h2"}}),X=new Ie({props:{id:"padding-free",options:["DPO","SFT"],$$slots:{default:[cl]},$$scope:{ctx:v}}}),ke=new P({props:{title:"Activation offloading",local:"activation-offloading",headingTag:"h2"}}),Ze=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwYWN0aXZhdGlvbl9vZmZsb2FkaW5nJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., activation_offloading=<span class="hljs-literal">True</span>)`,wrap:!1}}),je=new P({props:{title:"Padding Sequences to a Multiple",local:"padding-sequences-to-a-multiple",headingTag:"h2"}}),I=new Ie({props:{id:"pad_to_multiple_of",options:["SFT","Reward"],$$slots:{default:[hl]},$$scope:{ctx:v}}}),Fe=new P({props:{title:"Disabling model gathering for generation in online methods",local:"disabling-model-gathering-for-generation-in-online-methods",headingTag:"h2"}}),Q=new Ie({props:{id:"ds3_gather_for_generation",options:["GRPO","Online DPO","PPO","RLOO"],$$slots:{default:[Tl]},$$scope:{ctx:v}}}),Ge=new P({props:{title:"vLLM sleep mode",local:"vllm-sleep-mode",headingTag:"h2"}}),N=new Ie({props:{id:"vllm_sleep",options:["GRPO","RLOO"],$$slots:{default:[Cl]},$$scope:{ctx:v}}}),Se=new P({props:{title:"Gradient checkpointing",local:"gradient-checkpointing",headingTag:"h2"}}),qe=new Z({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFNGVENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBTRlRDb25maWcoLi4uJTJDJTIwZ3JhZGllbnRfY2hlY2twb2ludGluZyUzRFRydWUp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig | |
| training_args = SFTConfig(..., gradient_checkpointing=<span class="hljs-literal">True</span>)`,wrap:!1}}),Be=new nl({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/reducing_memory_usage.md"}}),{c(){n=T("meta"),a=o(),t=T("p"),f=o(),$(r.$$.fragment),c=o(),$(b.$$.fragment),U=o(),m=T("p"),m.innerHTML=k,g=o(),u=T("p"),u.textContent=R,F=o(),J=T("p"),J.textContent=x,V=o(),z=T("p"),z.innerHTML=nn,Ne=o(),$(Y.$$.fragment),De=o(),A=T("p"),A.textContent=ln,Ve=o(),K=T("p"),K.innerHTML=sn,ze=o(),ee=T("p"),ee.textContent=an,Ye=o(),$(E.$$.fragment),Ae=o(),$(te.$$.fragment),Ke=o(),ne=T("p"),ne.innerHTML=rn,et=o(),le=T("p"),le.textContent=on,tt=o(),G=T("iframe"),nt=o(),$(se.$$.fragment),lt=o(),W=T("blockquote"),W.innerHTML=fn,st=o(),ie=T("p"),ie.innerHTML=mn,it=o(),ae=T("ol"),ae.innerHTML=un,at=o(),re=T("p"),re.innerHTML=gn,rt=o(),oe=T("p"),oe.innerHTML=cn,ot=o(),pe=T("p"),pe.innerHTML=$n,pt=o(),fe=T("p"),fe.textContent=dn,ft=o(),me=T("ul"),me.innerHTML=hn,mt=o(),S=T("blockquote"),S.innerHTML=yn,ut=o(),$(ue.$$.fragment),gt=o(),$(ge.$$.fragment),ct=o(),ce=T("p"),ce.textContent=wn,$t=o(),$e=T("p"),$e.innerHTML=_n,dt=o(),de=T("p"),de.textContent=bn,ht=o(),$(he.$$.fragment),yt=o(),ye=T("p"),ye.innerHTML=Tn,wt=o(),$(we.$$.fragment),_t=o(),_e=T("p"),_e.innerHTML=Mn,bt=o(),be=T("p"),be.innerHTML=vn,Tt=o(),Te=T("p"),Te.textContent=Cn,Mt=o(),$(O.$$.fragment),vt=o(),$(Me.$$.fragment),Ct=o(),ve=T("p"),ve.textContent=kn,kt=o(),Ce=T("p"),Ce.innerHTML=Jn,Jt=o(),q=T("blockquote"),q.innerHTML=Un,Ut=o(),$(X.$$.fragment),Zt=o(),$(ke.$$.fragment),Rt=o(),Je=T("p"),Je.textContent=Zn,jt=o(),Ue=T("p"),Ue.textContent=Rn,Lt=o(),$(Ze.$$.fragment),Ft=o(),Re=T("p"),Re.innerHTML=jn,xt=o(),$(je.$$.fragment),Pt=o(),B=T("blockquote"),B.innerHTML=Ln,Ht=o(),Le=T("p"),Le.innerHTML=Fn,Gt=o(),$(I.$$.fragment),Et=o(),$(Fe.$$.fragment),Wt=o(),xe=T("p"),xe.innerHTML=xn,St=o(),Pe=T("p"),Pe.textContent=Pn,Ot=o(),$(Q.$$.fragment),qt=o(),He=T("p"),He.textContent=Hn,Xt=o(),$(Ge.$$.fragment),Bt=o(),Ee=T("p"),Ee.innerHTML=Gn,It=o(),$(N.$$.fragment),Qt=o(),We=T("p"),We.textContent=En,Nt=o(),$(Se.$$.fragment),Dt=o(),Oe=T("p"),Oe.textContent=Wn,Vt=o(),$(qe.$$.fragment),zt=o(),D=T("blockquote"),D.innerHTML=Sn,Yt=o(),Xe=T("p"),Xe.innerHTML=On,At=o(),$(Be.$$.fragment),Kt=o(),Qe=T("p"),this.h()},l(e){const l=Kn("svelte-u9bgzb",document.head);n=M(l,"META",{name:!0,content:!0}),l.forEach(s),a=p(e),t=M(e,"P",{}),tn(t).forEach(s),f=p(e),d(r.$$.fragment,e),c=p(e),d(b.$$.fragment,e),U=p(e),m=M(e,"P",{"data-svelte-h":!0}),C(m)!=="svelte-qdiwpl"&&(m.innerHTML=k),g=p(e),u=M(e,"P",{"data-svelte-h":!0}),C(u)!=="svelte-jfi5tv"&&(u.textContent=R),F=p(e),J=M(e,"P",{"data-svelte-h":!0}),C(J)!=="svelte-4m1hvf"&&(J.textContent=x),V=p(e),z=M(e,"P",{"data-svelte-h":!0}),C(z)!=="svelte-d6n8r3"&&(z.innerHTML=nn),Ne=p(e),d(Y.$$.fragment,e),De=p(e),A=M(e,"P",{"data-svelte-h":!0}),C(A)!=="svelte-1lhmd25"&&(A.textContent=ln),Ve=p(e),K=M(e,"P",{"data-svelte-h":!0}),C(K)!=="svelte-adsb0o"&&(K.innerHTML=sn),ze=p(e),ee=M(e,"P",{"data-svelte-h":!0}),C(ee)!=="svelte-15aiza5"&&(ee.textContent=an),Ye=p(e),d(E.$$.fragment,e),Ae=p(e),d(te.$$.fragment,e),Ke=p(e),ne=M(e,"P",{"data-svelte-h":!0}),C(ne)!=="svelte-o87210"&&(ne.innerHTML=rn),et=p(e),le=M(e,"P",{"data-svelte-h":!0}),C(le)!=="svelte-1xjnylo"&&(le.textContent=on),tt=p(e),G=M(e,"IFRAME",{src:!0,frameborder:!0,width:!0,height:!0}),tn(G).forEach(s),nt=p(e),d(se.$$.fragment,e),lt=p(e),W=M(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(W)!=="svelte-3664w3"&&(W.innerHTML=fn),st=p(e),ie=M(e,"P",{"data-svelte-h":!0}),C(ie)!=="svelte-6rm19q"&&(ie.innerHTML=mn),it=p(e),ae=M(e,"OL",{"data-svelte-h":!0}),C(ae)!=="svelte-60nq7z"&&(ae.innerHTML=un),at=p(e),re=M(e,"P",{"data-svelte-h":!0}),C(re)!=="svelte-8ot8yc"&&(re.innerHTML=gn),rt=p(e),oe=M(e,"P",{"data-svelte-h":!0}),C(oe)!=="svelte-1w0rnxt"&&(oe.innerHTML=cn),ot=p(e),pe=M(e,"P",{"data-svelte-h":!0}),C(pe)!=="svelte-1k6o2v5"&&(pe.innerHTML=$n),pt=p(e),fe=M(e,"P",{"data-svelte-h":!0}),C(fe)!=="svelte-8om845"&&(fe.textContent=dn),ft=p(e),me=M(e,"UL",{"data-svelte-h":!0}),C(me)!=="svelte-lavx3y"&&(me.innerHTML=hn),mt=p(e),S=M(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(S)!=="svelte-13fxzt2"&&(S.innerHTML=yn),ut=p(e),d(ue.$$.fragment,e),gt=p(e),d(ge.$$.fragment,e),ct=p(e),ce=M(e,"P",{"data-svelte-h":!0}),C(ce)!=="svelte-1efqjv8"&&(ce.textContent=wn),$t=p(e),$e=M(e,"P",{"data-svelte-h":!0}),C($e)!=="svelte-11pajgc"&&($e.innerHTML=_n),dt=p(e),de=M(e,"P",{"data-svelte-h":!0}),C(de)!=="svelte-16n1y05"&&(de.textContent=bn),ht=p(e),d(he.$$.fragment,e),yt=p(e),ye=M(e,"P",{"data-svelte-h":!0}),C(ye)!=="svelte-16gxpob"&&(ye.innerHTML=Tn),wt=p(e),d(we.$$.fragment,e),_t=p(e),_e=M(e,"P",{"data-svelte-h":!0}),C(_e)!=="svelte-ag51dc"&&(_e.innerHTML=Mn),bt=p(e),be=M(e,"P",{"data-svelte-h":!0}),C(be)!=="svelte-1wxpk58"&&(be.innerHTML=vn),Tt=p(e),Te=M(e,"P",{"data-svelte-h":!0}),C(Te)!=="svelte-tcc5th"&&(Te.textContent=Cn),Mt=p(e),d(O.$$.fragment,e),vt=p(e),d(Me.$$.fragment,e),Ct=p(e),ve=M(e,"P",{"data-svelte-h":!0}),C(ve)!=="svelte-igqq2g"&&(ve.textContent=kn),kt=p(e),Ce=M(e,"P",{"data-svelte-h":!0}),C(Ce)!=="svelte-1odxc71"&&(Ce.innerHTML=Jn),Jt=p(e),q=M(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(q)!=="svelte-1t38aso"&&(q.innerHTML=Un),Ut=p(e),d(X.$$.fragment,e),Zt=p(e),d(ke.$$.fragment,e),Rt=p(e),Je=M(e,"P",{"data-svelte-h":!0}),C(Je)!=="svelte-1w4qqip"&&(Je.textContent=Zn),jt=p(e),Ue=M(e,"P",{"data-svelte-h":!0}),C(Ue)!=="svelte-yymgek"&&(Ue.textContent=Rn),Lt=p(e),d(Ze.$$.fragment,e),Ft=p(e),Re=M(e,"P",{"data-svelte-h":!0}),C(Re)!=="svelte-158zqcy"&&(Re.innerHTML=jn),xt=p(e),d(je.$$.fragment,e),Pt=p(e),B=M(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(B)!=="svelte-16zy2ex"&&(B.innerHTML=Ln),Ht=p(e),Le=M(e,"P",{"data-svelte-h":!0}),C(Le)!=="svelte-17mouyh"&&(Le.innerHTML=Fn),Gt=p(e),d(I.$$.fragment,e),Et=p(e),d(Fe.$$.fragment,e),Wt=p(e),xe=M(e,"P",{"data-svelte-h":!0}),C(xe)!=="svelte-1uhnyny"&&(xe.innerHTML=xn),St=p(e),Pe=M(e,"P",{"data-svelte-h":!0}),C(Pe)!=="svelte-gun0x9"&&(Pe.textContent=Pn),Ot=p(e),d(Q.$$.fragment,e),qt=p(e),He=M(e,"P",{"data-svelte-h":!0}),C(He)!=="svelte-14mh10t"&&(He.textContent=Hn),Xt=p(e),d(Ge.$$.fragment,e),Bt=p(e),Ee=M(e,"P",{"data-svelte-h":!0}),C(Ee)!=="svelte-11stwpz"&&(Ee.innerHTML=Gn),It=p(e),d(N.$$.fragment,e),Qt=p(e),We=M(e,"P",{"data-svelte-h":!0}),C(We)!=="svelte-4vmmnf"&&(We.textContent=En),Nt=p(e),d(Se.$$.fragment,e),Dt=p(e),Oe=M(e,"P",{"data-svelte-h":!0}),C(Oe)!=="svelte-tl3eak"&&(Oe.textContent=Wn),Vt=p(e),d(qe.$$.fragment,e),zt=p(e),D=M(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),C(D)!=="svelte-z49v1g"&&(D.innerHTML=Sn),Yt=p(e),Xe=M(e,"P",{"data-svelte-h":!0}),C(Xe)!=="svelte-1c3mwrn"&&(Xe.innerHTML=On),At=p(e),d(Be.$$.fragment,e),Kt=p(e),Qe=M(e,"P",{}),tn(Qe).forEach(s),this.h()},h(){H(n,"name","hf:doc:metadata"),H(n,"content",Jl),Vn(G.src,pn="https://trl-lib-dataset-length-profiler.hf.space")||H(G,"src",pn),H(G,"frameborder","0"),H(G,"width","100%"),H(G,"height","1000"),H(W,"class","tip"),H(S,"class","note"),H(q,"class","warning"),H(B,"class","tip"),H(D,"class","note")},m(e,l){el(document.head,n),i(e,a,l),i(e,t,l),i(e,f,l),h(r,e,l),i(e,c,l),h(b,e,l),i(e,U,l),i(e,m,l),i(e,g,l),i(e,u,l),i(e,F,l),i(e,J,l),i(e,V,l),i(e,z,l),i(e,Ne,l),h(Y,e,l),i(e,De,l),i(e,A,l),i(e,Ve,l),i(e,K,l),i(e,ze,l),i(e,ee,l),i(e,Ye,l),h(E,e,l),i(e,Ae,l),h(te,e,l),i(e,Ke,l),i(e,ne,l),i(e,et,l),i(e,le,l),i(e,tt,l),i(e,G,l),i(e,nt,l),h(se,e,l),i(e,lt,l),i(e,W,l),i(e,st,l),i(e,ie,l),i(e,it,l),i(e,ae,l),i(e,at,l),i(e,re,l),i(e,rt,l),i(e,oe,l),i(e,ot,l),i(e,pe,l),i(e,pt,l),i(e,fe,l),i(e,ft,l),i(e,me,l),i(e,mt,l),i(e,S,l),i(e,ut,l),h(ue,e,l),i(e,gt,l),h(ge,e,l),i(e,ct,l),i(e,ce,l),i(e,$t,l),i(e,$e,l),i(e,dt,l),i(e,de,l),i(e,ht,l),h(he,e,l),i(e,yt,l),i(e,ye,l),i(e,wt,l),h(we,e,l),i(e,_t,l),i(e,_e,l),i(e,bt,l),i(e,be,l),i(e,Tt,l),i(e,Te,l),i(e,Mt,l),h(O,e,l),i(e,vt,l),h(Me,e,l),i(e,Ct,l),i(e,ve,l),i(e,kt,l),i(e,Ce,l),i(e,Jt,l),i(e,q,l),i(e,Ut,l),h(X,e,l),i(e,Zt,l),h(ke,e,l),i(e,Rt,l),i(e,Je,l),i(e,jt,l),i(e,Ue,l),i(e,Lt,l),h(Ze,e,l),i(e,Ft,l),i(e,Re,l),i(e,xt,l),h(je,e,l),i(e,Pt,l),i(e,B,l),i(e,Ht,l),i(e,Le,l),i(e,Gt,l),h(I,e,l),i(e,Et,l),h(Fe,e,l),i(e,Wt,l),i(e,xe,l),i(e,St,l),i(e,Pe,l),i(e,Ot,l),h(Q,e,l),i(e,qt,l),i(e,He,l),i(e,Xt,l),h(Ge,e,l),i(e,Bt,l),i(e,Ee,l),i(e,It,l),h(N,e,l),i(e,Qt,l),i(e,We,l),i(e,Nt,l),h(Se,e,l),i(e,Dt,l),i(e,Oe,l),i(e,Vt,l),h(qe,e,l),i(e,zt,l),i(e,D,l),i(e,Yt,l),i(e,Xe,l),i(e,At,l),h(Be,e,l),i(e,Kt,l),i(e,Qe,l),en=!0},p(e,[l]){const qn={};l&2&&(qn.$$scope={dirty:l,ctx:e}),E.$set(qn);const Xn={};l&2&&(Xn.$$scope={dirty:l,ctx:e}),O.$set(Xn);const Bn={};l&2&&(Bn.$$scope={dirty:l,ctx:e}),X.$set(Bn);const In={};l&2&&(In.$$scope={dirty:l,ctx:e}),I.$set(In);const Qn={};l&2&&(Qn.$$scope={dirty:l,ctx:e}),Q.$set(Qn);const Nn={};l&2&&(Nn.$$scope={dirty:l,ctx:e}),N.$set(Nn)},i(e){en||(y(r.$$.fragment,e),y(b.$$.fragment,e),y(Y.$$.fragment,e),y(E.$$.fragment,e),y(te.$$.fragment,e),y(se.$$.fragment,e),y(ue.$$.fragment,e),y(ge.$$.fragment,e),y(he.$$.fragment,e),y(we.$$.fragment,e),y(O.$$.fragment,e),y(Me.$$.fragment,e),y(X.$$.fragment,e),y(ke.$$.fragment,e),y(Ze.$$.fragment,e),y(je.$$.fragment,e),y(I.$$.fragment,e),y(Fe.$$.fragment,e),y(Q.$$.fragment,e),y(Ge.$$.fragment,e),y(N.$$.fragment,e),y(Se.$$.fragment,e),y(qe.$$.fragment,e),y(Be.$$.fragment,e),en=!0)},o(e){w(r.$$.fragment,e),w(b.$$.fragment,e),w(Y.$$.fragment,e),w(E.$$.fragment,e),w(te.$$.fragment,e),w(se.$$.fragment,e),w(ue.$$.fragment,e),w(ge.$$.fragment,e),w(he.$$.fragment,e),w(we.$$.fragment,e),w(O.$$.fragment,e),w(Me.$$.fragment,e),w(X.$$.fragment,e),w(ke.$$.fragment,e),w(Ze.$$.fragment,e),w(je.$$.fragment,e),w(I.$$.fragment,e),w(Fe.$$.fragment,e),w(Q.$$.fragment,e),w(Ge.$$.fragment,e),w(N.$$.fragment,e),w(Se.$$.fragment,e),w(qe.$$.fragment,e),w(Be.$$.fragment,e),en=!1},d(e){e&&(s(a),s(t),s(f),s(c),s(U),s(m),s(g),s(u),s(F),s(J),s(V),s(z),s(Ne),s(De),s(A),s(Ve),s(K),s(ze),s(ee),s(Ye),s(Ae),s(Ke),s(ne),s(et),s(le),s(tt),s(G),s(nt),s(lt),s(W),s(st),s(ie),s(it),s(ae),s(at),s(re),s(rt),s(oe),s(ot),s(pe),s(pt),s(fe),s(ft),s(me),s(mt),s(S),s(ut),s(gt),s(ct),s(ce),s($t),s($e),s(dt),s(de),s(ht),s(yt),s(ye),s(wt),s(_t),s(_e),s(bt),s(be),s(Tt),s(Te),s(Mt),s(vt),s(Ct),s(ve),s(kt),s(Ce),s(Jt),s(q),s(Ut),s(Zt),s(Rt),s(Je),s(jt),s(Ue),s(Lt),s(Ft),s(Re),s(xt),s(Pt),s(B),s(Ht),s(Le),s(Gt),s(Et),s(Wt),s(xe),s(St),s(Pe),s(Ot),s(qt),s(He),s(Xt),s(Bt),s(Ee),s(It),s(Qt),s(We),s(Nt),s(Dt),s(Oe),s(Vt),s(zt),s(D),s(Yt),s(Xe),s(At),s(Kt),s(Qe)),s(n),_(r,e),_(b,e),_(Y,e),_(E,e),_(te,e),_(se,e),_(ue,e),_(ge,e),_(he,e),_(we,e),_(O,e),_(Me,e),_(X,e),_(ke,e),_(Ze,e),_(je,e),_(I,e),_(Fe,e),_(Q,e),_(Ge,e),_(N,e),_(Se,e),_(qe,e),_(Be,e)}}}const Jl='{"title":"Reducing Memory Usage","local":"reducing-memory-usage","sections":[{"title":"Truncation","local":"truncation","sections":[{"title":"How to choose the max_length value?","local":"how-to-choose-the-maxlength-value","sections":[],"depth":3}],"depth":2},{"title":"Packing","local":"packing","sections":[],"depth":2},{"title":"PEFT for parameter-efficient fine-tuning","local":"peft-for-parameter-efficient-fine-tuning","sections":[],"depth":2},{"title":"Liger for reducing peak memory usage","local":"liger-for-reducing-peak-memory-usage","sections":[],"depth":2},{"title":"Padding-free","local":"padding-free","sections":[],"depth":2},{"title":"Activation offloading","local":"activation-offloading","sections":[],"depth":2},{"title":"Padding Sequences to a Multiple","local":"padding-sequences-to-a-multiple","sections":[],"depth":2},{"title":"Disabling model gathering for generation in online methods","local":"disabling-model-gathering-for-generation-in-online-methods","sections":[],"depth":2},{"title":"vLLM sleep mode","local":"vllm-sleep-mode","sections":[],"depth":2},{"title":"Gradient checkpointing","local":"gradient-checkpointing","sections":[],"depth":2}],"depth":1}';function Ul(v){return zn(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xl extends Yn{constructor(n){super(),An(this,n,Ul,kl,Dn,{})}}export{xl as component}; | |
Xet Storage Details
- Size:
- 44 kB
- Xet hash:
- 1031235925b10dc6e8ba98f140a598832db6baf43332637300279b9d21019ab9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.