Buckets:

rtrm's picture
download
raw
374 kB
import{s as Kg,a as sd,o as ad,n as td}from"../chunks/scheduler.7b731bd4.js";import{S as ed,i as nd,e as i,s as n,c as h,q as o,H as b,h as ld,a as m,d as t,b as l,f as T,g,j as r,r as c,u as M,k as v,l as p,m as e,n as d,t as u,o as y,p as f}from"../chunks/index.cc268345.js";import{C as pd,H as x,E as id}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.341fed55.js";import{D as jn}from"../chunks/Docstring.d5b03485.js";import{C as J}from"../chunks/CodeBlock.ef6947e6.js";import{E as md}from"../chunks/ExampleCodeBlock.40226b43.js";function rd(Cn){let z,Es="Example:",W,P,E;return P=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTBBZnJvbSUyMHRybC5yZXdhcmRzJTIwaW1wb3J0JTIwYWNjdXJhY3lfcmV3YXJkJTBBZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0cmwtbGliJTJGRGVlcE1hdGgtMTAzSyUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyUXdlbiUyRlF3ZW4yLjUtMC41Qi1JbnN0cnVjdCUyMiUyQyUwQSUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRGFjY3VyYWN5X3Jld2FyZCUyQyUwQSUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0RkYXRhc2V0JTJDJTBBKSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
<span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/DeepMath-103K&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen2.5-0.5B-Instruct&quot;</span>,
reward_funcs=accuracy_reward,
train_dataset=dataset,
)
trainer.train()`,wrap:!1}}),{c(){z=i("p"),z.textContent=Es,W=n(),h(P.$$.fragment)},l(j){z=m(j,"P",{"data-svelte-h":!0}),r(z)!=="svelte-11lpom8"&&(z.textContent=Es),W=l(j),g(P.$$.fragment,j)},m(j,Q){e(j,z,Q),e(j,W,Q),d(P,j,Q),E=!0},p:td,i(j){E||(u(P.$$.fragment,j),E=!0)},o(j){y(P.$$.fragment,j),E=!1},d(j){j&&(t(z),t(W)),f(P,j)}}}function od(Cn){let z,Es,W,P,E,j,Q,zn,Zs,Go='<a href="https://huggingface.co/models?other=grpo,trl" rel="nofollow"><img src="https://img.shields.io/badge/All_models-GRPO-blue" alt="model badge"/></a>',kn,Ss,In,Qs,Ro='TRL supports the GRPO Trainer for training language models, as described in the paper <a href="https://huggingface.co/papers/2402.03300" rel="nofollow">DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models</a> by <a href="https://huggingface.co/syhia" rel="nofollow">Zhihong Shao</a>, <a href="https://huggingface.co/peiyiwang89" rel="nofollow">Peiyi Wang</a>, <a href="https://huggingface.co/zqh11" rel="nofollow">Qihao Zhu</a>, Runxin Xu, <a href="https://huggingface.co/haha-point" rel="nofollow">Junxiao Song</a>, Mingchuan Zhang, Y. K. Li, Y. Wu, <a href="https://huggingface.co/guoday" rel="nofollow">Daya Guo</a>.',Gn,Hs,$o="The abstract from the paper is the following:",Rn,Xs,Lo="<p>Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.</p>",$n,Os,No='This post-training method was contributed by <a href="https://huggingface.co/qgallouedec" rel="nofollow">Quentin Gallouédec</a>.',Ln,Ws,Nn,Vs,Bo='This example demonstrates how to train a model using the GRPO method. We train a <a href="https://huggingface.co/Qwen/Qwen2-0.5B-Instruct" rel="nofollow">Qwen 0.5B Instruct model</a> with the prompts from the <a href="https://huggingface.co/datasets/trl-lib/DeepMath-103K" rel="nofollow">DeepMath-103K dataset</a>. You can view the data in the dataset here:',Bn,V,qo,qn,Fs,Ao="Below is the script to train the model.",An,Ys,Pn,Ds,Po="Execute the script using the following command:",En,Ks,Zn,sa,Eo="Distributed across 8 GPUs, the training takes approximately 1 day.",Sn,aa,Zo='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png" alt="GRPO curves"/>',Qn,ta,Hn,ea,So="GRPO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind GRPO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how GRPO works, it can be broken down into four main steps: <strong>Generating completions</strong>, <strong>computing the advantage</strong>, <strong>estimating the KL divergence</strong>, and <strong>computing the loss</strong>.",Xn,na,Qo='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png" alt="GRPO visual"/>',On,la,Wn,F,Om,Vn,og='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>G</mi></mrow><annotation encoding="application/x-tex"> G </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6833em;"></span><span class="mord mathnormal">G</span></span></span></span>',Fn,Yn,cg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>o</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex"> o_i </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.5806em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>',Dn,Kn,pa,sl,ps,Wm,al,hg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>G</mi></mrow><annotation encoding="application/x-tex"> G </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6833em;"></span><span class="mord mathnormal">G</span></span></span></span>',tl,el,gg='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>=</mo><mfrac><mrow><msub><mi>r</mi><mi>i</mi></msub><mo>−</mo><mtext>mean</mtext><mo stretchy="false">(</mo><mi mathvariant="bold">r</mi><mo stretchy="false">)</mo></mrow><mrow><mtext>std</mtext><mo stretchy="false">(</mo><mi mathvariant="bold">r</mi><mo stretchy="false">)</mo></mrow></mfrac></mrow><annotation encoding="application/x-tex">\\hat{A}_{i,t} = \\frac{r_i - \\text{mean}(\\mathbf{r})}{\\text{std}(\\mathbf{r})}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.2329em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.363em;vertical-align:-0.936em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord text"><span class="mord">std</span></span><span class="mopen">(</span><span class="mord mathbf">r</span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord text"><span class="mord">mean</span></span><span class="mopen">(</span><span class="mord mathbf">r</span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span>',nl,ia,Ho="This approach gives the method its name: <strong>Group Relative Policy Optimization (GRPO)</strong>.",ll,ma,L,Vm,ds,Xo="Understanding R1-Zero-Like Training: A Critical Perspective",Fm,pl,dg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>std</mtext><mo stretchy="false">(</mo><mi mathvariant="bold">r</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> \\text{std}(\\mathbf{r}) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord text"><span class="mord">std</span></span><span class="mopen">(</span><span class="mord mathbf">r</span><span class="mclose">)</span></span></span></span>',il,Ie,Oo="scale_rewards=False",Ym,ra,Wo="GRPOConfig",Dm,ml,us,Vo='<p>As shown in <a href="https://huggingface.co/papers/2508.08221" rel="nofollow">Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)</a>, calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting <code>scale_rewards=&quot;batch&quot;</code> in <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a>.</p>',rl,oa,ol,is,Km,ys,Fo="Schulman et al. (2020)",sr,cl,ug=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="double-struck">D</mi><mtext>KL</mtext></msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mi mathvariant="normal">∥</mi><msub><mi>π</mi><mtext>ref</mtext></msub><mo fence="true">]</mo></mrow><mo>=</mo><mfrac><mrow><msub><mi>π</mi><mtext>ref</mtext></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac><mo>−</mo><mi>log</mi><mo>⁡</mo><mfrac><mrow><msub><mi>π</mi><mtext>ref</mtext></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac><mo>−</mo><mn>1</mn><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">\\mathbb{D}_{\\text{KL}}\\left[\\pi_\\theta \\|\\pi_{\\text{ref}}\\right] = \\frac{\\pi_{\\text{ref}}(o_{i,t} \\mid q, o_{i,&lt;t})}{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt;t})} - \\log \\frac{\\pi_{\\text{ref}}(o_{i,t} \\mid q, o_{i,&lt;t})}{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt;t})} - 1,
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathbb">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">KL</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∥</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.3991em;vertical-align:-0.9721em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.3991em;vertical-align:-0.9721em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8389em;vertical-align:-0.1944em;"></span><span class="mord">1</span><span class="mpunct">,</span></span></span></span></span>`,hl,ca,gl,ha,ar,dl,yg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>GRPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mrow><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><mrow><mo fence="true">[</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow><mtext>no grad</mtext></msub></mfrac><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>−</mo><mi>β</mi><msub><mi mathvariant="double-struck">D</mi><mtext>KL</mtext></msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mi mathvariant="normal">∥</mi><msub><mi>π</mi><mtext>ref</mtext></msub><mo fence="true">]</mo></mrow><mo fence="true">]</mo></mrow><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{GRPO}}(\\theta) = -\\frac{1}{\\sum_{i=1}^G |o_i|} \\sum_{i=1}^G \\sum_{t=1}^{|o_i|} \\left[ \\frac{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})}{\\left[\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})\\right]_{\\text{no grad}}} \\hat{A}_{i,t} - \\beta \\mathbb{D}_{\\text{KL}}\\left[\\pi_\\theta \\| \\pi_{\\text{ref}}\\right] \\right],
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">GRPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.1288em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mop"><span class="mop op-symbol small-op" style="position:relative;top:0em;">∑</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.9812em;"><span style="top:-2.4003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.2029em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">∣</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.1709em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size4">[</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="minner"><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1503em;"><span style="top:-2.3642em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">no grad</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.4719em;"><span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.1579em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.05278em;">β</span><span class="mord"><span class="mord mathbb">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">KL</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∥</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size4">]</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span></span></span></span></span>`,ul,ga,Yo="where the first term represents the scaled advantage and the second term penalizes deviations from the reference policy through KL divergence.",yl,da,N,tr,fs,Do="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",er,fl,fg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mfrac><mn>1</mn><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac></mrow><annotation encoding="application/x-tex"> \\frac{1}{|o_i|} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.3651em;vertical-align:-0.52em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8451em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.394em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.52em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span>',vl,vs,Ko="Understanding R1-Zero-Like Training: A Critical Perspective",nr,ua,sc="loss types",lr,wl,ya,C,pr,ws,ac="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",ir,bl,vg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>β</mi><mo>=</mo><mn>0.0</mn></mrow><annotation encoding="application/x-tex"> \\beta = 0.0 </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord mathnormal" style="margin-right:0.05278em;">β</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.0</span></span></span></span>',Ml,bs,tc="Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model",mr,Ms,ec="Understanding R1-Zero-Like Training: A Critical Perspective",rr,_s,nc="DAPO: An Open-Source LLM Reinforcement Learning System at Scale",or,Ge,lc="beta",cr,fa,pc="GRPOConfig",hr,_l,R,gr,Tl,wg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>μ</mi></mrow><annotation encoding="application/x-tex"> \\mu </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">μ</span></span></span></span>',xl,Re,ic="num_iterations",dr,va,mc="GRPOConfig",ur,$e,rc="clipped surrogate objective",yr,Jl,bg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>GRPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mrow><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><mrow><mo fence="true">[</mo><mi>min</mi><mo>⁡</mo><mrow><mo fence="true">(</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mtext>old</mtext></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo separator="true">,</mo><mtext> </mtext><mtext>clip</mtext><mrow><mo fence="true">(</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mtext>old</mtext></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac><mo separator="true">,</mo><mn>1</mn><mo>−</mo><mi>ϵ</mi><mo separator="true">,</mo><mn>1</mn><mo>+</mo><mi>ϵ</mi><mo fence="true">)</mo></mrow><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo fence="true">)</mo></mrow><mo>−</mo><mi>β</mi><msub><mi mathvariant="double-struck">D</mi><mtext>KL</mtext></msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mi mathvariant="normal">∥</mi><msub><mi>π</mi><mtext>ref</mtext></msub><mo fence="true">]</mo></mrow><mo fence="true">]</mo></mrow><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{GRPO}}(\\theta) = - \\frac{1}{\\sum_{i=1}^G |o_i|} \\sum_{i=1}^G \\sum_{t=1}^{|o_i|} \\left[ \\min \\left( \\frac{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})}{\\pi_{\\theta_{\\text{old}}}(o_{i,t} \\mid q, o_{i,&lt; t})} \\hat{A}_{i,t}, \\, \\text{clip}\\left( \\frac{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})}{\\pi_{\\theta_{\\text{old}}}(o_{i,t} \\mid q, o_{i,&lt; t})}, 1 - \\epsilon, 1 + \\epsilon \\right) \\hat{A}_{i,t} \\right) - \\beta \\mathbb{D}_{\\text{KL}}\\left[\\pi_\\theta \\| \\pi_{\\text{ref}}\\right] \\right],
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">GRPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.1288em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mop"><span class="mop op-symbol small-op" style="position:relative;top:0em;">∑</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.9812em;"><span style="top:-2.4003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.2029em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">∣</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.1709em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">[</span></span><span class="mop">min</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0278em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">old</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2559em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord text"><span class="mord">clip</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0278em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">old</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2559em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">ϵ</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">ϵ</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">)</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">)</span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.05278em;">β</span><span class="mord"><span class="mord mathbb">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">KL</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∥</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">]</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span></span></span></span></span>`,Ul,$,fr,jl,Mg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>clip</mtext><mo stretchy="false">(</mo><mo>⋅</mo><mo separator="true">,</mo><mn>1</mn><mo>−</mo><mi>ϵ</mi><mo separator="true">,</mo><mn>1</mn><mo>+</mo><mi>ϵ</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\\text{clip}(\\cdot, 1 - \\epsilon, 1 + \\epsilon) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord text"><span class="mord">clip</span></span><span class="mopen">(</span><span class="mord">⋅</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8389em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">ϵ</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">ϵ</span><span class="mclose">)</span></span></span></span>',Cl,zl,_g='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>1</mn><mo>−</mo><mi>ϵ</mi></mrow><annotation encoding="application/x-tex"> 1 - \\epsilon </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">ϵ</span></span></span></span>',kl,Il,Tg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>1</mn><mo>+</mo><mi>ϵ</mi></mrow><annotation encoding="application/x-tex"> 1 + \\epsilon </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">ϵ</span></span></span></span>',Gl,Rl,xg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>μ</mi><mo>=</mo><mn>1</mn></mrow><annotation encoding="application/x-tex"> \\mu = 1 </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">μ</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">1</span></span></span></span>',$l,Ll,wa,Nl,ba,vr,Bl,Jg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>GRPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mi>G</mi></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mfrac><mn>1</mn><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><msub><mi>l</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{GRPO}}(\\theta) = - \\frac{1}{G} \\sum_{i=1}^G \\frac{1}{|o_i|} \\sum_{t=1}^{|o_i|} l_{i,t},
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">GRPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">G</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">∣</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span></span></span></span></span>`,ql,Ma,wr,Al,Ug=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>l</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>=</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow><mtext>no grad</mtext></msub></mfrac><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>−</mo><mi>β</mi><msub><mi mathvariant="double-struck">D</mi><mtext>KL</mtext></msub><mrow><mo fence="true">[</mo><msub><mi>π</mi><mi>θ</mi></msub><mi mathvariant="normal">∥</mi><msub><mi>π</mi><mtext>ref</mtext></msub><mo fence="true">]</mo></mrow><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">
l_{i,t} = \\frac{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})}{\\left[\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})\\right]_{\\text{no grad}}} \\hat{A}_{i,t} - \\beta \\mathbb{D}_{\\text{KL}}\\left[\\pi_\\theta \\| \\pi_{\\text{ref}}\\right].
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9805em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.5849em;vertical-align:-1.1579em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="minner"><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1503em;"><span style="top:-2.3642em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">no grad</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.4719em;"><span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.1579em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.05278em;">β</span><span class="mord"><span class="mord mathbb">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">KL</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">[</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∥</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">ref</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;">]</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">.</span></span></span></span></span>`,Pl,ms,br,Ts,oc="DAPO paper",Mr,El,jg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>DAPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mrow><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><msub><mi>l</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{DAPO}}(\\theta) = - \\frac{1}{\\sum_{i=1}^G |o_i|} \\sum_{i=1}^G \\sum_{t=1}^{|o_i|} l_{i,t},
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">DAPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.1288em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mop"><span class="mop op-symbol small-op" style="position:relative;top:0em;">∑</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.9812em;"><span style="top:-2.4003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.2029em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">∣</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.1709em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span></span></span></span></span>`,Zl,_a,cc='To use this formulation, set <code>loss_type=&quot;dapo&quot;</code> in <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a>.',Sl,rs,_r,xs,hc="Understanding R1-Zero-Like Training: A Critical Perspective",Tr,Ql,Cg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>Dr. GRPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mrow><mi>L</mi><mi>G</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><msub><mi>l</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{Dr. GRPO}}(\\theta) = - \\frac{1}{LG} \\sum_{i=1}^G \\sum_{t=1}^{|o_i|} l_{i,t},
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">Dr. GRPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">L</span><span class="mord mathnormal">G</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span></span></span></span></span>`,Hl,Ta,gc='This constant is recommended to be the maximum completion length. To use this formulation, set <code>loss_type=&quot;dr_grpo&quot;</code> in the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a>.',Xl,xa,dc='Alternatively, in the <a href="https://huggingface.co/papers/2511.20347" rel="nofollow">SAPO paper</a>, the Qwen team proposes replacing the “hard” clipping mechanism of GRPO with a smooth, temperature-controlled soft gating mechanism. While GRPO zeroes out gradients when the policy deviates too far from the reference, SAPO uses a soft trust region that smoothly decays the gradient weight. This allows the model to retain useful learning signals from “near-on-policy” tokens while suppressing noise from extreme deviations.',Ol,Ja,xr,Wl,zg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mtext>SAPO</mtext></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><mfrac><mn>1</mn><mi>G</mi></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mfrac><mn>1</mn><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></mfrac><munderover><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi mathvariant="normal">∣</mi><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi></mrow></munderover><msub><mi>f</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mrow><mo fence="true">(</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mrow><mi>o</mi><mi>l</mi><mi>d</mi></mrow></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac><mo fence="true">)</mo></mrow><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub></mrow><annotation encoding="application/x-tex">
\\mathcal{L}_{\\text{SAPO}}(\\theta) = - \\frac{1}{G} \\sum_{i=1}^G \\frac{1}{|o_i|} \\sum_{t=1}^{|o_i|} f_{i,t} \\left( \\frac{\\pi_\\theta(o_{i,t} | q, o_{i,&lt;t})}{\\pi_{\\theta_{old}}(o_{i,t} | q, o_{i,&lt;t})} \\right) \\hat{A}_{i,t}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">SAPO</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.2387em;vertical-align:-1.2777em;"></span><span class="mord">−</span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">G</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">∣</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.961em;"><span style="top:-1.8829em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.386em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">∣</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight">∣</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2671em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1076em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0278em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="mord mathnormal mtight" style="margin-right:0.01968em;">l</span><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2559em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">)</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span></span>`,Vl,H,Jr,Fl,kg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>f</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub></mrow><annotation encoding="application/x-tex"> f_{i,t} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9805em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1076em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',Yl,Dl,Ig='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>σ</mi></mrow><annotation encoding="application/x-tex"> \\sigma </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span></span></span></span>',Kl,sp,Gg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>f</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>x</mi><mo stretchy="false">)</mo><mo>=</mo><mi>σ</mi><mrow><mo fence="true">(</mo><msub><mi>τ</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>x</mi><mo>−</mo><mn>1</mn><mo stretchy="false">)</mo><mo fence="true">)</mo></mrow><mo>⋅</mo><mfrac><mn>4</mn><msub><mi>τ</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub></mfrac></mrow><annotation encoding="application/x-tex">
f_{i,t}(x) = \\sigma \\left( \\tau_{i,t} (x - 1) \\right) \\cdot \\frac{4}{\\tau_{i,t}}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1076em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord">1</span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;">)</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">⋅</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.2935em;vertical-align:-0.9721em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">4</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9721em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span>`,ap,X,Ur,tp,Rg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>τ</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub></mrow><annotation encoding="application/x-tex"> \\tau_{i,t} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',ep,np,$g='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub></mrow><annotation encoding="application/x-tex"> \\hat{A}_{i,t} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.2329em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',lp,pp,Lg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>τ</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>=</mo><mrow><mo fence="true">{</mo><mtable rowspacing="0.36em" columnalign="left left" columnspacing="1em"><mtr><mtd><mstyle scriptlevel="0" displaystyle="false"><mrow><msub><mi>τ</mi><mtext>pos</mtext></msub><mo separator="true">,</mo></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="false"><mrow><mtext>if </mtext><msub><mover accent="true"><mi>A</mi><mo>^</mo></mover><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>&gt;</mo><mn>0</mn></mrow></mstyle></mtd></mtr><mtr><mtd><mstyle scriptlevel="0" displaystyle="false"><mrow><msub><mi>τ</mi><mtext>neg</mtext></msub><mo separator="true">,</mo></mrow></mstyle></mtd><mtd><mstyle scriptlevel="0" displaystyle="false"><mtext>otherwise</mtext></mstyle></mtd></mtr></mtable></mrow></mrow><annotation encoding="application/x-tex">
\\tau_{i,t} = \\begin{cases}
\\tau_{\\text{pos}}, &amp; \\text{if } \\hat{A}_{i,t} &gt; 0 \\\\
\\tau_{\\text{neg}}, &amp; \\text{otherwise}
\\end{cases}
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3em;vertical-align:-1.25em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size4">{</span></span><span class="mord"><span class="mtable"><span class="col-align-l"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.69em;"><span style="top:-3.69em;"><span class="pstrut" style="height:3.008em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">pos</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span></span></span><span style="top:-2.25em;"><span class="pstrut" style="height:3.008em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">neg</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mpunct">,</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.19em;"><span></span></span></span></span></span><span class="arraycolsep" style="width:1em;"></span><span class="col-align-l"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.69em;"><span style="top:-3.69em;"><span class="pstrut" style="height:3.008em;"></span><span class="mord"><span class="mord text"><span class="mord">if </span></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9468em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">A</span></span><span style="top:-3.2523em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.1111em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&gt;</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">0</span></span></span><span style="top:-2.25em;"><span class="pstrut" style="height:3.008em;"></span><span class="mord"><span class="mord text"><span class="mord">otherwise</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.19em;"><span></span></span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span>`,ip,Y,jr,mp,Ng='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>τ</mi><mtext>neg</mtext></msub><mo>&gt;</mo><msub><mi>τ</mi><mtext>pos</mtext></msub></mrow><annotation encoding="application/x-tex"> \\tau_{\\text{neg}} &gt; \\tau_{\\text{pos}} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8252em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">neg</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&gt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">pos</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',rp,op,Bg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>τ</mi><mtext>pos</mtext></msub><mo>=</mo><mn>1.0</mn><mo separator="true">,</mo><msub><mi>τ</mi><mtext>neg</mtext></msub><mo>=</mo><mn>1.05</mn></mrow><annotation encoding="application/x-tex"> \\tau_{\\text{pos}}=1.0, \\tau_{\\text{neg}}=1.05 </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">pos</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.9305em;vertical-align:-0.2861em;"></span><span class="mord">1.0</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:-0.1132em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">neg</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">1.05</span></span></span></span>',cp,hp,Ua,uc='To use this formulation, set <code>loss_type=&quot;sapo&quot;</code> in the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a>.',gp,ja,dp,Ca,yc="While training and evaluating, we record the following reward metrics:",up,w,Le,fc="<code>num_tokens</code>: The total number of tokens processed so far, including both prompts and completions. When using tools, only non-tool tokens are counted.",Cr,Ne,vc="<code>step_time</code>: The average time (in seconds) taken per training step (including generation).",zr,Be,wc="<code>completions/mean_length</code>: The average length of generated completions. When using tools, only non-tool tokens are counted.",kr,qe,bc="<code>completions/min_length</code>: The minimum length of generated completions. When using tools, only non-tool tokens are counted.",Ir,Ae,Mc="<code>completions/max_length</code>: The maximum length of generated completions. When using tools, only non-tool tokens are counted.",Gr,Pe,_c="<code>completions/mean_terminated_length</code>: The average length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.",Rr,Ee,Tc="<code>completions/min_terminated_length</code>: The minimum length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.",$r,Ze,xc="<code>completions/max_terminated_length</code>: The maximum length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.",Lr,Se,Jc="<code>completions/clipped_ratio</code>: The ratio of truncated (clipped) completions.",Nr,Qe,Uc="<code>reward/{reward_func_name}/mean</code>: The average reward from a specific reward function.",Br,He,jc="<code>reward/{reward_func_name}/std</code>: The standard deviation of the reward from a specific reward function.",qr,Xe,Cc="<code>reward</code>: The overall average reward after summing rewards across functions (unweighted).",Ar,Oe,zc="<code>reward_std</code>: The standard deviation of summed rewards across functions (unweighted), computed over the full batch.",Pr,We,kc="<code>frac_reward_zero_std</code>: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).",Er,Ve,Ic="<code>entropy</code>: Average entropy of token predictions across generated completions. (If <code>mask_truncated_completions=True</code>, masked sequences tokens are excluded.)",Zr,Fe,Gc="<code>kl</code>: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if <code>beta</code> is nonzero.",Sr,Z,Ye,Rc="clip_ratio/region_mean",Qr,De,$c='importance_sampling_level="sequence"',Hr,yp,qg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>clip</mtext><mrow><mo fence="true">(</mo><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo separator="true">,</mo><mn>1</mn><mo>−</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">l</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">w</mi></mrow></msub><mo separator="true">,</mo><mn>1</mn><mo>+</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">h</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">g</mi><mi mathvariant="normal">h</mi></mrow></msub><mo fence="true">)</mo></mrow><mtext> </mtext><mo separator="true">,</mo><mspace width="1em"/><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mtext>old</mtext></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo>∣</mo><mi>q</mi><mo separator="true">,</mo><msub><mi>o</mi><mrow><mi>i</mi><mo separator="true">,</mo><mo>&lt;</mo><mi>t</mi></mrow></msub><mo stretchy="false">)</mo></mrow></mfrac></mrow><annotation encoding="application/x-tex"> \\text{clip}\\left( r_{i,t}(\\theta), 1 - \\epsilon_\\mathrm{low}, 1 + \\epsilon_\\mathrm{high} \\right)\\,, \\quad r_{i,t}(\\theta) = \\frac{\\pi_\\theta(o_{i,t} \\mid q, o_{i,&lt; t})}{\\pi_{\\theta_{\\text{old}}}(o_{i,t} \\mid q, o_{i,&lt; t})} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord text"><span class="mord">clip</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight" style="margin-right:0.01389em;">low</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight">high</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;">)</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:1em;"></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.658em;vertical-align:-0.6257em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0323em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0359em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3448em;margin-left:-0.0278em;margin-right:0.1em;"><span class="pstrut" style="height:2.6944em;"></span><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">old</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.3496em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.401em;"><span></span></span></span></span></span></span><span class="mopen mtight">(</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2819em;"><span></span></span></span></span></span></span><span class="mrel mtight">∣</span><span class="mord mathnormal mtight" style="margin-right:0.03588em;">q</span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2819em;"><span></span></span></span></span></span></span><span class="mclose mtight">)</span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.5073em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0359em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span><span class="mopen mtight">(</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2819em;"><span></span></span></span></span></span></span><span class="mrel mtight">∣</span><span class="mord mathnormal mtight" style="margin-right:0.03588em;">q</span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mrel mtight">&lt;</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2819em;"><span></span></span></span></span></span></span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.6257em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span>',fp,vp,Ag='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>θ</mi></msub></mrow><annotation encoding="application/x-tex">\\pi_\\theta</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.5806em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>',wp,Xr,D,Ke,Lc="clip_ratio/low_mean",Or,sn,Nc='importance_sampling_level="sequence"',Wr,bp,Pg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>&lt;</mo><mn>1</mn><mo>−</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">l</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">w</mi></mrow></msub></mrow><annotation encoding="application/x-tex">r_{i,t}(\\theta) &lt; 1 - \\epsilon_\\mathrm{low}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&lt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.5806em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight" style="margin-right:0.01389em;">low</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>',Mp,Vr,K,an,Bc="clip_ratio/low_min",Fr,tn,qc='importance_sampling_level="sequence"',Yr,_p,Eg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>&lt;</mo><mn>1</mn><mo>−</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">l</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">w</mi></mrow></msub></mrow><annotation encoding="application/x-tex">r_{i,t}(\\theta) &lt; 1 - \\epsilon_\\mathrm{low}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&lt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.5806em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight" style="margin-right:0.01389em;">low</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>',Tp,Dr,ss,en,Ac="clip_ratio/high_mean",Kr,nn,Pc='importance_sampling_level="sequence"',so,xp,Zg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>&gt;</mo><mn>1</mn><mo>+</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">h</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">g</mi><mi mathvariant="normal">h</mi></mrow></msub></mrow><annotation encoding="application/x-tex">r_{i,t}(\\theta) &gt; 1 + \\epsilon_\\mathrm{high}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&gt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight">high</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',Jp,ao,as,ln,Ec="clip_ratio/high_max",to,pn,Zc='importance_sampling_level="sequence"',eo,Up,Sg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mrow><mi>i</mi><mo separator="true">,</mo><mi>t</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>&gt;</mo><mn>1</mn><mo>+</mo><msub><mi>ϵ</mi><mrow><mi mathvariant="normal">h</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">g</mi><mi mathvariant="normal">h</mi></mrow></msub></mrow><annotation encoding="application/x-tex">r_{i,t}(\\theta) &gt; 1 + \\epsilon_\\mathrm{high}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight">t</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&gt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal">ϵ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathrm mtight">high</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',jp,Cp,za,zp,ka,kp,Ia,Sc='Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use <a href="https://github.com/vllm-project/vllm" rel="nofollow">vLLM</a>, a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with',Ip,Ga,Gp,Ra,Qc="We support two ways of using vLLM during training: <strong>server mode</strong> and <strong>colocate mode</strong>.",Rp,Js,Hc='<p>By default, Truncated Importance Sampling is activated for vLLM generation to address the generation-training mismatch that occurs when using different frameworks. This can be turned off by setting <code>vllm_importance_sampling_correction=False</code>. For more information, see <a href="paper_index#truncated-importance-sampling">Truncated Importance Sampling</a></p>',$p,$a,Lp,La,Xc="In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.",Np,Us,Na,mn,Oc="<strong>Start the vLLM server</strong>:",no,Ba,lo,qa,rn,Wc="<strong>Enable server mode in your training script</strong>:",po,Aa,Bp,js,Vc="<p>Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the <code>CUDA_VISIBLE_DEVICES</code> environment variable.</p>",qp,Pa,Ap,Ea,Fc="In this mode, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.",Pp,Za,Ep,Cs,Yc='<p>Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the <code>vllm_gpu_memory_utilization</code> parameter in <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a> to avoid underutilization or out-of-memory errors.</p> <p>We provide a <a href="https://huggingface.co/spaces/trl-lib/recommend-vllm-memory" rel="nofollow">HF Space</a> to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get <code>vllm_gpu_memory_utilization</code> recommendation:</p> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe> <p>If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.</p> <p>If you still find you are getting out-of-memory errors set <code>vllm_enable_sleep_mode</code> to True and the vllm parameters and cache will be offloaded during the optimization step. For more information, see <a href="reducing_memory_usage#vllm-sleep-mode">Reducing Memory Usage with vLLM Sleep Mode</a>.</p>',Zp,zs,Dc="<p>By default, GRPO uses <code>MASTER_ADDR=localhost</code> and <code>MASTER_PORT=12345</code> for vLLM, but you can override these values by setting the environment variables accordingly.</p>",Sp,Sa,Kc='For more information, see <a href="speeding_up_training#vllm-for-fast-generation-in-online-methods">Speeding up training with vLLM</a>.',Qp,Qa,Hp,Ha,sh="While vLLM greatly accelerates inference, it also decouples the inference engine from the training engine. In theory these engines are mathematically identical, in practice however they can produce different outputs due to precision effects and hardware specific optimizations. This divergence reflects the different optimization objectives of the two systems. This divergence reflects the distinct optimization goals of the two systems. Inference engines aim to maximize sampling throughput, typically measured in tokens per second, while maintaining acceptable sampling fidelity. Training frameworks instead focus on numerical stability and precision for gradient computation, often using higher precision formats like FP32 for master weights and optimizer states. These differing priorities and constraints introduce an inevitable, albeit subtle, mismatch between training and inference.",Xp,B,io,ks,ah="[1]",Is,th="[2]",Gs,eh="[3]",Rs,nh="[4]",$s,lh="[5]",mo,Op,Qg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="normal">∇</mi><mi>θ</mi></msub><mi mathvariant="script">J</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><msub><mi mathvariant="double-struck">E</mi><mrow><mi>y</mi><mo>∼</mo><msup><mi>π</mi><mtext>train</mtext></msup><mo stretchy="false">(</mo><mo>⋅</mo><mo>∣</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo></mrow></msub><mrow><mo fence="true">[</mo><msub><mi mathvariant="normal">∇</mi><mi>θ</mi></msub><mi>log</mi><mo>⁡</mo><msup><mi>π</mi><mtext>train</mtext></msup><mo stretchy="false">(</mo><mi>y</mi><mo>∣</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>⋅</mo><mi>R</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>y</mi><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow></mrow><annotation encoding="application/x-tex">
\\nabla_\\theta \\mathcal{J}(x,\\theta)
= \\mathbb{E}_{y \\sim \\pi^\\text{train}(\\cdot \\mid x,\\theta)}
\\left[ \\nabla_\\theta \\log \\pi^\\text{train}(y \\mid x,\\theta) \\cdot R(x,y) \\right]
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord">∇</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord mathcal" style="margin-right:0.18472em;">J</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.2448em;vertical-align:-0.3643em;"></span><span class="mord"><span class="mord mathbb">E</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.5107em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="mrel mtight">∼</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.763em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord text mtight"><span class="mord mtight">train</span></span></span></span></span></span></span></span></span><span class="mopen mtight">(</span><span class="mord mtight">⋅</span><span class="mrel mtight">∣</span><span class="mord mathnormal mtight">x</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.3643em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size1">[</span></span><span class="mord"><span class="mord">∇</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8805em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord text mtight"><span class="mord mtight">train</span></span></span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">⋅</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size1">]</span></span></span></span></span></span></span>`,Wp,q,ro,Vp,Hg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>x</mi></mrow><annotation encoding="application/x-tex"> x </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">x</span></span></span></span>',Fp,Yp,Xg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>π</mi><mtext>train</mtext></msup></mrow><annotation encoding="application/x-tex"> \\pi^\\text{train} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8305em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8305em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord text mtight"><span class="mord mtight">train</span></span></span></span></span></span></span></span></span></span></span></span>',Dp,Kp,Og='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>π</mi><mtext>inference</mtext></msup></mrow><annotation encoding="application/x-tex"> \\pi^\\text{inference} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8491em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8491em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord text mtight"><span class="mord mtight">inference</span></span></span></span></span></span></span></span></span></span></span></span>',si,ai,Wg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="normal">∇</mi><mi>θ</mi></msub><msub><mi mathvariant="script">J</mi><mtext>biased</mtext></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><msub><mi mathvariant="double-struck">E</mi><mrow><mi>y</mi><mo>∼</mo><msup><mi>π</mi><mtext>inference</mtext></msup><mo stretchy="false">(</mo><mo>⋅</mo><mo>∣</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo></mrow></msub><mrow><mo fence="true">[</mo><msub><mi mathvariant="normal">∇</mi><mi>θ</mi></msub><mi>log</mi><mo>⁡</mo><msup><mi>π</mi><mtext>train</mtext></msup><mo stretchy="false">(</mo><mi>y</mi><mo>∣</mo><mi>x</mi><mo separator="true">,</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>⋅</mo><mi>R</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>y</mi><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">
\\nabla_\\theta \\mathcal{J}_{\\text{biased}}(x,\\theta)
= \\mathbb{E}_{y \\sim \\pi^\\text{inference}(\\cdot \\mid x,\\theta)}
\\left[ \\nabla_\\theta \\log \\pi^\\text{train}(y \\mid x,\\theta) \\cdot R(x,y) \\right].
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord">∇</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord"><span class="mord mathcal" style="margin-right:0.18472em;">J</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.1847em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">biased</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.2581em;vertical-align:-0.3776em;"></span><span class="mord"><span class="mord mathbb">E</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.4974em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="mrel mtight">∼</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.782em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord text mtight"><span class="mord mtight">inference</span></span></span></span></span></span></span></span></span><span class="mopen mtight">(</span><span class="mord mtight">⋅</span><span class="mrel mtight">∣</span><span class="mord mathnormal mtight">x</span><span class="mpunct mtight">,</span><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.3776em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size1">[</span></span><span class="mord"><span class="mord">∇</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8805em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord text mtight"><span class="mord mtight">train</span></span></span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">∣</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">⋅</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size1">]</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">.</span></span></span></span></span>`,ti,Xa,ph="This turns an otherwise on policy RL problem into an off policy one.",ei,U,oo,on,ih="importance sampling (IS)",co,Oa,mh="Truncated Importance Sampling (TIS)",ho,Wa,rh="Masked Importance Sampling (MIS)",go,ni,Vg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>ρ</mi></mrow><annotation encoding="application/x-tex"> \\rho </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">ρ</span></span></span></span>',li,pi,Fg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>ρ</mi><mi>t</mi></msub></mrow><annotation encoding="application/x-tex"> \\rho_t </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal">ρ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2806em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">t</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>',ii,mi,Yg='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>ρ</mi><mtext>seq</mtext></msub></mrow><annotation encoding="application/x-tex"> \\rho_{\\text{seq}} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7167em;vertical-align:-0.2861em;"></span><span class="mord"><span class="mord mathnormal">ρ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.1514em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord text mtight"><span class="mord mtight">seq</span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span></span></span></span>',ri,cn,oh="vllm_importance_sampling_cap",uo,oi,Dg=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>ρ</mi><mo>←</mo><mi>min</mi><mo>⁡</mo><mo stretchy="false">(</mo><mi>ρ</mi><mo separator="true">,</mo><mi>C</mi><mo stretchy="false">)</mo><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">
\\rho \\leftarrow \\min(\\rho, C).
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">ρ</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">←</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mop">min</span><span class="mopen">(</span><span class="mord mathnormal">ρ</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.07153em;">C</span><span class="mclose">)</span><span class="mord">.</span></span></span></span></span>`,ci,Va,ch="Under MIS, ratios larger than <code>vllm_importance_sampling_cap</code> are set to zero, so those samples do not contribute to the gradient. In other words, large ratio samples are downweighted under TIS and discarded under MIS. The configuration flag <code>vllm_importance_sampling_mode</code> chooses both the IS variant (masking or truncation) and the granularity (token level or sequence level).",hi,Fa,hh='Importance sampling is the principled algorithmic response to the training–inference mismatch. However, there are also more direct approaches that attempt to reduce the mismatch between the two engines themselves. Most of these are engineering solutions. For example, <a href="https://huggingface.co/papers/2506.13585" rel="nofollow">MiniMax M1 uses an FP32 language model head</a> in the inference engine. Thinking Machines has explored <a href="https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/" rel="nofollow">deterministic inference kernels</a>, although this comes with a significant efficiency cost. vLLM has shown <a href="https://blog.vllm.ai/2025/11/10/bitwise-consistent-train-inference.html" rel="nofollow">bitwise consistent policies</a> by building on the batch invariant deterministic kernels from Thinking Machines, but as of November 2025 there remains a substantial throughput penalty relative to standard vLLM inference.',gi,Ya,di,Da,gh="When training large models like <strong>Qwen2.5-72B</strong>, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:",ui,Ka,dh='<li><strong>DeepSpeed ZeRO Stage 3</strong>: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such models. For more details, see <a href="deepspeed_integration">DeepSpeed Integration</a>.</li> <li><strong>Accelerate</strong>: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see <a href="distributing_training">Distributing Training</a>.</li> <li><strong>vLLM</strong>: See the previous section on how to use vLLM to speed up generation.</li>',yi,st,uh="Below is an example SLURM script to train a 70B model with GRPO on multiple nodes. This script trains a model on 4 nodes and uses the 5th node for vLLM-powered generation.",fi,at,vi,tt,wi,et,bi,nt,yh='The <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> supports using custom reward functions instead of dense reward models. To ensure compatibility, your reward function must satisfy the following requirements:',Mi,lt,fh="Reward functions can be either synchronous Python callables or asynchronous <code>async def</code> coroutines. When you provide multiple asynchronous reward functions, they are awaited concurrently (run in parallel via <code>asyncio.gather</code>) so their latency overlaps.",_i,pt,vh='<li><p><strong>Input arguments</strong>:</p> <ul><li><p>The function must accept the following as keyword arguments:</p> <ul><li><code>prompts</code> (contains the prompts),</li> <li><code>completions</code> (contains the generated completions),</li> <li><code>completion_ids</code> (contains the tokenized completions),</li> <li><code>trainer_state</code> (<a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerState" rel="nofollow">TrainerState</a>): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.</li> <li>All column names (but <code>prompt</code>) that the dataset may have. For example, if the dataset contains a column named <code>ground_truth</code>, the function will be called with <code>ground_truth</code> as a keyword argument.</li></ul> <p>The easiest way to comply with this requirement is to use <code>**kwargs</code> in the function signature.</p></li> <li><p>Depending on the dataset format, the input will vary:</p> <ul><li>For <a href="dataset_formats#standard">standard format</a>, <code>prompts</code> and <code>completions</code> will be lists of strings.</li> <li>For <a href="dataset_formats#conversational">conversational format</a>, <code>prompts</code> and <code>completions</code> will be lists of message dictionaries.</li></ul></li></ul></li> <li><p><strong>Return value</strong>: The function must return a list of floats. Each float represents the reward corresponding to a single completion.</p></li>',Ti,it,xi,mt,wh="Below is an example of a reward function for a standard format that rewards longer completions:",Ji,rt,Ui,ot,bh="You can test it as follows:",ji,ct,Ci,ht,zi,gt,Mh="Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.",ki,dt,Ii,ut,_h="You can test it as follows:",Gi,yt,Ri,ft,$i,vt,Th=`Below is an example of a reward function that checks if the completion has a specific format. This example is inspired by the <em>format reward</em> function used in the paper <a href="https://huggingface.co/papers/2501.12948" rel="nofollow">DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning</a>.
It is designed for a conversational format, where prompts and completions consist of structured messages.`,Li,wt,Ni,bt,xh="You can test this function as follows:",Bi,Mt,qi,_t,Ai,Tt,Jh=`Below is an example of a reward function that checks if the completion is correct. This example is inspired by the <em>accuracy reward</em> function used in the paper <a href="https://huggingface.co/papers/2501.12948" rel="nofollow">DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning</a>.
This example is designed for <a href="dataset_formats#standard">standard format</a>, where the dataset contains a column named <code>ground_truth</code>.`,Pi,xt,Ei,Jt,Uh="You can test this function as follows:",Zi,Ut,Si,jt,Qi,Ct,jh='Below is an example of using multiple reward functions in the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a>. In this example, we define two task-specific reward functions: <code>math_reward_func</code> and <code>coding_reward_func</code>. The <code>math_reward_func</code> rewards math problems based on their correctness, while the <code>coding_reward_func</code> rewards coding problems based on whether the solution works.',Hi,zt,Xi,kt,Ch='In this example, the <code>math_reward_func</code> and <code>coding_reward_func</code> are designed to work with a mixed dataset that contains both math and coding problems. The <code>task</code> column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return <code>None</code>, and the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> will continue with the valid functions and tasks. This allows the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> to handle multiple reward functions with different applicability.',Oi,It,zh='Note that the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> will ignore the <code>None</code> rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.',Wi,Gt,Vi,Rt,kh='Custom reward functions can also be defined as <code>async def</code> coroutines. This is useful if your reward depends on slow I/O (for example, calling a remote service). When you pass multiple async reward functions, <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> executes them concurrently so their latency overlaps.',Fi,$t,Ih="Below is a minimal example of an async reward function that simulates an I/O-bound operation:",Yi,Lt,Di,Nt,Ki,Bt,Gh='To use your custom reward function, pass it to the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> as follows:',sm,qt,am,At,Rh="You can pass several reward functions as a list; this list may include both synchronous and asynchronous functions:",tm,Pt,em,Et,$h="and the reward will be computed as the sum of the rewards from each function, or the weighted sum if <code>reward_weights</code> is provided in the config.",nm,Zt,Lh='Note that <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> supports multiple reward functions of different types. See the parameters documentation for more details.',lm,St,pm,Qt,Nh='RapidFire AI is an open-source experimentation engine that sits on top of TRL and lets you launch multiple GRPO configurations at once, even on a single GPU. Instead of trying configurations sequentially, RapidFire lets you <strong>see all their learning curves earlier, stop underperforming runs, and clone promising ones with new settings in flight</strong> without restarting. For more information, see <a href="rapidfire_integration">RapidFire AI Integration</a>.',im,Ht,mm,Xt,Bh=`GRPO supports <strong>agent training</strong> through the <code>tools</code> argument in <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a>.
This parameter expects a list of Python functions (sync or async) that define the tools available to the agent:`,rm,Ot,om,Wt,qh=`Each tool must be a standard Python function with <strong>type-hinted arguments and return types</strong>, along with a <strong>Google-style docstring</strong> describing its purpose, arguments, and return value.
For more details, see the <a href="https://huggingface.co/docs/transformers/en/chat_extras#passing-tools" rel="nofollow">Passing tools guide</a>.`,cm,Vt,Ah="Example:",hm,Ft,gm,Yt,dm,Dt,Ph="Tested with:",um,Kt,Eh="<li><strong>Qwen3</strong> — e.g., <code>Qwen/Qwen3-0.6B</code></li>",ym,Ls,Zh="<p>Compatibility with all LLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.</p>",fm,se,vm,ae,Sh='Use <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_agent.py" rel="nofollow">grpo_agent.py</a> to fine-tune a LLM for agentic workflows.',wm,te,bm,ee,Mm,ne,Qh="GRPO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.",_m,le,Tm,pe,Hh="Tested with:",xm,ie,Xh="<li><strong>Gemma3</strong> — e.g., <code>google/gemma-3-4b-it</code></li> <li><strong>LLaVA-NeXT</strong> — e.g., <code>llava-hf/llava-v1.6-mistral-7b-hf</code></li> <li><strong>Qwen2-VL</strong> — e.g., <code>Qwen/Qwen2-VL-2B-Instruct</code></li> <li><strong>Qwen2.5-VL</strong> — e.g., <code>Qwen/Qwen2.5-VL-3B-Instruct</code></li> <li><strong>SmolVLM2</strong> — e.g., <code>HuggingFaceTB/SmolVLM2-2.2B-Instruct</code></li>",Jm,Ns,Oh="<p>Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.</p>",Um,me,jm,re,Wh='Use <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py" rel="nofollow">grpo_vlm.py</a> to fine-tune a VLM. Example command for training on <a href="https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified" rel="nofollow"><code>lmms-lab/multimodal-open-r1-8k-verified</code></a>:',Cm,oe,zm,ce,km,he,Vh="<li>Use LoRA on vision-language projection layers</li> <li>Enable 4-bit quantization to reduce memory usage</li> <li>VLMs are memory-intensive — start with smaller batch sizes</li> <li>Most models are compatible with vLLM (<code>server</code> and <code>colocate</code> modes)</li>",Im,ge,Gm,de,Fh="Each training sample should include:",Rm,ue,Yh="<li><code>prompt</code>: Text formatted via the processor’s chat template</li> <li><code>image</code>/<code>images</code>: PIL Image or list of PIL Images</li>",$m,ye,Dh="The trainer automatically handles image-to-tensor conversion via the model’s image processor.",Lm,fe,Nm,k,ve,yo,hn,Kh=`Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
paper <a href="https://huggingface.co/papers/2402.03300" rel="nofollow">DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
Models</a>.`,fo,Bs,vo,qs,we,wo,gn,sg="Main training entry point.",bo,ts,be,Mo,dn,ag="Will save the model, so you can reload it using <code>from_pretrained()</code>.",_o,un,tg="Will only save from the main process.",To,As,Me,xo,yn,eg="Upload <code>self.model</code> and <code>self.processing_class</code> to the 🤗 model hub on the repo <code>self.args.hub_model_id</code>.",Bm,_e,qm,A,Te,Jo,fn,ng='Configuration class for the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a>.',Uo,vn,lg=`This class includes only the parameters that are specific to GRPO training. For a full list of training arguments,
please refer to the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a> documentation. Note that default values in this class may
differ from those in <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>.`,jo,wn,pg=`Using <a href="https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.HfArgumentParser" rel="nofollow">HfArgumentParser</a> we can turn this class into
<a href="https://docs.python.org/3/library/argparse#module-argparse" rel="nofollow">argparse</a> arguments that can be specified on the
command line.`,Am,xe,Pm,_n,Em;return E=new pd({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),Q=new x({props:{title:"GRPO Trainer",local:"grpo-trainer",headingTag:"h1"}}),Ss=new x({props:{title:"Overview",local:"overview",headingTag:"h2"}}),Ws=new x({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),Ys=new J({props:{code:"JTIzJTIwdHJhaW5fZ3Jwby5weSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGxvYWRfZGF0YXNldCUwQWZyb20lMjB0cmwlMjBpbXBvcnQlMjBHUlBPVHJhaW5lciUwQWZyb20lMjB0cmwucmV3YXJkcyUyMGltcG9ydCUyMGFjY3VyYWN5X3Jld2FyZCUwQSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydHJsLWxpYiUyRkRlZXBNYXRoLTEwM0slMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUwQSUwQXRyYWluZXIlMjAlM0QlMjBHUlBPVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMi0wLjVCLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcmV3YXJkX2Z1bmNzJTNEYWNjdXJhY3lfcmV3YXJkJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEEpJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-comment"># train_grpo.py</span>
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
<span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/DeepMath-103K&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen2-0.5B-Instruct&quot;</span>,
reward_funcs=accuracy_reward,
train_dataset=dataset,
)
trainer.train()`,wrap:!1}}),Ks=new J({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMHRyYWluX2dycG8ucHk=",highlighted:"accelerate launch train_grpo.py",wrap:!1}}),ta=new x({props:{title:"Looking deeper into the GRPO method",local:"looking-deeper-into-the-grpo-method",headingTag:"h2"}}),la=new x({props:{title:"Generating completions",local:"generating-completions",headingTag:"h3"}}),pa=new x({props:{title:"Computing the advantage",local:"computing-the-advantage",headingTag:"h3"}}),oa=new x({props:{title:"Estimating the KL divergence",local:"estimating-the-kl-divergence",headingTag:"h3"}}),ca=new x({props:{title:"Computing the loss",local:"computing-the-loss",headingTag:"h3"}}),wa=new x({props:{title:"Loss Types",local:"loss-types",headingTag:"h4"}}),ja=new x({props:{title:"Logged metrics",local:"logged-metrics",headingTag:"h2"}}),za=new x({props:{title:"Customization",local:"customization",headingTag:"h2"}}),ka=new x({props:{title:"Speed up training with vLLM-powered generation",local:"speed-up-training-with-vllm-powered-generation",headingTag:"h3"}}),Ga=new J({props:{code:"cGlwJTIwaW5zdGFsbCUyMHRybCU1QnZsbG0lNUQ=",highlighted:"pip install trl[vllm]",wrap:!1}}),$a=new x({props:{title:"🔌 Option 1: Server mode",local:"-option-1-server-mode",headingTag:"h4"}}),Ba=new J({props:{code:"dHJsJTIwdmxsbS1zZXJ2ZSUyMC0tbW9kZWwlMjAlM0Ntb2RlbF9uYW1lJTNF",highlighted:"trl vllm-serve --model &lt;model_name&gt;",wrap:!1}}),Aa=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyglMEElMjAlMjAlMjAlMjAuLi4lMkMlMEElMjAlMjAlMjAlMjB1c2VfdmxsbSUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjB2bGxtX21vZGUlM0QlMjJzZXJ2ZXIlMjIlMkMlMjAlMjAlMjMlMjBkZWZhdWx0JTIwdmFsdWUlMkMlMjBjYW4lMjBiZSUyMG9taXR0ZWQlMEEp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig
training_args = GRPOConfig(
...,
use_vllm=<span class="hljs-literal">True</span>,
vllm_mode=<span class="hljs-string">&quot;server&quot;</span>, <span class="hljs-comment"># default value, can be omitted</span>
)`,wrap:!1}}),Pa=new x({props:{title:"🧩 Option 2: Colocate mode",local:"-option-2-colocate-mode",headingTag:"h4"}}),Za=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9Db25maWclMEElMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyglMEElMjAlMjAlMjAlMjAuLi4lMkMlMEElMjAlMjAlMjAlMjB1c2VfdmxsbSUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjB2bGxtX21vZGUlM0QlMjJjb2xvY2F0ZSUyMiUyQyUwQSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig
training_args = GRPOConfig(
...,
use_vllm=<span class="hljs-literal">True</span>,
vllm_mode=<span class="hljs-string">&quot;colocate&quot;</span>,
)`,wrap:!1}}),Qa=new x({props:{title:"Dealing with the Training-Inference Mismatch",local:"dealing-with-the-training-inference-mismatch",headingTag:"h4"}}),Ya=new x({props:{title:"GRPO at scale: train a 70B+ Model on multiple nodes",local:"grpo-at-scale-train-a-70b-model-on-multiple-nodes",headingTag:"h3"}}),at=new J({props:{code:"JTIzISUyRmJpbiUyRmJhc2glMEElMjNTQkFUQ0glMjAtLW5vZGVzJTNENSUwQSUyM1NCQVRDSCUyMC0tZ3JlcyUzRGdwdSUzQTglMEElMEElMjMlMjBHZXQlMjB0aGUlMjBsaXN0JTIwb2YlMjBhbGxvY2F0ZWQlMjBub2RlcyUwQU5PREVMSVNUJTNEKCUyNChzY29udHJvbCUyMHNob3clMjBob3N0bmFtZXMlMjAlMjRTTFVSTV9KT0JfTk9ERUxJU1QpKSUwQSUwQSUyMyUyMEFzc2lnbiUyMHRoZSUyMGZpcnN0JTIwNCUyMG5vZGVzJTIwZm9yJTIwdHJhaW5pbmclMjBhbmQlMjB0aGUlMjA1dGglMjBub2RlJTIwZm9yJTIwdkxMTSUwQVRSQUlOX05PREVTJTNEJTIyJTI0JTdCTk9ERUxJU1QlNUIlNDAlNUQlM0EwJTNBNCU3RCUyMiUyMCUyMCUyMyUyME5vZGVzJTIwMCUyQyUyMDElMkMlMjAyJTJDJTIwMyUyMGZvciUyMHRyYWluaW5nJTBBVkxMTV9OT0RFJTNEJTIyJTI0JTdCTk9ERUxJU1QlNUI0JTVEJTdEJTIyJTIwJTIwJTIzJTIwTm9kZSUyMDQlMjBmb3IlMjB2TExNJTBBJTBBJTIzJTIwUnVuJTIwdHJhaW5pbmclMjBvbiUyMHRoZSUyMGZpcnN0JTIwNCUyMG5vZGVzJTIwKEdyb3VwJTIwMSklMEFzcnVuJTIwLS1ub2RlcyUzRDQlMjAtLW50YXNrcyUzRDQlMjAtLW5vZGVsaXN0JTNEJTIyJTI0JTdCTk9ERUxJU1QlNUIlNDAlNUQlM0EwJTNBNCU3RCUyMiUyMGFjY2VsZXJhdGUlMjBsYXVuY2glMjAlNUMlMEElMjAlMjAlMjAlMjAlMjAtLWNvbmZpZ19maWxlJTIwZXhhbXBsZXMlMkZhY2NlbGVyYXRlX2NvbmZpZ3MlMkZkZWVwc3BlZWRfemVybzMueWFtbCUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMC0tbnVtX3Byb2Nlc3NlcyUyMDMyJTIwJTVDJTBBJTIwJTIwJTIwJTIwJTIwLS1udW1fbWFjaGluZXMlMjA0JTIwJTVDJTBBJTIwJTIwJTIwJTIwJTIwLS1tYWluX3Byb2Nlc3NfaXAlMjAlMjQlN0JOT0RFTElTVCU1QjAlNUQlN0QlMjAlNUMlMEElMjAlMjAlMjAlMjAlMjAtLW1hY2hpbmVfcmFuayUyMCUyNFNMVVJNX1BST0NJRCUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMC0tcmR6dl9iYWNrZW5kJTIwYzEwZCUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMHRyYWluX2dycG8ucHklMjAlNUMlMEElMjAlMjAlMjAlMjAlMjAtLXNlcnZlcl9pcCUyMCUyNFZMTE1fTk9ERSUyMCUyNiUwQSUwQSUyMyUyMFJ1biUyMHZMTE0lMjBzZXJ2ZXIlMjBvbiUyMHRoZSUyMDV0aCUyMG5vZGUlMjAoR3JvdXAlMjAyKSUwQXNydW4lMjAtLW5vZGVzJTNEMSUyMC0tbnRhc2tzJTNEMSUyMC0tbm9kZWxpc3QlM0QlMjIlMjQlN0JOT0RFTElTVCU1QjQlNUQlN0QlMjIlMjB0cmwlMjB2bGxtLXNlcnZlJTIwLS1tb2RlbCUyMFF3ZW4lMkZRd2VuMi41LTcyQiUyMC0tdGVuc29yX3BhcmFsbGVsX3NpemUlMjA4JTIwJTI2JTBBJTBBd2FpdA==",highlighted:`<span class="hljs-meta">#!/bin/bash</span>
<span class="hljs-comment">#SBATCH --nodes=5</span>
<span class="hljs-comment">#SBATCH --gres=gpu:8</span>
<span class="hljs-comment"># Get the list of allocated nodes</span>
NODELIST=($(scontrol show hostnames <span class="hljs-variable">$SLURM_JOB_NODELIST</span>))
<span class="hljs-comment"># Assign the first 4 nodes for training and the 5th node for vLLM</span>
TRAIN_NODES=<span class="hljs-string">&quot;<span class="hljs-variable">\${NODELIST[@]:0:4}</span>&quot;</span> <span class="hljs-comment"># Nodes 0, 1, 2, 3 for training</span>
VLLM_NODE=<span class="hljs-string">&quot;<span class="hljs-variable">\${NODELIST[4]}</span>&quot;</span> <span class="hljs-comment"># Node 4 for vLLM</span>
<span class="hljs-comment"># Run training on the first 4 nodes (Group 1)</span>
srun --nodes=4 --ntasks=4 --nodelist=<span class="hljs-string">&quot;<span class="hljs-variable">\${NODELIST[@]:0:4}</span>&quot;</span> accelerate launch \\
--config_file examples/accelerate_configs/deepspeed_zero3.yaml \\
--num_processes 32 \\
--num_machines 4 \\
--main_process_ip <span class="hljs-variable">\${NODELIST[0]}</span> \\
--machine_rank <span class="hljs-variable">$SLURM_PROCID</span> \\
--rdzv_backend c10d \\
train_grpo.py \\
--server_ip <span class="hljs-variable">$VLLM_NODE</span> &amp;
<span class="hljs-comment"># Run vLLM server on the 5th node (Group 2)</span>
srun --nodes=1 --ntasks=1 --nodelist=<span class="hljs-string">&quot;<span class="hljs-variable">\${NODELIST[4]}</span>&quot;</span> trl vllm-serve --model Qwen/Qwen2.5-72B --tensor_parallel_size 8 &amp;
<span class="hljs-built_in">wait</span>`,wrap:!1}}),tt=new J({props:{code:"aW1wb3J0JTIwYXJncGFyc2UlMEElMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEFmcm9tJTIwdHJsJTIwaW1wb3J0JTIwR1JQT1RyYWluZXIlMkMlMjBHUlBPQ29uZmlnJTBBZnJvbSUyMHRybC5yZXdhcmRzJTIwaW1wb3J0JTIwYWNjdXJhY3lfcmV3YXJkJTBBJTBBZGVmJTIwbWFpbigpJTNBJTBBJTIwJTIwJTIwJTIwcGFyc2VyJTIwJTNEJTIwYXJncGFyc2UuQXJndW1lbnRQYXJzZXIoKSUwQSUyMCUyMCUyMCUyMHBhcnNlci5hZGRfYXJndW1lbnQoJTIyLS12bGxtX3NlcnZlcl9ob3N0JTIyJTJDJTIwdHlwZSUzRHN0ciUyQyUyMGRlZmF1bHQlM0QlMjIlMjIlMkMlMjBoZWxwJTNEJTIyVGhlJTIwc2VydmVyJTIwSVAlMjIpJTBBJTIwJTIwJTIwJTIwYXJncyUyMCUzRCUyMHBhcnNlci5wYXJzZV9hcmdzKCklMEElMEElMjAlMjAlMjAlMjBkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZEZWVwTWF0aC0xMDNLJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEElMEElMjAlMjAlMjAlMjB0cmFpbmluZ19hcmdzJTIwJTNEJTIwR1JQT0NvbmZpZyglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwZXJfZGV2aWNlX3RyYWluX2JhdGNoX3NpemUlM0Q0JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdXNlX3ZsbG0lM0RUcnVlJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdmxsbV9zZXJ2ZXJfaG9zdCUzRGFyZ3MudmxsbV9zZXJ2ZXJfaG9zdC5yZXBsYWNlKCUyMmlwLSUyMiUyQyUyMCUyMiUyMikucmVwbGFjZSglMjItJTIyJTJDJTIwJTIyLiUyMiklMkMlMjAlMjAlMjMlMjBmcm9tJTIwaXAtWC1YLVgtWCUyMHRvJTIwWC5YLlguWCUwQSUyMCUyMCUyMCUyMCklMEElMEElMjAlMjAlMjAlMjB0cmFpbmVyJTIwJTNEJTIwR1JQT1RyYWluZXIoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJRd2VuJTJGUXdlbjIuNS03MkIlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBhcmdzJTNEdHJhaW5pbmdfYXJncyUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRGFjY3VyYWN5X3Jld2FyZCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0RkYXRhc2V0JTBBJTIwJTIwJTIwJTIwKSUwQSUyMCUyMCUyMCUyMHRyYWluZXIudHJhaW4oKSUwQSUwQWlmJTIwX19uYW1lX18lM0QlM0QlMjJfX21haW5fXyUyMiUzQSUwQSUyMCUyMCUyMCUyMG1haW4oKQ==",highlighted:`<span class="hljs-keyword">import</span> argparse
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer, GRPOConfig
<span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward
<span class="hljs-keyword">def</span> <span class="hljs-title function_">main</span>():
parser = argparse.ArgumentParser()
parser.add_argument(<span class="hljs-string">&quot;--vllm_server_host&quot;</span>, <span class="hljs-built_in">type</span>=<span class="hljs-built_in">str</span>, default=<span class="hljs-string">&quot;&quot;</span>, <span class="hljs-built_in">help</span>=<span class="hljs-string">&quot;The server IP&quot;</span>)
args = parser.parse_args()
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/DeepMath-103K&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
training_args = GRPOConfig(
per_device_train_batch_size=<span class="hljs-number">4</span>,
use_vllm=<span class="hljs-literal">True</span>,
vllm_server_host=args.vllm_server_host.replace(<span class="hljs-string">&quot;ip-&quot;</span>, <span class="hljs-string">&quot;&quot;</span>).replace(<span class="hljs-string">&quot;-&quot;</span>, <span class="hljs-string">&quot;.&quot;</span>), <span class="hljs-comment"># from ip-X-X-X-X to X.X.X.X</span>
)
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen2.5-72B&quot;</span>,
args=training_args,
reward_funcs=accuracy_reward,
train_dataset=dataset
)
trainer.train()
<span class="hljs-keyword">if</span> __name__==<span class="hljs-string">&quot;__main__&quot;</span>:
main()`,wrap:!1}}),et=new x({props:{title:"Using a custom reward function",local:"using-a-custom-reward-function",headingTag:"h3"}}),it=new x({props:{title:"Example 1: Reward longer completions",local:"example-1-reward-longer-completions",headingTag:"h4"}}),rt=new J({props:{code:"ZGVmJTIwcmV3YXJkX2Z1bmMoY29tcGxldGlvbl9pZHMlMkMlMjAqKmt3YXJncyklM0ElMEElMjAlMjAlMjAlMjAlMjIlMjIlMjJSZXdhcmQlMjBmdW5jdGlvbiUyMHRoYXQlMjBhc3NpZ25zJTIwaGlnaGVyJTIwc2NvcmVzJTIwdG8lMjBsb25nZXIlMjBjb21wbGV0aW9ucyUyMChpbiUyMHRlcm1zJTIwb2YlMjB0b2tlbiUyMGNvdW50KS4lMjIlMjIlMjIlMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlNUJmbG9hdChsZW4oaWRzKSklMjBmb3IlMjBpZHMlMjBpbiUyMGNvbXBsZXRpb25faWRzJTVE",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">completion_ids, **kwargs</span>):
<span class="hljs-string">&quot;&quot;&quot;Reward function that assigns higher scores to longer completions (in terms of token count).&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(ids)) <span class="hljs-keyword">for</span> ids <span class="hljs-keyword">in</span> completion_ids]`,wrap:!1}}),ct=new J({props:{code:"cHJvbXB0cyUyMCUzRCUyMCU1QiUyMlRoZSUyMHNreSUyMGlzJTIyJTJDJTIwJTIyVGhlJTIwc3VuJTIwaXMlMjIlNUQlMjAlMjAlMjMlMjBub3QlMjB1c2VkJTIwaW4lMjB0aGUlMjByZXdhcmQlMjBmdW5jdGlvbiUyQyUyMGJ1dCUyMHRoZSUyMHRyYWluZXIlMjB3aWxsJTIwcGFzcyUyMGl0JTBBY29tcGxldGlvbnMlMjAlM0QlMjAlNUIlMjIlMjBibHVlLiUyMiUyQyUyMCUyMiUyMGluJTIwdGhlJTIwc2t5LiUyMiU1RCUyMCUyMCUyMyUyMG5vdCUyMHVzZWQlMjBpbiUyMHRoZSUyMHJld2FyZCUyMGZ1bmN0aW9uJTJDJTIwYnV0JTIwdGhlJTIwdHJhaW5lciUyMHdpbGwlMjBwYXNzJTIwaXQlMEFjb21wbGV0aW9uX2lkcyUyMCUzRCUyMCU1QiU1QjYzMDMlMkMlMjAxMyU1RCUyQyUyMCU1QjMwNCUyQyUyMDI3OSUyQyUyMDEyODg0JTJDJTIwMTMlNUQlNUQlMEFyZXdhcmRfZnVuYyhwcm9tcHRzJTNEcHJvbXB0cyUyQyUyMGNvbXBsZXRpb25zJTNEY29tcGxldGlvbnMlMkMlMjBjb21wbGV0aW9uX2lkcyUzRGNvbXBsZXRpb25faWRzKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [<span class="hljs-string">&quot;The sky is&quot;</span>, <span class="hljs-string">&quot;The sun is&quot;</span>] <span class="hljs-comment"># not used in the reward function, but the trainer will pass it</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>completions = [<span class="hljs-string">&quot; blue.&quot;</span>, <span class="hljs-string">&quot; in the sky.&quot;</span>] <span class="hljs-comment"># not used in the reward function, but the trainer will pass it</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>completion_ids = [[<span class="hljs-number">6303</span>, <span class="hljs-number">13</span>], [<span class="hljs-number">304</span>, <span class="hljs-number">279</span>, <span class="hljs-number">12884</span>, <span class="hljs-number">13</span>]]
<span class="hljs-meta">&gt;&gt;&gt; </span>reward_func(prompts=prompts, completions=completions, completion_ids=completion_ids)
[<span class="hljs-number">2.0</span>, <span class="hljs-number">4.0</span>]`,wrap:!1}}),ht=new x({props:{title:"Example 1.1: Reward longer completions (based on the number of characters)",local:"example-11-reward-longer-completions-based-on-the-number-of-characters",headingTag:"h4"}}),dt=new J({props:{code:"ZGVmJTIwcmV3YXJkX2Z1bmMoY29tcGxldGlvbnMlMkMlMjAqKmt3YXJncyklM0ElMEElMjAlMjAlMjAlMjAlMjIlMjIlMjJSZXdhcmQlMjBmdW5jdGlvbiUyMHRoYXQlMjBhc3NpZ25zJTIwaGlnaGVyJTIwc2NvcmVzJTIwdG8lMjBsb25nZXIlMjBjb21wbGV0aW9ucyUyMChpbiUyMHRlcm1zJTIwb2YlMjBjaGFyYWN0ZXIlMjBjb3VudCkuJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCZmxvYXQobGVuKGNvbXBsZXRpb24pKSUyMGZvciUyMGNvbXBsZXRpb24lMjBpbiUyMGNvbXBsZXRpb25zJTVE",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">completions, **kwargs</span>):
<span class="hljs-string">&quot;&quot;&quot;Reward function that assigns higher scores to longer completions (in terms of character count).&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]`,wrap:!1}}),yt=new J({props:{code:"cHJvbXB0cyUyMCUzRCUyMCU1QiUyMlRoZSUyMHNreSUyMGlzJTIyJTJDJTIwJTIyVGhlJTIwc3VuJTIwaXMlMjIlNUQlMEFjb21wbGV0aW9ucyUyMCUzRCUyMCU1QiUyMiUyMGJsdWUuJTIyJTJDJTIwJTIyJTIwaW4lMjB0aGUlMjBza3kuJTIyJTVEJTBBY29tcGxldGlvbl9pZHMlMjAlM0QlMjAlNUIlNUI2MzAzJTJDJTIwMTMlNUQlMkMlMjAlNUIzMDQlMkMlMjAyNzklMkMlMjAxMjg4NCUyQyUyMDEzJTVEJTVEJTIwJTIwJTIzJTIwbm90JTIwdXNlZCUyMGluJTIwdGhlJTIwcmV3YXJkJTIwZnVuY3Rpb24lMkMlMjBidXQlMjB0aGUlMjB0cmFpbmVyJTIwd2lsbCUyMHBhc3MlMjBpdCUwQXJld2FyZF9mdW5jKHByb21wdHMlM0Rwcm9tcHRzJTJDJTIwY29tcGxldGlvbnMlM0Rjb21wbGV0aW9ucyUyQyUyMGNvbXBsZXRpb25faWRzJTNEY29tcGxldGlvbl9pZHMp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [<span class="hljs-string">&quot;The sky is&quot;</span>, <span class="hljs-string">&quot;The sun is&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>completions = [<span class="hljs-string">&quot; blue.&quot;</span>, <span class="hljs-string">&quot; in the sky.&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>completion_ids = [[<span class="hljs-number">6303</span>, <span class="hljs-number">13</span>], [<span class="hljs-number">304</span>, <span class="hljs-number">279</span>, <span class="hljs-number">12884</span>, <span class="hljs-number">13</span>]] <span class="hljs-comment"># not used in the reward function, but the trainer will pass it</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>reward_func(prompts=prompts, completions=completions, completion_ids=completion_ids)
[<span class="hljs-number">6.0</span>, <span class="hljs-number">12.0</span>]`,wrap:!1}}),ft=new x({props:{title:"Example 2: Reward completions with a specific format",local:"example-2-reward-completions-with-a-specific-format",headingTag:"h4"}}),wt=new J({props:{code:"aW1wb3J0JTIwcmUlMEElMEFkZWYlMjBmb3JtYXRfcmV3YXJkX2Z1bmMoY29tcGxldGlvbnMlMkMlMjAqKmt3YXJncyklM0ElMEElMjAlMjAlMjAlMjAlMjIlMjIlMjJSZXdhcmQlMjBmdW5jdGlvbiUyMHRoYXQlMjBjaGVja3MlMjBpZiUyMHRoZSUyMGNvbXBsZXRpb24lMjBoYXMlMjBhJTIwc3BlY2lmaWMlMjBmb3JtYXQuJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwcGF0dGVybiUyMCUzRCUyMHIlMjIlNUUlM0N0aGluayUzRS4qJTNGJTNDJTJGdGhpbmslM0UlM0NhbnN3ZXIlM0UuKiUzRiUzQyUyRmFuc3dlciUzRSUyNCUyMiUwQSUyMCUyMCUyMCUyMGNvbXBsZXRpb25fY29udGVudHMlMjAlM0QlMjAlNUJjb21wbGV0aW9uJTVCMCU1RCU1QiUyMmNvbnRlbnQlMjIlNUQlMjBmb3IlMjBjb21wbGV0aW9uJTIwaW4lMjBjb21wbGV0aW9ucyU1RCUwQSUyMCUyMCUyMCUyMG1hdGNoZXMlMjAlM0QlMjAlNUJyZS5tYXRjaChwYXR0ZXJuJTJDJTIwY29udGVudCklMjBmb3IlMjBjb250ZW50JTIwaW4lMjBjb21wbGV0aW9uX2NvbnRlbnRzJTVEJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCMS4wJTIwaWYlMjBtYXRjaCUyMGVsc2UlMjAwLjAlMjBmb3IlMjBtYXRjaCUyMGluJTIwbWF0Y2hlcyU1RA==",highlighted:`<span class="hljs-keyword">import</span> re
<span class="hljs-keyword">def</span> <span class="hljs-title function_">format_reward_func</span>(<span class="hljs-params">completions, **kwargs</span>):
<span class="hljs-string">&quot;&quot;&quot;Reward function that checks if the completion has a specific format.&quot;&quot;&quot;</span>
pattern = <span class="hljs-string">r&quot;^&lt;think&gt;.*?&lt;/think&gt;&lt;answer&gt;.*?&lt;/answer&gt;$&quot;</span>
completion_contents = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
matches = [re.<span class="hljs-keyword">match</span>(pattern, content) <span class="hljs-keyword">for</span> content <span class="hljs-keyword">in</span> completion_contents]
<span class="hljs-keyword">return</span> [<span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">in</span> matches]`,wrap:!1}}),Mt=new J({props:{code:"cHJvbXB0cyUyMCUzRCUyMCU1QiUwQSUyMCUyMCUyMCUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyV2hhdCUyMGlzJTIwdGhlJTIwcmVzdWx0JTIwb2YlMjAoMSUyMCUyQiUyMDIpJTIwKiUyMDQlM0YlMjIlN0QlNUQlMkMlMEElMjAlMjAlMjAlMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldoYXQlMjBpcyUyMHRoZSUyMHJlc3VsdCUyMG9mJTIwKDMlMjAlMkIlMjAxKSUyMColMjAyJTNGJTIyJTdEJTVEJTJDJTBBJTVEJTBBY29tcGxldGlvbnMlMjAlM0QlMjAlNUIlMEElMjAlMjAlMjAlMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMiUzQ3RoaW5rJTNFVGhlJTIwc3VtJTIwb2YlMjAxJTIwYW5kJTIwMiUyMGlzJTIwMyUyQyUyMHdoaWNoJTIwd2UlMjBtdWx0aXBseSUyMGJ5JTIwNCUyMHRvJTIwZ2V0JTIwMTIuJTNDJTJGdGhpbmslM0UlM0NhbnN3ZXIlM0UoMSUyMCUyQiUyMDIpJTIwKiUyMDQlMjAlM0QlMjAxMiUzQyUyRmFuc3dlciUzRSUyMiU3RCU1RCUyQyUwQSUyMCUyMCUyMCUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyVGhlJTIwc3VtJTIwb2YlMjAzJTIwYW5kJTIwMSUyMGlzJTIwNCUyQyUyMHdoaWNoJTIwd2UlMjBtdWx0aXBseSUyMGJ5JTIwMiUyMHRvJTIwZ2V0JTIwOC4lMjBTbyUyMCgzJTIwJTJCJTIwMSklMjAqJTIwMiUyMCUzRCUyMDguJTIyJTdEJTVEJTJDJTBBJTVEJTBBZm9ybWF0X3Jld2FyZF9mdW5jKHByb21wdHMlM0Rwcm9tcHRzJTJDJTIwY29tcGxldGlvbnMlM0Rjb21wbGV0aW9ucyk=",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [
<span class="hljs-meta">... </span> [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What is the result of (1 + 2) * 4?&quot;</span>}],
<span class="hljs-meta">... </span> [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What is the result of (3 + 1) * 2?&quot;</span>}],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>completions = [
<span class="hljs-meta">... </span> [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;&lt;think&gt;The sum of 1 and 2 is 3, which we multiply by 4 to get 12.&lt;/think&gt;&lt;answer&gt;(1 + 2) * 4 = 12&lt;/answer&gt;&quot;</span>}],
<span class="hljs-meta">... </span> [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8.&quot;</span>}],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>format_reward_func(prompts=prompts, completions=completions)
[<span class="hljs-number">1.0</span>, <span class="hljs-number">0.0</span>]`,wrap:!1}}),_t=new x({props:{title:"Example 3: Reward completions based on a reference",local:"example-3-reward-completions-based-on-a-reference",headingTag:"h4"}}),xt=new J({props:{code:"aW1wb3J0JTIwcmUlMEElMEFkZWYlMjByZXdhcmRfZnVuYyhjb21wbGV0aW9ucyUyQyUyMGdyb3VuZF90cnV0aCUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMCUyMyUyMFJlZ3VsYXIlMjBleHByZXNzaW9uJTIwdG8lMjBjYXB0dXJlJTIwY29udGVudCUyMGluc2lkZSUyMCU1Q2JveGVkJTdCJTdEJTBBJTIwJTIwJTIwJTIwbWF0Y2hlcyUyMCUzRCUyMCU1QnJlLnNlYXJjaChyJTIyJTVDJTVDYm94ZWQlNUMlN0IoLiolM0YpJTVDJTdEJTIyJTJDJTIwY29tcGxldGlvbiklMjBmb3IlMjBjb21wbGV0aW9uJTIwaW4lMjBjb21wbGV0aW9ucyU1RCUwQSUyMCUyMCUyMCUyMGNvbnRlbnRzJTIwJTNEJTIwJTVCbWF0Y2guZ3JvdXAoMSklMjBpZiUyMG1hdGNoJTIwZWxzZSUyMCUyMiUyMiUyMGZvciUyMG1hdGNoJTIwaW4lMjBtYXRjaGVzJTVEJTBBJTIwJTIwJTIwJTIwJTIzJTIwUmV3YXJkJTIwMSUyMGlmJTIwdGhlJTIwY29udGVudCUyMGlzJTIwdGhlJTIwc2FtZSUyMGFzJTIwdGhlJTIwZ3JvdW5kJTIwdHJ1dGglMkMlMjAwJTIwb3RoZXJ3aXNlJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCMS4wJTIwaWYlMjBjJTIwJTNEJTNEJTIwZ3QlMjBlbHNlJTIwMC4wJTIwZm9yJTIwYyUyQyUyMGd0JTIwaW4lMjB6aXAoY29udGVudHMlMkMlMjBncm91bmRfdHJ1dGgpJTVE",highlighted:`<span class="hljs-keyword">import</span> re
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">completions, ground_truth, **kwargs</span>):
<span class="hljs-comment"># Regular expression to capture content inside \\boxed{}</span>
matches = [re.search(<span class="hljs-string">r&quot;\\\\boxed\\{(.*?)\\}&quot;</span>, completion) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
contents = [<span class="hljs-keyword">match</span>.group(<span class="hljs-number">1</span>) <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;&quot;</span> <span class="hljs-keyword">for</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">in</span> matches]
<span class="hljs-comment"># Reward 1 if the content is the same as the ground truth, 0 otherwise</span>
<span class="hljs-keyword">return</span> [<span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> c == gt <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> c, gt <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(contents, ground_truth)]`,wrap:!1}}),Ut=new J({props:{code:"cHJvbXB0cyUyMCUzRCUyMCU1QiUyMlByb2JsZW0lM0ElMjBTb2x2ZSUyMHRoZSUyMGVxdWF0aW9uJTIwJTI0MnglMjAlMkIlMjAzJTIwJTNEJTIwNyUyNC4lMjBTb2x1dGlvbiUzQSUyMiUyQyUyMCUyMlByb2JsZW0lM0ElMjBTb2x2ZSUyMHRoZSUyMGVxdWF0aW9uJTIwJTI0M3glMjAtJTIwNSUyMCUzRCUyMDEwJTI0LiUyMiU1RCUwQWNvbXBsZXRpb25zJTIwJTNEJTIwJTVCciUyMiUyMFRoZSUyMHNvbHV0aW9uJTIwaXMlMjAlNUNib3hlZCU3QjIlN0QuJTIyJTJDJTIwciUyMiUyMFRoZSUyMHNvbHV0aW9uJTIwaXMlMjAlNUNib3hlZCU3QjYlN0QuJTIyJTVEJTBBZ3JvdW5kX3RydXRoJTIwJTNEJTIwJTVCJTIyMiUyMiUyQyUyMCUyMjUlMjIlNUQlMEFyZXdhcmRfZnVuYyhwcm9tcHRzJTNEcHJvbXB0cyUyQyUyMGNvbXBsZXRpb25zJTNEY29tcGxldGlvbnMlMkMlMjBncm91bmRfdHJ1dGglM0Rncm91bmRfdHJ1dGgp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [<span class="hljs-string">&quot;Problem: Solve the equation $2x + 3 = 7$. Solution:&quot;</span>, <span class="hljs-string">&quot;Problem: Solve the equation $3x - 5 = 10$.&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>completions = [<span class="hljs-string">r&quot; The solution is \\boxed{2}.&quot;</span>, <span class="hljs-string">r&quot; The solution is \\boxed{6}.&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>ground_truth = [<span class="hljs-string">&quot;2&quot;</span>, <span class="hljs-string">&quot;5&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
[<span class="hljs-number">1.0</span>, <span class="hljs-number">0.0</span>]`,wrap:!1}}),jt=new x({props:{title:"Example 4: Multi-task reward functions",local:"example-4-multi-task-reward-functions",headingTag:"h4"}}),zt=new J({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQWZyb20lMjB0cmwlMjBpbXBvcnQlMjBHUlBPVHJhaW5lciUwQSUwQSUyMyUyMERlZmluZSUyMGElMjBkYXRhc2V0JTIwdGhhdCUyMGNvbnRhaW5zJTIwYm90aCUyMG1hdGglMjBhbmQlMjBjb2RpbmclMjBwcm9ibGVtcyUwQWRhdGFzZXQlMjAlM0QlMjBEYXRhc2V0LmZyb21fbGlzdCglMEElMjAlMjAlMjAlMjAlNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJwcm9tcHQlMjIlM0ElMjAlMjJXaGF0JTIwaXMlMjAyJTJCMiUzRiUyMiUyQyUyMCUyMnRhc2slMjIlM0ElMjAlMjJtYXRoJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycHJvbXB0JTIyJTNBJTIwJTIyV3JpdGUlMjBhJTIwZnVuY3Rpb24lMjB0aGF0JTIwcmV0dXJucyUyMHRoZSUyMHN1bSUyMG9mJTIwdHdvJTIwbnVtYmVycy4lMjIlMkMlMjAlMjJ0YXNrJTIyJTNBJTIwJTIyY29kZSUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnByb21wdCUyMiUzQSUyMCUyMldoYXQlMjBpcyUyMDMqNCUzRiUyMiUyQyUyMCUyMnRhc2slMjIlM0ElMjAlMjJtYXRoJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycHJvbXB0JTIyJTNBJTIwJTIyV3JpdGUlMjBhJTIwZnVuY3Rpb24lMjB0aGF0JTIwcmV0dXJucyUyMHRoZSUyMHByb2R1Y3QlMjBvZiUyMHR3byUyMG51bWJlcnMuJTIyJTJDJTIwJTIydGFzayUyMiUzQSUyMCUyMmNvZGUlMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlNUQlMEEpJTBBJTBBJTIzJTIwTWF0aC1zcGVjaWZpYyUyMHJld2FyZCUyMGZ1bmN0aW9uJTBBZGVmJTIwbWF0aF9yZXdhcmRfZnVuYyhwcm9tcHRzJTJDJTIwY29tcGxldGlvbnMlMkMlMjB0YXNrJTJDJTIwKiprd2FyZ3MpJTNBJTBBJTIwJTIwJTIwJTIwcmV3YXJkcyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMGZvciUyMHByb21wdCUyQyUyMGNvbXBsZXRpb24lMkMlMjB0JTIwaW4lMjB6aXAocHJvbXB0cyUyQyUyMGNvbXBsZXRpb25zJTJDJTIwdGFzayklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHQlMjAlM0QlM0QlMjAlMjJtYXRoJTIyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwQ2FsY3VsYXRlJTIwbWF0aC1zcGVjaWZpYyUyMHJld2FyZCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGNvcnJlY3QlMjAlM0QlMjBjaGVja19tYXRoX3NvbHV0aW9uKHByb21wdCUyQyUyMGNvbXBsZXRpb24pJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV3YXJkJTIwJTNEJTIwMS4wJTIwaWYlMjBjb3JyZWN0JTIwZWxzZSUyMC0xLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZChyZXdhcmQpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFJldHVybiUyME5vbmUlMjBmb3IlMjBub24tbWF0aCUyMHRhc2tzJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV3YXJkcy5hcHBlbmQoTm9uZSklMEElMjAlMjAlMjAlMjByZXR1cm4lMjByZXdhcmRzJTBBJTBBJTIzJTIwQ29kaW5nLXNwZWNpZmljJTIwcmV3YXJkJTIwZnVuY3Rpb24lMEFkZWYlMjBjb2RpbmdfcmV3YXJkX2Z1bmMocHJvbXB0cyUyQyUyMGNvbXBsZXRpb25zJTJDJTIwdGFzayUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMHJld2FyZHMlMjAlM0QlMjAlNUIlNUQlMEElMjAlMjAlMjAlMjBmb3IlMjBwcm9tcHQlMkMlMjBjb21wbGV0aW9uJTJDJTIwdCUyMGluJTIwemlwKHByb21wdHMlMkMlMjBjb21wbGV0aW9ucyUyQyUyMHRhc2spJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjB0JTIwJTNEJTNEJTIwJTIyY29kaW5nJTIyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwQ2FsY3VsYXRlJTIwY29kaW5nLXNwZWNpZmljJTIwcmV3YXJkJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd29ya3MlMjAlM0QlMjB0ZXN0X2NvZGVfc29sdXRpb24ocHJvbXB0JTJDJTIwY29tcGxldGlvbiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmQlMjAlM0QlMjAxLjAlMjBpZiUyMHdvcmtzJTIwZWxzZSUyMC0xLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZChyZXdhcmQpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFJldHVybiUyME5vbmUlMjBmb3IlMjBub24tY29kaW5nJTIwdGFza3MlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZChOb25lKSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHJld2FyZHMlMEElMEElMjMlMjBVc2UlMjBib3RoJTIwdGFzay1zcGVjaWZpYyUyMHJld2FyZCUyMGZ1bmN0aW9ucyUwQXRyYWluZXIlMjAlM0QlMjBHUlBPVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMi0wLjVCLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcmV3YXJkX2Z1bmNzJTNEJTVCbWF0aF9yZXdhcmRfZnVuYyUyQyUyMGNvZGluZ19yZXdhcmRfZnVuYyU1RCUyQyUwQSUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0RkYXRhc2V0JTJDJTBBKSUwQSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
<span class="hljs-comment"># Define a dataset that contains both math and coding problems</span>
dataset = Dataset.from_list(
[
{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;What is 2+2?&quot;</span>, <span class="hljs-string">&quot;task&quot;</span>: <span class="hljs-string">&quot;math&quot;</span>},
{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;Write a function that returns the sum of two numbers.&quot;</span>, <span class="hljs-string">&quot;task&quot;</span>: <span class="hljs-string">&quot;code&quot;</span>},
{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;What is 3*4?&quot;</span>, <span class="hljs-string">&quot;task&quot;</span>: <span class="hljs-string">&quot;math&quot;</span>},
{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;Write a function that returns the product of two numbers.&quot;</span>, <span class="hljs-string">&quot;task&quot;</span>: <span class="hljs-string">&quot;code&quot;</span>},
]
)
<span class="hljs-comment"># Math-specific reward function</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">math_reward_func</span>(<span class="hljs-params">prompts, completions, task, **kwargs</span>):
rewards = []
<span class="hljs-keyword">for</span> prompt, completion, t <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(prompts, completions, task):
<span class="hljs-keyword">if</span> t == <span class="hljs-string">&quot;math&quot;</span>:
<span class="hljs-comment"># Calculate math-specific reward</span>
correct = check_math_solution(prompt, completion)
reward = <span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> correct <span class="hljs-keyword">else</span> -<span class="hljs-number">1.0</span>
rewards.append(reward)
<span class="hljs-keyword">else</span>:
<span class="hljs-comment"># Return None for non-math tasks</span>
rewards.append(<span class="hljs-literal">None</span>)
<span class="hljs-keyword">return</span> rewards
<span class="hljs-comment"># Coding-specific reward function</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">coding_reward_func</span>(<span class="hljs-params">prompts, completions, task, **kwargs</span>):
rewards = []
<span class="hljs-keyword">for</span> prompt, completion, t <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(prompts, completions, task):
<span class="hljs-keyword">if</span> t == <span class="hljs-string">&quot;coding&quot;</span>:
<span class="hljs-comment"># Calculate coding-specific reward</span>
works = test_code_solution(prompt, completion)
reward = <span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> works <span class="hljs-keyword">else</span> -<span class="hljs-number">1.0</span>
rewards.append(reward)
<span class="hljs-keyword">else</span>:
<span class="hljs-comment"># Return None for non-coding tasks</span>
rewards.append(<span class="hljs-literal">None</span>)
<span class="hljs-keyword">return</span> rewards
<span class="hljs-comment"># Use both task-specific reward functions</span>
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen2-0.5B-Instruct&quot;</span>,
reward_funcs=[math_reward_func, coding_reward_func],
train_dataset=dataset,
)
trainer.train()`,wrap:!1}}),Gt=new x({props:{title:"Example 5: Asynchronous reward functions",local:"example-5-asynchronous-reward-functions",headingTag:"h4"}}),Lt=new J({props:{code:"aW1wb3J0JTIwYXN5bmNpbyUwQSUwQWFzeW5jJTIwZGVmJTIwYXN5bmNfcmV3YXJkX2Z1bmMocHJvbXB0cyUyQyUyMGNvbXBsZXRpb25zJTJDJTIwKiprd2FyZ3MpJTNBJTBBJTIwJTIwJTIwJTIwJTIzJTIwU2ltdWxhdGUlMjBhbiUyMEklMkZPLWJvdW5kJTIwY2FsbCUyMChlLmcuJTJDJTIwSFRUUCUyMHJlcXVlc3QlMkMlMjBkYXRhYmFzZSUyMGxvb2t1cCklMEElMjAlMjAlMjAlMjBhd2FpdCUyMGFzeW5jaW8uc2xlZXAoMC4wMSklMEElMjAlMjAlMjAlMjAlMjMlMjBTaW1wbGUlMjB0b3klMjByZXdhcmQlM0ElMjAxLjAlMjBpZiUyMHRoZSUyMGNvbXBsZXRpb24lMjBpcyUyMG5vbi1lbXB0eSUyQyUyMGVsc2UlMjAwLjAlMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlNUIxLjAlMjBpZiUyMGNvbXBsZXRpb24lMjBlbHNlJTIwMC4wJTIwZm9yJTIwY29tcGxldGlvbiUyMGluJTIwY29tcGxldGlvbnMlNUQ=",highlighted:`<span class="hljs-keyword">import</span> asyncio
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">async_reward_func</span>(<span class="hljs-params">prompts, completions, **kwargs</span>):
<span class="hljs-comment"># Simulate an I/O-bound call (e.g., HTTP request, database lookup)</span>
<span class="hljs-keyword">await</span> asyncio.sleep(<span class="hljs-number">0.01</span>)
<span class="hljs-comment"># Simple toy reward: 1.0 if the completion is non-empty, else 0.0</span>
<span class="hljs-keyword">return</span> [<span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> completion <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]`,wrap:!1}}),Nt=new x({props:{title:"Passing the reward function to the trainer",local:"passing-the-reward-function-to-the-trainer",headingTag:"h4"}}),qt=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRHJld2FyZF9mdW5jJTJDJTBBJTIwJTIwJTIwJTIwLi4uJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
trainer = GRPOTrainer(
reward_funcs=reward_func,
...,
)`,wrap:!1}}),Pt=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRCU1QnJld2FyZF9mdW5jJTJDJTIwYXN5bmNfcmV3YXJkX2Z1bmMxJTJDJTIwYXN5bmNfcmV3YXJkX2Z1bmMyJTVEJTJDJTBBJTIwJTIwJTIwJTIwLi4uJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
trainer = GRPOTrainer(
reward_funcs=[reward_func, async_reward_func1, async_reward_func2],
...,
)`,wrap:!1}}),St=new x({props:{title:"Rapid Experimentation for GRPO",local:"rapid-experimentation-for-grpo",headingTag:"h3"}}),Ht=new x({props:{title:"Agent Training",local:"agent-training",headingTag:"h2"}}),Ot=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMHRvb2xzJTNEJTVCdG9vbDElMkMlMjB0b29sMiU1RCUyQyUwQSUyMCUyMCUyMCUyMC4uLiUyQyUwQSk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
trainer = GRPOTrainer(
tools=[tool1, tool2],
...,
)`,wrap:!1}}),Ft=new J({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTBBJTBBZGVmJTIwbXVsdGlwbHkoYSUzQSUyMGludCUyQyUyMGIlM0ElMjBpbnQpJTIwLSUzRSUyMGludCUzQSUwQSUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyME11bHRpcGxpZXMlMjB0d28lMjBpbnRlZ2Vycy4lMEElMEElMjAlMjAlMjAlMjBBcmdzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYSUzQSUyMFRoZSUyMGZpcnN0JTIwaW50ZWdlci4lMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBiJTNBJTIwVGhlJTIwc2Vjb25kJTIwaW50ZWdlci4lMEElMEElMjAlMjAlMjAlMjBSZXR1cm5zJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwVGhlJTIwcHJvZHVjdCUyMG9mJTIwdGhlJTIwdHdvJTIwaW50ZWdlcnMuJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwYSUyMColMjBiJTBBJTBBYXN5bmMlMjBkZWYlMjBhc3luY19hZGQoYSUzQSUyMGludCUyQyUyMGIlM0ElMjBpbnQpJTIwLSUzRSUyMGludCUzQSUwQSUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMEFzeW5jaHJvbm91c2x5JTIwYWRkcyUyMHR3byUyMGludGVnZXJzLiUwQSUwQSUyMCUyMCUyMCUyMEFyZ3MlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBhJTNBJTIwVGhlJTIwZmlyc3QlMjBpbnRlZ2VyLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGIlM0ElMjBUaGUlMjBzZWNvbmQlMjBpbnRlZ2VyLiUwQSUwQSUyMCUyMCUyMCUyMFJldHVybnMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBzdW0lMjBvZiUyMHRoZSUyMHR3byUyMGludGVnZXJzLiUwQSUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMHJldHVybiUyMGElMjAlMkIlMjBiJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMHRvb2xzJTNEJTVCbXVsdGlwbHklMkMlMjBhc3luY19hZGQlNUQlMkMlMEElMjAlMjAlMjAlMjAuLi4lMkMlMEEp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer
<span class="hljs-keyword">def</span> <span class="hljs-title function_">multiply</span>(<span class="hljs-params">a: <span class="hljs-built_in">int</span>, b: <span class="hljs-built_in">int</span></span>) -&gt; <span class="hljs-built_in">int</span>:
<span class="hljs-string">&quot;&quot;&quot;
Multiplies two integers.
Args:
a: The first integer.
b: The second integer.
Returns:
The product of the two integers.
&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> a * b
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">async_add</span>(<span class="hljs-params">a: <span class="hljs-built_in">int</span>, b: <span class="hljs-built_in">int</span></span>) -&gt; <span class="hljs-built_in">int</span>:
<span class="hljs-string">&quot;&quot;&quot;
Asynchronously adds two integers.
Args:
a: The first integer.
b: The second integer.
Returns:
The sum of the two integers.
&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> a + b
trainer = GRPOTrainer(
tools=[multiply, async_add],
...,
)`,wrap:!1}}),Yt=new x({props:{title:"Supported Models",local:"supported-models",headingTag:"h3"}}),se=new x({props:{title:"Quick Start",local:"quick-start",headingTag:"h3"}}),te=new J({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMCU1QyUwQSUyMCUyMC0tY29uZmlnX2ZpbGUlM0RleGFtcGxlcyUyRmFjY2VsZXJhdGVfY29uZmlncyUyRmRlZXBzcGVlZF96ZXJvMy55YW1sJTIwJTVDJTBBJTIwJTIwZXhhbXBsZXMlMkZzY3JpcHRzJTJGZ3Jwb19hZ2VudC5weSUyMCU1QyUwQSUyMCUyMC0tbW9kZWxfbmFtZV9vcl9wYXRoJTIwUXdlbiUyRlF3ZW4zLTAuNkIlMEElMjAlMjAuLi4=",highlighted:`accelerate launch \\
--config_file=examples/accelerate_configs/deepspeed_zero3.yaml \\
examples/scripts/grpo_agent.py \\
--model_name_or_path Qwen/Qwen3-0.6B
...`,wrap:!1}}),ee=new x({props:{title:"Vision-Language Model (VLM) Training",local:"vision-language-model-vlm-training",headingTag:"h2"}}),le=new x({props:{title:"Supported Models",local:"supported-models",headingTag:"h3"}}),me=new x({props:{title:"Quick Start",local:"quick-start",headingTag:"h3"}}),oe=new J({props:{code:"YWNjZWxlcmF0ZSUyMGxhdW5jaCUyMCU1QyUwQSUyMCUyMC0tY29uZmlnX2ZpbGUlM0RleGFtcGxlcyUyRmFjY2VsZXJhdGVfY29uZmlncyUyRmRlZXBzcGVlZF96ZXJvMy55YW1sJTIwJTVDJTBBJTIwJTIwZXhhbXBsZXMlMkZzY3JpcHRzJTJGZ3Jwb192bG0ucHklMjAlNUMlMEElMjAlMjAtLW1vZGVsX25hbWVfb3JfcGF0aCUyMFF3ZW4lMkZRd2VuMi41LVZMLTNCLUluc3RydWN0JTIwJTVDJTBBJTIwJTIwLS1vdXRwdXRfZGlyJTIwZ3Jwby1Rd2VuMi41LVZMLTNCLUluc3RydWN0JTIwJTVDJTBBJTIwJTIwLS1sZWFybmluZ19yYXRlJTIwMWUtNSUyMCU1QyUwQSUyMCUyMC0tZHR5cGUlMjBiZmxvYXQxNiUyMCU1QyUwQSUyMCUyMC0tbWF4X2NvbXBsZXRpb25fbGVuZ3RoJTIwMTAyNCUyMCU1QyUwQSUyMCUyMC0tdXNlX3ZsbG0lMjAlNUMlMEElMjAlMjAtLXZsbG1fbW9kZSUyMGNvbG9jYXRlJTIwJTVDJTBBJTIwJTIwLS11c2VfcGVmdCUyMCU1QyUwQSUyMCUyMC0tbG9yYV90YXJnZXRfbW9kdWxlcyUyMCUyMnFfcHJvaiUyMiUyQyUyMCUyMnZfcHJvaiUyMiUyMCU1QyUwQSUyMCUyMC0tbG9nX2NvbXBsZXRpb25z",highlighted:`accelerate launch \\
--config_file=examples/accelerate_configs/deepspeed_zero3.yaml \\
examples/scripts/grpo_vlm.py \\
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \\
--output_dir grpo-Qwen2.5-VL-3B-Instruct \\
--learning_rate 1e-5 \\
--dtype bfloat16 \\
--max_completion_length 1024 \\
--use_vllm \\
--vllm_mode colocate \\
--use_peft \\
--lora_target_modules <span class="hljs-string">&quot;q_proj&quot;</span>, <span class="hljs-string">&quot;v_proj&quot;</span> \\
--log_completions`,wrap:!1}}),ce=new x({props:{title:"Configuration Tips",local:"configuration-tips",headingTag:"h3"}}),ge=new x({props:{title:"Dataset Format",local:"dataset-format",headingTag:"h3"}}),fe=new x({props:{title:"GRPOTrainer",local:"trl.GRPOTrainer",headingTag:"h2"}}),ve=new jn({props:{name:"class trl.GRPOTrainer",anchor:"trl.GRPOTrainer",parameters:[{name:"model",val:": str | PreTrainedModel | PeftModel"},{name:"reward_funcs",val:": str | transformers.modeling_utils.PreTrainedModel | collections.abc.Callable[[list, list], list[float]] | list[str | transformers.modeling_utils.PreTrainedModel | collections.abc.Callable[[list, list], list[float]]]"},{name:"args",val:": trl.trainer.grpo_config.GRPOConfig | None = None"},{name:"train_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | None = None"},{name:"eval_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | dict[str, datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset] | None = None"},{name:"processing_class",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase | transformers.processing_utils.ProcessorMixin | None = None"},{name:"reward_processing_classes",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase | list[transformers.tokenization_utils_base.PreTrainedTokenizerBase] | None = None"},{name:"callbacks",val:": list[transformers.trainer_callback.TrainerCallback] | None = None"},{name:"optimizers",val:": tuple = (None, None)"},{name:"peft_config",val:": PeftConfig | None = None"},{name:"tools",val:": list[collections.abc.Callable] | None = None"},{name:"rollout_func",val:": collections.abc.Callable[[list[str], 'GRPOTrainer'], dict[str, typing.Any]] | None = None"}],parametersDescription:[{anchor:"trl.GRPOTrainer.model",description:`<strong>model</strong> (<code>str</code> or <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> or <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a>) &#x2014;
Model to be trained. Can be either:</p>
<ul>
<li>A string, being the <em>model id</em> of a pretrained model hosted inside a model repo on huggingface.co, or a
path to a <em>directory</em> containing model weights saved using
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>&apos;./my_model_directory/&apos;</code>. The model is loaded
using <code>&lt;ModelArchitecture&gt;.from_pretrained</code> (where <code>&lt;ModelArchitecture&gt;</code> is derived from the model
config) with the keyword arguments in <code>args.model_init_kwargs</code>.</li>
<li>A <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> object. Only causal language models are supported.</li>
<li>A <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a> object. Only causal language models are supported.</li>
</ul>`,name:"model"},{anchor:"trl.GRPOTrainer.reward_funcs",description:`<strong>reward_funcs</strong> (<code>RewardFunc | list[RewardFunc]</code>) &#x2014;
Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
functions with the prompts and completions and sum the rewards. Can be either:</p>
<ul>
<li>
<p>A single reward function, such as:</p>
<ul>
<li>
<p>A string: The <em>model ID</em> of a pretrained model hosted inside a model repo on huggingface.co, or a
path to a <em>directory</em> containing model weights saved using
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>&apos;./my_model_directory/&apos;</code>. The model is loaded
using <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification.from_pretrained" rel="nofollow">from_pretrained</a> with <code>num_labels=1</code> and the
keyword arguments in <code>args.model_init_kwargs</code>.</p>
</li>
<li>
<p>A <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> object: Only sequence classification models are supported.</p>
</li>
<li>
<p>A custom reward function: The function is provided with the prompts and the generated completions,
plus any additional columns in the dataset. It should return a list of rewards. Custom reward
functions can be either synchronous or asynchronous and can also return <code>None</code> when the reward is
not applicable to those samples. This is useful for multi-task training where different reward
functions apply to different types of samples. When a reward function returns <code>None</code> for a sample,
that reward function is excluded from the reward calculation for that sample. For more details, see
<a href="#using-a-custom-reward-function">Using a custom reward
function</a>.</p>
<p>The trainer&#x2019;s state is also passed to the reward function. The trainer&#x2019;s state is an instance of
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerState" rel="nofollow">TrainerState</a> and can be accessed by accessing the <code>trainer_state</code> argument to the
reward function&#x2019;s signature.</p>
</li>
</ul>
</li>
<li>
<p>A list of reward functions, where each item can independently be any of the above types. Mixing different
types within the list (e.g., a string model ID and a custom reward function) is allowed.</p>
</li>
</ul>`,name:"reward_funcs"},{anchor:"trl.GRPOTrainer.args",description:`<strong>args</strong> (<a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig">GRPOConfig</a>, <em>optional</em>) &#x2014;
Configuration for this trainer. If <code>None</code>, a default configuration is used.`,name:"args"},{anchor:"trl.GRPOTrainer.train_dataset",description:`<strong>train_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a> or <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a>) &#x2014;
Dataset to use for training. It must include a column <code>&quot;prompt&quot;</code>. Any additional columns in the dataset is
ignored. The format of the samples can be either:</p>
<ul>
<li><a href="dataset_formats#standard">Standard</a>: Each sample contains plain text.</li>
<li><a href="dataset_formats#conversational">Conversational</a>: Each sample contains structured messages (e.g., role
and content).</li>
</ul>`,name:"train_dataset"},{anchor:"trl.GRPOTrainer.eval_dataset",description:`<strong>eval_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a>, <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a> or <code>dict[str, Dataset | IterableDataset]</code>) &#x2014;
Dataset to use for evaluation. It must meet the same requirements as <code>train_dataset</code>.`,name:"eval_dataset"},{anchor:"trl.GRPOTrainer.processing_class",description:`<strong>processing_class</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a>, <a href="https://huggingface.co/docs/transformers/main/en/main_classes/processors#transformers.ProcessorMixin" rel="nofollow">ProcessorMixin</a>, <em>optional</em>) &#x2014;
Processing class used to process the data. The padding side must be set to &#x201C;left&#x201D;. If <code>None</code>, the
processing class is loaded from the model&#x2019;s name with <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoProcessor.from_pretrained" rel="nofollow">from_pretrained</a>. A
padding token, <code>tokenizer.pad_token</code>, must be set. If the processing class has not set a padding token,
<code>tokenizer.eos_token</code> will be used as the default.`,name:"processing_class"},{anchor:"trl.GRPOTrainer.reward_processing_classes",description:`<strong>reward_processing_classes</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a> or <code>list[PreTrainedTokenizerBase]</code>, <em>optional</em>) &#x2014;
Processing classes corresponding to the reward functions specified in <code>reward_funcs</code>. Can be either:</p>
<ul>
<li>A single processing class: Used when <code>reward_funcs</code> contains only one reward function.</li>
<li>A list of processing classes: Must match the order and length of the reward functions in <code>reward_funcs</code>.
If set to <code>None</code>, or if an element of the list corresponding to a <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> is
<code>None</code>, the tokenizer for the model is automatically loaded using
<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained" rel="nofollow">from_pretrained</a>. For elements in <code>reward_funcs</code> that are custom reward
functions (not <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a>), the corresponding entries in <code>reward_processing_classes</code>
are ignored.</li>
</ul>`,name:"reward_processing_classes"},{anchor:"trl.GRPOTrainer.callbacks",description:`<strong>callbacks</strong> (list of <a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback" rel="nofollow">TrainerCallback</a>, <em>optional</em>) &#x2014;
List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
in <a href="https://huggingface.co/docs/transformers/main_classes/callback" rel="nofollow">here</a>.</p>
<p>If you want to remove one of the default callbacks used, use the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.remove_callback" rel="nofollow">remove_callback</a>
method.`,name:"callbacks"},{anchor:"trl.GRPOTrainer.optimizers",description:`<strong>optimizers</strong> (<code>tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]</code>, <em>optional</em>, defaults to <code>(None, None)</code>) &#x2014;
A tuple containing the optimizer and the scheduler to use. Will default to an instance of <code>AdamW</code> on your
model and a scheduler given by <a href="https://huggingface.co/docs/transformers/main/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup" rel="nofollow">get_linear_schedule_with_warmup</a> controlled by <code>args</code>.`,name:"optimizers"},{anchor:"trl.GRPOTrainer.peft_config",description:`<strong>peft_config</strong> (<a href="https://huggingface.co/docs/peft/main/en/package_reference/config#peft.PeftConfig" rel="nofollow">PeftConfig</a>, <em>optional</em>) &#x2014;
PEFT configuration used to wrap the model. If <code>None</code>, the model is not wrapped.`,name:"peft_config"},{anchor:"trl.GRPOTrainer.tools",description:`<strong>tools</strong> (list of <code>Callable</code>, <em>optional</em>) &#x2014;
A list of callable tool functions (sync or async) that the model can invoke during generation. Each tool
should be a standard Python function with properly type-hinted arguments and return values, and a
Google-style docstring describing its purpose, arguments, and return value. For more details, see:
<a href="https://huggingface.co/docs/transformers/en/chat_extras#passing-tools" rel="nofollow">https://huggingface.co/docs/transformers/en/chat_extras#passing-tools</a>. The model uses the function&#x2019;s name,
type hints, and docstring to determine how to call it. Ensure that the model&#x2019;s chat template supports tool
use and that it has been fine-tuned for tool calling.`,name:"tools"},{anchor:"trl.GRPOTrainer.rollout_func",description:`<strong>rollout_func</strong> (<code>RolloutFunc</code>, <em>optional</em>) &#x2014;
Function to use for generating completions. It receives the list of prompts allocated to the current
process and the trainer instance. It must return a dict with <code>&quot;prompt_ids&quot;</code>, <code>&quot;completion_ids&quot;</code>, and
<code>&quot;logprobs&quot;</code> fields. Any other fields are forwarded to the reward functions. This feature is experimental
and may change or be removed at any time without prior notice.`,name:"rollout_func"}],source:"https://github.com/huggingface/trl/blob/vr_4949/trl/trainer/grpo_trainer.py#L122"}}),Bs=new md({props:{anchor:"trl.GRPOTrainer.example",$$slots:{default:[rd]},$$scope:{ctx:Cn}}}),we=new jn({props:{name:"train",anchor:"trl.GRPOTrainer.train",parameters:[{name:"resume_from_checkpoint",val:": str | bool | None = None"},{name:"trial",val:": typing.Union[ForwardRef('optuna.Trial'), dict[str, typing.Any], NoneType] = None"},{name:"ignore_keys_for_eval",val:": list[str] | None = None"}],parametersDescription:[{anchor:"trl.GRPOTrainer.train.resume_from_checkpoint",description:`<strong>resume_from_checkpoint</strong> (<code>str</code> or <code>bool</code>, <em>optional</em>) &#x2014;
If a <code>str</code>, local path to a saved checkpoint as saved by a previous instance of <code>Trainer</code>. If a
<code>bool</code> and equals <code>True</code>, load the last checkpoint in <em>args.output_dir</em> as saved by a previous instance
of <code>Trainer</code>. If present, training will resume from the model/optimizer/scheduler states loaded here.`,name:"resume_from_checkpoint"},{anchor:"trl.GRPOTrainer.train.trial",description:`<strong>trial</strong> (<code>optuna.Trial</code> or <code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
The trial run or the hyperparameter dictionary for hyperparameter search.`,name:"trial"},{anchor:"trl.GRPOTrainer.train.ignore_keys_for_eval",description:`<strong>ignore_keys_for_eval</strong> (<code>list[str]</code>, <em>optional</em>) &#x2014;
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions for evaluation during the training.`,name:"ignore_keys_for_eval"}],source:"https://github.com/huggingface/trl/blob/vr_4949/transformers/trainer.py#L2070"}}),be=new jn({props:{name:"save_model",anchor:"trl.GRPOTrainer.save_model",parameters:[{name:"output_dir",val:": str | None = None"},{name:"_internal_call",val:": bool = False"}],source:"https://github.com/huggingface/trl/blob/vr_4949/transformers/trainer.py#L3993"}}),Me=new jn({props:{name:"push_to_hub",anchor:"trl.GRPOTrainer.push_to_hub",parameters:[{name:"commit_message",val:": str | None = 'End of training'"},{name:"blocking",val:": bool = True"},{name:"token",val:": str | None = None"},{name:"revision",val:": str | None = None"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"trl.GRPOTrainer.push_to_hub.commit_message",description:`<strong>commit_message</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;End of training&quot;</code>) &#x2014;
Message to commit while pushing.`,name:"commit_message"},{anchor:"trl.GRPOTrainer.push_to_hub.blocking",description:`<strong>blocking</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether the function should return only when the <code>git push</code> has finished.`,name:"blocking"},{anchor:"trl.GRPOTrainer.push_to_hub.token",description:`<strong>token</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) &#x2014;
Token with write permission to overwrite Trainer&#x2019;s original args.`,name:"token"},{anchor:"trl.GRPOTrainer.push_to_hub.revision",description:`<strong>revision</strong> (<code>str</code>, <em>optional</em>) &#x2014;
The git revision to commit from. Defaults to the head of the &#x201C;main&#x201D; branch.`,name:"revision"},{anchor:"trl.GRPOTrainer.push_to_hub.kwargs",description:`<strong>kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Additional keyword arguments passed along to <code>~Trainer.create_model_card</code>.`,name:"kwargs"}],source:"https://github.com/huggingface/trl/blob/vr_4949/transformers/trainer.py#L4902",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>The URL of the repository where the model was pushed if <code>blocking=False</code>, or a <code>Future</code> object tracking the
progress of the commit if <code>blocking=True</code>.</p>
`}}),_e=new x({props:{title:"GRPOConfig",local:"trl.GRPOConfig",headingTag:"h2"}}),Te=new jn({props:{name:"class trl.GRPOConfig",anchor:"trl.GRPOConfig",parameters:[{name:"output_dir",val:": str | None = None"},{name:"do_train",val:": bool = False"},{name:"do_eval",val:": bool = False"},{name:"do_predict",val:": bool = False"},{name:"eval_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'no'"},{name:"prediction_loss_only",val:": bool = False"},{name:"per_device_train_batch_size",val:": int = 8"},{name:"per_device_eval_batch_size",val:": int = 8"},{name:"gradient_accumulation_steps",val:": int = 1"},{name:"eval_accumulation_steps",val:": int | None = None"},{name:"eval_delay",val:": float = 0"},{name:"torch_empty_cache_steps",val:": int | None = None"},{name:"learning_rate",val:": float = 1e-06"},{name:"weight_decay",val:": float = 0.0"},{name:"adam_beta1",val:": float = 0.9"},{name:"adam_beta2",val:": float = 0.999"},{name:"adam_epsilon",val:": float = 1e-08"},{name:"max_grad_norm",val:": float = 1.0"},{name:"num_train_epochs",val:": float = 3.0"},{name:"max_steps",val:": int = -1"},{name:"lr_scheduler_type",val:": transformers.trainer_utils.SchedulerType | str = 'linear'"},{name:"lr_scheduler_kwargs",val:": dict | str | None = None"},{name:"warmup_ratio",val:": float | None = None"},{name:"warmup_steps",val:": float = 0"},{name:"log_level",val:": str = 'passive'"},{name:"log_level_replica",val:": str = 'warning'"},{name:"log_on_each_node",val:": bool = True"},{name:"logging_dir",val:": str | None = None"},{name:"logging_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'steps'"},{name:"logging_first_step",val:": bool = False"},{name:"logging_steps",val:": float = 10"},{name:"logging_nan_inf_filter",val:": bool = True"},{name:"save_strategy",val:": transformers.trainer_utils.SaveStrategy | str = 'steps'"},{name:"save_steps",val:": float = 500"},{name:"save_total_limit",val:": int | None = None"},{name:"enable_jit_checkpoint",val:": bool = False"},{name:"save_on_each_node",val:": bool = False"},{name:"save_only_model",val:": bool = False"},{name:"restore_callback_states_from_checkpoint",val:": bool = False"},{name:"use_cpu",val:": bool = False"},{name:"seed",val:": int = 42"},{name:"data_seed",val:": int | None = None"},{name:"bf16",val:": bool | None = None"},{name:"fp16",val:": bool = False"},{name:"bf16_full_eval",val:": bool = False"},{name:"fp16_full_eval",val:": bool = False"},{name:"tf32",val:": bool | None = None"},{name:"local_rank",val:": int = -1"},{name:"ddp_backend",val:": str | None = None"},{name:"debug",val:": str | list[transformers.debug_utils.DebugOption] = ''"},{name:"dataloader_drop_last",val:": bool = False"},{name:"eval_steps",val:": float | None = None"},{name:"dataloader_num_workers",val:": int = 0"},{name:"dataloader_prefetch_factor",val:": int | None = None"},{name:"run_name",val:": str | None = None"},{name:"disable_tqdm",val:": bool | None = None"},{name:"remove_unused_columns",val:": bool | None = False"},{name:"label_names",val:": list[str] | None = None"},{name:"load_best_model_at_end",val:": bool = False"},{name:"metric_for_best_model",val:": str | None = None"},{name:"greater_is_better",val:": bool | None = None"},{name:"ignore_data_skip",val:": bool = False"},{name:"fsdp",val:": list[transformers.trainer_utils.FSDPOption] | str | None = None"},{name:"fsdp_config",val:": dict[str, typing.Any] | str | None = None"},{name:"accelerator_config",val:": dict | str | None = None"},{name:"parallelism_config",val:": accelerate.parallelism_config.ParallelismConfig | None = None"},{name:"deepspeed",val:": dict | str | None = None"},{name:"label_smoothing_factor",val:": float = 0.0"},{name:"optim",val:": transformers.training_args.OptimizerNames | str = 'adamw_torch_fused'"},{name:"optim_args",val:": str | None = None"},{name:"group_by_length",val:": bool = False"},{name:"length_column_name",val:": str = 'length'"},{name:"report_to",val:": None | str | list[str] = 'none'"},{name:"project",val:": str = 'huggingface'"},{name:"trackio_space_id",val:": str | None = 'trackio'"},{name:"ddp_find_unused_parameters",val:": bool | None = None"},{name:"ddp_bucket_cap_mb",val:": int | None = None"},{name:"ddp_broadcast_buffers",val:": bool | None = None"},{name:"dataloader_pin_memory",val:": bool = True"},{name:"dataloader_persistent_workers",val:": bool = False"},{name:"skip_memory_metrics",val:": bool = True"},{name:"push_to_hub",val:": bool = False"},{name:"resume_from_checkpoint",val:": str | None = None"},{name:"hub_model_id",val:": str | None = None"},{name:"hub_strategy",val:": transformers.trainer_utils.HubStrategy | str = 'every_save'"},{name:"hub_token",val:": str | None = None"},{name:"hub_private_repo",val:": bool | None = None"},{name:"hub_always_push",val:": bool = False"},{name:"hub_revision",val:": str | None = None"},{name:"gradient_checkpointing",val:": bool = True"},{name:"gradient_checkpointing_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"include_for_metrics",val:": list = <factory>"},{name:"eval_do_concat_batches",val:": bool = True"},{name:"auto_find_batch_size",val:": bool = False"},{name:"full_determinism",val:": bool = False"},{name:"ddp_timeout",val:": int = 1800"},{name:"torch_compile",val:": bool = False"},{name:"torch_compile_backend",val:": str | None = None"},{name:"torch_compile_mode",val:": str | None = None"},{name:"include_num_input_tokens_seen",val:": str | bool = 'no'"},{name:"neftune_noise_alpha",val:": float | None = None"},{name:"optim_target_modules",val:": None | str | list[str] = None"},{name:"batch_eval_metrics",val:": bool = False"},{name:"eval_on_start",val:": bool = False"},{name:"use_liger_kernel",val:": bool = False"},{name:"liger_kernel_config",val:": dict[str, bool] | None = None"},{name:"eval_use_gather_object",val:": bool = False"},{name:"average_tokens_across_devices",val:": bool = True"},{name:"use_cache",val:": bool = False"},{name:"model_init_kwargs",val:": dict | str | None = None"},{name:"disable_dropout",val:": bool = False"},{name:"cast_lm_head_to_fp32",val:": bool = False"},{name:"num_generations",val:": int | None = 8"},{name:"num_generations_eval",val:": int | None = None"},{name:"max_completion_length",val:": int | None = 256"},{name:"ds3_gather_for_generation",val:": bool = True"},{name:"shuffle_dataset",val:": bool | None = True"},{name:"generation_batch_size",val:": int | None = None"},{name:"steps_per_generation",val:": int | None = None"},{name:"temperature",val:": float = 1.0"},{name:"top_p",val:": float = 1.0"},{name:"top_k",val:": int = 0"},{name:"min_p",val:": float | None = None"},{name:"generation_kwargs",val:": dict | None = None"},{name:"chat_template_kwargs",val:": dict | None = None"},{name:"repetition_penalty",val:": float = 1.0"},{name:"use_transformers_paged",val:": bool = False"},{name:"cache_implementation",val:": str | None = None"},{name:"use_vllm",val:": bool = False"},{name:"vllm_mode",val:": str = 'server'"},{name:"vllm_model_impl",val:": str = 'vllm'"},{name:"vllm_enable_sleep_mode",val:": bool = False"},{name:"vllm_structured_outputs_regex",val:": str | None = None"},{name:"vllm_server_base_url",val:": str | None = None"},{name:"vllm_server_host",val:": str = '0.0.0.0'"},{name:"vllm_server_port",val:": int = 8000"},{name:"vllm_server_timeout",val:": float = 240.0"},{name:"vllm_group_port",val:": int = 51216"},{name:"vllm_gpu_memory_utilization",val:": float = 0.3"},{name:"vllm_max_model_length",val:": int | None = None"},{name:"vllm_tensor_parallel_size",val:": int = 1"},{name:"beta",val:": float = 0.0"},{name:"num_iterations",val:": int = 1"},{name:"epsilon",val:": float = 0.2"},{name:"delta",val:": float | None = None"},{name:"epsilon_high",val:": float | None = None"},{name:"sapo_temperature_neg",val:": float = 1.05"},{name:"sapo_temperature_pos",val:": float = 1.0"},{name:"importance_sampling_level",val:": str = 'token'"},{name:"reward_weights",val:": list[float] | None = None"},{name:"multi_objective_aggregation",val:": str = 'sum_then_normalize'"},{name:"scale_rewards",val:": str = 'group'"},{name:"loss_type",val:": str = 'dapo'"},{name:"mask_truncated_completions",val:": bool = False"},{name:"sync_ref_model",val:": bool = False"},{name:"ref_model_mixup_alpha",val:": float = 0.6"},{name:"ref_model_sync_steps",val:": int = 512"},{name:"top_entropy_quantile",val:": float = 1.0"},{name:"max_tool_calling_iterations",val:": int | None = None"},{name:"vllm_importance_sampling_correction",val:": bool = True"},{name:"vllm_importance_sampling_mode",val:": str = 'sequence_mask'"},{name:"vllm_importance_sampling_cap",val:": float = 3.0"},{name:"off_policy_mask_threshold",val:": float | None = None"},{name:"use_bias_correction_kl",val:": bool = False"},{name:"log_completions",val:": bool = False"},{name:"num_completions_to_print",val:": int | None = None"},{name:"log_unique_prompts",val:": bool = False"},{name:"log_completions_hub_repo",val:": str | None = None"}],source:"https://github.com/huggingface/trl/blob/vr_4949/trl/trainer/grpo_config.py#L23",parameterGroups:[{title:"Parameters that control the model and reference model",parametersDescription:[{anchor:"trl.GRPOConfig.model_init_kwargs",description:`<strong>model_init_kwargs</strong> (<code>str</code>, <code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Keyword arguments for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained" rel="nofollow">from_pretrained</a>, used when the <code>model</code>
argument of the <a href="/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> is provided as a string.`,name:"model_init_kwargs"},{anchor:"trl.GRPOConfig.disable_dropout",description:`<strong>disable_dropout</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
the model from generating different logprobs for the same input.`,name:"disable_dropout"},{anchor:"trl.GRPOConfig.cast_lm_head_to_fp32",description:`<strong>cast_lm_head_to_fp32</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to cast the language modeling head of the policy and reference models to float32. As recommended by
the <a href="https://huggingface.co/papers/2510.13786" rel="nofollow">ScaleRL</a> recipe. This flag is only supported when the model
has untied word embedding and language modeling head layers i.e. <code>tie_word_embeddings</code> in the model config
is False.`,name:"cast_lm_head_to_fp32"}]},{title:"Parameters that control the data preprocessing",parametersDescription:[{anchor:"trl.GRPOConfig.remove_unused_columns",description:`<strong>remove_unused_columns</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to only keep the column <code>&quot;prompt&quot;</code> in the dataset. If you use a custom reward function that
requires any column other than <code>&quot;prompts&quot;</code> and <code>&quot;completions&quot;</code>, you should keep this to <code>False</code>.`,name:"remove_unused_columns"},{anchor:"trl.GRPOConfig.num_generations",description:`<strong>num_generations</strong> (<code>int</code>, <em>optional</em>, defaults to <code>8</code>) &#x2014;
Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
<ul>
<li>gradient_accumulation_steps) must be evenly divisible by this value.</li>
</ul>`,name:"num_generations"},{anchor:"trl.GRPOConfig.num_generations_eval",description:`<strong>num_generations_eval</strong> (<code>int</code> or <code>None</code>, <em>optional</em>) &#x2014;
Number of generations to sample during evaluation. This allows using fewer generations during evaluation to
save computation. If <code>None</code>, uses the value of <code>num_generations</code>.`,name:"num_generations_eval"},{anchor:"trl.GRPOConfig.max_completion_length",description:`<strong>max_completion_length</strong> (<code>int</code> or <code>None</code>, <em>optional</em>, defaults to <code>256</code>) &#x2014;
Maximum length of the generated completion.`,name:"max_completion_length"},{anchor:"trl.GRPOConfig.ds3_gather_for_generation",description:`<strong>ds3_gather_for_generation</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
improving generation speed. However, disabling this option allows training models that exceed the VRAM
capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
with vLLM generation.`,name:"ds3_gather_for_generation"},{anchor:"trl.GRPOConfig.shuffle_dataset",description:`<strong>shuffle_dataset</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether to shuffle the training dataset.`,name:"shuffle_dataset"}]},{title:"Parameters that control generation",parametersDescription:[{anchor:"trl.GRPOConfig.generation_batch_size",description:`<strong>generation_batch_size</strong> &#x2014; (<code>int</code>, <em>optional</em>):
Batch size to use for generation. If <code>None</code>, it defaults to the effective training batch size:
<code>per_device_train_batch_size * num_processes * steps_per_generation</code>. In other words, there is one
generation batch processed per optimization step. Mutually exclusive with <code>steps_per_generation</code>.`,name:"generation_batch_size"},{anchor:"trl.GRPOConfig.steps_per_generation",description:`<strong>steps_per_generation</strong> &#x2014; (<code>int</code>, <em>optional</em>):
Number of steps per generation. If <code>None</code>, it defaults to <code>gradient_accumulation_steps</code>. Mutually exclusive
with <code>generation_batch_size</code>.`,name:"steps_per_generation"},{anchor:"trl.GRPOConfig.temperature",description:`<strong>temperature</strong> (<code>float</code>, defaults to <code>1.0</code>) &#x2014;
Temperature for sampling. The higher the temperature, the more random the completions.`,name:"temperature"},{anchor:"trl.GRPOConfig.top_p",description:`<strong>top_p</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) &#x2014;
Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
<code>1.0</code> to consider all tokens.`,name:"top_p"},{anchor:"trl.GRPOConfig.top_k",description:`<strong>top_k</strong> (<code>int</code>, <em>optional</em>, defaults to <code>0</code>) &#x2014;
Number of highest probability vocabulary tokens to keep for top-k-filtering. If <code>0</code>, top-k-filtering is
disabled and all tokens are considered.`,name:"top_k"},{anchor:"trl.GRPOConfig.min_p",description:`<strong>min_p</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
value between <code>0.0</code> and <code>1.0</code>. Typical values are in the <code>0.01-0.2</code> range.`,name:"min_p"},{anchor:"trl.GRPOConfig.generation_kwargs",description:`<strong>generation_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Additional keyword arguments to pass to <a href="https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig" rel="nofollow">GenerationConfig</a> (if using transformers) or
<code>SamplingParams</code> (if using vLLM) when sampling completions. This can be used to further customize the
generation behavior, such as setting <code>suppress_tokens</code>, <code>num_beams</code>, etc. If it contains keys that conflict
with the other generation parameters (like <code>min_p</code>, <code>top_p</code>, etc.), they will override them.`,name:"generation_kwargs"},{anchor:"trl.GRPOConfig.chat_template_kwargs",description:`<strong>chat_template_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Additional keyword arguments to pass to the <code>apply_chat_template</code> function when generating completions.`,name:"chat_template_kwargs"},{anchor:"trl.GRPOConfig.repetition_penalty",description:`<strong>repetition_penalty</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) &#x2014;
Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
Values &gt; <code>1.0</code> encourage the model to use new tokens, while values &lt; <code>1.0</code> encourage the model to repeat
tokens.`,name:"repetition_penalty"},{anchor:"trl.GRPOConfig.use_transformers_paged",description:`<strong>use_transformers_paged</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to use the <code>transformers</code> paged implementation for generation. If set to <code>True</code>, the <code>transformers</code>
paged implementation will be used for generation instead of the default padded implementation. This
parameter is only effective when <code>use_vllm</code> is set to <code>False</code>.`,name:"use_transformers_paged"},{anchor:"trl.GRPOConfig.cache_implementation",description:`<strong>cache_implementation</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Implementation of the cache method for faster generation when <code>use_vllm</code> is set to <code>False</code>.`,name:"cache_implementation"}]},{title:"Parameters that control generation acceleration powered by vLLM",parametersDescription:[{anchor:"trl.GRPOConfig.use_vllm",description:`<strong>use_vllm</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to use vLLM for generating completions. If set to <code>True</code>, the trainer will use vLLM for generation
instead of the default model.generate(). Requires <code>vllm</code> to be installed.`,name:"use_vllm"},{anchor:"trl.GRPOConfig.vllm_mode",description:`<strong>vllm_mode</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;server&quot;</code>) &#x2014;
Mode to use for vLLM integration when <code>use_vllm</code> is set to <code>True</code>. Must be one of <code>&quot;server&quot;</code> or
<code>&quot;colocate&quot;</code>.</p>
<ul>
<li><code>&quot;server&quot;</code>: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
server is running (start with <code>trl vllm-serve</code>).</li>
<li><code>&quot;colocate&quot;</code>: vLLM will run in the same process and share the training GPUs. This avoids the need for a
separate server but may cause resource contention with training.</li>
</ul>`,name:"vllm_mode"},{anchor:"trl.GRPOConfig.vllm_model_impl",description:`<strong>vllm_model_impl</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;vllm&quot;</code>) &#x2014;
Model implementation to use for vLLM. Must be one of <code>&quot;transformers&quot;</code> or <code>&quot;vllm&quot;</code>. <code>&quot;transformers&quot;</code>: Use
the <code>transformers</code> backend for model implementation. <code>&quot;vllm&quot;</code>: Use the <code>vllm</code> library for model
implementation.`,name:"vllm_model_impl"},{anchor:"trl.GRPOConfig.vllm_structured_outputs_regex",description:`<strong>vllm_structured_outputs_regex</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Regex for vLLM structured outputs. If <code>None</code> (default), structured outputs is disabled.`,name:"vllm_structured_outputs_regex"}]},{title:'Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)',parametersDescription:[{anchor:"trl.GRPOConfig.vllm_server_base_url",description:`<strong>vllm_server_base_url</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Base URL for the vLLM server (e.g., <code>&quot;http://localhost:8000&quot;</code>). If provided, <code>vllm_server_host</code> and
<code>vllm_server_port</code> are ignored.`,name:"vllm_server_base_url"},{anchor:"trl.GRPOConfig.vllm_server_host",description:`<strong>vllm_server_host</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;0.0.0.0&quot;</code>) &#x2014;
Host of the vLLM server to connect to. Ignored if <code>vllm_server_base_url</code> is provided.`,name:"vllm_server_host"},{anchor:"trl.GRPOConfig.vllm_server_port",description:`<strong>vllm_server_port</strong> (<code>int</code>, <em>optional</em>, defaults to <code>8000</code>) &#x2014;
Port of the vLLM server to connect to. Ignored if <code>vllm_server_base_url</code> is provided.`,name:"vllm_server_port"},{anchor:"trl.GRPOConfig.vllm_server_timeout",description:`<strong>vllm_server_timeout</strong> (<code>float</code>, <em>optional</em>, defaults to <code>240.0</code>) &#x2014;
Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
timeout, a <code>ConnectionError</code> is raised.`,name:"vllm_server_timeout"},{anchor:"trl.GRPOConfig.vllm_group_port",description:`<strong>vllm_group_port</strong> (<code>int</code>, <em>optional</em>, defaults to <code>51216</code>) &#x2014;
Port number for the weight update group. This is used to communicate with the vLLM server. Unless the port
is occupied, there is no need to change it.`,name:"vllm_group_port"}]},{title:'Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)',parametersDescription:[{anchor:"trl.GRPOConfig.vllm_gpu_memory_utilization",description:`<strong>vllm_gpu_memory_utilization</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.3</code>) &#x2014;
Control the GPU memory utilization for vLLM. This setting only applies when <code>vllm_mode</code> is set to
<code>&quot;colocate&quot;</code>. If you are using <code>vllm_mode=&quot;server&quot;</code>, this parameter must be passed separately when
launching the vLLM server via the <code>--vllm_gpu_memory_utilization</code> flag.`,name:"vllm_gpu_memory_utilization"},{anchor:"trl.GRPOConfig.vllm_max_model_length",description:`<strong>vllm_max_model_length</strong> (<code>int</code>, <em>optional</em>) &#x2014;
Context window for vLLM. Set it to at least the maximum prompt length in the dataset plus
<code>max_completion_length</code>; if omitted, it is inferred from the model config.`,name:"vllm_max_model_length"},{anchor:"trl.GRPOConfig.vllm_tensor_parallel_size",description:`<strong>vllm_tensor_parallel_size</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1</code>) &#x2014;
Control the tensor parallel size for vLLM. This setting only applies when <code>vllm_mode</code> is set to
<code>&quot;colocate&quot;</code>. If you are using <code>vllm_mode=&quot;server&quot;</code>, this parameter must be passed separately when
launching the vLLM server via the <code>--vllm_tensor_parallel_size</code> flag.`,name:"vllm_tensor_parallel_size"},{anchor:"trl.GRPOConfig.vllm_enable_sleep_mode",description:`<strong>vllm_enable_sleep_mode</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
waking the engine adds host&#x2013;device transfer latency.`,name:"vllm_enable_sleep_mode"}]},{title:"Parameters that control the training",parametersDescription:[{anchor:"trl.GRPOConfig.beta",description:`<strong>beta</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.0</code>) &#x2014;
KL coefficient. If <code>0.0</code> (default), the reference model is not loaded, reducing memory usage and improving
training speed. <a href="https://huggingface.co/papers/2501.12948" rel="nofollow">DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement
learning</a> use a value of <code>0.001</code>.`,name:"beta"},{anchor:"trl.GRPOConfig.num_iterations",description:`<strong>num_iterations</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1</code>) &#x2014;
Number of iterations per batch (denoted as &#x3BC; in the algorithm).`,name:"num_iterations"},{anchor:"trl.GRPOConfig.epsilon",description:`<strong>epsilon</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.2</code>) &#x2014;
Epsilon value for clipping.`,name:"epsilon"},{anchor:"trl.GRPOConfig.delta",description:`<strong>delta</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Enables the upper clipping bound in two-sided GRPO loss when set to a float. If <code>None</code> (default), standard
GRPO clipping is used. Recommended to be greater than <code>1 + &#x3B5;</code> when enabled. This method is introduced in
the <a href="https://huggingface.co/papers/2505.07291" rel="nofollow">INTELLECT-2 tech report</a>.`,name:"delta"},{anchor:"trl.GRPOConfig.epsilon_high",description:`<strong>epsilon_high</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
specified in argument <code>epsilon</code>. Paper <a href="https://huggingface.co/papers/2503.14476" rel="nofollow">DAPO</a> recommends <code>0.28</code>.
When used with <code>loss_type=&apos;cispo&apos;</code>, this corresponds to the &#x3B5;_max param specified in the <a href="https://huggingface.co/papers/2510.13786" rel="nofollow">ScaleRL
paper</a> and the recommended value is <code>5.0</code>.`,name:"epsilon_high"},{anchor:"trl.GRPOConfig.sapo_temperature_neg",description:`<strong>sapo_temperature_neg</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.05</code>) &#x2014;
Temperature for tokens with non-positive advantage scores used in the <code>sapo</code> loss function. This parameter
is introduced in the <a href="https://huggingface.co/papers/2511.20347" rel="nofollow">Soft Adaptive Policy Optimization paper</a>.`,name:"sapo_temperature_neg"},{anchor:"trl.GRPOConfig.sapo_temperature_pos",description:`<strong>sapo_temperature_pos</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) &#x2014;
Temperature for tokens with positive advantage scores used in the <code>sapo</code> loss function. This parameter is
introduced in the <a href="https://huggingface.co/papers/2511.20347" rel="nofollow">Soft Adaptive Policy Optimization paper</a>.`,name:"sapo_temperature_pos"},{anchor:"trl.GRPOConfig.importance_sampling_level",description:`<strong>importance_sampling_level</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;token&quot;</code>) &#x2014;
Controls whether importance sampling ratios are computed at the <code>&quot;token&quot;</code> or <code>&quot;sequence&quot;</code> level. <code>&quot;token&quot;</code>
keeps the raw per-token log-probability ratios (one weight per token). <code>&quot;sequence&quot;</code> averages the
log-probability ratios across valid tokens to produce a single ratio per sequence. The <a href="https://huggingface.co/papers/2507.18071" rel="nofollow">GSPO
paper</a> shows that sequence-level sampling often yields more
stable training and better alignment with sequence-level rewards.`,name:"importance_sampling_level"},{anchor:"trl.GRPOConfig.reward_weights",description:`<strong>reward_weights</strong> (<code>list[float]</code>, <em>optional</em>) &#x2014;
Weights for each reward function. Must match the number of reward functions. If <code>None</code>, all rewards are
weighted equally with weight <code>1.0</code>.`,name:"reward_weights"},{anchor:"trl.GRPOConfig.multi_objective_aggregation",description:`<strong>multi_objective_aggregation</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;sum_then_normalize&quot;</code>) &#x2014;
Method to aggregate multiple reward functions. Supported values are:</p>
<ul>
<li><code>&quot;sum_then_normalize&quot;</code> (default): First sums the weighted rewards from each reward function, then applies
reward scaling/normalization as specified by <code>scale_rewards</code> (see <code>scale_rewards</code> for details).</li>
<li><code>&quot;normalize_then_sum&quot;</code>: First normalizes/scales each reward function across generations (within each
group), then sums the normalized rewards using the specified weights. The aggregated reward is then
normalized at the batch level when forming advantages. This is the suggested approach from the paper
<a href="https://huggingface.co/papers/2601.05242" rel="nofollow">GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL
Optimization</a>.</li>
</ul>`,name:"multi_objective_aggregation"},{anchor:"trl.GRPOConfig.scale_rewards",description:`<strong>scale_rewards</strong> (<code>str</code> or <code>bool</code>, <em>optional</em>, defaults to <code>&quot;group&quot;</code>) &#x2014;
Specifies the scaling strategy for rewards. Supported values are:</p>
<ul>
<li><code>True</code> or <code>&quot;group&quot;</code> (default): rewards are scaled by the standard deviation within each group, ensuring
unit variance within a group.</li>
<li><code>&quot;batch&quot;</code>: rewards are scaled by the standard deviation across the entire batch, as recommended in the
<a href="https://huggingface.co/papers/2508.08221" rel="nofollow">PPO Lite paper</a>.</li>
<li><code>False</code> or <code>&quot;none&quot;</code>: no scaling is applied. The <a href="https://huggingface.co/papers/2503.20783" rel="nofollow">Dr. GRPO
paper</a> recommends not scaling rewards, as scaling by the
standard deviation introduces a question-level difficulty bias.</li>
</ul>`,name:"scale_rewards"},{anchor:"trl.GRPOConfig.loss_type",description:`<strong>loss_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;dapo&quot;</code>) &#x2014;
Specifies the loss formulation to use. Supported values are:</p>
<ul>
<li><code>&quot;grpo&quot;</code>: Aggregates token-level losses by normalizing over sequence length. Not recommended due to
length bias&#x2014;this approach tends to prefer shorter completions with positive advantages and longer ones
with negative advantages.</li>
<li><code>&quot;dr_grpo&quot;</code>: Aggregates token-level losses by normalizing with a global constant. This method was
introduced in the <a href="https://huggingface.co/papers/2503.20783" rel="nofollow">Dr. GRPO paper</a> to eliminate length bias.
The value of the constant corresponds to <code>max_completion_length</code>.</li>
<li><code>&quot;dapo&quot;</code> (default): Aggregates token-level losses by normalizing with the number of active token in the
global accumulated batch. This method was introduced in the <a href="https://huggingface.co/papers/2503.14476" rel="nofollow">DAPO
paper</a> to eliminate length bias.</li>
<li><code>&quot;bnpo&quot;</code>: Aggregates token-level losses by normalizing with the number of active token in the local
batch. Note that normalization is performed over the local batch only, so results may slightly vary
depending on the local batch size, despite a constant effective batch size. When using
<code>per_device_train_batch_size==1</code>, the loss is equivalent to the GRPO loss.</li>
<li><code>&quot;cispo&quot;</code>: Clips the importance sampling weights instead of the advantage scaled importance weights. The
clipped weights are then multiplied with the advantages and policy model&#x2019;s log probs. Individual token
losses are aggregated by normalizing with the number of active tokens in the global accumulated batch.
This method was introduced in the <a href="https://huggingface.co/papers/2506.13585" rel="nofollow">MiniMax-M1 paper</a>.</li>
<li><code>&quot;sapo&quot;</code>: Soft Adaptive Policy Optimization loss, as introduced in the <a href="https://huggingface.co/papers/2506.13585" rel="nofollow">Soft Adaptive Policy Optimization
paper</a>. Replaces hard clipping with a smooth,
temperature-controlled gate that adaptively attenuates off-policy updates while preserving useful
learning signals.</li>
</ul>`,name:"loss_type"},{anchor:"trl.GRPOConfig.mask_truncated_completions",description:`<strong>mask_truncated_completions</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
When enabled, truncated completions are excluded from the loss calculation, preventing them from being
incorrectly penalized and introducing noise during training. According to the
<a href="https://huggingface.co/papers/2503.14476" rel="nofollow">DAPO</a> paper, this is a good practice for training stability.`,name:"mask_truncated_completions"},{anchor:"trl.GRPOConfig.sync_ref_model",description:`<strong>sync_ref_model</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to synchronize the reference model with the active model every <code>ref_model_sync_steps</code> steps, using
the <code>ref_model_mixup_alpha</code> parameter. This synchronization originates from the
<a href="https://huggingface.co/papers/2404.09656" rel="nofollow">TR-DPO</a> paper.`,name:"sync_ref_model"},{anchor:"trl.GRPOConfig.ref_model_mixup_alpha",description:`<strong>ref_model_mixup_alpha</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.6</code>) &#x2014;
&#x3B1; parameter from the <a href="https://huggingface.co/papers/2404.09656" rel="nofollow">TR-DPO</a> paper, which controls the mix
between the current policy and the previous reference policy during updates. The reference policy is
updated according to the equation: <code>&#x3C0;_ref = &#x3B1; * &#x3C0;_&#x3B8; + (1 - &#x3B1;) * &#x3C0;_ref_prev</code>. To use this parameter, you
must set <code>sync_ref_model=True</code>.`,name:"ref_model_mixup_alpha"},{anchor:"trl.GRPOConfig.ref_model_sync_steps",description:`<strong>ref_model_sync_steps</strong> (<code>int</code>, <em>optional</em>, defaults to <code>512</code>) &#x2014;
&#x3C4; parameter from the <a href="https://huggingface.co/papers/2404.09656" rel="nofollow">TR-DPO</a> paper, which determines how
frequently the current policy is synchronized with the reference policy. To use this parameter, you must
set <code>sync_ref_model=True</code>.`,name:"ref_model_sync_steps"},{anchor:"trl.GRPOConfig.top_entropy_quantile",description:`<strong>top_entropy_quantile</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) &#x2014;
&#x3C1; parameter from <a href="https://huggingface.co/papers/2506.01939" rel="nofollow">Beyond the 80/20 Rule</a>. Keeps in the policy
loss term only the top-&#x3C1; quantile of tokens by entropy of the probability distribution at each sequence
position, improving results. Range: <code>[0.0-1.0]</code>. A value of <code>0.0</code> masks all but the highest entropy token;
<code>1.0</code> keeps all tokens. The paper recommends a value of <code>0.2</code>. If used with
<code>mask_truncated_completions=True</code>, only tokens from non-truncated completions are considered.`,name:"top_entropy_quantile"},{anchor:"trl.GRPOConfig.max_tool_calling_iterations",description:`<strong>max_tool_calling_iterations</strong> (<code>int</code>, <em>optional</em>) &#x2014;
Maximum number of tool-calling turns when training an agent. If <code>None</code>, there is no limit and generation
stops when the model generates a response turn with no tool calls or when the total response length reaches
<code>max_model_length</code>.`,name:"max_tool_calling_iterations"},{anchor:"trl.GRPOConfig.vllm_importance_sampling_correction",description:`<strong>vllm_importance_sampling_correction</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether to apply Importance Sampling (IS) to correct for the mismatch between vLLM completion logprobs and
recomputed training logprobs. If set to <code>False</code>, no IS is applied regardless of
<code>vllm_importance_sampling_mode</code>. When <code>True</code>, the selected mode determines how the IS ratios are computed
and constrained.`,name:"vllm_importance_sampling_correction"},{anchor:"trl.GRPOConfig.vllm_importance_sampling_mode",description:`<strong>vllm_importance_sampling_mode</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;sequence_mask&quot;</code>) &#x2014;
Specifies how Importance Sampling is performed when <code>vllm_importance_sampling_correction=True</code>. Possible
values are:</p>
<ul>
<li><code>&quot;token_truncate&quot;</code>: Token-level truncated IS (default). Per-token ratios are clipped from above at C.</li>
<li><code>&quot;token_mask&quot;</code>: Token-level masked IS. Per-token ratios above C are set to zero.</li>
<li><code>&quot;sequence_truncate&quot;</code>: Sequence-level truncated IS. A single sequence ratio is clipped from above at
C and applied to all tokens in the sequence.</li>
<li><code>&quot;sequence_mask&quot;</code>: Sequence-level masked IS. Sequences with ratios above C are masked out.</li>
</ul>`,name:"vllm_importance_sampling_mode"},{anchor:"trl.GRPOConfig.vllm_importance_sampling_cap",description:`<strong>vllm_importance_sampling_cap</strong> (<code>float</code>, <em>optional</em>, defaults to <code>3.0</code>) &#x2014;
Importance sampling cap C used by <code>vllm_importance_sampling_mode</code>. For <code>*_truncate</code> modes, importance
ratios are clipped from above at C. For <code>*_mask</code> modes, ratios larger than C are set to zero.`,name:"vllm_importance_sampling_cap"},{anchor:"trl.GRPOConfig.off_policy_mask_threshold",description:`<strong>off_policy_mask_threshold</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Threshold for off-policy sequence masking. If <code>None</code>, off-policy sequence masking is disabled. When set,
sequences with negative advantages and high KL divergence are masked out to stabilize training. This
parameter corresponds to the <code>delta</code> threshold in Equation 9 of the <a href="https://huggingface.co/papers/2512.02556" rel="nofollow">DeepSeek-V3.2
paper</a>. It expects a positive value (e.g., 0.5).`,name:"off_policy_mask_threshold"},{anchor:"trl.GRPOConfig.use_bias_correction_kl",description:`<strong>use_bias_correction_kl</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to use the unbiased KL divergence estimator with importance sampling correction. This corrects the
KL divergence estimate by multiplying it with the importance sampling ratio. This is described in the
<a href="https://huggingface.co/papers/2512.02556" rel="nofollow">DeepSeek-V3.2 paper</a>.`,name:"use_bias_correction_kl"}]},{title:"Parameters that control the logging",parametersDescription:[{anchor:"trl.GRPOConfig.log_completions",description:`<strong>log_completions</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to log a sample of (prompt, completion) pairs every <code>logging_steps</code> steps. If <code>rich</code> is installed,
it prints the sample. If <code>wandb</code> and/or <code>trackio</code> logging is enabled, it logs it to <code>wandb</code> and/or
<code>trackio</code>.`,name:"log_completions"},{anchor:"trl.GRPOConfig.num_completions_to_print",description:`<strong>num_completions_to_print</strong> (<code>int</code>, <em>optional</em>) &#x2014;
Number of completions to print with <code>rich</code>. If <code>None</code>, all completions are logged.`,name:"num_completions_to_print"},{anchor:"trl.GRPOConfig.log_unique_prompts",description:`<strong>log_unique_prompts</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to log unique prompts. If <code>True</code>, only unique prompts are logged. If <code>False</code>, all prompts are
logged.`,name:"log_unique_prompts"},{anchor:"trl.GRPOConfig.log_completions_hub_repo",description:`<strong>log_completions_hub_repo</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Hugging Face Hub repository to save the completions. Should be a complete repository name like
<code>&apos;username/reponame&apos;</code> or <code>&apos;orgname/reponame&apos;</code>, or just <code>&apos;reponame&apos;</code> in which case the repository will be
created in the currently-logged-in Hugging Face user&#x2019;s namespace. Note that this repository will be public
unless you set <code>hub_private_repo=True</code> or your organization&#x2019;s default is to create private repositories.&#x201D;`,name:"log_completions_hub_repo"}]}]}}),xe=new id({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md"}}),{c(){z=i("meta"),Es=n(),W=i("p"),P=n(),h(E.$$.fragment),j=n(),h(Q.$$.fragment),zn=n(),Zs=i("p"),Zs.innerHTML=Go,kn=n(),h(Ss.$$.fragment),In=n(),Qs=i("p"),Qs.innerHTML=Ro,Gn=n(),Hs=i("p"),Hs.textContent=$o,Rn=n(),Xs=i("blockquote"),Xs.innerHTML=Lo,$n=n(),Os=i("p"),Os.innerHTML=No,Ln=n(),h(Ws.$$.fragment),Nn=n(),Vs=i("p"),Vs.innerHTML=Bo,Bn=n(),V=i("iframe"),qn=n(),Fs=i("p"),Fs.textContent=Ao,An=n(),h(Ys.$$.fragment),Pn=n(),Ds=i("p"),Ds.textContent=Po,En=n(),h(Ks.$$.fragment),Zn=n(),sa=i("p"),sa.textContent=Eo,Sn=n(),aa=i("p"),aa.innerHTML=Zo,Qn=n(),h(ta.$$.fragment),Hn=n(),ea=i("p"),ea.innerHTML=So,Xn=n(),na=i("p"),na.innerHTML=Qo,On=n(),h(la.$$.fragment),Wn=n(),F=i("p"),Om=o("At each training step, we sample a batch of prompts and generate a set of "),Vn=new b(!1),Fn=o(" completions for each prompt (denoted as "),Yn=new b(!1),Dn=o(")."),Kn=n(),h(pa.$$.fragment),sl=n(),ps=i("p"),Wm=o("For each of the "),al=new b(!1),tl=o(` sequences, we compute the reward using a reward model or reward function. To align with the comparative nature of reward models—typically trained on datasets of comparisons between outputs for the same question—the advantage is calculated to reflect these relative comparisons. It is normalized as follows:
`),el=new b(!1),nl=n(),ia=i("p"),ia.innerHTML=Ho,ll=n(),ma=i("blockquote"),L=i("p"),Vm=o("It was shown in the paper "),ds=i("a"),ds.textContent=Xo,Fm=o(" that scaling by "),pl=new b(!1),il=o(" may cause a question-level difficulty bias. You can disable this scaling by setting "),Ie=i("code"),Ie.textContent=Oo,Ym=o(" in "),ra=i("a"),ra.textContent=Wo,Dm=o(`.
Note that turning off std-based scaling also removes variance normalization, so update magnitudes depend directly on the raw reward scale and batch composition.`),ml=n(),us=i("blockquote"),us.innerHTML=Vo,rl=n(),h(oa.$$.fragment),ol=n(),is=i("p"),Km=o("KL divergence is estimated using the approximator introduced by "),ys=i("a"),ys.textContent=Fo,sr=o(`. The approximator is defined as follows:
`),cl=new b(!1),hl=n(),h(ca.$$.fragment),gl=n(),ha=i("p"),ar=o(`The objective is to maximize the advantage while ensuring that the model remains close to the reference policy. Consequently, the loss is defined as follows:
`),dl=new b(!1),ul=n(),ga=i("p"),ga.textContent=Yo,yl=n(),da=i("blockquote"),N=i("p"),tr=o("Note that compared to the original formulation in "),fs=i("a"),fs.textContent=Do,er=o(", we don’t scale by "),fl=new b(!1),vl=o(" because it was shown in the paper "),vs=i("a"),vs.textContent=Ko,nr=o(" that this introduces a response-level length bias. More details in "),ua=i("a"),ua.textContent=sc,lr=o("."),wl=n(),ya=i("blockquote"),C=i("p"),pr=o("Note that compared to the original formulation in "),ws=i("a"),ws.textContent=ac,ir=o(", we use "),bl=new b(!1),Ml=o(" by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g., "),bs=i("a"),bs.textContent=tc,mr=o(") which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g. "),Ms=i("a"),Ms.textContent=ec,rr=o(", "),_s=i("a"),_s.textContent=nc,or=o("). If you wish to include the KL divergence term, you can set "),Ge=i("code"),Ge.textContent=lc,cr=o(" in "),fa=i("a"),fa.textContent=pc,hr=o(" to a non-zero value."),_l=n(),R=i("p"),gr=o("In the original paper, this formulation is generalized to account for multiple updates after each generation (denoted "),Tl=new b(!1),xl=o(", can be set with "),Re=i("code"),Re.textContent=ic,dr=o(" in "),va=i("a"),va.textContent=mc,ur=o(") by leveraging the "),$e=i("strong"),$e.textContent=rc,yr=o(`:
`),Jl=new b(!1),Ul=n(),$=i("p"),fr=o("where "),jl=new b(!1),Cl=o(" ensures that updates do not deviate excessively from the reference policy by bounding the policy ratio between "),zl=new b(!1),kl=o(" and "),Il=new b(!1),Gl=o(`.
When `),Rl=new b(!1),$l=o(" (default in TRL), the clipped surrogate objective simplifies to the original objective."),Ll=n(),h(wa.$$.fragment),Nl=n(),ba=i("p"),vr=o(`Several formulations of the objective have been proposed in the literature. Initially, the objective of GRPO was defined as follows:
`),Bl=new b(!1),ql=n(),Ma=i("p"),wr=o(`where
`),Al=new b(!1),Pl=n(),ms=i("p"),br=o("The "),Ts=i("a"),Ts.textContent=oc,Mr=o(` highlights the limitations of the GRPO algorithm’s sample-level loss in long-CoT scenarios, where longer responses are under-penalized, leading to poorer quality outputs. The proposed solution is a token-level normalization, which better handles longer sequences by assigning more balanced rewards to individual tokens, regardless of response length:
`),El=new b(!1),Zl=n(),_a=i("p"),_a.innerHTML=cc,Sl=n(),rs=i("p"),_r=o("Furthermore, it was demonstrated in the paper "),xs=i("a"),xs.textContent=hc,Tr=o(` that the initial GRPO formulation introduces a response length bias. They show that while the DAPO formulation reduces this bias, it does not eliminate it completely. To fully remove this bias, they propose dividing by a constant instead of the sequence length, resulting in the following formulation:
`),Ql=new b(!1),Hl=n(),Ta=i("p"),Ta.innerHTML=gc,Xl=n(),xa=i("p"),xa.innerHTML=dc,Ol=n(),Ja=i("p"),xr=o(`The loss function is defined as:
`),Wl=new b(!1),Vl=n(),H=i("p"),Jr=o("The soft-gating function "),Fl=new b(!1),Yl=o(" is defined using the sigmoid function "),Dl=new b(!1),Kl=o(` as:
`),sp=new b(!1),ap=n(),X=i("p"),Ur=o("The temperature "),tp=new b(!1),ep=o(" is chosen based on the sign of the advantage "),np=new b(!1),lp=o(`:
`),pp=new b(!1),ip=n(),Y=i("p"),jr=o("They recommends using asymmetric temperatures, "),mp=new b(!1),rp=o(" (defaults are "),op=new b(!1),cp=o(" ). This ensures that the model is penalized more strictly for “bad” actions to prevent instability, while being more permissive with “good” actions."),hp=n(),Ua=i("p"),Ua.innerHTML=uc,gp=n(),h(ja.$$.fragment),dp=n(),Ca=i("p"),Ca.textContent=yc,up=n(),w=i("ul"),Le=i("li"),Le.innerHTML=fc,Cr=n(),Ne=i("li"),Ne.innerHTML=vc,zr=n(),Be=i("li"),Be.innerHTML=wc,kr=n(),qe=i("li"),qe.innerHTML=bc,Ir=n(),Ae=i("li"),Ae.innerHTML=Mc,Gr=n(),Pe=i("li"),Pe.innerHTML=_c,Rr=n(),Ee=i("li"),Ee.innerHTML=Tc,$r=n(),Ze=i("li"),Ze.innerHTML=xc,Lr=n(),Se=i("li"),Se.innerHTML=Jc,Nr=n(),Qe=i("li"),Qe.innerHTML=Uc,Br=n(),He=i("li"),He.innerHTML=jc,qr=n(),Xe=i("li"),Xe.innerHTML=Cc,Ar=n(),Oe=i("li"),Oe.innerHTML=zc,Pr=n(),We=i("li"),We.innerHTML=kc,Er=n(),Ve=i("li"),Ve.innerHTML=Ic,Zr=n(),Fe=i("li"),Fe.innerHTML=Gc,Sr=n(),Z=i("li"),Ye=i("code"),Ye.textContent=Rc,Qr=o(": The ratio of token (or sequence, if "),De=i("code"),De.textContent=$c,Hr=o(") probabilities where the GRPO objective is clipped to stay within the trust region: "),yp=new b(!1),fp=o(". A higher value means more tokens are clipped, which constrains how much the policy "),vp=new b(!1),wp=o(" can change."),Xr=n(),D=i("li"),Ke=i("code"),Ke.textContent=Lc,Or=o(": The average ratio of token (or sequence, if "),sn=i("code"),sn.textContent=Nc,Wr=o(") probabilities that were clipped on the lower bound of the trust region: "),bp=new b(!1),Mp=o("."),Vr=n(),K=i("li"),an=i("code"),an.textContent=Bc,Fr=o(": The minimum ratio of token (or sequence, if "),tn=i("code"),tn.textContent=qc,Yr=o(") probabilities that were clipped on the lower bound of the trust region: "),_p=new b(!1),Tp=o("."),Dr=n(),ss=i("li"),en=i("code"),en.textContent=Ac,Kr=o(": The average ratio of token (or sequence, if "),nn=i("code"),nn.textContent=Pc,so=o(") probabilities that were clipped on the upper bound of the trust region: "),xp=new b(!1),Jp=o("."),ao=n(),as=i("li"),ln=i("code"),ln.textContent=Ec,to=o(": The maximum ratio of token (or sequence, if "),pn=i("code"),pn.textContent=Zc,eo=o(") probabilities that were clipped on the upper bound of the trust region: "),Up=new b(!1),jp=o("."),Cp=n(),h(za.$$.fragment),zp=n(),h(ka.$$.fragment),kp=n(),Ia=i("p"),Ia.innerHTML=Sc,Ip=n(),h(Ga.$$.fragment),Gp=n(),Ra=i("p"),Ra.innerHTML=Qc,Rp=n(),Js=i("blockquote"),Js.innerHTML=Hc,$p=n(),h($a.$$.fragment),Lp=n(),La=i("p"),La.textContent=Xc,Np=n(),Us=i("ol"),Na=i("li"),mn=i("p"),mn.innerHTML=Oc,no=n(),h(Ba.$$.fragment),lo=n(),qa=i("li"),rn=i("p"),rn.innerHTML=Wc,po=n(),h(Aa.$$.fragment),Bp=n(),js=i("blockquote"),js.innerHTML=Vc,qp=n(),h(Pa.$$.fragment),Ap=n(),Ea=i("p"),Ea.textContent=Fc,Pp=n(),h(Za.$$.fragment),Ep=n(),Cs=i("blockquote"),Cs.innerHTML=Yc,Zp=n(),zs=i("blockquote"),zs.innerHTML=Dc,Sp=n(),Sa=i("p"),Sa.innerHTML=Kc,Qp=n(),h(Qa.$$.fragment),Hp=n(),Ha=i("p"),Ha.textContent=sh,Xp=n(),B=i("p"),io=o("This mismatch leads to a biased gradient update which has been observed to destabilize training ("),ks=i("a"),ks.textContent=ah,Is=i("a"),Is.textContent=th,Gs=i("a"),Gs.textContent=eh,Rs=i("a"),Rs.textContent=nh,$s=i("a"),$s.textContent=lh,mo=o(`). For simplicity, consider the REINFORCE policy gradient:
`),Op=new b(!1),Wp=n(),q=i("p"),ro=o("Here "),Vp=new b(!1),Fp=o(" denotes prompts sampled from some data distribution, and "),Yp=new b(!1),Dp=o(" is the policy implemented by the training engine. With vLLM in the loop we obtain a separate inference policy "),Kp=new b(!1),si=o(`, so the effective policy gradient becomes
`),ai=new b(!1),ti=n(),Xa=i("p"),Xa.textContent=ph,ei=n(),U=i("p"),oo=o("The standard way to correct for this distribution shift is "),on=i("strong"),on.textContent=ih,co=o(". We provide two IS variants: "),Oa=i("a"),Oa.textContent=mh,ho=o(" and "),Wa=i("a"),Wa.textContent=rh,go=o(". Both variants can be applied either at the token level or at the sequence level.Let "),ni=new b(!1),li=o(" denote the importance weight, for example "),pi=new b(!1),ii=o(" per token or "),mi=new b(!1),ri=o(" per sequence. Under TIS, ratios larger than "),cn=i("code"),cn.textContent=oh,uo=o(` are clipped,
`),oi=new b(!1),ci=n(),Va=i("p"),Va.innerHTML=ch,hi=n(),Fa=i("p"),Fa.innerHTML=hh,gi=n(),h(Ya.$$.fragment),di=n(),Da=i("p"),Da.innerHTML=gh,ui=n(),Ka=i("ul"),Ka.innerHTML=dh,yi=n(),st=i("p"),st.textContent=uh,fi=n(),h(at.$$.fragment),vi=n(),h(tt.$$.fragment),wi=n(),h(et.$$.fragment),bi=n(),nt=i("p"),nt.innerHTML=yh,Mi=n(),lt=i("p"),lt.innerHTML=fh,_i=n(),pt=i("ol"),pt.innerHTML=vh,Ti=n(),h(it.$$.fragment),xi=n(),mt=i("p"),mt.textContent=wh,Ji=n(),h(rt.$$.fragment),Ui=n(),ot=i("p"),ot.textContent=bh,ji=n(),h(ct.$$.fragment),Ci=n(),h(ht.$$.fragment),zi=n(),gt=i("p"),gt.textContent=Mh,ki=n(),h(dt.$$.fragment),Ii=n(),ut=i("p"),ut.textContent=_h,Gi=n(),h(yt.$$.fragment),Ri=n(),h(ft.$$.fragment),$i=n(),vt=i("p"),vt.innerHTML=Th,Li=n(),h(wt.$$.fragment),Ni=n(),bt=i("p"),bt.textContent=xh,Bi=n(),h(Mt.$$.fragment),qi=n(),h(_t.$$.fragment),Ai=n(),Tt=i("p"),Tt.innerHTML=Jh,Pi=n(),h(xt.$$.fragment),Ei=n(),Jt=i("p"),Jt.textContent=Uh,Zi=n(),h(Ut.$$.fragment),Si=n(),h(jt.$$.fragment),Qi=n(),Ct=i("p"),Ct.innerHTML=jh,Hi=n(),h(zt.$$.fragment),Xi=n(),kt=i("p"),kt.innerHTML=Ch,Oi=n(),It=i("p"),It.innerHTML=zh,Wi=n(),h(Gt.$$.fragment),Vi=n(),Rt=i("p"),Rt.innerHTML=kh,Fi=n(),$t=i("p"),$t.textContent=Ih,Yi=n(),h(Lt.$$.fragment),Di=n(),h(Nt.$$.fragment),Ki=n(),Bt=i("p"),Bt.innerHTML=Gh,sm=n(),h(qt.$$.fragment),am=n(),At=i("p"),At.textContent=Rh,tm=n(),h(Pt.$$.fragment),em=n(),Et=i("p"),Et.innerHTML=$h,nm=n(),Zt=i("p"),Zt.innerHTML=Lh,lm=n(),h(St.$$.fragment),pm=n(),Qt=i("p"),Qt.innerHTML=Nh,im=n(),h(Ht.$$.fragment),mm=n(),Xt=i("p"),Xt.innerHTML=Bh,rm=n(),h(Ot.$$.fragment),om=n(),Wt=i("p"),Wt.innerHTML=qh,cm=n(),Vt=i("p"),Vt.textContent=Ah,hm=n(),h(Ft.$$.fragment),gm=n(),h(Yt.$$.fragment),dm=n(),Dt=i("p"),Dt.textContent=Ph,um=n(),Kt=i("ul"),Kt.innerHTML=Eh,ym=n(),Ls=i("blockquote"),Ls.innerHTML=Zh,fm=n(),h(se.$$.fragment),vm=n(),ae=i("p"),ae.innerHTML=Sh,wm=n(),h(te.$$.fragment),bm=n(),h(ee.$$.fragment),Mm=n(),ne=i("p"),ne.textContent=Qh,_m=n(),h(le.$$.fragment),Tm=n(),pe=i("p"),pe.textContent=Hh,xm=n(),ie=i("ul"),ie.innerHTML=Xh,Jm=n(),Ns=i("blockquote"),Ns.innerHTML=Oh,Um=n(),h(me.$$.fragment),jm=n(),re=i("p"),re.innerHTML=Wh,Cm=n(),h(oe.$$.fragment),zm=n(),h(ce.$$.fragment),km=n(),he=i("ul"),he.innerHTML=Vh,Im=n(),h(ge.$$.fragment),Gm=n(),de=i("p"),de.textContent=Fh,Rm=n(),ue=i("ul"),ue.innerHTML=Yh,$m=n(),ye=i("p"),ye.textContent=Dh,Lm=n(),h(fe.$$.fragment),Nm=n(),k=i("div"),h(ve.$$.fragment),yo=n(),hn=i("p"),hn.innerHTML=Kh,fo=n(),h(Bs.$$.fragment),vo=n(),qs=i("div"),h(we.$$.fragment),wo=n(),gn=i("p"),gn.textContent=sg,bo=n(),ts=i("div"),h(be.$$.fragment),Mo=n(),dn=i("p"),dn.innerHTML=ag,_o=n(),un=i("p"),un.textContent=tg,To=n(),As=i("div"),h(Me.$$.fragment),xo=n(),yn=i("p"),yn.innerHTML=eg,Bm=n(),h(_e.$$.fragment),qm=n(),A=i("div"),h(Te.$$.fragment),Jo=n(),fn=i("p"),fn.innerHTML=ng,Uo=n(),vn=i("p"),vn.innerHTML=lg,jo=n(),wn=i("p"),wn.innerHTML=pg,Am=n(),h(xe.$$.fragment),Pm=n(),_n=i("p"),this.h()},l(s){const a=ld("svelte-u9bgzb",document.head);z=m(a,"META",{name:!0,content:!0}),a.forEach(t),Es=l(s),W=m(s,"P",{}),T(W).forEach(t),P=l(s),g(E.$$.fragment,s),j=l(s),g(Q.$$.fragment,s),zn=l(s),Zs=m(s,"P",{"data-svelte-h":!0}),r(Zs)!=="svelte-5f4e9w"&&(Zs.innerHTML=Go),kn=l(s),g(Ss.$$.fragment,s),In=l(s),Qs=m(s,"P",{"data-svelte-h":!0}),r(Qs)!=="svelte-1btigzj"&&(Qs.innerHTML=Ro),Gn=l(s),Hs=m(s,"P",{"data-svelte-h":!0}),r(Hs)!=="svelte-vfdo9a"&&(Hs.textContent=$o),Rn=l(s),Xs=m(s,"BLOCKQUOTE",{"data-svelte-h":!0}),r(Xs)!=="svelte-maid5i"&&(Xs.innerHTML=Lo),$n=l(s),Os=m(s,"P",{"data-svelte-h":!0}),r(Os)!=="svelte-mxzj2f"&&(Os.innerHTML=No),Ln=l(s),g(Ws.$$.fragment,s),Nn=l(s),Vs=m(s,"P",{"data-svelte-h":!0}),r(Vs)!=="svelte-z6bzxd"&&(Vs.innerHTML=Bo),Bn=l(s),V=m(s,"IFRAME",{src:!0,frameborder:!0,width:!0,height:!0}),T(V).forEach(t),qn=l(s),Fs=m(s,"P",{"data-svelte-h":!0}),r(Fs)!=="svelte-fsw01e"&&(Fs.textContent=Ao),An=l(s),g(Ys.$$.fragment,s),Pn=l(s),Ds=m(s,"P",{"data-svelte-h":!0}),r(Ds)!=="svelte-15hino8"&&(Ds.textContent=Po),En=l(s),g(Ks.$$.fragment,s),Zn=l(s),sa=m(s,"P",{"data-svelte-h":!0}),r(sa)!=="svelte-1p32u9o"&&(sa.textContent=Eo),Sn=l(s),aa=m(s,"P",{"data-svelte-h":!0}),r(aa)!=="svelte-19pqwae"&&(aa.innerHTML=Zo),Qn=l(s),g(ta.$$.fragment,s),Hn=l(s),ea=m(s,"P",{"data-svelte-h":!0}),r(ea)!=="svelte-ir94jd"&&(ea.innerHTML=So),Xn=l(s),na=m(s,"P",{"data-svelte-h":!0}),r(na)!=="svelte-sxakza"&&(na.innerHTML=Qo),On=l(s),g(la.$$.fragment,s),Wn=l(s),F=m(s,"P",{});var os=T(F);Om=c(os,"At each training step, we sample a batch of prompts and generate a set of "),Vn=M(os,!1),Fn=c(os," completions for each prompt (denoted as "),Yn=M(os,!1),Dn=c(os,")."),os.forEach(t),Kn=l(s),g(pa.$$.fragment,s),sl=l(s),ps=m(s,"P",{});var Tn=T(ps);Wm=c(Tn,"For each of the "),al=M(Tn,!1),tl=c(Tn,` sequences, we compute the reward using a reward model or reward function. To align with the comparative nature of reward models—typically trained on datasets of comparisons between outputs for the same question—the advantage is calculated to reflect these relative comparisons. It is normalized as follows:
`),el=M(Tn,!1),Tn.forEach(t),nl=l(s),ia=m(s,"P",{"data-svelte-h":!0}),r(ia)!=="svelte-1gfu28r"&&(ia.innerHTML=Ho),ll=l(s),ma=m(s,"BLOCKQUOTE",{class:!0});var ig=T(ma);L=m(ig,"P",{});var es=T(L);Vm=c(es,"It was shown in the paper "),ds=m(es,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(ds)!=="svelte-u44a8w"&&(ds.textContent=Xo),Fm=c(es," that scaling by "),pl=M(es,!1),il=c(es," may cause a question-level difficulty bias. You can disable this scaling by setting "),Ie=m(es,"CODE",{"data-svelte-h":!0}),r(Ie)!=="svelte-bmfm87"&&(Ie.textContent=Oo),Ym=c(es," in "),ra=m(es,"A",{href:!0,"data-svelte-h":!0}),r(ra)!=="svelte-854xyl"&&(ra.textContent=Wo),Dm=c(es,`.
Note that turning off std-based scaling also removes variance normalization, so update magnitudes depend directly on the raw reward scale and batch composition.`),es.forEach(t),ig.forEach(t),ml=l(s),us=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(us)!=="svelte-1sz8ywd"&&(us.innerHTML=Vo),rl=l(s),g(oa.$$.fragment,s),ol=l(s),is=m(s,"P",{});var xn=T(is);Km=c(xn,"KL divergence is estimated using the approximator introduced by "),ys=m(xn,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(ys)!=="svelte-77j2bq"&&(ys.textContent=Fo),sr=c(xn,`. The approximator is defined as follows:
`),cl=M(xn,!1),xn.forEach(t),hl=l(s),g(ca.$$.fragment,s),gl=l(s),ha=m(s,"P",{});var Co=T(ha);ar=c(Co,`The objective is to maximize the advantage while ensuring that the model remains close to the reference policy. Consequently, the loss is defined as follows:
`),dl=M(Co,!1),Co.forEach(t),ul=l(s),ga=m(s,"P",{"data-svelte-h":!0}),r(ga)!=="svelte-lefg4k"&&(ga.textContent=Yo),yl=l(s),da=m(s,"BLOCKQUOTE",{class:!0});var mg=T(da);N=m(mg,"P",{});var ns=T(N);tr=c(ns,"Note that compared to the original formulation in "),fs=m(ns,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(fs)!=="svelte-16brarf"&&(fs.textContent=Do),er=c(ns,", we don’t scale by "),fl=M(ns,!1),vl=c(ns," because it was shown in the paper "),vs=m(ns,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(vs)!=="svelte-u44a8w"&&(vs.textContent=Ko),nr=c(ns," that this introduces a response-level length bias. More details in "),ua=m(ns,"A",{href:!0,"data-svelte-h":!0}),r(ua)!=="svelte-o61jf4"&&(ua.textContent=sc),lr=c(ns,"."),ns.forEach(t),mg.forEach(t),wl=l(s),ya=m(s,"BLOCKQUOTE",{class:!0});var rg=T(ya);C=m(rg,"P",{});var G=T(C);pr=c(G,"Note that compared to the original formulation in "),ws=m(G,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(ws)!=="svelte-16brarf"&&(ws.textContent=ac),ir=c(G,", we use "),bl=M(G,!1),Ml=c(G," by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g., "),bs=m(G,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(bs)!=="svelte-1tkhwi2"&&(bs.textContent=tc),mr=c(G,") which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g. "),Ms=m(G,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(Ms)!=="svelte-u44a8w"&&(Ms.textContent=ec),rr=c(G,", "),_s=m(G,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(_s)!=="svelte-1q851rp"&&(_s.textContent=nc),or=c(G,"). If you wish to include the KL divergence term, you can set "),Ge=m(G,"CODE",{"data-svelte-h":!0}),r(Ge)!=="svelte-ste5qq"&&(Ge.textContent=lc),cr=c(G," in "),fa=m(G,"A",{href:!0,"data-svelte-h":!0}),r(fa)!=="svelte-854xyl"&&(fa.textContent=pc),hr=c(G," to a non-zero value."),G.forEach(t),rg.forEach(t),_l=l(s),R=m(s,"P",{});var O=T(R);gr=c(O,"In the original paper, this formulation is generalized to account for multiple updates after each generation (denoted "),Tl=M(O,!1),xl=c(O,", can be set with "),Re=m(O,"CODE",{"data-svelte-h":!0}),r(Re)!=="svelte-1i6938t"&&(Re.textContent=ic),dr=c(O," in "),va=m(O,"A",{href:!0,"data-svelte-h":!0}),r(va)!=="svelte-854xyl"&&(va.textContent=mc),ur=c(O,") by leveraging the "),$e=m(O,"STRONG",{"data-svelte-h":!0}),r($e)!=="svelte-18krrzk"&&($e.textContent=rc),yr=c(O,`:
`),Jl=M(O,!1),O.forEach(t),Ul=l(s),$=m(s,"P",{});var ls=T($);fr=c(ls,"where "),jl=M(ls,!1),Cl=c(ls," ensures that updates do not deviate excessively from the reference policy by bounding the policy ratio between "),zl=M(ls,!1),kl=c(ls," and "),Il=M(ls,!1),Gl=c(ls,`.
When `),Rl=M(ls,!1),$l=c(ls," (default in TRL), the clipped surrogate objective simplifies to the original objective."),ls.forEach(t),Ll=l(s),g(wa.$$.fragment,s),Nl=l(s),ba=m(s,"P",{});var zo=T(ba);vr=c(zo,`Several formulations of the objective have been proposed in the literature. Initially, the objective of GRPO was defined as follows:
`),Bl=M(zo,!1),zo.forEach(t),ql=l(s),Ma=m(s,"P",{});var ko=T(Ma);wr=c(ko,`where
`),Al=M(ko,!1),ko.forEach(t),Pl=l(s),ms=m(s,"P",{});var Jn=T(ms);br=c(Jn,"The "),Ts=m(Jn,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(Ts)!=="svelte-oig0q8"&&(Ts.textContent=oc),Mr=c(Jn,` highlights the limitations of the GRPO algorithm’s sample-level loss in long-CoT scenarios, where longer responses are under-penalized, leading to poorer quality outputs. The proposed solution is a token-level normalization, which better handles longer sequences by assigning more balanced rewards to individual tokens, regardless of response length:
`),El=M(Jn,!1),Jn.forEach(t),Zl=l(s),_a=m(s,"P",{"data-svelte-h":!0}),r(_a)!=="svelte-13lker3"&&(_a.innerHTML=cc),Sl=l(s),rs=m(s,"P",{});var Un=T(rs);_r=c(Un,"Furthermore, it was demonstrated in the paper "),xs=m(Un,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(xs)!=="svelte-u44a8w"&&(xs.textContent=hc),Tr=c(Un,` that the initial GRPO formulation introduces a response length bias. They show that while the DAPO formulation reduces this bias, it does not eliminate it completely. To fully remove this bias, they propose dividing by a constant instead of the sequence length, resulting in the following formulation:
`),Ql=M(Un,!1),Un.forEach(t),Hl=l(s),Ta=m(s,"P",{"data-svelte-h":!0}),r(Ta)!=="svelte-wgwwz1"&&(Ta.innerHTML=gc),Xl=l(s),xa=m(s,"P",{"data-svelte-h":!0}),r(xa)!=="svelte-1yaaj0u"&&(xa.innerHTML=dc),Ol=l(s),Ja=m(s,"P",{});var Io=T(Ja);xr=c(Io,`The loss function is defined as:
`),Wl=M(Io,!1),Io.forEach(t),Vl=l(s),H=m(s,"P",{});var Je=T(H);Jr=c(Je,"The soft-gating function "),Fl=M(Je,!1),Yl=c(Je," is defined using the sigmoid function "),Dl=M(Je,!1),Kl=c(Je,` as:
`),sp=M(Je,!1),Je.forEach(t),ap=l(s),X=m(s,"P",{});var Ue=T(X);Ur=c(Ue,"The temperature "),tp=M(Ue,!1),ep=c(Ue," is chosen based on the sign of the advantage "),np=M(Ue,!1),lp=c(Ue,`:
`),pp=M(Ue,!1),Ue.forEach(t),ip=l(s),Y=m(s,"P",{});var bn=T(Y);jr=c(bn,"They recommends using asymmetric temperatures, "),mp=M(bn,!1),rp=c(bn," (defaults are "),op=M(bn,!1),cp=c(bn," ). This ensures that the model is penalized more strictly for “bad” actions to prevent instability, while being more permissive with “good” actions."),bn.forEach(t),hp=l(s),Ua=m(s,"P",{"data-svelte-h":!0}),r(Ua)!=="svelte-c1u97x"&&(Ua.innerHTML=uc),gp=l(s),g(ja.$$.fragment,s),dp=l(s),Ca=m(s,"P",{"data-svelte-h":!0}),r(Ca)!=="svelte-vze52b"&&(Ca.textContent=yc),up=l(s),w=m(s,"UL",{});var _=T(w);Le=m(_,"LI",{"data-svelte-h":!0}),r(Le)!=="svelte-hhacpu"&&(Le.innerHTML=fc),Cr=l(_),Ne=m(_,"LI",{"data-svelte-h":!0}),r(Ne)!=="svelte-1eqyiwr"&&(Ne.innerHTML=vc),zr=l(_),Be=m(_,"LI",{"data-svelte-h":!0}),r(Be)!=="svelte-txwuvf"&&(Be.innerHTML=wc),kr=l(_),qe=m(_,"LI",{"data-svelte-h":!0}),r(qe)!=="svelte-y02b8p"&&(qe.innerHTML=bc),Ir=l(_),Ae=m(_,"LI",{"data-svelte-h":!0}),r(Ae)!=="svelte-9gwu8x"&&(Ae.innerHTML=Mc),Gr=l(_),Pe=m(_,"LI",{"data-svelte-h":!0}),r(Pe)!=="svelte-1ddd2lg"&&(Pe.innerHTML=_c),Rr=l(_),Ee=m(_,"LI",{"data-svelte-h":!0}),r(Ee)!=="svelte-1n38q0y"&&(Ee.innerHTML=Tc),$r=l(_),Ze=m(_,"LI",{"data-svelte-h":!0}),r(Ze)!=="svelte-1ax9w1y"&&(Ze.innerHTML=xc),Lr=l(_),Se=m(_,"LI",{"data-svelte-h":!0}),r(Se)!=="svelte-dpkiu2"&&(Se.innerHTML=Jc),Nr=l(_),Qe=m(_,"LI",{"data-svelte-h":!0}),r(Qe)!=="svelte-j7e6bo"&&(Qe.innerHTML=Uc),Br=l(_),He=m(_,"LI",{"data-svelte-h":!0}),r(He)!=="svelte-scyo6b"&&(He.innerHTML=jc),qr=l(_),Xe=m(_,"LI",{"data-svelte-h":!0}),r(Xe)!=="svelte-2asvkr"&&(Xe.innerHTML=Cc),Ar=l(_),Oe=m(_,"LI",{"data-svelte-h":!0}),r(Oe)!=="svelte-eqwuzj"&&(Oe.innerHTML=zc),Pr=l(_),We=m(_,"LI",{"data-svelte-h":!0}),r(We)!=="svelte-1xvloak"&&(We.innerHTML=kc),Er=l(_),Ve=m(_,"LI",{"data-svelte-h":!0}),r(Ve)!=="svelte-153aat5"&&(Ve.innerHTML=Ic),Zr=l(_),Fe=m(_,"LI",{"data-svelte-h":!0}),r(Fe)!=="svelte-k2atia"&&(Fe.innerHTML=Gc),Sr=l(_),Z=m(_,"LI",{});var cs=T(Z);Ye=m(cs,"CODE",{"data-svelte-h":!0}),r(Ye)!=="svelte-iqp3k9"&&(Ye.textContent=Rc),Qr=c(cs,": The ratio of token (or sequence, if "),De=m(cs,"CODE",{"data-svelte-h":!0}),r(De)!=="svelte-d1rrl1"&&(De.textContent=$c),Hr=c(cs,") probabilities where the GRPO objective is clipped to stay within the trust region: "),yp=M(cs,!1),fp=c(cs,". A higher value means more tokens are clipped, which constrains how much the policy "),vp=M(cs,!1),wp=c(cs," can change."),cs.forEach(t),Xr=l(_),D=m(_,"LI",{});var je=T(D);Ke=m(je,"CODE",{"data-svelte-h":!0}),r(Ke)!=="svelte-1wl9jmh"&&(Ke.textContent=Lc),Or=c(je,": The average ratio of token (or sequence, if "),sn=m(je,"CODE",{"data-svelte-h":!0}),r(sn)!=="svelte-d1rrl1"&&(sn.textContent=Nc),Wr=c(je,") probabilities that were clipped on the lower bound of the trust region: "),bp=M(je,!1),Mp=c(je,"."),je.forEach(t),Vr=l(_),K=m(_,"LI",{});var Ce=T(K);an=m(Ce,"CODE",{"data-svelte-h":!0}),r(an)!=="svelte-zggxjg"&&(an.textContent=Bc),Fr=c(Ce,": The minimum ratio of token (or sequence, if "),tn=m(Ce,"CODE",{"data-svelte-h":!0}),r(tn)!=="svelte-d1rrl1"&&(tn.textContent=qc),Yr=c(Ce,") probabilities that were clipped on the lower bound of the trust region: "),_p=M(Ce,!1),Tp=c(Ce,"."),Ce.forEach(t),Dr=l(_),ss=m(_,"LI",{});var ze=T(ss);en=m(ze,"CODE",{"data-svelte-h":!0}),r(en)!=="svelte-1tram1d"&&(en.textContent=Ac),Kr=c(ze,": The average ratio of token (or sequence, if "),nn=m(ze,"CODE",{"data-svelte-h":!0}),r(nn)!=="svelte-d1rrl1"&&(nn.textContent=Pc),so=c(ze,") probabilities that were clipped on the upper bound of the trust region: "),xp=M(ze,!1),Jp=c(ze,"."),ze.forEach(t),ao=l(_),as=m(_,"LI",{});var ke=T(as);ln=m(ke,"CODE",{"data-svelte-h":!0}),r(ln)!=="svelte-g4lei0"&&(ln.textContent=Ec),to=c(ke,": The maximum ratio of token (or sequence, if "),pn=m(ke,"CODE",{"data-svelte-h":!0}),r(pn)!=="svelte-d1rrl1"&&(pn.textContent=Zc),eo=c(ke,") probabilities that were clipped on the upper bound of the trust region: "),Up=M(ke,!1),jp=c(ke,"."),ke.forEach(t),_.forEach(t),Cp=l(s),g(za.$$.fragment,s),zp=l(s),g(ka.$$.fragment,s),kp=l(s),Ia=m(s,"P",{"data-svelte-h":!0}),r(Ia)!=="svelte-uhtnkr"&&(Ia.innerHTML=Sc),Ip=l(s),g(Ga.$$.fragment,s),Gp=l(s),Ra=m(s,"P",{"data-svelte-h":!0}),r(Ra)!=="svelte-wpkh1u"&&(Ra.innerHTML=Qc),Rp=l(s),Js=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(Js)!=="svelte-19brdw3"&&(Js.innerHTML=Hc),$p=l(s),g($a.$$.fragment,s),Lp=l(s),La=m(s,"P",{"data-svelte-h":!0}),r(La)!=="svelte-14qro2b"&&(La.textContent=Xc),Np=l(s),Us=m(s,"OL",{});var Zm=T(Us);Na=m(Zm,"LI",{});var Sm=T(Na);mn=m(Sm,"P",{"data-svelte-h":!0}),r(mn)!=="svelte-c2qog"&&(mn.innerHTML=Oc),no=l(Sm),g(Ba.$$.fragment,Sm),Sm.forEach(t),lo=l(Zm),qa=m(Zm,"LI",{});var Qm=T(qa);rn=m(Qm,"P",{"data-svelte-h":!0}),r(rn)!=="svelte-l0zqox"&&(rn.innerHTML=Wc),po=l(Qm),g(Aa.$$.fragment,Qm),Qm.forEach(t),Zm.forEach(t),Bp=l(s),js=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(js)!=="svelte-16p5pwe"&&(js.innerHTML=Vc),qp=l(s),g(Pa.$$.fragment,s),Ap=l(s),Ea=m(s,"P",{"data-svelte-h":!0}),r(Ea)!=="svelte-g8ygxn"&&(Ea.textContent=Fc),Pp=l(s),g(Za.$$.fragment,s),Ep=l(s),Cs=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(Cs)!=="svelte-1uqtj29"&&(Cs.innerHTML=Yc),Zp=l(s),zs=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(zs)!=="svelte-9yuadg"&&(zs.innerHTML=Dc),Sp=l(s),Sa=m(s,"P",{"data-svelte-h":!0}),r(Sa)!=="svelte-4esodc"&&(Sa.innerHTML=Kc),Qp=l(s),g(Qa.$$.fragment,s),Hp=l(s),Ha=m(s,"P",{"data-svelte-h":!0}),r(Ha)!=="svelte-8tt49x"&&(Ha.textContent=sh),Xp=l(s),B=m(s,"P",{});var hs=T(B);io=c(hs,"This mismatch leads to a biased gradient update which has been observed to destabilize training ("),ks=m(hs,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(ks)!=="svelte-plv263"&&(ks.textContent=ah),Is=m(hs,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(Is)!=="svelte-brkm3q"&&(Is.textContent=th),Gs=m(hs,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(Gs)!=="svelte-1i6z0rm"&&(Gs.textContent=eh),Rs=m(hs,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r(Rs)!=="svelte-27rdyx"&&(Rs.textContent=nh),$s=m(hs,"A",{href:!0,rel:!0,"data-svelte-h":!0}),r($s)!=="svelte-15137ds"&&($s.textContent=lh),mo=c(hs,`). For simplicity, consider the REINFORCE policy gradient:
`),Op=M(hs,!1),hs.forEach(t),Wp=l(s),q=m(s,"P",{});var gs=T(q);ro=c(gs,"Here "),Vp=M(gs,!1),Fp=c(gs," denotes prompts sampled from some data distribution, and "),Yp=M(gs,!1),Dp=c(gs," is the policy implemented by the training engine. With vLLM in the loop we obtain a separate inference policy "),Kp=M(gs,!1),si=c(gs,`, so the effective policy gradient becomes
`),ai=M(gs,!1),gs.forEach(t),ti=l(s),Xa=m(s,"P",{"data-svelte-h":!0}),r(Xa)!=="svelte-oi2uz"&&(Xa.textContent=ph),ei=l(s),U=m(s,"P",{});var I=T(U);oo=c(I,"The standard way to correct for this distribution shift is "),on=m(I,"STRONG",{"data-svelte-h":!0}),r(on)!=="svelte-125qvbq"&&(on.textContent=ih),co=c(I,". We provide two IS variants: "),Oa=m(I,"A",{href:!0,"data-svelte-h":!0}),r(Oa)!=="svelte-h92dch"&&(Oa.textContent=mh),ho=c(I," and "),Wa=m(I,"A",{href:!0,"data-svelte-h":!0}),r(Wa)!=="svelte-1etj6zg"&&(Wa.textContent=rh),go=c(I,". Both variants can be applied either at the token level or at the sequence level.Let "),ni=M(I,!1),li=c(I," denote the importance weight, for example "),pi=M(I,!1),ii=c(I," per token or "),mi=M(I,!1),ri=c(I," per sequence. Under TIS, ratios larger than "),cn=m(I,"CODE",{"data-svelte-h":!0}),r(cn)!=="svelte-1bz2unv"&&(cn.textContent=oh),uo=c(I,` are clipped,
`),oi=M(I,!1),I.forEach(t),ci=l(s),Va=m(s,"P",{"data-svelte-h":!0}),r(Va)!=="svelte-1ifxkdu"&&(Va.innerHTML=ch),hi=l(s),Fa=m(s,"P",{"data-svelte-h":!0}),r(Fa)!=="svelte-1x42ak2"&&(Fa.innerHTML=hh),gi=l(s),g(Ya.$$.fragment,s),di=l(s),Da=m(s,"P",{"data-svelte-h":!0}),r(Da)!=="svelte-1lkshlz"&&(Da.innerHTML=gh),ui=l(s),Ka=m(s,"UL",{"data-svelte-h":!0}),r(Ka)!=="svelte-1fpjj5p"&&(Ka.innerHTML=dh),yi=l(s),st=m(s,"P",{"data-svelte-h":!0}),r(st)!=="svelte-1d00z2m"&&(st.textContent=uh),fi=l(s),g(at.$$.fragment,s),vi=l(s),g(tt.$$.fragment,s),wi=l(s),g(et.$$.fragment,s),bi=l(s),nt=m(s,"P",{"data-svelte-h":!0}),r(nt)!=="svelte-11n7anc"&&(nt.innerHTML=yh),Mi=l(s),lt=m(s,"P",{"data-svelte-h":!0}),r(lt)!=="svelte-1gch0wn"&&(lt.innerHTML=fh),_i=l(s),pt=m(s,"OL",{"data-svelte-h":!0}),r(pt)!=="svelte-1y79op3"&&(pt.innerHTML=vh),Ti=l(s),g(it.$$.fragment,s),xi=l(s),mt=m(s,"P",{"data-svelte-h":!0}),r(mt)!=="svelte-1n7vn3s"&&(mt.textContent=wh),Ji=l(s),g(rt.$$.fragment,s),Ui=l(s),ot=m(s,"P",{"data-svelte-h":!0}),r(ot)!=="svelte-17jijhc"&&(ot.textContent=bh),ji=l(s),g(ct.$$.fragment,s),Ci=l(s),g(ht.$$.fragment,s),zi=l(s),gt=m(s,"P",{"data-svelte-h":!0}),r(gt)!=="svelte-mgdbdy"&&(gt.textContent=Mh),ki=l(s),g(dt.$$.fragment,s),Ii=l(s),ut=m(s,"P",{"data-svelte-h":!0}),r(ut)!=="svelte-17jijhc"&&(ut.textContent=_h),Gi=l(s),g(yt.$$.fragment,s),Ri=l(s),g(ft.$$.fragment,s),$i=l(s),vt=m(s,"P",{"data-svelte-h":!0}),r(vt)!=="svelte-1ublx2j"&&(vt.innerHTML=Th),Li=l(s),g(wt.$$.fragment,s),Ni=l(s),bt=m(s,"P",{"data-svelte-h":!0}),r(bt)!=="svelte-19axqz5"&&(bt.textContent=xh),Bi=l(s),g(Mt.$$.fragment,s),qi=l(s),g(_t.$$.fragment,s),Ai=l(s),Tt=m(s,"P",{"data-svelte-h":!0}),r(Tt)!=="svelte-d6x9c8"&&(Tt.innerHTML=Jh),Pi=l(s),g(xt.$$.fragment,s),Ei=l(s),Jt=m(s,"P",{"data-svelte-h":!0}),r(Jt)!=="svelte-19axqz5"&&(Jt.textContent=Uh),Zi=l(s),g(Ut.$$.fragment,s),Si=l(s),g(jt.$$.fragment,s),Qi=l(s),Ct=m(s,"P",{"data-svelte-h":!0}),r(Ct)!=="svelte-1v0gfrv"&&(Ct.innerHTML=jh),Hi=l(s),g(zt.$$.fragment,s),Xi=l(s),kt=m(s,"P",{"data-svelte-h":!0}),r(kt)!=="svelte-hzw5pb"&&(kt.innerHTML=Ch),Oi=l(s),It=m(s,"P",{"data-svelte-h":!0}),r(It)!=="svelte-8lqo7d"&&(It.innerHTML=zh),Wi=l(s),g(Gt.$$.fragment,s),Vi=l(s),Rt=m(s,"P",{"data-svelte-h":!0}),r(Rt)!=="svelte-1jrgdb6"&&(Rt.innerHTML=kh),Fi=l(s),$t=m(s,"P",{"data-svelte-h":!0}),r($t)!=="svelte-rvbpam"&&($t.textContent=Ih),Yi=l(s),g(Lt.$$.fragment,s),Di=l(s),g(Nt.$$.fragment,s),Ki=l(s),Bt=m(s,"P",{"data-svelte-h":!0}),r(Bt)!=="svelte-hrr6lx"&&(Bt.innerHTML=Gh),sm=l(s),g(qt.$$.fragment,s),am=l(s),At=m(s,"P",{"data-svelte-h":!0}),r(At)!=="svelte-192mot3"&&(At.textContent=Rh),tm=l(s),g(Pt.$$.fragment,s),em=l(s),Et=m(s,"P",{"data-svelte-h":!0}),r(Et)!=="svelte-104ub32"&&(Et.innerHTML=$h),nm=l(s),Zt=m(s,"P",{"data-svelte-h":!0}),r(Zt)!=="svelte-3xiv4m"&&(Zt.innerHTML=Lh),lm=l(s),g(St.$$.fragment,s),pm=l(s),Qt=m(s,"P",{"data-svelte-h":!0}),r(Qt)!=="svelte-1qmo1hr"&&(Qt.innerHTML=Nh),im=l(s),g(Ht.$$.fragment,s),mm=l(s),Xt=m(s,"P",{"data-svelte-h":!0}),r(Xt)!=="svelte-1d9oy0x"&&(Xt.innerHTML=Bh),rm=l(s),g(Ot.$$.fragment,s),om=l(s),Wt=m(s,"P",{"data-svelte-h":!0}),r(Wt)!=="svelte-xu1n6s"&&(Wt.innerHTML=qh),cm=l(s),Vt=m(s,"P",{"data-svelte-h":!0}),r(Vt)!=="svelte-11lpom8"&&(Vt.textContent=Ah),hm=l(s),g(Ft.$$.fragment,s),gm=l(s),g(Yt.$$.fragment,s),dm=l(s),Dt=m(s,"P",{"data-svelte-h":!0}),r(Dt)!=="svelte-1clwarl"&&(Dt.textContent=Ph),um=l(s),Kt=m(s,"UL",{"data-svelte-h":!0}),r(Kt)!=="svelte-1thvsoi"&&(Kt.innerHTML=Eh),ym=l(s),Ls=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(Ls)!=="svelte-1rv2e8z"&&(Ls.innerHTML=Zh),fm=l(s),g(se.$$.fragment,s),vm=l(s),ae=m(s,"P",{"data-svelte-h":!0}),r(ae)!=="svelte-wckq77"&&(ae.innerHTML=Sh),wm=l(s),g(te.$$.fragment,s),bm=l(s),g(ee.$$.fragment,s),Mm=l(s),ne=m(s,"P",{"data-svelte-h":!0}),r(ne)!=="svelte-1atcqx7"&&(ne.textContent=Qh),_m=l(s),g(le.$$.fragment,s),Tm=l(s),pe=m(s,"P",{"data-svelte-h":!0}),r(pe)!=="svelte-1clwarl"&&(pe.textContent=Hh),xm=l(s),ie=m(s,"UL",{"data-svelte-h":!0}),r(ie)!=="svelte-1dxn67f"&&(ie.innerHTML=Xh),Jm=l(s),Ns=m(s,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(Ns)!=="svelte-pwu269"&&(Ns.innerHTML=Oh),Um=l(s),g(me.$$.fragment,s),jm=l(s),re=m(s,"P",{"data-svelte-h":!0}),r(re)!=="svelte-1e86bb2"&&(re.innerHTML=Wh),Cm=l(s),g(oe.$$.fragment,s),zm=l(s),g(ce.$$.fragment,s),km=l(s),he=m(s,"UL",{"data-svelte-h":!0}),r(he)!=="svelte-kv9quh"&&(he.innerHTML=Vh),Im=l(s),g(ge.$$.fragment,s),Gm=l(s),de=m(s,"P",{"data-svelte-h":!0}),r(de)!=="svelte-985b2u"&&(de.textContent=Fh),Rm=l(s),ue=m(s,"UL",{"data-svelte-h":!0}),r(ue)!=="svelte-1kosedy"&&(ue.innerHTML=Yh),$m=l(s),ye=m(s,"P",{"data-svelte-h":!0}),r(ye)!=="svelte-t3zbfe"&&(ye.textContent=Dh),Lm=l(s),g(fe.$$.fragment,s),Nm=l(s),k=m(s,"DIV",{class:!0});var S=T(k);g(ve.$$.fragment,S),yo=l(S),hn=m(S,"P",{"data-svelte-h":!0}),r(hn)!=="svelte-udysum"&&(hn.innerHTML=Kh),fo=l(S),g(Bs.$$.fragment,S),vo=l(S),qs=m(S,"DIV",{class:!0});var Hm=T(qs);g(we.$$.fragment,Hm),wo=l(Hm),gn=m(Hm,"P",{"data-svelte-h":!0}),r(gn)!=="svelte-1cilnet"&&(gn.textContent=sg),Hm.forEach(t),bo=l(S),ts=m(S,"DIV",{class:!0});var Mn=T(ts);g(be.$$.fragment,Mn),Mo=l(Mn),dn=m(Mn,"P",{"data-svelte-h":!0}),r(dn)!=="svelte-r8h4ov"&&(dn.innerHTML=ag),_o=l(Mn),un=m(Mn,"P",{"data-svelte-h":!0}),r(un)!=="svelte-1e6bius"&&(un.textContent=tg),Mn.forEach(t),To=l(S),As=m(S,"DIV",{class:!0});var Xm=T(As);g(Me.$$.fragment,Xm),xo=l(Xm),yn=m(Xm,"P",{"data-svelte-h":!0}),r(yn)!=="svelte-8tudwd"&&(yn.innerHTML=eg),Xm.forEach(t),S.forEach(t),Bm=l(s),g(_e.$$.fragment,s),qm=l(s),A=m(s,"DIV",{class:!0});var Ps=T(A);g(Te.$$.fragment,Ps),Jo=l(Ps),fn=m(Ps,"P",{"data-svelte-h":!0}),r(fn)!=="svelte-1apsqp6"&&(fn.innerHTML=ng),Uo=l(Ps),vn=m(Ps,"P",{"data-svelte-h":!0}),r(vn)!=="svelte-13s1zc4"&&(vn.innerHTML=lg),jo=l(Ps),wn=m(Ps,"P",{"data-svelte-h":!0}),r(wn)!=="svelte-ekuf1t"&&(wn.innerHTML=pg),Ps.forEach(t),Am=l(s),g(xe.$$.fragment,s),Pm=l(s),_n=m(s,"P",{}),T(_n).forEach(t),this.h()},h(){v(z,"name","hf:doc:metadata"),v(z,"content",cd),sd(V.src,qo="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0")||v(V,"src",qo),v(V,"frameborder","0"),v(V,"width","100%"),v(V,"height","560px"),Vn.a=Fn,Yn.a=Dn,al.a=tl,el.a=null,v(ds,"href","https://huggingface.co/papers/2503.20783"),v(ds,"rel","nofollow"),pl.a=il,v(ra,"href","/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig"),v(ma,"class","tip"),v(us,"class","tip"),v(ys,"href","http://joschu.net/blog/kl-approx.html"),v(ys,"rel","nofollow"),cl.a=null,dl.a=null,v(fs,"href","https://huggingface.co/papers/2402.03300"),v(fs,"rel","nofollow"),fl.a=vl,v(vs,"href","https://huggingface.co/papers/2503.20783"),v(vs,"rel","nofollow"),v(ua,"href","#loss-types"),v(da,"class","tip"),v(ws,"href","https://huggingface.co/papers/2402.03300"),v(ws,"rel","nofollow"),bl.a=Ml,v(bs,"href","https://huggingface.co/papers/2503.24290"),v(bs,"rel","nofollow"),v(Ms,"href","https://huggingface.co/papers/2503.20783"),v(Ms,"rel","nofollow"),v(_s,"href","https://huggingface.co/papers/2503.14476"),v(_s,"rel","nofollow"),v(fa,"href","/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig"),v(ya,"class","tip"),Tl.a=xl,v(va,"href","/docs/trl/pr_4949/en/grpo_trainer#trl.GRPOConfig"),Jl.a=null,jl.a=Cl,zl.a=kl,Il.a=Gl,Rl.a=$l,Bl.a=null,Al.a=null,v(Ts,"href","https://huggingface.co/papers/2503.14476"),v(Ts,"rel","nofollow"),El.a=null,v(xs,"href","https://huggingface.co/papers/2503.20783"),v(xs,"rel","nofollow"),Ql.a=null,Wl.a=null,Fl.a=Yl,Dl.a=Kl,sp.a=null,tp.a=ep,np.a=lp,pp.a=null,mp.a=rp,op.a=cp,yp.a=fp,vp.a=wp,bp.a=Mp,_p.a=Tp,xp.a=Jp,Up.a=jp,v(Js,"class","tip"),v(js,"class","warning"),v(Cs,"class","tip"),v(zs,"class","tip"),v(ks,"href","https://fengyao.notion.site/off-policy-rl"),v(ks,"rel","nofollow"),v(Is,"href","https://yingru.notion.site/When-Speed-Kills-Stability-Demystifying-RL-Collapse-from-the-Training-Inference-Mismatch-271211a558b7808d8b12d403fd15edda"),v(Is,"rel","nofollow"),v(Gs,"href","https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/#true-on-policy-rl"),v(Gs,"rel","nofollow"),v(Rs,"href","https://huggingface.co/papers/2510.26788"),v(Rs,"rel","nofollow"),v($s,"href","https://huggingface.co/papers/2510.18855"),v($s,"rel","nofollow"),Op.a=null,Vp.a=Fp,Yp.a=Dp,Kp.a=si,ai.a=null,v(Oa,"href","paper_index#truncated-importance-sampling"),v(Wa,"href","paper_index#masked-importance-sampling"),ni.a=li,pi.a=ii,mi.a=ri,oi.a=null,v(Ls,"class","tip"),v(Ns,"class","tip"),v(qs,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),v(ts,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),v(As,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),v(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),v(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(s,a){p(document.head,z),e(s,Es,a),e(s,W,a),e(s,P,a),d(E,s,a),e(s,j,a),d(Q,s,a),e(s,zn,a),e(s,Zs,a),e(s,kn,a),d(Ss,s,a),e(s,In,a),e(s,Qs,a),e(s,Gn,a),e(s,Hs,a),e(s,Rn,a),e(s,Xs,a),e(s,$n,a),e(s,Os,a),e(s,Ln,a),d(Ws,s,a),e(s,Nn,a),e(s,Vs,a),e(s,Bn,a),e(s,V,a),e(s,qn,a),e(s,Fs,a),e(s,An,a),d(Ys,s,a),e(s,Pn,a),e(s,Ds,a),e(s,En,a),d(Ks,s,a),e(s,Zn,a),e(s,sa,a),e(s,Sn,a),e(s,aa,a),e(s,Qn,a),d(ta,s,a),e(s,Hn,a),e(s,ea,a),e(s,Xn,a),e(s,na,a),e(s,On,a),d(la,s,a),e(s,Wn,a),e(s,F,a),p(F,Om),Vn.m(og,F),p(F,Fn),Yn.m(cg,F),p(F,Dn),e(s,Kn,a),d(pa,s,a),e(s,sl,a),e(s,ps,a),p(ps,Wm),al.m(hg,ps),p(ps,tl),el.m(gg,ps),e(s,nl,a),e(s,ia,a),e(s,ll,a),e(s,ma,a),p(ma,L),p(L,Vm),p(L,ds),p(L,Fm),pl.m(dg,L),p(L,il),p(L,Ie),p(L,Ym),p(L,ra),p(L,Dm),e(s,ml,a),e(s,us,a),e(s,rl,a),d(oa,s,a),e(s,ol,a),e(s,is,a),p(is,Km),p(is,ys),p(is,sr),cl.m(ug,is),e(s,hl,a),d(ca,s,a),e(s,gl,a),e(s,ha,a),p(ha,ar),dl.m(yg,ha),e(s,ul,a),e(s,ga,a),e(s,yl,a),e(s,da,a),p(da,N),p(N,tr),p(N,fs),p(N,er),fl.m(fg,N),p(N,vl),p(N,vs),p(N,nr),p(N,ua),p(N,lr),e(s,wl,a),e(s,ya,a),p(ya,C),p(C,pr),p(C,ws),p(C,ir),bl.m(vg,C),p(C,Ml),p(C,bs),p(C,mr),p(C,Ms),p(C,rr),p(C,_s),p(C,or),p(C,Ge),p(C,cr),p(C,fa),p(C,hr),e(s,_l,a),e(s,R,a),p(R,gr),Tl.m(wg,R),p(R,xl),p(R,Re),p(R,dr),p(R,va),p(R,ur),p(R,$e),p(R,yr),Jl.m(bg,R),e(s,Ul,a),e(s,$,a),p($,fr),jl.m(Mg,$),p($,Cl),zl.m(_g,$),p($,kl),Il.m(Tg,$),p($,Gl),Rl.m(xg,$),p($,$l),e(s,Ll,a),d(wa,s,a),e(s,Nl,a),e(s,ba,a),p(ba,vr),Bl.m(Jg,ba),e(s,ql,a),e(s,Ma,a),p(Ma,wr),Al.m(Ug,Ma),e(s,Pl,a),e(s,ms,a),p(ms,br),p(ms,Ts),p(ms,Mr),El.m(jg,ms),e(s,Zl,a),e(s,_a,a),e(s,Sl,a),e(s,rs,a),p(rs,_r),p(rs,xs),p(rs,Tr),Ql.m(Cg,rs),e(s,Hl,a),e(s,Ta,a),e(s,Xl,a),e(s,xa,a),e(s,Ol,a),e(s,Ja,a),p(Ja,xr),Wl.m(zg,Ja),e(s,Vl,a),e(s,H,a),p(H,Jr),Fl.m(kg,H),p(H,Yl),Dl.m(Ig,H),p(H,Kl),sp.m(Gg,H),e(s,ap,a),e(s,X,a),p(X,Ur),tp.m(Rg,X),p(X,ep),np.m($g,X),p(X,lp),pp.m(Lg,X),e(s,ip,a),e(s,Y,a),p(Y,jr),mp.m(Ng,Y),p(Y,rp),op.m(Bg,Y),p(Y,cp),e(s,hp,a),e(s,Ua,a),e(s,gp,a),d(ja,s,a),e(s,dp,a),e(s,Ca,a),e(s,up,a),e(s,w,a),p(w,Le),p(w,Cr),p(w,Ne),p(w,zr),p(w,Be),p(w,kr),p(w,qe),p(w,Ir),p(w,Ae),p(w,Gr),p(w,Pe),p(w,Rr),p(w,Ee),p(w,$r),p(w,Ze),p(w,Lr),p(w,Se),p(w,Nr),p(w,Qe),p(w,Br),p(w,He),p(w,qr),p(w,Xe),p(w,Ar),p(w,Oe),p(w,Pr),p(w,We),p(w,Er),p(w,Ve),p(w,Zr),p(w,Fe),p(w,Sr),p(w,Z),p(Z,Ye),p(Z,Qr),p(Z,De),p(Z,Hr),yp.m(qg,Z),p(Z,fp),vp.m(Ag,Z),p(Z,wp),p(w,Xr),p(w,D),p(D,Ke),p(D,Or),p(D,sn),p(D,Wr),bp.m(Pg,D),p(D,Mp),p(w,Vr),p(w,K),p(K,an),p(K,Fr),p(K,tn),p(K,Yr),_p.m(Eg,K),p(K,Tp),p(w,Dr),p(w,ss),p(ss,en),p(ss,Kr),p(ss,nn),p(ss,so),xp.m(Zg,ss),p(ss,Jp),p(w,ao),p(w,as),p(as,ln),p(as,to),p(as,pn),p(as,eo),Up.m(Sg,as),p(as,jp),e(s,Cp,a),d(za,s,a),e(s,zp,a),d(ka,s,a),e(s,kp,a),e(s,Ia,a),e(s,Ip,a),d(Ga,s,a),e(s,Gp,a),e(s,Ra,a),e(s,Rp,a),e(s,Js,a),e(s,$p,a),d($a,s,a),e(s,Lp,a),e(s,La,a),e(s,Np,a),e(s,Us,a),p(Us,Na),p(Na,mn),p(Na,no),d(Ba,Na,null),p(Us,lo),p(Us,qa),p(qa,rn),p(qa,po),d(Aa,qa,null),e(s,Bp,a),e(s,js,a),e(s,qp,a),d(Pa,s,a),e(s,Ap,a),e(s,Ea,a),e(s,Pp,a),d(Za,s,a),e(s,Ep,a),e(s,Cs,a),e(s,Zp,a),e(s,zs,a),e(s,Sp,a),e(s,Sa,a),e(s,Qp,a),d(Qa,s,a),e(s,Hp,a),e(s,Ha,a),e(s,Xp,a),e(s,B,a),p(B,io),p(B,ks),p(B,Is),p(B,Gs),p(B,Rs),p(B,$s),p(B,mo),Op.m(Qg,B),e(s,Wp,a),e(s,q,a),p(q,ro),Vp.m(Hg,q),p(q,Fp),Yp.m(Xg,q),p(q,Dp),Kp.m(Og,q),p(q,si),ai.m(Wg,q),e(s,ti,a),e(s,Xa,a),e(s,ei,a),e(s,U,a),p(U,oo),p(U,on),p(U,co),p(U,Oa),p(U,ho),p(U,Wa),p(U,go),ni.m(Vg,U),p(U,li),pi.m(Fg,U),p(U,ii),mi.m(Yg,U),p(U,ri),p(U,cn),p(U,uo),oi.m(Dg,U),e(s,ci,a),e(s,Va,a),e(s,hi,a),e(s,Fa,a),e(s,gi,a),d(Ya,s,a),e(s,di,a),e(s,Da,a),e(s,ui,a),e(s,Ka,a),e(s,yi,a),e(s,st,a),e(s,fi,a),d(at,s,a),e(s,vi,a),d(tt,s,a),e(s,wi,a),d(et,s,a),e(s,bi,a),e(s,nt,a),e(s,Mi,a),e(s,lt,a),e(s,_i,a),e(s,pt,a),e(s,Ti,a),d(it,s,a),e(s,xi,a),e(s,mt,a),e(s,Ji,a),d(rt,s,a),e(s,Ui,a),e(s,ot,a),e(s,ji,a),d(ct,s,a),e(s,Ci,a),d(ht,s,a),e(s,zi,a),e(s,gt,a),e(s,ki,a),d(dt,s,a),e(s,Ii,a),e(s,ut,a),e(s,Gi,a),d(yt,s,a),e(s,Ri,a),d(ft,s,a),e(s,$i,a),e(s,vt,a),e(s,Li,a),d(wt,s,a),e(s,Ni,a),e(s,bt,a),e(s,Bi,a),d(Mt,s,a),e(s,qi,a),d(_t,s,a),e(s,Ai,a),e(s,Tt,a),e(s,Pi,a),d(xt,s,a),e(s,Ei,a),e(s,Jt,a),e(s,Zi,a),d(Ut,s,a),e(s,Si,a),d(jt,s,a),e(s,Qi,a),e(s,Ct,a),e(s,Hi,a),d(zt,s,a),e(s,Xi,a),e(s,kt,a),e(s,Oi,a),e(s,It,a),e(s,Wi,a),d(Gt,s,a),e(s,Vi,a),e(s,Rt,a),e(s,Fi,a),e(s,$t,a),e(s,Yi,a),d(Lt,s,a),e(s,Di,a),d(Nt,s,a),e(s,Ki,a),e(s,Bt,a),e(s,sm,a),d(qt,s,a),e(s,am,a),e(s,At,a),e(s,tm,a),d(Pt,s,a),e(s,em,a),e(s,Et,a),e(s,nm,a),e(s,Zt,a),e(s,lm,a),d(St,s,a),e(s,pm,a),e(s,Qt,a),e(s,im,a),d(Ht,s,a),e(s,mm,a),e(s,Xt,a),e(s,rm,a),d(Ot,s,a),e(s,om,a),e(s,Wt,a),e(s,cm,a),e(s,Vt,a),e(s,hm,a),d(Ft,s,a),e(s,gm,a),d(Yt,s,a),e(s,dm,a),e(s,Dt,a),e(s,um,a),e(s,Kt,a),e(s,ym,a),e(s,Ls,a),e(s,fm,a),d(se,s,a),e(s,vm,a),e(s,ae,a),e(s,wm,a),d(te,s,a),e(s,bm,a),d(ee,s,a),e(s,Mm,a),e(s,ne,a),e(s,_m,a),d(le,s,a),e(s,Tm,a),e(s,pe,a),e(s,xm,a),e(s,ie,a),e(s,Jm,a),e(s,Ns,a),e(s,Um,a),d(me,s,a),e(s,jm,a),e(s,re,a),e(s,Cm,a),d(oe,s,a),e(s,zm,a),d(ce,s,a),e(s,km,a),e(s,he,a),e(s,Im,a),d(ge,s,a),e(s,Gm,a),e(s,de,a),e(s,Rm,a),e(s,ue,a),e(s,$m,a),e(s,ye,a),e(s,Lm,a),d(fe,s,a),e(s,Nm,a),e(s,k,a),d(ve,k,null),p(k,yo),p(k,hn),p(k,fo),d(Bs,k,null),p(k,vo),p(k,qs),d(we,qs,null),p(qs,wo),p(qs,gn),p(k,bo),p(k,ts),d(be,ts,null),p(ts,Mo),p(ts,dn),p(ts,_o),p(ts,un),p(k,To),p(k,As),d(Me,As,null),p(As,xo),p(As,yn),e(s,Bm,a),d(_e,s,a),e(s,qm,a),e(s,A,a),d(Te,A,null),p(A,Jo),p(A,fn),p(A,Uo),p(A,vn),p(A,jo),p(A,wn),e(s,Am,a),d(xe,s,a),e(s,Pm,a),e(s,_n,a),Em=!0},p(s,[a]){const os={};a&2&&(os.$$scope={dirty:a,ctx:s}),Bs.$set(os)},i(s){Em||(u(E.$$.fragment,s),u(Q.$$.fragment,s),u(Ss.$$.fragment,s),u(Ws.$$.fragment,s),u(Ys.$$.fragment,s),u(Ks.$$.fragment,s),u(ta.$$.fragment,s),u(la.$$.fragment,s),u(pa.$$.fragment,s),u(oa.$$.fragment,s),u(ca.$$.fragment,s),u(wa.$$.fragment,s),u(ja.$$.fragment,s),u(za.$$.fragment,s),u(ka.$$.fragment,s),u(Ga.$$.fragment,s),u($a.$$.fragment,s),u(Ba.$$.fragment,s),u(Aa.$$.fragment,s),u(Pa.$$.fragment,s),u(Za.$$.fragment,s),u(Qa.$$.fragment,s),u(Ya.$$.fragment,s),u(at.$$.fragment,s),u(tt.$$.fragment,s),u(et.$$.fragment,s),u(it.$$.fragment,s),u(rt.$$.fragment,s),u(ct.$$.fragment,s),u(ht.$$.fragment,s),u(dt.$$.fragment,s),u(yt.$$.fragment,s),u(ft.$$.fragment,s),u(wt.$$.fragment,s),u(Mt.$$.fragment,s),u(_t.$$.fragment,s),u(xt.$$.fragment,s),u(Ut.$$.fragment,s),u(jt.$$.fragment,s),u(zt.$$.fragment,s),u(Gt.$$.fragment,s),u(Lt.$$.fragment,s),u(Nt.$$.fragment,s),u(qt.$$.fragment,s),u(Pt.$$.fragment,s),u(St.$$.fragment,s),u(Ht.$$.fragment,s),u(Ot.$$.fragment,s),u(Ft.$$.fragment,s),u(Yt.$$.fragment,s),u(se.$$.fragment,s),u(te.$$.fragment,s),u(ee.$$.fragment,s),u(le.$$.fragment,s),u(me.$$.fragment,s),u(oe.$$.fragment,s),u(ce.$$.fragment,s),u(ge.$$.fragment,s),u(fe.$$.fragment,s),u(ve.$$.fragment,s),u(Bs.$$.fragment,s),u(we.$$.fragment,s),u(be.$$.fragment,s),u(Me.$$.fragment,s),u(_e.$$.fragment,s),u(Te.$$.fragment,s),u(xe.$$.fragment,s),Em=!0)},o(s){y(E.$$.fragment,s),y(Q.$$.fragment,s),y(Ss.$$.fragment,s),y(Ws.$$.fragment,s),y(Ys.$$.fragment,s),y(Ks.$$.fragment,s),y(ta.$$.fragment,s),y(la.$$.fragment,s),y(pa.$$.fragment,s),y(oa.$$.fragment,s),y(ca.$$.fragment,s),y(wa.$$.fragment,s),y(ja.$$.fragment,s),y(za.$$.fragment,s),y(ka.$$.fragment,s),y(Ga.$$.fragment,s),y($a.$$.fragment,s),y(Ba.$$.fragment,s),y(Aa.$$.fragment,s),y(Pa.$$.fragment,s),y(Za.$$.fragment,s),y(Qa.$$.fragment,s),y(Ya.$$.fragment,s),y(at.$$.fragment,s),y(tt.$$.fragment,s),y(et.$$.fragment,s),y(it.$$.fragment,s),y(rt.$$.fragment,s),y(ct.$$.fragment,s),y(ht.$$.fragment,s),y(dt.$$.fragment,s),y(yt.$$.fragment,s),y(ft.$$.fragment,s),y(wt.$$.fragment,s),y(Mt.$$.fragment,s),y(_t.$$.fragment,s),y(xt.$$.fragment,s),y(Ut.$$.fragment,s),y(jt.$$.fragment,s),y(zt.$$.fragment,s),y(Gt.$$.fragment,s),y(Lt.$$.fragment,s),y(Nt.$$.fragment,s),y(qt.$$.fragment,s),y(Pt.$$.fragment,s),y(St.$$.fragment,s),y(Ht.$$.fragment,s),y(Ot.$$.fragment,s),y(Ft.$$.fragment,s),y(Yt.$$.fragment,s),y(se.$$.fragment,s),y(te.$$.fragment,s),y(ee.$$.fragment,s),y(le.$$.fragment,s),y(me.$$.fragment,s),y(oe.$$.fragment,s),y(ce.$$.fragment,s),y(ge.$$.fragment,s),y(fe.$$.fragment,s),y(ve.$$.fragment,s),y(Bs.$$.fragment,s),y(we.$$.fragment,s),y(be.$$.fragment,s),y(Me.$$.fragment,s),y(_e.$$.fragment,s),y(Te.$$.fragment,s),y(xe.$$.fragment,s),Em=!1},d(s){s&&(t(Es),t(W),t(P),t(j),t(zn),t(Zs),t(kn),t(In),t(Qs),t(Gn),t(Hs),t(Rn),t(Xs),t($n),t(Os),t(Ln),t(Nn),t(Vs),t(Bn),t(V),t(qn),t(Fs),t(An),t(Pn),t(Ds),t(En),t(Zn),t(sa),t(Sn),t(aa),t(Qn),t(Hn),t(ea),t(Xn),t(na),t(On),t(Wn),t(F),t(Kn),t(sl),t(ps),t(nl),t(ia),t(ll),t(ma),t(ml),t(us),t(rl),t(ol),t(is),t(hl),t(gl),t(ha),t(ul),t(ga),t(yl),t(da),t(wl),t(ya),t(_l),t(R),t(Ul),t($),t(Ll),t(Nl),t(ba),t(ql),t(Ma),t(Pl),t(ms),t(Zl),t(_a),t(Sl),t(rs),t(Hl),t(Ta),t(Xl),t(xa),t(Ol),t(Ja),t(Vl),t(H),t(ap),t(X),t(ip),t(Y),t(hp),t(Ua),t(gp),t(dp),t(Ca),t(up),t(w),t(Cp),t(zp),t(kp),t(Ia),t(Ip),t(Gp),t(Ra),t(Rp),t(Js),t($p),t(Lp),t(La),t(Np),t(Us),t(Bp),t(js),t(qp),t(Ap),t(Ea),t(Pp),t(Ep),t(Cs),t(Zp),t(zs),t(Sp),t(Sa),t(Qp),t(Hp),t(Ha),t(Xp),t(B),t(Wp),t(q),t(ti),t(Xa),t(ei),t(U),t(ci),t(Va),t(hi),t(Fa),t(gi),t(di),t(Da),t(ui),t(Ka),t(yi),t(st),t(fi),t(vi),t(wi),t(bi),t(nt),t(Mi),t(lt),t(_i),t(pt),t(Ti),t(xi),t(mt),t(Ji),t(Ui),t(ot),t(ji),t(Ci),t(zi),t(gt),t(ki),t(Ii),t(ut),t(Gi),t(Ri),t($i),t(vt),t(Li),t(Ni),t(bt),t(Bi),t(qi),t(Ai),t(Tt),t(Pi),t(Ei),t(Jt),t(Zi),t(Si),t(Qi),t(Ct),t(Hi),t(Xi),t(kt),t(Oi),t(It),t(Wi),t(Vi),t(Rt),t(Fi),t($t),t(Yi),t(Di),t(Ki),t(Bt),t(sm),t(am),t(At),t(tm),t(em),t(Et),t(nm),t(Zt),t(lm),t(pm),t(Qt),t(im),t(mm),t(Xt),t(rm),t(om),t(Wt),t(cm),t(Vt),t(hm),t(gm),t(dm),t(Dt),t(um),t(Kt),t(ym),t(Ls),t(fm),t(vm),t(ae),t(wm),t(bm),t(Mm),t(ne),t(_m),t(Tm),t(pe),t(xm),t(ie),t(Jm),t(Ns),t(Um),t(jm),t(re),t(Cm),t(zm),t(km),t(he),t(Im),t(Gm),t(de),t(Rm),t(ue),t($m),t(ye),t(Lm),t(Nm),t(k),t(Bm),t(qm),t(A),t(Am),t(Pm),t(_n)),t(z),f(E,s),f(Q,s),f(Ss,s),f(Ws,s),f(Ys,s),f(Ks,s),f(ta,s),f(la,s),f(pa,s),f(oa,s),f(ca,s),f(wa,s),f(ja,s),f(za,s),f(ka,s),f(Ga,s),f($a,s),f(Ba),f(Aa),f(Pa,s),f(Za,s),f(Qa,s),f(Ya,s),f(at,s),f(tt,s),f(et,s),f(it,s),f(rt,s),f(ct,s),f(ht,s),f(dt,s),f(yt,s),f(ft,s),f(wt,s),f(Mt,s),f(_t,s),f(xt,s),f(Ut,s),f(jt,s),f(zt,s),f(Gt,s),f(Lt,s),f(Nt,s),f(qt,s),f(Pt,s),f(St,s),f(Ht,s),f(Ot,s),f(Ft,s),f(Yt,s),f(se,s),f(te,s),f(ee,s),f(le,s),f(me,s),f(oe,s),f(ce,s),f(ge,s),f(fe,s),f(ve),f(Bs),f(we),f(be),f(Me),f(_e,s),f(Te),f(xe,s)}}}const cd='{"title":"GRPO Trainer","local":"grpo-trainer","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"Looking deeper into the GRPO method","local":"looking-deeper-into-the-grpo-method","sections":[{"title":"Generating completions","local":"generating-completions","sections":[],"depth":3},{"title":"Computing the advantage","local":"computing-the-advantage","sections":[],"depth":3},{"title":"Estimating the KL divergence","local":"estimating-the-kl-divergence","sections":[],"depth":3},{"title":"Computing the loss","local":"computing-the-loss","sections":[{"title":"Loss Types","local":"loss-types","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Logged metrics","local":"logged-metrics","sections":[],"depth":2},{"title":"Customization","local":"customization","sections":[{"title":"Speed up training with vLLM-powered generation","local":"speed-up-training-with-vllm-powered-generation","sections":[{"title":"🔌 Option 1: Server mode","local":"-option-1-server-mode","sections":[],"depth":4},{"title":"🧩 Option 2: Colocate mode","local":"-option-2-colocate-mode","sections":[],"depth":4},{"title":"Dealing with the Training-Inference Mismatch","local":"dealing-with-the-training-inference-mismatch","sections":[],"depth":4}],"depth":3},{"title":"GRPO at scale: train a 70B+ Model on multiple nodes","local":"grpo-at-scale-train-a-70b-model-on-multiple-nodes","sections":[],"depth":3},{"title":"Using a custom reward function","local":"using-a-custom-reward-function","sections":[{"title":"Example 1: Reward longer completions","local":"example-1-reward-longer-completions","sections":[],"depth":4},{"title":"Example 1.1: Reward longer completions (based on the number of characters)","local":"example-11-reward-longer-completions-based-on-the-number-of-characters","sections":[],"depth":4},{"title":"Example 2: Reward completions with a specific format","local":"example-2-reward-completions-with-a-specific-format","sections":[],"depth":4},{"title":"Example 3: Reward completions based on a reference","local":"example-3-reward-completions-based-on-a-reference","sections":[],"depth":4},{"title":"Example 4: Multi-task reward functions","local":"example-4-multi-task-reward-functions","sections":[],"depth":4},{"title":"Example 5: Asynchronous reward functions","local":"example-5-asynchronous-reward-functions","sections":[],"depth":4},{"title":"Passing the reward function to the trainer","local":"passing-the-reward-function-to-the-trainer","sections":[],"depth":4}],"depth":3},{"title":"Rapid Experimentation for GRPO","local":"rapid-experimentation-for-grpo","sections":[],"depth":3}],"depth":2},{"title":"Agent Training","local":"agent-training","sections":[{"title":"Supported Models","local":"supported-models","sections":[],"depth":3},{"title":"Quick Start","local":"quick-start","sections":[],"depth":3}],"depth":2},{"title":"Vision-Language Model (VLM) Training","local":"vision-language-model-vlm-training","sections":[{"title":"Supported Models","local":"supported-models","sections":[],"depth":3},{"title":"Quick Start","local":"quick-start","sections":[],"depth":3},{"title":"Configuration Tips","local":"configuration-tips","sections":[],"depth":3},{"title":"Dataset Format","local":"dataset-format","sections":[],"depth":3}],"depth":2},{"title":"GRPOTrainer","local":"trl.GRPOTrainer","sections":[],"depth":2},{"title":"GRPOConfig","local":"trl.GRPOConfig","sections":[],"depth":2}],"depth":1}';function hd(Cn){return ad(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class wd extends ed{constructor(z){super(),nd(this,z,hd,od,Kg,{})}}export{wd as component};

Xet Storage Details

Size:
374 kB
·
Xet hash:
cdd7a0835ef16aa4ef0d4be8554c1e02efd1dd5cc42cb9c83b3b69016137d2d1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.