Buckets:

rtrm's picture
download
raw
101 kB
import{s as _p,o as Ip,n as Ap}from"../chunks/scheduler.37c15a92.js";import{S as kp,i as zp,g as p,s as n,r,m as C,H as Xt,A as Gp,h as i,f as e,c as l,j as v,u as o,x as m,n as U,B as Pt,k as bp,y as d,a,v as c,d as h,t as g,w as y}from"../chunks/index.7cb9c9b8.js";import{T as Bp}from"../chunks/Tip.d10b3fc9.js";import{C as f}from"../chunks/CodeBlock.abae2786.js";import{H as u,E as Rp}from"../chunks/getInferenceSnippets.a2135f3c.js";function Zp(ae){let M,j='This section dives into the technical and mathematical details of GRPO. It was authored by <a href="https://github.com/shirinyamani" rel="nofollow">Shirin Yamani</a>.';return{c(){M=p("p"),M.innerHTML=j},l(w){M=i(w,"P",{"data-svelte-h":!0}),m(M)!=="svelte-kxyfny"&&(M.innerHTML=j)},m(w,Vt){a(w,M,Vt)},p:Ap,d(w){w&&e(M)}}}function Qp(ae){let M,j,w,Vt,_,ne,T,le,I,Wn="Let’s deepen our understanding of GRPO so that we can improve our model’s training process.",pe,A,Yn="GRPO directly evaluates the model-generated responses by comparing them within groups of generation to optimize policy model, instead of training a separate value model (Critic). This approach leads to significant reduction in computational cost!",ie,k,qn="GRPO can be applied to any verifiable task where the correctness of the response can be determined. For instance, in math reasoning, the correctness of the response can be easily verified by comparing it to the ground truth.",me,z,On="Before diving into the technical details, let’s visualize how GRPO works at a high level:",re,G,Kn='<img src="./img/2.jpg" alt="deep"/>',oe,B,sl="Now that we have a visual overview, let’s break down how GRPO works step by step.",ce,R,he,Z,tl="The core innovation of GRPO is its approach to evaluating and learning from multiple generated responses simultaneously. Instead of relying on a separate reward model, it compares outputs within the same group to determine which ones should be reinforced.",ge,Q,el="Let’s walk through each step of the algorithm in detail:",ye,S,ue,H,al="The first step is to generate multiple possible answers for each question. This creates a diverse set of outputs that can be compared against each other.",de,E,nl="For each question $q$, the model will generate $G$ outputs (group size) from the trained policy:{ ${o<em>1, o_2, o_3, \\dots, o_G}\\pi</em>{\\theta_{\\text{old}}}$ }, $G=8$ where each $o_i$ represents one completion from the model.",Me,N,fe,L,ll="To make this concrete, let’s look at a simple arithmetic problem:",we,X,pl="<li><strong>Question</strong> $q$ : $\\text{Calculate}\\space2 + 2 \\times 6$</li> <li><strong>Outputs</strong> $(G = 8)$: ${o_1:14 \\text{ (correct)}, o_2:16 \\text{ (wrong)}, o_3:10 \\text{ (wrong)}, \\ldots, o_8:14 \\text{ (correct)}}$</li>",ve,P,il="Notice how some of the generated answers are correct (14) while others are wrong (16 or 10). This diversity is crucial for the next step.",be,V,Te,F,ml="Once we have multiple responses, we need a way to determine which ones are better than others. This is where the advantage calculation comes in.",$e,D,Je,W,rl="First, we assign a reward score to each generated response. In this example, we’ll use a reward model, but as we learnt in the previous section, we can use any reward returning function.",xe,Y,ol="Assign a RM score to each of the generated responses based on the correctness $r_i$ <em>(e.g. 1 for correct response, 0 for wrong response)</em> then for each of the $r_i$ calculate the following Advantage value",Ce,q,Ue,O,Rn,je,Tp='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>A</mi><mi>i</mi></msub><mo>=</mo><mfrac><mrow><msub><mi>r</mi><mi>i</mi></msub><mo>−</mo><mtext>mean</mtext><mo stretchy="false">(</mo><mo stretchy="false">{</mo><msub><mi>r</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>r</mi><mn>2</mn></msub><mo separator="true">,</mo><mo>…</mo><mo separator="true">,</mo><msub><mi>r</mi><mi>G</mi></msub><mo stretchy="false">}</mo><mo stretchy="false">)</mo></mrow><mrow><mtext>std</mtext><mo stretchy="false">(</mo><mo stretchy="false">{</mo><msub><mi>r</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>r</mi><mn>2</mn></msub><mo separator="true">,</mo><mo>…</mo><mo separator="true">,</mo><msub><mi>r</mi><mi>G</mi></msub><mo stretchy="false">}</mo><mo stretchy="false">)</mo></mrow></mfrac></mrow><annotation encoding="application/x-tex">A_i = \\frac{r_i - \\text{mean}(\\{r_1, r_2, \\ldots, r_G\\})}{\\text{std}(\\{r_1, r_2, \\ldots, r_G\\})}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8333em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">A</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.363em;vertical-align:-0.936em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord text"><span class="mord">std</span></span><span class="mopen">({</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">2</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner">…</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">})</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord text"><span class="mord">mean</span></span><span class="mopen">({</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">2</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner">…</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">})</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span>',_e,K,Ie,ss,cl="Continuing with our arithmetic example for the same example above, imagine we have 8 responses, 4 of which is correct and the rest wrong, therefore;",Ae,ts,hl="<li>Group Average: $mean(r_i) = 0.5$</li> <li>Std: $std(r_i) = 0.53$</li> <li>Advantage Value:<ul><li>Correct response: $A_i = \\frac{1 - 0.5}{0.53}= 0.94$</li> <li>Wrong response: $A_i = \\frac{0 - 0.5}{0.53}= -0.94$</li></ul></li>",ke,es,ze,as,gl="Now that we have calculated the advantage values, let’s understand what they mean:",Ge,ns,yl="This standardization (i.e. $A_i$ weighting) allows the model to assess each response’s relative performance, guiding the optimization process to favour responses that are better than average (high reward) and discourage those that are worse. For instance if $A_i > 0$, then the $o_i$ is better response than the average level within its group; and if $A_i < 0$, then the $o_i$ then the quality of the response is less than the average (i.e. poor quality/performance).",Be,ls,ul="For the example above, if $A_i = 0.94 \\text{(correct output)}$ then during optimization steps its generation probability will be increased.",Re,ps,dl="With our advantage values calculated, we’re now ready to update the policy.",Ze,is,Qe,ms,Ml="The final step is to use these advantage values to update our model so that it becomes more likely to generate good responses in the future.",Se,rs,Zn,He,$p='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>J</mi><mrow><mi>G</mi><mi>R</mi><mi>P</mi><mi>O</mi></mrow></msub><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mrow><mo fence="true">[</mo><mfrac><mn>1</mn><mi>G</mi></mfrac><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>G</mi></munderover><mi>min</mi><mo>⁡</mo><mrow><mo fence="true">(</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mrow><mi>o</mi><mi>l</mi><mi>d</mi></mrow></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo stretchy="false">)</mo></mrow></mfrac><msub><mi>A</mi><mi>i</mi></msub><mtext>clip</mtext><mrow><mo fence="true">(</mo><mfrac><mrow><msub><mi>π</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo stretchy="false">)</mo></mrow><mrow><msub><mi>π</mi><msub><mi>θ</mi><mrow><mi>o</mi><mi>l</mi><mi>d</mi></mrow></msub></msub><mo stretchy="false">(</mo><msub><mi>o</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><mi>q</mi><mo stretchy="false">)</mo></mrow></mfrac><mo separator="true">,</mo><mn>1</mn><mo>−</mo><mi>ϵ</mi><mo separator="true">,</mo><mn>1</mn><mo>+</mo><mi>ϵ</mi><mo fence="true">)</mo></mrow><msub><mi>A</mi><mi>i</mi></msub><mo fence="true">)</mo></mrow><mo fence="true">]</mo></mrow><mo>−</mo><mi>β</mi><msub><mi>D</mi><mrow><mi>K</mi><mi>L</mi></mrow></msub><mo stretchy="false">(</mo><msub><mi>π</mi><mi>θ</mi></msub><mi mathvariant="normal">∣</mi><mi mathvariant="normal">∣</mi><msub><mi>π</mi><mrow><mi>r</mi><mi>e</mi><mi>f</mi></mrow></msub><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">J_{GRPO}(\\theta) = \\left[\\frac{1}{G} \\sum_{i=1}^{G} \\min \\left( \\frac{\\pi_{\\theta}(o_i|q)}{\\pi_{\\theta_{old}}(o_i|q)} A_i \\text{clip}\\left( \\frac{\\pi_{\\theta}(o_i|q)}{\\pi_{\\theta_{old}}(o_i|q)}, 1 - \\epsilon, 1 + \\epsilon \\right) A_i \\right)\\right]- \\beta D_{KL}(\\pi_{\\theta} || \\pi_{ref})</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.09618em;">J</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:-0.0962em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">GRPO</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.106em;vertical-align:-1.2777em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size4">[</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">G</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">G</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">min</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0278em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="mord mathnormal mtight" style="margin-right:0.01968em;">l</span><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2559em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9419em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mord"><span class="mord mathnormal">A</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord text"><span class="mord">clip</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size3">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.3488em;margin-left:-0.0278em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">o</span><span class="mord mathnormal mtight" style="margin-right:0.01968em;">l</span><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.1512em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2559em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal" style="margin-right:0.03588em;">q</span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.9419em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">ϵ</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">1</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">ϵ</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">)</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">A</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size3">)</span></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size4">]</span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0361em;vertical-align:-0.2861em;"></span><span class="mord mathnormal" style="margin-right:0.05278em;">β</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.07153em;">K</span><span class="mord mathnormal mtight">L</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣∣</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0359em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">re</span><span class="mord mathnormal mtight" style="margin-right:0.10764em;">f</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.2861em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span></span>',Ee,os,fl="This formula might look intimidating at first, but it’s built from several components that each serve an important purpose. Let’s break them down one by one.",Ne,cs,Le,hs,wl="The GRPO update function combines several techniques to ensure stable and effective learning. Let’s examine each component:",Xe,gs,Pe,ys,vl="The probability ratio is defined as:",Ve,us,bl="$\\left(\\frac{\\pi<em>{\\theta}(o_i|q)}{\\pi</em>{\\theta_{old}}(o_i|q)}\\right)$",Fe,ds,Tl="Intuitively, the formula compares how much the new model’s response probability differs from the old model’s response probability while incorporating a preference for responses that improve the expected outcome.",De,Ms,We,fs,$l="<li>If $\\text{ratio} &gt; 1$, the new model assigns a higher probability to response $o_i$​ than the old model.</li> <li>If $\\text{ratio} &lt; 1$, the new model assigns a lower probability to $o_i$​</li>",Ye,ws,Jl="This ratio allows us to control how much the model changes at each step, which leads us to the next component.",qe,vs,Oe,bs,xl="The clipping function is defined as:",Ke,Ts,Cl="$\\text{clip}\\left( \\frac{\\pi<em>{\\theta}(o_i|q)}{\\pi</em>{\\theta_{old}}(o_i|q)}, 1 - \\epsilon, 1 + \\epsilon\\right)$",sa,$s,Ul="Limit the ratio discussed above to be within $[1 - \\epsilon, 1 + \\epsilon]$ to avoid/control drastic changes or crazy updates and stepping too far off from the old policy. In other words, it limit how much the probability ratio can increase to help maintaining stability by avoiding updates that push the new model too far from the old one.",ta,Js,ea,xs,jl="Let’s look at two different scenarios to better understand this clipping function:",aa,Cs,_l="<li><strong>Case 1</strong>: if the new policy has a probability of 0.9 for a specific response and the old policy has a probabiliy of 0.5, it means this response is getting reinforeced by the new policy to have higher probability, but within a controlled limit which is the clipping to tight up its hands to not get drastic <ul><li>$\\text{Ratio}: \\frac{\\pi<em>{\\theta}(o_i|q)}{\\pi</em>{\\theta_{old}}(o_i|q)} = \\frac{0.9}{0.5} = 1.8 → \\text{Clip}\\space1.2$ (upper bound limit 1.2)</li></ul></li> <li><strong>Case 2</strong>: If the new policy is not in favour of a response (lower probability e.g. 0.2), meaning if the response is not beneficial the increase might be incorrect, and the model would be penalized.<ul><li>$\\text{Ratio}: \\frac{\\pi<em>{\\theta}(o_i|q)}{\\pi</em>{\\theta_{old}}(o_i|q)} = \\frac{0.2}{0.5} = 0.4 →\\text{Clip}\\space0.8$ (lower bound limit 0.8)</li></ul></li>",na,Us,la,js,Il="<li>The formula encourages the new model to favour responses that the old model underweighted <strong>if they improve the outcome</strong>.</li> <li>If the old model already favoured a response with a high probability, the new model can still reinforce it <strong>but only within a controlled limit $[1 - \\epsilon, 1 + \\epsilon]$, $\\text{(e.g., }\\epsilon = 0.2, \\space \\text{so} \\space [0.8-1.2])$</strong>.</li> <li>If the old model overestimated a response that performs poorly, the new model is <strong>discouraged</strong> from maintaining that high probability.</li> <li>Therefore, intuitively, By incorporating the probability ratio, the objective function ensures that updates to the policy are proportional to the advantage $A_i$ while being moderated to prevent drastic changes. T</li>",pa,_s,Al="While the clipping function helps prevent drastic changes, we need one more safeguard to ensure our model doesn’t deviate too far from its original behavior.",ia,Is,ma,As,kl="The KL divergence term is:",ra,ks,zl="$\\beta D<em>{KL}(\\pi</em>{\\theta} || \\pi_{ref})$",oa,zs,Gl="In the KL divergence term, the $\\pi<em>{ref}$ is basically the pre-update model’s output, <code>per_token_logps</code> and $\\pi</em>{\\theta}$ is the new model’s output, <code>new_per_token_logps</code>. Theoretically, KL divergence is minimized to prevent the model from deviating too far from its original behavior during optimization. This helps strike a balance between improving performance based on the reward signal and maintaining coherence. In this context, minimizing KL divergence reduces the risk of the model generating nonsensical text or, in the case of mathematical reasoning, producing extremely incorrect answers.",ca,Gs,ha,Bs,Bl="<li>A KL divergence penalty keeps the model’s outputs close to its original distribution, preventing extreme shifts.</li> <li>Instead of drifting towards completely irrational outputs, the model would refine its understanding while still allowing some exploration</li>",ga,Rs,ya,Zs,Rl="For those interested in the mathematical details, let’s look at the formal definition:",ua,$,Qn,da,Jp='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>D</mi><mrow><mi>K</mi><mi>L</mi></mrow></msub><mo stretchy="false">(</mo><mi>P</mi><mi mathvariant="normal">∣</mi><mi mathvariant="normal">∣</mi><mi>Q</mi><mo stretchy="false">)</mo><mo>=</mo><munder><mo>∑</mo><mrow><mi>x</mi><mo>∈</mo><mi>X</mi></mrow></munder><mi>P</mi><mo stretchy="false">(</mo><mi>x</mi><mo stretchy="false">)</mo><mi>log</mi><mo>⁡</mo><mfrac><mrow><mi>P</mi><mo stretchy="false">(</mo><mi>x</mi><mo stretchy="false">)</mo></mrow><mrow><mi>Q</mi><mo stretchy="false">(</mo><mi>x</mi><mo stretchy="false">)</mo></mrow></mfrac></mrow><annotation encoding="application/x-tex">D_{KL}(P || Q) = \\sum_{x \\in X} P(x) \\log \\frac{P(x)}{Q(x)}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">D</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.07153em;">K</span><span class="mord mathnormal mtight">L</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mord">∣∣</span><span class="mord mathnormal">Q</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.7487em;vertical-align:-1.3217em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.05em;"><span style="top:-1.8557em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">x</span><span class="mrel mtight">∈</span><span class="mord mathnormal mtight" style="margin-right:0.07847em;">X</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:1.3217em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">Q</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mclose">)</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span>',Ma,fa,Qs,wa,Ss,Zl="The coefficient $\\beta$ controls how strongly we enforce the KL divergence constraint:",va,Hs,Ql='<li><strong>Higher $\\beta$ (Stronger KL Penalty)</strong><ul><li>More constraint on policy updates. The model remains close to its reference distribution.</li> <li>Can slow down adaptation: The model may struggle to explore better responses.</li></ul></li> <li><strong>Lower $\\beta$ (Weaker KL Penalty)</strong><ul><li>More freedom to update policy: The model can deviate more from the reference.</li> <li>Faster adaptation but risk of instability: The model might learn reward-hacking behaviors.</li> <li>Over-optimization risk: If the reward model is flawed, the policy might generate nonsensical outputs.</li></ul></li> <li><strong>Original</strong> <a href="https://arxiv.org/abs/2402.03300" rel="nofollow">DeepSeekMath</a> paper set this $\\beta= 0.04$</li>',ba,Es,Sl="Now that we understand the components of GRPO, let’s see how they work together in a complete example.",Ta,Ns,$a,Ls,Hl="To solidify our understanding of GRPO, let’s walk through a complete example from start to finish.",Ja,Xs,xa,Ft,Ca,xp='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mtext>Q: Calculate </mtext><mn>2</mn><mo>+</mo><mn>2</mn><mo>×</mo><mn>6</mn></mrow><annotation encoding="application/x-tex">\\text{Q: Calculate}\\space2 + 2 \\times 6</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord text"><span class="mord">Q: Calculate</span></span><span class="mspace"> </span><span class="mord">2</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.7278em;vertical-align:-0.0833em;"></span><span class="mord">2</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">6</span></span></span></span></span>',Ua,Ps,ja,Vs,El="First, we generate multiple responses from our model:",_a,Fs,Sn,Ia,Cp='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi>o</mi><mn>1</mn></msub><mo>:</mo><mn>14</mn><mo stretchy="false">(</mo><mi>c</mi><mi>o</mi><mi>r</mi><mi>r</mi><mi>e</mi><mi>c</mi><mi>t</mi><mo stretchy="false">)</mo><mo separator="true">,</mo><msub><mi>o</mi><mn>2</mn></msub><mo>:</mo><mn>10</mn><mo stretchy="false">(</mo><mi>w</mi><mi>r</mi><mi>o</mi><mi>n</mi><mi>g</mi><mo stretchy="false">)</mo><mo separator="true">,</mo><msub><mi>o</mi><mn>3</mn></msub><mo>:</mo><mn>16</mn><mo stretchy="false">(</mo><mi>w</mi><mi>r</mi><mi>o</mi><mi>n</mi><mi>g</mi><mo stretchy="false">)</mo><mo separator="true">,</mo><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><msub><mi>o</mi><mi>G</mi></msub><mo>:</mo><mn>14</mn><mo stretchy="false">(</mo><mi>c</mi><mi>o</mi><mi>r</mi><mi>r</mi><mi>e</mi><mi>c</mi><mi>t</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">{o_1:14(correct), o_2:10 (wrong), o_3:16 (wrong), ... o_G:14(correct)}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">:</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">14</span><span class="mopen">(</span><span class="mord mathnormal">correc</span><span class="mord mathnormal">t</span><span class="mclose">)</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">2</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">:</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">10</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02691em;">w</span><span class="mord mathnormal">ro</span><span class="mord mathnormal">n</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mclose">)</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">3</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">:</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">16</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02691em;">w</span><span class="mord mathnormal">ro</span><span class="mord mathnormal">n</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mclose">)</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">...</span><span class="mord"><span class="mord mathnormal">o</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">G</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">:</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mord">14</span><span class="mopen">(</span><span class="mord mathnormal">correc</span><span class="mord mathnormal">t</span><span class="mclose">)</span></span></span></span></span></span>',Aa,Ds,ka,Ws,Nl="Next, we calculate the advantage values to determine which responses are better than average:",za,b,Dt,Hn,Ga,Up='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>m</mi><mi>e</mi><mi>a</mi><mi>n</mi><mo stretchy="false">(</mo><msub><mi>r</mi><mi>i</mi></msub><mo stretchy="false">)</mo><mo>=</mo><mn>0.5</mn></mrow><annotation encoding="application/x-tex">mean(r_i) = 0.5</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">m</span><span class="mord mathnormal">e</span><span class="mord mathnormal">an</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.5</span></span></span></span></span>',En,Wt,Ll="Std: $$std(r_i) = 0.53$$",Nn,Yt,Xl="Advantage Value:<ul><li>Correct response: $A_i = \\frac{1 - 0.5}{0.53}= 0.94$</li> <li>Wrong response: $A_i = \\frac{0 - 0.5}{0.53}= -0.94$</li></ul>",Ba,Ys,Ra,qs,Pl="Finally, we update our model to reinforce the correct responses:",Za,J,x,Ln,qt,Vl="{\\theta",Xn,Qa,jp='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mtext>Ratio</mtext><mo>:</mo><mfrac><mn>0.7</mn><mn>0.5</mn></mfrac><mo>=</mo><mn>1.4</mn><mo>→</mo><mtext>after Clip </mtext><mn>1.2</mn><mtext> </mtext><mo stretchy="false">(</mo><mi>ϵ</mi><mo>=</mo><mn>0.2</mn><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\\text{Ratio}: \\frac{0.7}{0.5} = 1.4 →\\text{after Clip}\\space1.2 \\space (\\epsilon = 0.2)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6833em;"></span><span class="mord text"><span class="mord">Ratio</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">:</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">0.5</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">0.7</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">1.4</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">→</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord text"><span class="mord">after Clip</span></span><span class="mspace"> </span><span class="mord">1.2</span><span class="mspace"> </span><span class="mopen">(</span><span class="mord mathnormal">ϵ</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord">0.2</span><span class="mclose">)</span></span></span></span></span>',Pn,Ot,Fl="Then when the target function is re-weighted, the model tends to reinforce the generation of correct output, and the $\\text{KL Divergence}$ limits the deviation from the reference policy.",Sa,Os,Dl="With the theoretical understanding in place, let’s see how GRPO can be implemented in code.",Ha,Ks,Ea,st,Wl="Let’s put everything together in a practical example. The following code demonstrates how to implement GRPO in PyTorch.",Na,tt,La,et,Yl="First, we need to load a model and generate multiple responses for a given question:",Xa,at,Pa,nt,ql="this initial Generation (Before Any Steps) will output sth like this:",Va,lt,Fa,pt,Da,it,Ol="Now, we need to determine which responses are correct and assign rewards accordingly:",Wa,mt,Kl="With GRPO, with the same sample prompt, we generate multiple completions. So for instance, for our prompts of <code>&quot;Solve y = 2x + 1 for x = 2, y = &quot;</code> and <code>Solve y = 2x + 1 for x = 4, y = &quot;</code> we have two group of generated outputs for the given prompt one is say",Ya,rt,sp="<li><code>[5, 6, 7, 5]</code> and the other is</li> <li><code>[10, 2, 9, 9]</code> while the correct answer is 5 and 9.</li>",qa,ot,tp="Note that in practice these reward scores are achieved by a rule-based reward function that assigns rewards based on the correctness of the response or a more complex neural network-based model that can be trained to assign rewards based on the correctness of the response or a mixed of both. But for sake of simplicity let’s say our reward per response is 1 if the response is correct and 0 if it is wrong, therefore;",Oa,ct,Ka,ht,ep="next we get the group_wise mean and std of the rewards;",sn,gt,tn,yt,ap="this will output:",en,ut,an,dt,np="Now we can calculate the advantage values for each response:",nn,Mt,ln,ft,lp="this will output:",pn,wt,mn,vt,pp="which is coming from the Advantage formula above, so:",rn,bt,on,Tt,ip="however, the shape here is <code>(B*G,) = (8,)</code> but in practice, we need to have the shape of <code>(B, G) = (2, 4)</code> to match the logits shape, right? Therefore, we need to unsqueeze the advantages tensor to have the shape of <code>(B*G, 1) = (8, 1)</code> to match the logits shape.",cn,$t,hn,Jt,mp="which will output:",gn,xt,yn,Ct,rp="now we are good, let’s move to the next step of updating the policy model based on the advantage values.",un,Ut,dn,jt,op="Finally, we use the advantage values to update our model:",Mn,_t,fn,It,cp="Note that the <code>per_token_logps</code> can be achieved by passing the generated outputs to the model and get the logits and then apply the softmax function to get the probabilities <code>F.softmax(logits, dim=-1)</code>.",wn,At,vn,kt,hp="<code>per_token_kl</code> can also be calculated as follows:",bn,zt,Tn,Gt,gp='Complete example can be found <a href="./basic_example.py">here</a>. GRPO is also implemented by the excellent TRL team, you can check the implementation <a href="https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py" rel="nofollow">TRL/GRPO_trainer</a> for more details.',$n,Bt,Jn,Rt,yp="Congratulations! You’ve now learned about Group Relative Policy Optimization (GRPO). To recap what we’ve covered:",xn,Zt,up="<li>GRPO compares multiple outputs within a group to determine which ones are better than others, without requiring a separate value model.</li> <li>The advantage calculation standardizes rewards to identify which responses are above or below average.</li> <li>The policy update uses a clipped objective function with a KL divergence penalty to ensure stable learning.</li>",Cn,Qt,dp="This approach is particularly powerful for mathematical reasoning tasks, where correctness can be objectively verified. The GRPO method allows for more efficient training compared to traditional RLHF approaches that require a separate critic model.",Un,St,Mp="As you continue exploring GRPO, consider experimenting with different group sizes, reward functions, and KL penalty coefficients to see how they affect your model’s performance.",jn,Ht,fp="Happy training! 🚀",_n,Et,In,Nt,wp='<li><a href="https://github.com/natolambert/rlhf-book" rel="nofollow">RLHF Book by Nathan Lambert</a></li> <li><a href="https://huggingface.co/papers/2412.19437" rel="nofollow">DeepSeek-V3 Technical Report</a></li> <li><a href="https://huggingface.co/papers/2402.03300" rel="nofollow">DeepSeekMath</a></li>',An,Lt,kn,te,zn;return _=new u({props:{title:"Advanced Understanding of Group Relative Policy Optimization (GRPO) in DeepSeekMath",local:"advanced-understanding-of-group-relative-policy-optimization-grpo-in-deepseekmath",headingTag:"h1"}}),T=new Bp({props:{$$slots:{default:[Zp]},$$scope:{ctx:ae}}}),R=new u({props:{title:"The GRPO Algorithm",local:"the-grpo-algorithm",headingTag:"h2"}}),S=new u({props:{title:"Step 1: Group Sampling",local:"step-1-group-sampling",headingTag:"h3"}}),N=new u({props:{title:"Example:",local:"example",headingTag:"h4"}}),V=new u({props:{title:"Step 2: Advantage Calculation",local:"step-2-advantage-calculation",headingTag:"h3"}}),D=new u({props:{title:"Reward Distribution:",local:"reward-distribution",headingTag:"h4"}}),q=new u({props:{title:"Advantage Value Formula:",local:"advantage-value-formula",headingTag:"h4"}}),K=new u({props:{title:"Example:",local:"example",headingTag:"h4"}}),es=new u({props:{title:"Interpretation:",local:"interpretation",headingTag:"h4"}}),is=new u({props:{title:"Step 3: Policy Update",local:"step-3-policy-update",headingTag:"h3"}}),cs=new u({props:{title:"Key Components of the Target Function",local:"key-components-of-the-target-function",headingTag:"h2"}}),gs=new u({props:{title:"1. Probability Ratio",local:"1-probability-ratio",headingTag:"h3"}}),Ms=new u({props:{title:"Interpretation:",local:"interpretation",headingTag:"h4"}}),vs=new u({props:{title:"2. Clip Function",local:"2-clip-function",headingTag:"h3"}}),Js=new u({props:{title:"Example $\\space \\text{suppose}(\\epsilon = 0.2)$",local:"example-space-textsupposeepsilon--02",headingTag:"h4"}}),Us=new u({props:{title:"Interpretation:",local:"interpretation",headingTag:"h4"}}),Is=new u({props:{title:"3. KL Divergence",local:"3-kl-divergence",headingTag:"h3"}}),Gs=new u({props:{title:"Interpretation",local:"interpretation",headingTag:"h4"}}),Rs=new u({props:{title:"Math Definition",local:"math-definition",headingTag:"h4"}}),Qs=new u({props:{title:"The Role of $\\beta$ Parameter",local:"the-role-of-beta-parameter",headingTag:"h4"}}),Ns=new u({props:{title:"Worked Example with GRPO",local:"worked-example-with-grpo",headingTag:"h2"}}),Xs=new u({props:{title:"Example Problem",local:"example-problem",headingTag:"h3"}}),Ps=new u({props:{title:"Step 1: Group Sampling",local:"step-1-group-sampling",headingTag:"h3"}}),Ds=new u({props:{title:"Step 2: Advantage Calculation",local:"step-2-advantage-calculation",headingTag:"h3"}}),Ys=new u({props:{title:"Step 3: Policy Update",local:"step-3-policy-update",headingTag:"h3"}}),Ks=new u({props:{title:"Implementation Example",local:"implementation-example",headingTag:"h2"}}),tt=new u({props:{title:"1. Loading the Model and Generating Responses",local:"1-loading-the-model-and-generating-responses",headingTag:"h3"}}),at=new f({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjB0b3JjaC5ubi5mdW5jdGlvbmFsJTIwYXMlMjBGJTBBZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Nb2RlbEZvckNhdXNhbExNJTJDJTIwQXV0b1Rva2VuaXplciUwQSUwQSUyMyUyMExvYWQlMjB0aGUlMjBtb2RlbCUyMGFuZCUyMHRva2VuaXplciUwQW1vZGVsX25hbWUlMjAlM0QlMjAlMjJRd2VuJTJGUXdlbjItTWF0aC0xLjVCJTIyJTBBbW9kZWwlMjAlM0QlMjBBdXRvTW9kZWxGb3JDYXVzYWxMTS5mcm9tX3ByZXRyYWluZWQobW9kZWxfbmFtZSklMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChtb2RlbF9uYW1lKSUwQW1vZGVsLmV2YWwoKSUwQSUwQSUyMyUyME1vdmUlMjBtb2RlbCUyMHRvJTIwR1BVJTIwaWYlMjBhdmFpbGFibGUlMEFkZXZpY2UlMjAlM0QlMjB0b3JjaC5kZXZpY2UoJTIyY3VkYSUyMiUyMGlmJTIwdG9yY2guY3VkYS5pc19hdmFpbGFibGUoKSUyMGVsc2UlMjAlMjJjcHUlMjIpJTBBbW9kZWwudG8oZGV2aWNlKSUwQSUwQSUyMyUyMElucHV0JTIwcHJvbXB0JTBBcHJvbXB0JTIwJTNEJTIwJTIyU29sdmUlMjB5JTIwJTNEJTIwMnglMjAlMkIlMjAxJTIwZm9yJTIweCUyMCUzRCUyMDIlMkMlMjB5JTIwJTNEJTIwJTIyJTIwJTIwJTIzJTIwQ29ycmVjdCUyMGFuc3dlciUzQSUyMDUlMEFpbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIocHJvbXB0JTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJwdCUyMiUyQyUyMHBhZGRpbmclM0RUcnVlKSUwQWlucHV0X2lkcyUyMCUzRCUyMGlucHV0cyU1QiUyMmlucHV0X2lkcyUyMiU1RC50byhkZXZpY2UpJTIwJTIwJTIzJTIwU2hhcGUlM0ElMjAoMSUyQyUyMHByb21wdF9sZW4pJTBBYXR0ZW50aW9uX21hc2slMjAlM0QlMjBpbnB1dHMlNUIlMjJhdHRlbnRpb25fbWFzayUyMiU1RC50byhkZXZpY2UpJTBBJTBBJTIzJTIwU3RlcCUyMDElM0ElMjBHZW5lcmF0ZSUyMDglMjByZXNwb25zZXMlMjAoQiUyMCUzRCUyMDIlMjBncm91cHMlMkMlMjBHJTIwJTNEJTIwNCUyMHJlc3BvbnNlcyUyMHBlciUyMGdyb3VwKSUwQWJhdGNoX3NpemUlMkMlMjBudW1fZ2VuZXJhdGlvbnMlMjAlM0QlMjAyJTJDJTIwNCUwQW91dHB1dHMlMjAlM0QlMjBtb2RlbC5nZW5lcmF0ZSglMEElMjAlMjAlMjAlMjBpbnB1dF9pZHMlM0RpbnB1dF9pZHMlMkMlMjAlMjAlMjMlMjBTaGFwZSUzQSUyMCgxJTJDJTIwcHJvbXB0X2xlbiklMEElMjAlMjAlMjAlMjBhdHRlbnRpb25fbWFzayUzRGF0dGVudGlvbl9tYXNrJTJDJTBBJTIwJTIwJTIwJTIwbWF4X25ld190b2tlbnMlM0QxJTJDJTIwJTIwJTIzJTIwc2VxX2xlbiUyMCUzRCUyMDElMjAoc2luZ2xlJTIwdG9rZW4lMjBwZXIlMjByZXNwb25zZSklMEElMjAlMjAlMjAlMjBudW1fcmV0dXJuX3NlcXVlbmNlcyUzRGJhdGNoX3NpemUlMjAqJTIwbnVtX2dlbmVyYXRpb25zJTJDJTIwJTIwJTIzJTIwOCUyMHJlc3BvbnNlcyUyMHRvdGFsJTBBJTIwJTIwJTIwJTIwZG9fc2FtcGxlJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMHRvcF9rJTNEMTAlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuNyUyQyUwQSUyMCUyMCUyMCUyMHBhZF90b2tlbl9pZCUzRHRva2VuaXplci5lb3NfdG9rZW5faWQlMkMlMEElMjAlMjAlMjAlMjByZXR1cm5fZGljdF9pbl9nZW5lcmF0ZSUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBvdXRwdXRfc2NvcmVzJTNEVHJ1ZSUyQyUwQSk=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.nn.functional <span class="hljs-keyword">as</span> F
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer
<span class="hljs-comment"># Load the model and tokenizer</span>
model_name = <span class="hljs-string">&quot;Qwen/Qwen2-Math-1.5B&quot;</span>
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.<span class="hljs-built_in">eval</span>()
<span class="hljs-comment"># Move model to GPU if available</span>
device = torch.device(<span class="hljs-string">&quot;cuda&quot;</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;cpu&quot;</span>)
model.to(device)
<span class="hljs-comment"># Input prompt</span>
prompt = <span class="hljs-string">&quot;Solve y = 2x + 1 for x = 2, y = &quot;</span> <span class="hljs-comment"># Correct answer: 5</span>
inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>, padding=<span class="hljs-literal">True</span>)
input_ids = inputs[<span class="hljs-string">&quot;input_ids&quot;</span>].to(device) <span class="hljs-comment"># Shape: (1, prompt_len)</span>
attention_mask = inputs[<span class="hljs-string">&quot;attention_mask&quot;</span>].to(device)
<span class="hljs-comment"># Step 1: Generate 8 responses (B = 2 groups, G = 4 responses per group)</span>
batch_size, num_generations = <span class="hljs-number">2</span>, <span class="hljs-number">4</span>
outputs = model.generate(
input_ids=input_ids, <span class="hljs-comment"># Shape: (1, prompt_len)</span>
attention_mask=attention_mask,
max_new_tokens=<span class="hljs-number">1</span>, <span class="hljs-comment"># seq_len = 1 (single token per response)</span>
num_return_sequences=batch_size * num_generations, <span class="hljs-comment"># 8 responses total</span>
do_sample=<span class="hljs-literal">True</span>,
top_k=<span class="hljs-number">10</span>,
temperature=<span class="hljs-number">0.7</span>,
pad_token_id=tokenizer.eos_token_id,
return_dict_in_generate=<span class="hljs-literal">True</span>,
output_scores=<span class="hljs-literal">True</span>,
)`,wrap:!1}}),lt=new f({props:{code:"T3V0cHV0JTIwMSUzQSUyMDUuMCUwQU91dHB1dCUyMDIlM0ElMjA2LjAlMEFPdXRwdXQlMjAzJTNBJTIwNy4wJTBBT3V0cHV0JTIwNCUzQSUyMDUuMCUwQU91dHB1dCUyMDUlM0ElMjAxMC4wJTBBT3V0cHV0JTIwNiUzQSUyMDIuMCUwQU91dHB1dCUyMDclM0ElMjA1LjAlMEFPdXRwdXQlMjA4JTNBJTIwNS4w",highlighted:`Output 1: 5.0
Output 2: 6.0
Output 3: 7.0
Output 4: 5.0
Output 5: 10.0
Output 6: 2.0
Output 7: 5.0
Output 8: 5.0`,wrap:!1}}),pt=new u({props:{title:"2. Calculating Rewards",local:"2-calculating-rewards",headingTag:"h3"}}),ct=new f({props:{code:"cmV3YXJkXzElMjAlM0QlMjAlNUIxJTJDJTIwMCUyQyUyMDAlMkMlMjAxJTVEJTBBcmV3YXJkXzIlMjAlM0QlMjAlNUIwJTJDJTIwMCUyQyUyMDElMkMlMjAxJTVE",highlighted:`reward_1 = [<span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>]
reward_2 = [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]`,wrap:!1}}),gt=new f({props:{code:"JTIzJTIwU2hhcGUlM0ElMjAoQiUyMColMjBHJTJDKSUyMCUzRCUyMCg4JTJDKSUyMGJjJTIwd2UlMjBoYXZlJTIwMiUyMGdyb3VwcyUyMG9mJTIwNCUyMGdlbmVyYXRpb25zJTIwdGhhdCUyMHdlJTIwZmxhdHRlbiUwQXJld2FyZHMlMjAlM0QlMjB0b3JjaC50ZW5zb3IoJTVCMSUyQyUyMDAlMkMlMjAwJTJDJTIwMSUyQyUyMDAlMkMlMjAwJTJDJTIwMSUyQyUyMDElNUQlMkMlMjBkdHlwZSUzRHRvcmNoLmZsb2F0MzIpJTBBbnVtX2dlbmVyYXRpb25zJTIwJTNEJTIwNCUwQSUwQSUyMyUyMEdyb3VwJTIwcmV3YXJkcyUzQSUyMFNoYXBlJTIwKEIlMkMlMjBHKSUyMCUzRCUyMDIlMkMlMjA0KSUwQXJld2FyZHNfZ3JvdXBlZCUyMCUzRCUyMHJld2FyZHMudmlldygtMSUyQyUyMG51bV9nZW5lcmF0aW9ucyklMEElMEElMjMlMjBNZWFuJTIwcGVyJTIwZ3JvdXAlM0ElMjBTaGFwZSUyMChCJTJDKSUyMCUzRCUyMCgyJTJDKSUwQW1lYW5fZ3JvdXBlZF9yZXdhcmRzJTIwJTNEJTIwcmV3YXJkc19ncm91cGVkLm1lYW4oZGltJTNEMSklMEElMEElMjMlMjBTdGQlMjBwZXIlMjBncm91cCUzQSUyMFNoYXBlJTIwKEIlMkMpJTIwJTNEJTIwKDIlMkMpJTBBc3RkX2dyb3VwZWRfcmV3YXJkcyUyMCUzRCUyMHJld2FyZHNfZ3JvdXBlZC5zdGQoZGltJTNEMSklMEElMEElMjMlMjBCcm9hZGNhc3QlMjB0byUyMG1hdGNoJTIwcmV3YXJkcyUyMGFuZCUyMG5vcm1hbGl6ZSUzQSUyMFNoYXBlJTIwKEIlMjAqJTIwRyUyQyklMjAlM0QlMjAoOCUyQyklMEElMjMlMjB3aHklMjB3ZSUyMG5lZWQlMjB0byUyMGJyb2FkY2FzdCUzRiUyMGJlY2F1c2UlMjB3ZSUyMG5lZWQlMjB0byUyMGNhbGN1bGF0ZSUyMHRoZSUyMGFkdmFudGFnZSUyMHZhbHVlcyUyMGZvciUyMGVhY2glMjByZXNwb25zZSUyMHdpdGhpbiUyMHRoZSUyMGdyb3VwJTBBbWVhbl9ncm91cGVkX3Jld2FyZHMlMjAlM0QlMjBtZWFuX2dyb3VwZWRfcmV3YXJkcy5yZXBlYXRfaW50ZXJsZWF2ZShudW1fZ2VuZXJhdGlvbnMlMkMlMjBkaW0lM0QwKSUwQXN0ZF9ncm91cGVkX3Jld2FyZHMlMjAlM0QlMjBzdGRfZ3JvdXBlZF9yZXdhcmRzLnJlcGVhdF9pbnRlcmxlYXZlKG51bV9nZW5lcmF0aW9ucyUyQyUyMGRpbSUzRDAp",highlighted:`<span class="hljs-comment"># Shape: (B * G,) = (8,) bc we have 2 groups of 4 generations that we flatten</span>
rewards = torch.tensor([<span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>], dtype=torch.float32)
num_generations = <span class="hljs-number">4</span>
<span class="hljs-comment"># Group rewards: Shape (B, G) = 2, 4)</span>
rewards_grouped = rewards.view(-<span class="hljs-number">1</span>, num_generations)
<span class="hljs-comment"># Mean per group: Shape (B,) = (2,)</span>
mean_grouped_rewards = rewards_grouped.mean(dim=<span class="hljs-number">1</span>)
<span class="hljs-comment"># Std per group: Shape (B,) = (2,)</span>
std_grouped_rewards = rewards_grouped.std(dim=<span class="hljs-number">1</span>)
<span class="hljs-comment"># Broadcast to match rewards and normalize: Shape (B * G,) = (8,)</span>
<span class="hljs-comment"># why we need to broadcast? because we need to calculate the advantage values for each response within the group</span>
mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(num_generations, dim=<span class="hljs-number">0</span>)
std_grouped_rewards = std_grouped_rewards.repeat_interleave(num_generations, dim=<span class="hljs-number">0</span>)`,wrap:!1}}),ut=new f({props:{code:"R3JvdXBlZCUyMFJld2FyZHMlM0ElMjB0ZW5zb3IoJTVCJTVCMS4lMkMlMjAwLiUyQyUyMDAuJTJDJTIwMS4lNUQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlNUIwLiUyQyUyMDAuJTJDJTIwMS4lMkMlMjAxLiU1RCU1RCklMEFNZWFuJTIwcGVyJTIwZ3JvdXAlM0ElMjB0ZW5zb3IoJTVCMC41MDAwJTJDJTIwMC41MDAwJTVEKSUwQVN0ZCUyMHBlciUyMGdyb3VwJTNBJTIwdGVuc29yKCU1QjAuNTc3NCUyQyUyMDAuNTc3NCU1RCklMEFCcm9hZGNhc3RlZCUyME1lYW4lM0ElMjB0ZW5zb3IoJTVCMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTJDJTIwMC41MDAwJTVEKSUwQUJyb2FkY2FzdGVkJTIwU3RkJTNBJTIwdGVuc29yKCU1QjAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCUyQyUyMDAuNTc3NCU1RCk=",highlighted:`Grouped Rewards: tensor([[1., 0., 0., 1.],
[0., 0., 1., 1.]])
Mean per group: tensor([0.5000, 0.5000])
Std per group: tensor([0.5774, 0.5774])
Broadcasted Mean: tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000])
Broadcasted Std: tensor([0.5774, 0.5774, 0.5774, 0.5774, 0.5774, 0.5774, 0.5774, 0.5774])`,wrap:!1}}),Mt=new f({props:{code:"JTIzJTIwQWR2YW50YWdlcyUzQSUyMFNoYXBlJTIwKEIlMjAqJTIwRyUyQyklMjAlM0QlMjAoOCUyQyklMEFhZHZhbnRhZ2VzJTIwJTNEJTIwKHJld2FyZHMlMjAtJTIwbWVhbl9ncm91cGVkX3Jld2FyZHMpJTIwJTJGJTIwKHN0ZF9ncm91cGVkX3Jld2FyZHMlMjAlMkIlMjAxZS04KQ==",highlighted:`<span class="hljs-comment"># Advantages: Shape (B * G,) = (8,)</span>
advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + <span class="hljs-number">1e-8</span>)`,wrap:!1}}),wt=new f({props:{code:"QWR2YW50YWdlcyUzQSUyMHRlbnNvciglNUIlMjAwLjg2NTklMkMlMjAtMC44NjYwJTJDJTIwLTAuODY2MCUyQyUyMCUyMDAuODY1OSUyQyUyMC0wLjg2NjAlMkMlMjAtMC44NjYwJTJDJTIwJTIwMC44NjU5JTJDJTIwJTIwMC44NjU5JTVEKQ==",highlighted:"Advantages: tensor([ 0.8659, -0.8660, -0.8660, 0.8659, -0.8660, -0.8660, 0.8659, 0.8659])",wrap:!1}}),bt=new f({props:{code:"Rm9yJTIwcmV3YXJkXzElMjAlM0QlMjAlNUIxJTJDJTIwMCUyQyUyMDAlMkMlMjAxJTVEJTNBJTBBMSUyMC0lMjAwLjUlMjAlMkYlMjAwLjU3NzQlMjAlRTIlODklODglMjAwLjg2NTklMEEwJTIwLSUyMDAuNSUyMCUyRiUyMDAuNTc3NCUyMCVFMiU4OSU4OCUyMC0wLjg2NjAlMEFGb3IlMjByZXdhcmRfMiUyMCUzRCUyMCU1QjAlMkMlMjAwJTJDJTIwMSUyQyUyMDElNUQlM0ElMjBTYW1lJTIwcGF0dGVybi4=",highlighted:`For reward_1 = [1, 0, 0, 1]:
1 - 0.5 / 0.5774 ≈ 0.8659
0 - 0.5 / 0.5774 ≈ -0.8660
For reward_2 = [0, 0, 1, 1]: Same pattern.`,wrap:!1}}),$t=new f({props:{code:"JTIzJTIwU2hhcGUlMjAoQiUyMColMjBHJTJDJTIwMSklMjAlM0QlMjAoOCUyQyUyMDEpJTIwdG8lMjBtYXRjaCUyMHRoZSUyMGxvZ2l0cyUyMHNoYXBlJTBBYWR2YW50YWdlcyUyMCUzRCUyMGFkdmFudGFnZXMudW5zcXVlZXplKDEp",highlighted:`<span class="hljs-comment"># Shape (B * G, 1) = (8, 1) to match the logits shape</span>
advantages = advantages.unsqueeze(<span class="hljs-number">1</span>)`,wrap:!1}}),xt=new f({props:{code:"QWR2YW50YWdlcyUzQSUyMHRlbnNvciglNUIlNUIlMjAwLjg2NTklNUQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlNUItMC44NjYwJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTVCLTAuODY2MCU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU1QiUyMDAuODY1OSU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU1Qi0wLjg2NjAlNUQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlNUItMC44NjYwJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTVCJTIwMC44NjU5JTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTVCJTIwMC44NjU5JTVEJTVEKQ==",highlighted:`Advantages: tensor([[ 0.8659],
[-0.8660],
[-0.8660],
[ 0.8659],
[-0.8660],
[-0.8660],
[ 0.8659],
[ 0.8659]])`,wrap:!1}}),Ut=new u({props:{title:"3. Updating the Policy",local:"3-updating-the-policy",headingTag:"h3"}}),_t=new f({props:{code:"JTIzJTIwQ29tcHV0ZSUyMHByb2JhYmlsaXR5JTIwcmF0aW8lMjBiZXR3ZWVuJTIwbmV3JTIwYW5kJTIwb2xkJTIwcG9saWNpZXMlMEFyYXRpbyUyMCUzRCUyMHRvcmNoLmV4cCglMEElMjAlMjAlMjAlMjBuZXdfcGVyX3Rva2VuX2xvZ3BzJTIwLSUyMHBlcl90b2tlbl9sb2dwcyUwQSklMjAlMjAlMjMlMjBTaGFwZSUzQSUyMChCKkclMkMlMjBzZXFfbGVuKSUyMHNlcV9sZW4lMjBpcyUyMHRoZSUyMGxlbmd0aCUyMG9mJTIwdGhlJTIwb3V0cHV0JTIwaS5lLiUyMHRoZSUyMG51bSUyMG9mJTIwZ2VuZXJhdGVkJTIwdG9rZW5zJTIwc28lMjBoZXJlJTIwZm9yJTIwc2ltcGxpY2l0eSUyMGxldCdzJTIwYXNzdW1lJTIwaXQlMjBpcyUyMDElMjAlMjMlMjAoOCUyQyUyMDEp",highlighted:`<span class="hljs-comment"># Compute probability ratio between new and old policies</span>
ratio = torch.exp(
new_per_token_logps - per_token_logps
) <span class="hljs-comment"># Shape: (B*G, seq_len) seq_len is the length of the output i.e. the num of generated tokens so here for simplicity let&#x27;s assume it is 1 # (8, 1)</span>`,wrap:!1}}),At=new f({props:{code:"JTIzJTIwQ2xpcHBpbmclMjBGdW5jdGlvbiUwQWVwcyUyMCUzRCUyMHNlbGYuY2xpcHJhbmdlJTIwJTIwJTIzJTIwZS5nLiUyMDAuMiUwQXBnX2xvc3NlczElMjAlM0QlMjAtYWR2YW50YWdlcyUyMColMjByYXRpbyUyMCUyMCUyMyUyMFNoYXBlJTNBJTIwKEIqRyUyQyUyMHNlcV9sZW4pJTIwJTIwJTIzKDglMkMlMjAxKSUwQXBnX2xvc3NlczIlMjAlM0QlMjAtYWR2YW50YWdlcyUyMColMjB0b3JjaC5jbGFtcCglMEElMjAlMjAlMjAlMjByYXRpbyUyQyUyMDEuMCUyMC0lMjBlcHMlMkMlMjAxLjAlMjAlMkIlMjBlcHMlMEEpJTIwJTIwJTIzJTIwU2hhcGUlM0ElMjAoQipHJTJDJTIwc2VxX2xlbiklMjAlMjMoOCUyQyUyMDEpJTBBcGdfbG9zc19tYXglMjAlM0QlMjB0b3JjaC5tYXgocGdfbG9zc2VzMSUyQyUyMHBnX2xvc3NlczIpJTIwJTIwJTIzJTIwU2hhcGUlM0ElMjAoQipHJTJDJTIwc2VxX2xlbiklMjAlMjMoOCUyQyUyMDEpJTBBJTBBJTBBJTIzJTIwTm93JTIwQ29tYmluZSUyMHdpdGglMjBLTCUyMHBlbmFsdHklMjAlMjMlMjBTaGFwZSUzQSUyMChCKkclMkMlMjBzZXFfbGVuKSUyMCUyMyg4JTJDJTIwMSklMEFwZXJfdG9rZW5fbG9zcyUyMCUzRCUyMHBnX2xvc3NfbWF4JTIwJTJCJTIwc2VsZi5iZXRhJTIwKiUyMHBlcl90b2tlbl9rbA==",highlighted:`<span class="hljs-comment"># Clipping Function</span>
eps = self.cliprange <span class="hljs-comment"># e.g. 0.2</span>
pg_losses1 = -advantages * ratio <span class="hljs-comment"># Shape: (B*G, seq_len) #(8, 1)</span>
pg_losses2 = -advantages * torch.clamp(
ratio, <span class="hljs-number">1.0</span> - eps, <span class="hljs-number">1.0</span> + eps
) <span class="hljs-comment"># Shape: (B*G, seq_len) #(8, 1)</span>
pg_loss_max = torch.<span class="hljs-built_in">max</span>(pg_losses1, pg_losses2) <span class="hljs-comment"># Shape: (B*G, seq_len) #(8, 1)</span>
<span class="hljs-comment"># Now Combine with KL penalty # Shape: (B*G, seq_len) #(8, 1)</span>
per_token_loss = pg_loss_max + self.beta * per_token_kl`,wrap:!1}}),zt=new f({props:{code:"JTIzJTIwU2hhcGUlM0ElMjAoQipHJTJDJTIwc2VxX2xlbiklMjAlMjMoOCUyQyUyMDEpJTBBcGVyX3Rva2VuX2tsJTIwJTNEJTIwRi5rbF9kaXYoJTBBJTIwJTIwJTIwJTIwRi5sb2dfc29mdG1heChuZXdfcGVyX3Rva2VuX2xvZ3BzJTJDJTIwZGltJTNELTEpJTJDJTBBJTIwJTIwJTIwJTIwRi5zb2Z0bWF4KHBlcl90b2tlbl9sb2dwcyUyQyUyMGRpbSUzRC0xKSUyQyUwQSUyMCUyMCUyMCUyMHJlZHVjdGlvbiUzRCUyMm5vbmUlMjIlMkMlMEEpLnN1bShkaW0lM0QtMSUyQyUyMGtlZXBkaW0lM0RUcnVlKQ==",highlighted:`<span class="hljs-comment"># Shape: (B*G, seq_len) #(8, 1)</span>
per_token_kl = F.kl_div(
F.log_softmax(new_per_token_logps, dim=-<span class="hljs-number">1</span>),
F.softmax(per_token_logps, dim=-<span class="hljs-number">1</span>),
reduction=<span class="hljs-string">&quot;none&quot;</span>,
).<span class="hljs-built_in">sum</span>(dim=-<span class="hljs-number">1</span>, keepdim=<span class="hljs-literal">True</span>)`,wrap:!1}}),Bt=new u({props:{title:"Summary and Next Steps",local:"summary-and-next-steps",headingTag:"h2"}}),Et=new u({props:{title:"References",local:"references",headingTag:"h2"}}),Lt=new Rp({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter12/3a.mdx"}}),{c(){M=p("meta"),j=n(),w=p("p"),Vt=n(),r(_.$$.fragment),ne=n(),r(T.$$.fragment),le=n(),I=p("p"),I.textContent=Wn,pe=n(),A=p("p"),A.textContent=Yn,ie=n(),k=p("p"),k.textContent=qn,me=n(),z=p("p"),z.textContent=On,re=n(),G=p("p"),G.innerHTML=Kn,oe=n(),B=p("p"),B.textContent=sl,ce=n(),r(R.$$.fragment),he=n(),Z=p("p"),Z.textContent=tl,ge=n(),Q=p("p"),Q.textContent=el,ye=n(),r(S.$$.fragment),ue=n(),H=p("p"),H.textContent=al,de=n(),E=p("p"),E.innerHTML=nl,Me=n(),r(N.$$.fragment),fe=n(),L=p("p"),L.textContent=ll,we=n(),X=p("ul"),X.innerHTML=pl,ve=n(),P=p("p"),P.textContent=il,be=n(),r(V.$$.fragment),Te=n(),F=p("p"),F.textContent=ml,$e=n(),r(D.$$.fragment),Je=n(),W=p("p"),W.textContent=rl,xe=n(),Y=p("p"),Y.innerHTML=ol,Ce=n(),r(q.$$.fragment),Ue=n(),O=p("p"),Rn=C(`The key insight of GRPO is that we don’t need absolute measures of quality - we can compare outputs within the same group. This is done using standardization:
`),je=new Xt(!1),_e=n(),r(K.$$.fragment),Ie=n(),ss=p("p"),ss.textContent=cl,Ae=n(),ts=p("ul"),ts.innerHTML=hl,ke=n(),r(es.$$.fragment),ze=n(),as=p("p"),as.textContent=gl,Ge=n(),ns=p("p"),ns.textContent=yl,Be=n(),ls=p("p"),ls.textContent=ul,Re=n(),ps=p("p"),ps.textContent=dl,Ze=n(),r(is.$$.fragment),Qe=n(),ms=p("p"),ms.textContent=Ml,Se=n(),rs=p("p"),Zn=C(`The target function for policy update is:
`),He=new Xt(!1),Ee=n(),os=p("p"),os.textContent=fl,Ne=n(),r(cs.$$.fragment),Le=n(),hs=p("p"),hs.textContent=wl,Xe=n(),r(gs.$$.fragment),Pe=n(),ys=p("p"),ys.textContent=vl,Ve=n(),us=p("p"),us.innerHTML=bl,Fe=n(),ds=p("p"),ds.textContent=Tl,De=n(),r(Ms.$$.fragment),We=n(),fs=p("ul"),fs.innerHTML=$l,Ye=n(),ws=p("p"),ws.textContent=Jl,qe=n(),r(vs.$$.fragment),Oe=n(),bs=p("p"),bs.textContent=xl,Ke=n(),Ts=p("p"),Ts.innerHTML=Cl,sa=n(),$s=p("p"),$s.textContent=Ul,ta=n(),r(Js.$$.fragment),ea=n(),xs=p("p"),xs.textContent=jl,aa=n(),Cs=p("ul"),Cs.innerHTML=_l,na=n(),r(Us.$$.fragment),la=n(),js=p("ul"),js.innerHTML=Il,pa=n(),_s=p("p"),_s.textContent=Al,ia=n(),r(Is.$$.fragment),ma=n(),As=p("p"),As.textContent=kl,ra=n(),ks=p("p"),ks.innerHTML=zl,oa=n(),zs=p("p"),zs.innerHTML=Gl,ca=n(),r(Gs.$$.fragment),ha=n(),Bs=p("ul"),Bs.innerHTML=Bl,ga=n(),r(Rs.$$.fragment),ya=n(),Zs=p("p"),Zs.textContent=Rl,ua=n(),$=p("p"),Qn=C("Recall that KL distance is defined as follows:"),da=new Xt(!1),Ma=C(`
In RLHF, the two distributions of interest are often the distribution of the new model version, P(x), and a distribution of the reference policy, Q(x).`),fa=n(),r(Qs.$$.fragment),wa=n(),Ss=p("p"),Ss.textContent=Zl,va=n(),Hs=p("ul"),Hs.innerHTML=Ql,ba=n(),Es=p("p"),Es.textContent=Sl,Ta=n(),r(Ns.$$.fragment),$a=n(),Ls=p("p"),Ls.textContent=Hl,Ja=n(),r(Xs.$$.fragment),xa=n(),Ft=p("p"),Ca=new Xt(!1),Ua=n(),r(Ps.$$.fragment),ja=n(),Vs=p("p"),Vs.textContent=El,_a=n(),Fs=p("p"),Sn=C(`Generate $(G = 8)$ responses, $4$ of which are correct answer ($14, \\text{reward=} 1$) and $4$ incorrect $\\text{(reward= 0)}$, Therefore:
`),Ia=new Xt(!1),Aa=n(),r(Ds.$$.fragment),ka=n(),Ws=p("p"),Ws.textContent=Nl,za=n(),b=p("ul"),Dt=p("li"),Hn=C("Group Average: "),Ga=new Xt(!1),En=n(),Wt=p("li"),Wt.textContent=Ll,Nn=n(),Yt=p("li"),Yt.innerHTML=Xl,Ba=n(),r(Ys.$$.fragment),Ra=n(),qs=p("p"),qs.textContent=Pl,Za=n(),J=p("ul"),x=p("li"),Ln=C("Assuming the probability of old policy ($\\pi"),qt=p("em"),qt.textContent=Vl,Xn=C("{old}}$) for a correct output $o_1$ is $0.5$ and the new policy increases it to $0.7$ then:"),Qa=new Xt(!1),Pn=n(),Ot=p("li"),Ot.textContent=Fl,Sa=n(),Os=p("p"),Os.textContent=Dl,Ha=n(),r(Ks.$$.fragment),Ea=n(),st=p("p"),st.textContent=Wl,Na=n(),r(tt.$$.fragment),La=n(),et=p("p"),et.textContent=Yl,Xa=n(),r(at.$$.fragment),Pa=n(),nt=p("p"),nt.textContent=ql,Va=n(),r(lt.$$.fragment),Fa=n(),r(pt.$$.fragment),Da=n(),it=p("p"),it.textContent=Ol,Wa=n(),mt=p("p"),mt.innerHTML=Kl,Ya=n(),rt=p("ul"),rt.innerHTML=sp,qa=n(),ot=p("p"),ot.textContent=tp,Oa=n(),r(ct.$$.fragment),Ka=n(),ht=p("p"),ht.textContent=ep,sn=n(),r(gt.$$.fragment),tn=n(),yt=p("p"),yt.textContent=ap,en=n(),r(ut.$$.fragment),an=n(),dt=p("p"),dt.textContent=np,nn=n(),r(Mt.$$.fragment),ln=n(),ft=p("p"),ft.textContent=lp,pn=n(),r(wt.$$.fragment),mn=n(),vt=p("p"),vt.textContent=pp,rn=n(),r(bt.$$.fragment),on=n(),Tt=p("p"),Tt.innerHTML=ip,cn=n(),r($t.$$.fragment),hn=n(),Jt=p("p"),Jt.textContent=mp,gn=n(),r(xt.$$.fragment),yn=n(),Ct=p("p"),Ct.textContent=rp,un=n(),r(Ut.$$.fragment),dn=n(),jt=p("p"),jt.textContent=op,Mn=n(),r(_t.$$.fragment),fn=n(),It=p("p"),It.innerHTML=cp,wn=n(),r(At.$$.fragment),vn=n(),kt=p("p"),kt.innerHTML=hp,bn=n(),r(zt.$$.fragment),Tn=n(),Gt=p("p"),Gt.innerHTML=gp,$n=n(),r(Bt.$$.fragment),Jn=n(),Rt=p("p"),Rt.textContent=yp,xn=n(),Zt=p("ol"),Zt.innerHTML=up,Cn=n(),Qt=p("p"),Qt.textContent=dp,Un=n(),St=p("p"),St.textContent=Mp,jn=n(),Ht=p("p"),Ht.textContent=fp,_n=n(),r(Et.$$.fragment),In=n(),Nt=p("ol"),Nt.innerHTML=wp,An=n(),r(Lt.$$.fragment),kn=n(),te=p("p"),this.h()},l(s){const t=Gp("svelte-u9bgzb",document.head);M=i(t,"META",{name:!0,content:!0}),t.forEach(e),j=l(s),w=i(s,"P",{}),v(w).forEach(e),Vt=l(s),o(_.$$.fragment,s),ne=l(s),o(T.$$.fragment,s),le=l(s),I=i(s,"P",{"data-svelte-h":!0}),m(I)!=="svelte-yv9vu0"&&(I.textContent=Wn),pe=l(s),A=i(s,"P",{"data-svelte-h":!0}),m(A)!=="svelte-1m5i5vf"&&(A.textContent=Yn),ie=l(s),k=i(s,"P",{"data-svelte-h":!0}),m(k)!=="svelte-1z0jlvp"&&(k.textContent=qn),me=l(s),z=i(s,"P",{"data-svelte-h":!0}),m(z)!=="svelte-1rot1dt"&&(z.textContent=On),re=l(s),G=i(s,"P",{"data-svelte-h":!0}),m(G)!=="svelte-grajek"&&(G.innerHTML=Kn),oe=l(s),B=i(s,"P",{"data-svelte-h":!0}),m(B)!=="svelte-2029g"&&(B.textContent=sl),ce=l(s),o(R.$$.fragment,s),he=l(s),Z=i(s,"P",{"data-svelte-h":!0}),m(Z)!=="svelte-19zfcdy"&&(Z.textContent=tl),ge=l(s),Q=i(s,"P",{"data-svelte-h":!0}),m(Q)!=="svelte-7ijv8n"&&(Q.textContent=el),ye=l(s),o(S.$$.fragment,s),ue=l(s),H=i(s,"P",{"data-svelte-h":!0}),m(H)!=="svelte-l99jvf"&&(H.textContent=al),de=l(s),E=i(s,"P",{"data-svelte-h":!0}),m(E)!=="svelte-zcbqh0"&&(E.innerHTML=nl),Me=l(s),o(N.$$.fragment,s),fe=l(s),L=i(s,"P",{"data-svelte-h":!0}),m(L)!=="svelte-6hey6l"&&(L.textContent=ll),we=l(s),X=i(s,"UL",{"data-svelte-h":!0}),m(X)!=="svelte-d2hqi4"&&(X.innerHTML=pl),ve=l(s),P=i(s,"P",{"data-svelte-h":!0}),m(P)!=="svelte-1hkfalc"&&(P.textContent=il),be=l(s),o(V.$$.fragment,s),Te=l(s),F=i(s,"P",{"data-svelte-h":!0}),m(F)!=="svelte-py6z8p"&&(F.textContent=ml),$e=l(s),o(D.$$.fragment,s),Je=l(s),W=i(s,"P",{"data-svelte-h":!0}),m(W)!=="svelte-c98l76"&&(W.textContent=rl),xe=l(s),Y=i(s,"P",{"data-svelte-h":!0}),m(Y)!=="svelte-km5vrd"&&(Y.innerHTML=ol),Ce=l(s),o(q.$$.fragment,s),Ue=l(s),O=i(s,"P",{});var Kt=v(O);Rn=U(Kt,`The key insight of GRPO is that we don’t need absolute measures of quality - we can compare outputs within the same group. This is done using standardization:
`),je=Pt(Kt,!1),Kt.forEach(e),_e=l(s),o(K.$$.fragment,s),Ie=l(s),ss=i(s,"P",{"data-svelte-h":!0}),m(ss)!=="svelte-24wdaf"&&(ss.textContent=cl),Ae=l(s),ts=i(s,"UL",{"data-svelte-h":!0}),m(ts)!=="svelte-sl5m0h"&&(ts.innerHTML=hl),ke=l(s),o(es.$$.fragment,s),ze=l(s),as=i(s,"P",{"data-svelte-h":!0}),m(as)!=="svelte-posn41"&&(as.textContent=gl),Ge=l(s),ns=i(s,"P",{"data-svelte-h":!0}),m(ns)!=="svelte-zcjrdj"&&(ns.textContent=yl),Be=l(s),ls=i(s,"P",{"data-svelte-h":!0}),m(ls)!=="svelte-1vsxktr"&&(ls.textContent=ul),Re=l(s),ps=i(s,"P",{"data-svelte-h":!0}),m(ps)!=="svelte-19cns2n"&&(ps.textContent=dl),Ze=l(s),o(is.$$.fragment,s),Qe=l(s),ms=i(s,"P",{"data-svelte-h":!0}),m(ms)!=="svelte-9fqajt"&&(ms.textContent=Ml),Se=l(s),rs=i(s,"P",{});var Vn=v(rs);Zn=U(Vn,`The target function for policy update is:
`),He=Pt(Vn,!1),Vn.forEach(e),Ee=l(s),os=i(s,"P",{"data-svelte-h":!0}),m(os)!=="svelte-1b0nggz"&&(os.textContent=fl),Ne=l(s),o(cs.$$.fragment,s),Le=l(s),hs=i(s,"P",{"data-svelte-h":!0}),m(hs)!=="svelte-17gvfcn"&&(hs.textContent=wl),Xe=l(s),o(gs.$$.fragment,s),Pe=l(s),ys=i(s,"P",{"data-svelte-h":!0}),m(ys)!=="svelte-m3dudm"&&(ys.textContent=vl),Ve=l(s),us=i(s,"P",{"data-svelte-h":!0}),m(us)!=="svelte-1089qz0"&&(us.innerHTML=bl),Fe=l(s),ds=i(s,"P",{"data-svelte-h":!0}),m(ds)!=="svelte-s1ow2h"&&(ds.textContent=Tl),De=l(s),o(Ms.$$.fragment,s),We=l(s),fs=i(s,"UL",{"data-svelte-h":!0}),m(fs)!=="svelte-ihmtt5"&&(fs.innerHTML=$l),Ye=l(s),ws=i(s,"P",{"data-svelte-h":!0}),m(ws)!=="svelte-gvqvif"&&(ws.textContent=Jl),qe=l(s),o(vs.$$.fragment,s),Oe=l(s),bs=i(s,"P",{"data-svelte-h":!0}),m(bs)!=="svelte-1wq6f94"&&(bs.textContent=xl),Ke=l(s),Ts=i(s,"P",{"data-svelte-h":!0}),m(Ts)!=="svelte-csgxyq"&&(Ts.innerHTML=Cl),sa=l(s),$s=i(s,"P",{"data-svelte-h":!0}),m($s)!=="svelte-sm8a7o"&&($s.textContent=Ul),ta=l(s),o(Js.$$.fragment,s),ea=l(s),xs=i(s,"P",{"data-svelte-h":!0}),m(xs)!=="svelte-ici6cu"&&(xs.textContent=jl),aa=l(s),Cs=i(s,"UL",{"data-svelte-h":!0}),m(Cs)!=="svelte-183y4uy"&&(Cs.innerHTML=_l),na=l(s),o(Us.$$.fragment,s),la=l(s),js=i(s,"UL",{"data-svelte-h":!0}),m(js)!=="svelte-11d1zl1"&&(js.innerHTML=Il),pa=l(s),_s=i(s,"P",{"data-svelte-h":!0}),m(_s)!=="svelte-1542h54"&&(_s.textContent=Al),ia=l(s),o(Is.$$.fragment,s),ma=l(s),As=i(s,"P",{"data-svelte-h":!0}),m(As)!=="svelte-126d324"&&(As.textContent=kl),ra=l(s),ks=i(s,"P",{"data-svelte-h":!0}),m(ks)!=="svelte-zkgyxi"&&(ks.innerHTML=zl),oa=l(s),zs=i(s,"P",{"data-svelte-h":!0}),m(zs)!=="svelte-i6pndx"&&(zs.innerHTML=Gl),ca=l(s),o(Gs.$$.fragment,s),ha=l(s),Bs=i(s,"UL",{"data-svelte-h":!0}),m(Bs)!=="svelte-182lgs4"&&(Bs.innerHTML=Bl),ga=l(s),o(Rs.$$.fragment,s),ya=l(s),Zs=i(s,"P",{"data-svelte-h":!0}),m(Zs)!=="svelte-5tevnd"&&(Zs.textContent=Rl),ua=l(s),$=i(s,"P",{});var Gn=v($);Qn=U(Gn,"Recall that KL distance is defined as follows:"),da=Pt(Gn,!1),Ma=U(Gn,`
In RLHF, the two distributions of interest are often the distribution of the new model version, P(x), and a distribution of the reference policy, Q(x).`),Gn.forEach(e),fa=l(s),o(Qs.$$.fragment,s),wa=l(s),Ss=i(s,"P",{"data-svelte-h":!0}),m(Ss)!=="svelte-ugynt7"&&(Ss.textContent=Zl),va=l(s),Hs=i(s,"UL",{"data-svelte-h":!0}),m(Hs)!=="svelte-959xmu"&&(Hs.innerHTML=Ql),ba=l(s),Es=i(s,"P",{"data-svelte-h":!0}),m(Es)!=="svelte-172tgrr"&&(Es.textContent=Sl),Ta=l(s),o(Ns.$$.fragment,s),$a=l(s),Ls=i(s,"P",{"data-svelte-h":!0}),m(Ls)!=="svelte-1kecgai"&&(Ls.textContent=Hl),Ja=l(s),o(Xs.$$.fragment,s),xa=l(s),Ft=i(s,"P",{});var vp=v(Ft);Ca=Pt(vp,!1),vp.forEach(e),Ua=l(s),o(Ps.$$.fragment,s),ja=l(s),Vs=i(s,"P",{"data-svelte-h":!0}),m(Vs)!=="svelte-19oke8c"&&(Vs.textContent=El),_a=l(s),Fs=i(s,"P",{});var Fn=v(Fs);Sn=U(Fn,`Generate $(G = 8)$ responses, $4$ of which are correct answer ($14, \\text{reward=} 1$) and $4$ incorrect $\\text{(reward= 0)}$, Therefore:
`),Ia=Pt(Fn,!1),Fn.forEach(e),Aa=l(s),o(Ds.$$.fragment,s),ka=l(s),Ws=i(s,"P",{"data-svelte-h":!0}),m(Ws)!=="svelte-7jmzoi"&&(Ws.textContent=Nl),za=l(s),b=i(s,"UL",{});var se=v(b);Dt=i(se,"LI",{});var Dn=v(Dt);Hn=U(Dn,"Group Average: "),Ga=Pt(Dn,!1),Dn.forEach(e),En=l(se),Wt=i(se,"LI",{"data-svelte-h":!0}),m(Wt)!=="svelte-11g5ftk"&&(Wt.textContent=Ll),Nn=l(se),Yt=i(se,"LI",{"data-svelte-h":!0}),m(Yt)!=="svelte-d19f8i"&&(Yt.innerHTML=Xl),se.forEach(e),Ba=l(s),o(Ys.$$.fragment,s),Ra=l(s),qs=i(s,"P",{"data-svelte-h":!0}),m(qs)!=="svelte-p2yfd2"&&(qs.textContent=Pl),Za=l(s),J=i(s,"UL",{});var Bn=v(J);x=i(Bn,"LI",{});var ee=v(x);Ln=U(ee,"Assuming the probability of old policy ($\\pi"),qt=i(ee,"EM",{"data-svelte-h":!0}),m(qt)!=="svelte-19ssaec"&&(qt.textContent=Vl),Xn=U(ee,"{old}}$) for a correct output $o_1$ is $0.5$ and the new policy increases it to $0.7$ then:"),Qa=Pt(ee,!1),ee.forEach(e),Pn=l(Bn),Ot=i(Bn,"LI",{"data-svelte-h":!0}),m(Ot)!=="svelte-1fcug7i"&&(Ot.textContent=Fl),Bn.forEach(e),Sa=l(s),Os=i(s,"P",{"data-svelte-h":!0}),m(Os)!=="svelte-axmf6u"&&(Os.textContent=Dl),Ha=l(s),o(Ks.$$.fragment,s),Ea=l(s),st=i(s,"P",{"data-svelte-h":!0}),m(st)!=="svelte-1onl4uk"&&(st.textContent=Wl),Na=l(s),o(tt.$$.fragment,s),La=l(s),et=i(s,"P",{"data-svelte-h":!0}),m(et)!=="svelte-qcmhu8"&&(et.textContent=Yl),Xa=l(s),o(at.$$.fragment,s),Pa=l(s),nt=i(s,"P",{"data-svelte-h":!0}),m(nt)!=="svelte-5ogvdk"&&(nt.textContent=ql),Va=l(s),o(lt.$$.fragment,s),Fa=l(s),o(pt.$$.fragment,s),Da=l(s),it=i(s,"P",{"data-svelte-h":!0}),m(it)!=="svelte-sdk010"&&(it.textContent=Ol),Wa=l(s),mt=i(s,"P",{"data-svelte-h":!0}),m(mt)!=="svelte-1axuibu"&&(mt.innerHTML=Kl),Ya=l(s),rt=i(s,"UL",{"data-svelte-h":!0}),m(rt)!=="svelte-1eslwf1"&&(rt.innerHTML=sp),qa=l(s),ot=i(s,"P",{"data-svelte-h":!0}),m(ot)!=="svelte-1ylhe0"&&(ot.textContent=tp),Oa=l(s),o(ct.$$.fragment,s),Ka=l(s),ht=i(s,"P",{"data-svelte-h":!0}),m(ht)!=="svelte-1dv10qi"&&(ht.textContent=ep),sn=l(s),o(gt.$$.fragment,s),tn=l(s),yt=i(s,"P",{"data-svelte-h":!0}),m(yt)!=="svelte-1k2jw1h"&&(yt.textContent=ap),en=l(s),o(ut.$$.fragment,s),an=l(s),dt=i(s,"P",{"data-svelte-h":!0}),m(dt)!=="svelte-v9ksyn"&&(dt.textContent=np),nn=l(s),o(Mt.$$.fragment,s),ln=l(s),ft=i(s,"P",{"data-svelte-h":!0}),m(ft)!=="svelte-1k2jw1h"&&(ft.textContent=lp),pn=l(s),o(wt.$$.fragment,s),mn=l(s),vt=i(s,"P",{"data-svelte-h":!0}),m(vt)!=="svelte-1z0xj5h"&&(vt.textContent=pp),rn=l(s),o(bt.$$.fragment,s),on=l(s),Tt=i(s,"P",{"data-svelte-h":!0}),m(Tt)!=="svelte-1usqh25"&&(Tt.innerHTML=ip),cn=l(s),o($t.$$.fragment,s),hn=l(s),Jt=i(s,"P",{"data-svelte-h":!0}),m(Jt)!=="svelte-mxi4u2"&&(Jt.textContent=mp),gn=l(s),o(xt.$$.fragment,s),yn=l(s),Ct=i(s,"P",{"data-svelte-h":!0}),m(Ct)!=="svelte-16k78wr"&&(Ct.textContent=rp),un=l(s),o(Ut.$$.fragment,s),dn=l(s),jt=i(s,"P",{"data-svelte-h":!0}),m(jt)!=="svelte-t7gd3h"&&(jt.textContent=op),Mn=l(s),o(_t.$$.fragment,s),fn=l(s),It=i(s,"P",{"data-svelte-h":!0}),m(It)!=="svelte-17qfvx2"&&(It.innerHTML=cp),wn=l(s),o(At.$$.fragment,s),vn=l(s),kt=i(s,"P",{"data-svelte-h":!0}),m(kt)!=="svelte-exudkk"&&(kt.innerHTML=hp),bn=l(s),o(zt.$$.fragment,s),Tn=l(s),Gt=i(s,"P",{"data-svelte-h":!0}),m(Gt)!=="svelte-17fht8a"&&(Gt.innerHTML=gp),$n=l(s),o(Bt.$$.fragment,s),Jn=l(s),Rt=i(s,"P",{"data-svelte-h":!0}),m(Rt)!=="svelte-efhpp6"&&(Rt.textContent=yp),xn=l(s),Zt=i(s,"OL",{"data-svelte-h":!0}),m(Zt)!=="svelte-hly9hr"&&(Zt.innerHTML=up),Cn=l(s),Qt=i(s,"P",{"data-svelte-h":!0}),m(Qt)!=="svelte-33u601"&&(Qt.textContent=dp),Un=l(s),St=i(s,"P",{"data-svelte-h":!0}),m(St)!=="svelte-crj0za"&&(St.textContent=Mp),jn=l(s),Ht=i(s,"P",{"data-svelte-h":!0}),m(Ht)!=="svelte-1ywr9jc"&&(Ht.textContent=fp),_n=l(s),o(Et.$$.fragment,s),In=l(s),Nt=i(s,"OL",{"data-svelte-h":!0}),m(Nt)!=="svelte-18yys75"&&(Nt.innerHTML=wp),An=l(s),o(Lt.$$.fragment,s),kn=l(s),te=i(s,"P",{}),v(te).forEach(e),this.h()},h(){bp(M,"name","hf:doc:metadata"),bp(M,"content",Sp),je.a=null,He.a=null,da.a=Ma,Ca.a=null,Ia.a=null,Ga.a=null,Qa.a=null},m(s,t){d(document.head,M),a(s,j,t),a(s,w,t),a(s,Vt,t),c(_,s,t),a(s,ne,t),c(T,s,t),a(s,le,t),a(s,I,t),a(s,pe,t),a(s,A,t),a(s,ie,t),a(s,k,t),a(s,me,t),a(s,z,t),a(s,re,t),a(s,G,t),a(s,oe,t),a(s,B,t),a(s,ce,t),c(R,s,t),a(s,he,t),a(s,Z,t),a(s,ge,t),a(s,Q,t),a(s,ye,t),c(S,s,t),a(s,ue,t),a(s,H,t),a(s,de,t),a(s,E,t),a(s,Me,t),c(N,s,t),a(s,fe,t),a(s,L,t),a(s,we,t),a(s,X,t),a(s,ve,t),a(s,P,t),a(s,be,t),c(V,s,t),a(s,Te,t),a(s,F,t),a(s,$e,t),c(D,s,t),a(s,Je,t),a(s,W,t),a(s,xe,t),a(s,Y,t),a(s,Ce,t),c(q,s,t),a(s,Ue,t),a(s,O,t),d(O,Rn),je.m(Tp,O),a(s,_e,t),c(K,s,t),a(s,Ie,t),a(s,ss,t),a(s,Ae,t),a(s,ts,t),a(s,ke,t),c(es,s,t),a(s,ze,t),a(s,as,t),a(s,Ge,t),a(s,ns,t),a(s,Be,t),a(s,ls,t),a(s,Re,t),a(s,ps,t),a(s,Ze,t),c(is,s,t),a(s,Qe,t),a(s,ms,t),a(s,Se,t),a(s,rs,t),d(rs,Zn),He.m($p,rs),a(s,Ee,t),a(s,os,t),a(s,Ne,t),c(cs,s,t),a(s,Le,t),a(s,hs,t),a(s,Xe,t),c(gs,s,t),a(s,Pe,t),a(s,ys,t),a(s,Ve,t),a(s,us,t),a(s,Fe,t),a(s,ds,t),a(s,De,t),c(Ms,s,t),a(s,We,t),a(s,fs,t),a(s,Ye,t),a(s,ws,t),a(s,qe,t),c(vs,s,t),a(s,Oe,t),a(s,bs,t),a(s,Ke,t),a(s,Ts,t),a(s,sa,t),a(s,$s,t),a(s,ta,t),c(Js,s,t),a(s,ea,t),a(s,xs,t),a(s,aa,t),a(s,Cs,t),a(s,na,t),c(Us,s,t),a(s,la,t),a(s,js,t),a(s,pa,t),a(s,_s,t),a(s,ia,t),c(Is,s,t),a(s,ma,t),a(s,As,t),a(s,ra,t),a(s,ks,t),a(s,oa,t),a(s,zs,t),a(s,ca,t),c(Gs,s,t),a(s,ha,t),a(s,Bs,t),a(s,ga,t),c(Rs,s,t),a(s,ya,t),a(s,Zs,t),a(s,ua,t),a(s,$,t),d($,Qn),da.m(Jp,$),d($,Ma),a(s,fa,t),c(Qs,s,t),a(s,wa,t),a(s,Ss,t),a(s,va,t),a(s,Hs,t),a(s,ba,t),a(s,Es,t),a(s,Ta,t),c(Ns,s,t),a(s,$a,t),a(s,Ls,t),a(s,Ja,t),c(Xs,s,t),a(s,xa,t),a(s,Ft,t),Ca.m(xp,Ft),a(s,Ua,t),c(Ps,s,t),a(s,ja,t),a(s,Vs,t),a(s,_a,t),a(s,Fs,t),d(Fs,Sn),Ia.m(Cp,Fs),a(s,Aa,t),c(Ds,s,t),a(s,ka,t),a(s,Ws,t),a(s,za,t),a(s,b,t),d(b,Dt),d(Dt,Hn),Ga.m(Up,Dt),d(b,En),d(b,Wt),d(b,Nn),d(b,Yt),a(s,Ba,t),c(Ys,s,t),a(s,Ra,t),a(s,qs,t),a(s,Za,t),a(s,J,t),d(J,x),d(x,Ln),d(x,qt),d(x,Xn),Qa.m(jp,x),d(J,Pn),d(J,Ot),a(s,Sa,t),a(s,Os,t),a(s,Ha,t),c(Ks,s,t),a(s,Ea,t),a(s,st,t),a(s,Na,t),c(tt,s,t),a(s,La,t),a(s,et,t),a(s,Xa,t),c(at,s,t),a(s,Pa,t),a(s,nt,t),a(s,Va,t),c(lt,s,t),a(s,Fa,t),c(pt,s,t),a(s,Da,t),a(s,it,t),a(s,Wa,t),a(s,mt,t),a(s,Ya,t),a(s,rt,t),a(s,qa,t),a(s,ot,t),a(s,Oa,t),c(ct,s,t),a(s,Ka,t),a(s,ht,t),a(s,sn,t),c(gt,s,t),a(s,tn,t),a(s,yt,t),a(s,en,t),c(ut,s,t),a(s,an,t),a(s,dt,t),a(s,nn,t),c(Mt,s,t),a(s,ln,t),a(s,ft,t),a(s,pn,t),c(wt,s,t),a(s,mn,t),a(s,vt,t),a(s,rn,t),c(bt,s,t),a(s,on,t),a(s,Tt,t),a(s,cn,t),c($t,s,t),a(s,hn,t),a(s,Jt,t),a(s,gn,t),c(xt,s,t),a(s,yn,t),a(s,Ct,t),a(s,un,t),c(Ut,s,t),a(s,dn,t),a(s,jt,t),a(s,Mn,t),c(_t,s,t),a(s,fn,t),a(s,It,t),a(s,wn,t),c(At,s,t),a(s,vn,t),a(s,kt,t),a(s,bn,t),c(zt,s,t),a(s,Tn,t),a(s,Gt,t),a(s,$n,t),c(Bt,s,t),a(s,Jn,t),a(s,Rt,t),a(s,xn,t),a(s,Zt,t),a(s,Cn,t),a(s,Qt,t),a(s,Un,t),a(s,St,t),a(s,jn,t),a(s,Ht,t),a(s,_n,t),c(Et,s,t),a(s,In,t),a(s,Nt,t),a(s,An,t),c(Lt,s,t),a(s,kn,t),a(s,te,t),zn=!0},p(s,[t]){const Kt={};t&2&&(Kt.$$scope={dirty:t,ctx:s}),T.$set(Kt)},i(s){zn||(h(_.$$.fragment,s),h(T.$$.fragment,s),h(R.$$.fragment,s),h(S.$$.fragment,s),h(N.$$.fragment,s),h(V.$$.fragment,s),h(D.$$.fragment,s),h(q.$$.fragment,s),h(K.$$.fragment,s),h(es.$$.fragment,s),h(is.$$.fragment,s),h(cs.$$.fragment,s),h(gs.$$.fragment,s),h(Ms.$$.fragment,s),h(vs.$$.fragment,s),h(Js.$$.fragment,s),h(Us.$$.fragment,s),h(Is.$$.fragment,s),h(Gs.$$.fragment,s),h(Rs.$$.fragment,s),h(Qs.$$.fragment,s),h(Ns.$$.fragment,s),h(Xs.$$.fragment,s),h(Ps.$$.fragment,s),h(Ds.$$.fragment,s),h(Ys.$$.fragment,s),h(Ks.$$.fragment,s),h(tt.$$.fragment,s),h(at.$$.fragment,s),h(lt.$$.fragment,s),h(pt.$$.fragment,s),h(ct.$$.fragment,s),h(gt.$$.fragment,s),h(ut.$$.fragment,s),h(Mt.$$.fragment,s),h(wt.$$.fragment,s),h(bt.$$.fragment,s),h($t.$$.fragment,s),h(xt.$$.fragment,s),h(Ut.$$.fragment,s),h(_t.$$.fragment,s),h(At.$$.fragment,s),h(zt.$$.fragment,s),h(Bt.$$.fragment,s),h(Et.$$.fragment,s),h(Lt.$$.fragment,s),zn=!0)},o(s){g(_.$$.fragment,s),g(T.$$.fragment,s),g(R.$$.fragment,s),g(S.$$.fragment,s),g(N.$$.fragment,s),g(V.$$.fragment,s),g(D.$$.fragment,s),g(q.$$.fragment,s),g(K.$$.fragment,s),g(es.$$.fragment,s),g(is.$$.fragment,s),g(cs.$$.fragment,s),g(gs.$$.fragment,s),g(Ms.$$.fragment,s),g(vs.$$.fragment,s),g(Js.$$.fragment,s),g(Us.$$.fragment,s),g(Is.$$.fragment,s),g(Gs.$$.fragment,s),g(Rs.$$.fragment,s),g(Qs.$$.fragment,s),g(Ns.$$.fragment,s),g(Xs.$$.fragment,s),g(Ps.$$.fragment,s),g(Ds.$$.fragment,s),g(Ys.$$.fragment,s),g(Ks.$$.fragment,s),g(tt.$$.fragment,s),g(at.$$.fragment,s),g(lt.$$.fragment,s),g(pt.$$.fragment,s),g(ct.$$.fragment,s),g(gt.$$.fragment,s),g(ut.$$.fragment,s),g(Mt.$$.fragment,s),g(wt.$$.fragment,s),g(bt.$$.fragment,s),g($t.$$.fragment,s),g(xt.$$.fragment,s),g(Ut.$$.fragment,s),g(_t.$$.fragment,s),g(At.$$.fragment,s),g(zt.$$.fragment,s),g(Bt.$$.fragment,s),g(Et.$$.fragment,s),g(Lt.$$.fragment,s),zn=!1},d(s){s&&(e(j),e(w),e(Vt),e(ne),e(le),e(I),e(pe),e(A),e(ie),e(k),e(me),e(z),e(re),e(G),e(oe),e(B),e(ce),e(he),e(Z),e(ge),e(Q),e(ye),e(ue),e(H),e(de),e(E),e(Me),e(fe),e(L),e(we),e(X),e(ve),e(P),e(be),e(Te),e(F),e($e),e(Je),e(W),e(xe),e(Y),e(Ce),e(Ue),e(O),e(_e),e(Ie),e(ss),e(Ae),e(ts),e(ke),e(ze),e(as),e(Ge),e(ns),e(Be),e(ls),e(Re),e(ps),e(Ze),e(Qe),e(ms),e(Se),e(rs),e(Ee),e(os),e(Ne),e(Le),e(hs),e(Xe),e(Pe),e(ys),e(Ve),e(us),e(Fe),e(ds),e(De),e(We),e(fs),e(Ye),e(ws),e(qe),e(Oe),e(bs),e(Ke),e(Ts),e(sa),e($s),e(ta),e(ea),e(xs),e(aa),e(Cs),e(na),e(la),e(js),e(pa),e(_s),e(ia),e(ma),e(As),e(ra),e(ks),e(oa),e(zs),e(ca),e(ha),e(Bs),e(ga),e(ya),e(Zs),e(ua),e($),e(fa),e(wa),e(Ss),e(va),e(Hs),e(ba),e(Es),e(Ta),e($a),e(Ls),e(Ja),e(xa),e(Ft),e(Ua),e(ja),e(Vs),e(_a),e(Fs),e(Aa),e(ka),e(Ws),e(za),e(b),e(Ba),e(Ra),e(qs),e(Za),e(J),e(Sa),e(Os),e(Ha),e(Ea),e(st),e(Na),e(La),e(et),e(Xa),e(Pa),e(nt),e(Va),e(Fa),e(Da),e(it),e(Wa),e(mt),e(Ya),e(rt),e(qa),e(ot),e(Oa),e(Ka),e(ht),e(sn),e(tn),e(yt),e(en),e(an),e(dt),e(nn),e(ln),e(ft),e(pn),e(mn),e(vt),e(rn),e(on),e(Tt),e(cn),e(hn),e(Jt),e(gn),e(yn),e(Ct),e(un),e(dn),e(jt),e(Mn),e(fn),e(It),e(wn),e(vn),e(kt),e(bn),e(Tn),e(Gt),e($n),e(Jn),e(Rt),e(xn),e(Zt),e(Cn),e(Qt),e(Un),e(St),e(jn),e(Ht),e(_n),e(In),e(Nt),e(An),e(kn),e(te)),e(M),y(_,s),y(T,s),y(R,s),y(S,s),y(N,s),y(V,s),y(D,s),y(q,s),y(K,s),y(es,s),y(is,s),y(cs,s),y(gs,s),y(Ms,s),y(vs,s),y(Js,s),y(Us,s),y(Is,s),y(Gs,s),y(Rs,s),y(Qs,s),y(Ns,s),y(Xs,s),y(Ps,s),y(Ds,s),y(Ys,s),y(Ks,s),y(tt,s),y(at,s),y(lt,s),y(pt,s),y(ct,s),y(gt,s),y(ut,s),y(Mt,s),y(wt,s),y(bt,s),y($t,s),y(xt,s),y(Ut,s),y(_t,s),y(At,s),y(zt,s),y(Bt,s),y(Et,s),y(Lt,s)}}}const Sp='{"title":"Advanced Understanding of Group Relative Policy Optimization (GRPO) in DeepSeekMath","local":"advanced-understanding-of-group-relative-policy-optimization-grpo-in-deepseekmath","sections":[{"title":"The GRPO Algorithm","local":"the-grpo-algorithm","sections":[{"title":"Step 1: Group Sampling","local":"step-1-group-sampling","sections":[{"title":"Example:","local":"example","sections":[],"depth":4}],"depth":3},{"title":"Step 2: Advantage Calculation","local":"step-2-advantage-calculation","sections":[{"title":"Reward Distribution:","local":"reward-distribution","sections":[],"depth":4},{"title":"Advantage Value Formula:","local":"advantage-value-formula","sections":[],"depth":4},{"title":"Example:","local":"example","sections":[],"depth":4},{"title":"Interpretation:","local":"interpretation","sections":[],"depth":4}],"depth":3},{"title":"Step 3: Policy Update","local":"step-3-policy-update","sections":[],"depth":3}],"depth":2},{"title":"Key Components of the Target Function","local":"key-components-of-the-target-function","sections":[{"title":"1. Probability Ratio","local":"1-probability-ratio","sections":[{"title":"Interpretation:","local":"interpretation","sections":[],"depth":4}],"depth":3},{"title":"2. Clip Function","local":"2-clip-function","sections":[{"title":"Example $\\space \\text{suppose}(\\epsilon = 0.2)$","local":"example-space-textsupposeepsilon--02","sections":[],"depth":4},{"title":"Interpretation:","local":"interpretation","sections":[],"depth":4}],"depth":3},{"title":"3. KL Divergence","local":"3-kl-divergence","sections":[{"title":"Interpretation","local":"interpretation","sections":[],"depth":4},{"title":"Math Definition","local":"math-definition","sections":[],"depth":4},{"title":"The Role of $\\beta$ Parameter","local":"the-role-of-beta-parameter","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Worked Example with GRPO","local":"worked-example-with-grpo","sections":[{"title":"Example Problem","local":"example-problem","sections":[],"depth":3},{"title":"Step 1: Group Sampling","local":"step-1-group-sampling","sections":[],"depth":3},{"title":"Step 2: Advantage Calculation","local":"step-2-advantage-calculation","sections":[],"depth":3},{"title":"Step 3: Policy Update","local":"step-3-policy-update","sections":[],"depth":3}],"depth":2},{"title":"Implementation Example","local":"implementation-example","sections":[{"title":"1. Loading the Model and Generating Responses","local":"1-loading-the-model-and-generating-responses","sections":[],"depth":3},{"title":"2. Calculating Rewards","local":"2-calculating-rewards","sections":[],"depth":3},{"title":"3. Updating the Policy","local":"3-updating-the-policy","sections":[],"depth":3}],"depth":2},{"title":"Summary and Next Steps","local":"summary-and-next-steps","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function Hp(ae){return Ip(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Vp extends kp{constructor(M){super(),zp(this,M,Hp,Qp,_p,{})}}export{Vp as component};

Xet Storage Details

Size:
101 kB
·
Xet hash:
241c17035be3bd86d4789fcde60991c2551b8bc703ed20650452d5e7c1397f50

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.