Buckets:
| import{s as $t,a as Rt,o as zt,n as Zt}from"../chunks/scheduler.7b731bd4.js";import{S as Gt,i as Bt,e as o,s as n,c as m,q as v,H as N,h as Ft,a as i,d as s,b as l,f as R,g as c,j as p,r as j,u as q,k as b,v as Ws,l as r,m as t,n as d,t as h,o as g,p as u}from"../chunks/index.cc268345.js";import{C as Qt,H as U,E as Wt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{D as Oe}from"../chunks/Docstring.03f7b462.js";import{C as B}from"../chunks/CodeBlock.169a125f.js";import{E as Et}from"../chunks/ExampleCodeBlock.415f9452.js";function Vt(ea){let M,Y="Example:",z,k,C;return k=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZ1bHRyYWZlZWRiYWNrX2JpbmFyaXplZCUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMFJld2FyZFRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJRd2VuJTJGUXdlbjIuNS0wLjVCLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEEpJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/ultrafeedback_binarized"</span>, split=<span class="hljs-string">"train"</span>) | |
| trainer = RewardTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen2.5-0.5B-Instruct"</span>, | |
| train_dataset=dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),{c(){M=o("p"),M.textContent=Y,z=n(),m(k.$$.fragment)},l(w){M=i(w,"P",{"data-svelte-h":!0}),p(M)!=="svelte-11lpom8"&&(M.textContent=Y),z=l(w),c(k.$$.fragment,w)},m(w,$){t(w,M,$),t(w,z,$),d(k,w,$),C=!0},p:Zt,i(w){C||(h(k.$$.fragment,w),C=!0)},o(w){g(k.$$.fragment,w),C=!1},d(w){w&&(s(M),s(z)),u(k,w)}}}function Xt(ea){let M,Y,z,k,C,w,$,aa,H,Es='<a href="https://huggingface.co/models?other=reward-trainer,trl" rel="nofollow"><img src="https://img.shields.io/badge/All_models-Reward_Trainer-blue" alt="model badge"/></a>',sa,L,ta,A,Vs="TRL supports the Outcome-supervised Reward Modeling (ORM) Trainer for training reward models.",na,D,Xs='This post-training method was contributed by <a href="https://huggingface.co/ybelkada" rel="nofollow">Younes Belkada</a>.',la,P,ra,K,Ss='This example demonstrates how to train a reward model using the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> from TRL. We train a <a href="https://huggingface.co/Qwen/Qwen3-0.6B" rel="nofollow">Qwen 3 0.6B</a> model on the <a href="https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized" rel="nofollow">UltraFeedback dataset</a>, large-scale, fine-grained, diverse preference dataset.',oa,O,ia,x,Ys,pa,ee,ma,ae,Hs='<a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> supports <a href="dataset_formats#preference">preference</a> datasets type (both implicit and explicit prompt). The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> is compatible with both <a href="dataset_formats#standard">standard</a> and <a href="dataset_formats#conversational">conversational</a> dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.',ca,se,da,te,Ls='If your dataset is not in one of these formats, you can preprocess it to convert it into the expected format. Here is an example with the <a href="https://huggingface.co/datasets/lmarena-ai/arena-human-preference-55k" rel="nofollow">lmarena-ai/arena-human-preference-55k</a> dataset:',ha,ne,ga,le,ua,re,fa,oe,As="Reward Models (RMs) are typically trained using supervised learning on datasets containing pairs of preferred and non-preferred responses. The goal is to learn a function that assigns higher scores to preferred responses, enabling the model to rank outputs based on preferences.",ya,ie,Ds="This section breaks down how reward modeling works in practice, covering the key steps: <strong>preprocessing</strong> and <strong>loss computation</strong>.",wa,pe,Ta,me,Ps=`During training, each example is expected to contain a <strong>chosen</strong> and <strong>rejected</strong> field. For more details on the expected formats, see <a href="dataset_formats#preference">Dataset formats - Preference</a>. | |
| The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> tokenizes each input using the model’s tokenizer. If prompts and completions (chosen and rejected) are provided separately (explicit prompt case), they are concatenated before tokenization.`,Ma,ce,_a,f,Js,ba,_t='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>x</mi></mrow><annotation encoding="application/x-tex"> x </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">x</span></span></span></span>',Ja,va,bt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',ja,Ua,Jt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Ia,W,Ks="Bradley & Terry, 1952",vs,ka,vt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',Ca,xa,jt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Na,qa,Ut='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>r</mi></mrow><annotation encoding="application/x-tex"> r </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span></span></span></span>',$a,Ra,It='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi><mo stretchy="false">(</mo><msup><mi>y</mi><mo>+</mo></msup><mo>≻</mo><msup><mi>y</mi><mo>−</mo></msup><mi mathvariant="normal">∣</mi><mi>x</mi><mo stretchy="false">)</mo><mo>=</mo><mi>σ</mi><mo stretchy="false">(</mo><mi>r</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo stretchy="false">)</mo><mo>−</mo><mi>r</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> p(y^+ ≻ y^- |x) = \\sigma(r(x, y^+)−r(x, y^-)) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal">p</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">≻</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal">x</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose">))</span></span></span></span>',za,Za,kt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>σ</mi></mrow><annotation encoding="application/x-tex"> σ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span></span></span></span>',Ga,Ba,I,js,Fa,Ct='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>y</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> r_\\theta(x, y) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mclose">)</span></span></span></span>',Qa,Wa,xt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',Ea,Va,Nt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Xa,Sa,qt=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi mathvariant="script">L</mi><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><msub><mi mathvariant="double-struck">E</mi><mrow><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo>∼</mo><mi mathvariant="script">D</mi></mrow></msub><mrow><mo fence="true">[</mo><mi>log</mi><mo></mo><mi>σ</mi><mo stretchy="false">(</mo><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo stretchy="false">)</mo><mo>−</mo><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex"> | |
| \\mathcal{L}(\\theta) = - \\mathbb{E}_{(x,y^+,y^-) \\sim \\mathcal{D}} \\left[ \\log \\sigma(r_\\theta(x, y^+) - r_\\theta(x, y^-)) \\right]. | |
| </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathcal">L</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.2052em;vertical-align:-0.3552em;"></span><span class="mord">−</span><span class="mord"><span class="mord mathbb">E</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.5198em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">x</span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7027em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7027em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose mtight">)</span><span class="mrel mtight">∼</span><span class="mord mathcal mtight" style="margin-right:0.02778em;">D</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.3552em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size1">[</span></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8213em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8213em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose">))</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size1">]</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">.</span></span></span></span></span>`,Ya,E,Os='<p>The Bradley-Terry model is underdetermined, meaning that adding a constant to all rewards does not change the preference probabilities. To address this, <a href="https://huggingface.co/papers/2312.09244" rel="nofollow">Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate Reward Hacking</a> proposes adding an auxiliary loss term that encourages the rewards to be centered around zero. This is controlled by the <code>center_rewards_coefficient</code> parameter in the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>. The recommended value is <code>1e-2</code>.</p>',Ha,de,La,he,et="While training and evaluating we record the following reward metrics:",Aa,ge,at="<li><code>global_step</code>: The total number of optimizer steps taken so far.</li> <li><code>epoch</code>: The current epoch number, based on dataset iteration.</li> <li><code>num_tokens</code>: The total number of tokens processed so far.</li> <li><code>loss</code>: The average loss over the last logging interval.</li> <li><code>accuracy</code>: The proportion of correct predictions (i.e., the model assigned a higher score to the chosen response than to the rejected one) averaged over the last logging interval.</li> <li><code>min_reward</code>: The minimum reward score assigned by the model. This value is averaged over the logging interval.</li> <li><code>mean_reward</code>: The average reward score assigned by the model over the last logging interval.</li> <li><code>max_reward</code>: The maximum reward score assigned by the model. This value is averaged over the logging interval.</li> <li><code>margin</code>: The average margin (difference between chosen and rejected rewards) over the last logging interval.</li> <li><code>learning_rate</code>: The current learning rate, which may change dynamically if a scheduler is used.</li> <li><code>grad_norm</code>: The L2 norm of the gradients, computed before gradient clipping.</li>",Da,ue,Pa,fe,Ka,ye,st='You can directly pass the kwargs of the <code>from_pretrained()</code> method to the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>. For example, if you want to load a model in a different precision, analogous to',Oa,we,es,Te,tt='you can do so by passing the <code>model_init_kwargs={"dtype": torch.bfloat16}</code> argument to the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>.',as,Me,ss,_e,nt="Note that all keyword arguments of <code>from_pretrained()</code> are supported, except for <code>num_labels</code>, which is automatically set to 1.",ts,be,ns,Je,lt="We support tight integration with 🤗 PEFT library, allowing any user to conveniently train adapters and share them on the Hub, rather than training the entire model.",ls,ve,rs,je,rt='You can also continue training your <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a>. For that, first load a <code>PeftModel</code> outside <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> and pass it directly to the trainer without the <code>peft_config</code> argument being passed.',os,Ue,is,F,We,ot="When training adapters, you typically use a higher learning rate (≈1e‑3) since only new parameters are being learned.",Us,Ie,ps,ke,ms,Ce,it='The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> fully supports fine-tuning models with <em>tool calling</em> capabilities. In this case, each dataset example should include:',cs,xe,pt="<li>The conversation messages, including any tool calls (<code>tool_calls</code>) and tool responses (<code>tool</code> role messages)</li> <li>The list of available tools in the <code>tools</code> column, typically provided as JSON schemas</li>",ds,Ne,mt='For details on the expected dataset structure, see the <a href="dataset_formats#tool-calling">Dataset Format — Tool Calling</a> section.',hs,qe,gs,T,$e,Is,Ee,ct="Trainer for Outcome-supervised Reward Models (ORM).",ks,Ve,dt='This class is a wrapper around the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer" rel="nofollow">Trainer</a> class and inherits all of its attributes and methods.',Cs,V,xs,X,Re,Ns,Xe,ht="Main training entry point.",qs,Z,ze,$s,Se,gt="Will save the model, so you can reload it using <code>from_pretrained()</code>.",Rs,Ye,ut="Will only save from the main process.",zs,S,Ze,Zs,He,ft="Upload <code>self.model</code> and <code>self.processing_class</code> to the 🤗 model hub on the repo <code>self.args.hub_model_id</code>.",us,Ge,fs,_,Be,Gs,Le,yt='Configuration class for the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a>.',Bs,Ae,wt=`This class includes only the parameters that are specific to Reward training. For a full list of training | |
| arguments, please refer to the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a> documentation. Note that default values in this | |
| class may differ from those in <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>.`,Fs,De,Tt=`Using <a href="https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.HfArgumentParser" rel="nofollow">HfArgumentParser</a> we can turn this class into | |
| <a href="https://docs.python.org/3/library/argparse#module-argparse" rel="nofollow">argparse</a> arguments that can be specified on the | |
| command line.`,Qs,Fe,Mt='<p>These parameters have default values different from <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>:</p> <ul><li><code>logging_steps</code>: Defaults to <code>10</code> instead of <code>500</code>.</li> <li><code>gradient_checkpointing</code>: Defaults to <code>True</code> instead of <code>False</code>.</li> <li><code>bf16</code>: Defaults to <code>True</code> if <code>fp16</code> is not set, instead of <code>False</code>.</li> <li><code>learning_rate</code>: Defaults to <code>1e-4</code> instead of <code>5e-5</code>.</li></ul>',ys,Qe,ws,Ke,Ts;return C=new Qt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),$=new U({props:{title:"Reward Modeling",local:"reward-modeling",headingTag:"h1"}}),L=new U({props:{title:"Overview",local:"overview",headingTag:"h2"}}),P=new U({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),O=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEF0cmFpbmVyJTIwJTNEJTIwUmV3YXJkVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMy0wLjZCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGxvYWRfZGF0YXNldCglMjJ0cmwtbGliJTJGdWx0cmFmZWVkYmFja19iaW5hcml6ZWQlMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| trainer = RewardTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen3-0.6B"</span>, | |
| train_dataset=load_dataset(<span class="hljs-string">"trl-lib/ultrafeedback_binarized"</span>, split=<span class="hljs-string">"train"</span>), | |
| ) | |
| trainer.train()`,wrap:!1}}),ee=new U({props:{title:"Expected dataset type and format",local:"expected-dataset-type-and-format",headingTag:"h2"}}),se=new B({props:{code:"JTIzJTIwU3RhbmRhcmQlMjBwcmVmZXJlbmNlJTIwKGltcGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMmNob3NlbiUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIwYmx1ZS4lMjIlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIwZ3JlZW4uJTIyJTdEJTBBJTBBJTIzJTIwQ29udmVyc2F0aW9uYWwlMjBwcmVmZXJlbmNlJTIwKGltcGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldoYXQlMjBjb2xvciUyMGlzJTIwdGhlJTIwc2t5JTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJJdCUyMGlzJTIwYmx1ZS4lMjIlN0QlNUQlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldoYXQlMjBjb2xvciUyMGlzJTIwdGhlJTIwc2t5JTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJJdCUyMGlzJTIwZ3JlZW4uJTIyJTdEJTVEJTdEJTBBJTBBJTIzJTIwU3RhbmRhcmQlMjBwcmVmZXJlbmNlJTIwKGV4cGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMnByb21wdCUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIyJTJDJTBBJTIwJTIyY2hvc2VuJTIyJTNBJTIwJTIyJTIwYmx1ZS4lMjIlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCUyMiUyMGdyZWVuLiUyMiU3RCUwQSUwQSUyMyUyMENvbnZlcnNhdGlvbmFsJTIwcHJlZmVyZW5jZSUyMChleHBsaWNpdCUyMHByb21wdCklMEElN0IlMjJwcm9tcHQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJXaGF0JTIwY29sb3IlMjBpcyUyMHRoZSUyMHNreSUzRiUyMiU3RCU1RCUyQyUwQSUyMCUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXQlMjBpcyUyMGJsdWUuJTIyJTdEJTVEJTJDJTBBJTIwJTIycmVqZWN0ZWQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMkl0JTIwaXMlMjBncmVlbi4lMjIlN0QlNUQlN0Q=",highlighted:`<span class="hljs-comment"># Standard preference (implicit prompt)</span> | |
| {<span class="hljs-string">"chosen"</span>: <span class="hljs-string">"The sky is blue."</span>, | |
| <span class="hljs-string">"rejected"</span>: <span class="hljs-string">"The sky is green."</span>} | |
| <span class="hljs-comment"># Conversational preference (implicit prompt)</span> | |
| {<span class="hljs-string">"chosen"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"What color is the sky?"</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"It is blue."</span>}], | |
| <span class="hljs-string">"rejected"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"What color is the sky?"</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"It is green."</span>}]} | |
| <span class="hljs-comment"># Standard preference (explicit prompt)</span> | |
| {<span class="hljs-string">"prompt"</span>: <span class="hljs-string">"The sky is"</span>, | |
| <span class="hljs-string">"chosen"</span>: <span class="hljs-string">" blue."</span>, | |
| <span class="hljs-string">"rejected"</span>: <span class="hljs-string">" green."</span>} | |
| <span class="hljs-comment"># Conversational preference (explicit prompt)</span> | |
| {<span class="hljs-string">"prompt"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"What color is the sky?"</span>}], | |
| <span class="hljs-string">"chosen"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"It is blue."</span>}], | |
| <span class="hljs-string">"rejected"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"It is green."</span>}]}`,wrap:!1}}),ne=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBaW1wb3J0JTIwanNvbiUwQSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIybG1hcmVuYS1haSUyRmFyZW5hLWh1bWFuLXByZWZlcmVuY2UtNTVrJTIyKSUwQSUwQSUyMyUyMEZpbHRlciUyMG91dCUyMHRpZXMlMEFkYXRhc2V0JTIwJTNEJTIwZGF0YXNldC5maWx0ZXIobGFtYmRhJTIwZXhhbXBsZSUzQSUyMGV4YW1wbGUlNUIlMjJ3aW5uZXJfdGllJTIyJTVEJTIwJTNEJTNEJTIwMCklMEElMEElMjMlMjBDcmVhdGUlMjAnY2hvc2VuJyUyMGFuZCUyMCdyZWplY3RlZCclMjBmaWVsZHMlMjBiYXNlZCUyMG9uJTIwdGhlJTIwd2lubmVyJTIwY29sdW1uJTBBZGVmJTIwcmVzcG9uc2VfYV9iX3RvX2Nob3Nlbl9yZWplY3RlZChleGFtcGxlKSUzQSUwQSUyMCUyMCUyMCUyMGlmJTIwZXhhbXBsZSU1QiUyMndpbm5lcl9tb2RlbF9hJTIyJTVEJTIwJTNEJTNEJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQlMjAlM0QlMjBleGFtcGxlJTVCJTIycmVzcG9uc2VfYSUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJyZWplY3RlZCUyMiU1RCUyMCUzRCUyMGV4YW1wbGUlNUIlMjJyZXNwb25zZV9iJTIyJTVEJTBBJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQlMjAlM0QlMjBleGFtcGxlJTVCJTIycmVzcG9uc2VfYiUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJyZWplY3RlZCUyMiU1RCUyMCUzRCUyMGV4YW1wbGUlNUIlMjJyZXNwb25zZV9hJTIyJTVEJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwZXhhbXBsZSUwQSUwQWRhdGFzZXQlMjAlM0QlMjBkYXRhc2V0Lm1hcChyZXNwb25zZV9hX2JfdG9fY2hvc2VuX3JlamVjdGVkKSUwQSUwQSUyMyUyMENvbnZlcnQlMjB0byUyMGNvbnZlcnNhdGlvbmFsJTIwZm9ybWF0JTBBZGVmJTIwbWFrZV9jb252ZXJzYXRpb24oZXhhbXBsZSklM0ElMEElMjAlMjAlMjAlMjBwcm9tcHQlMjAlM0QlMjBqc29uLmxvYWRzKGV4YW1wbGUlNUIlMjJwcm9tcHQlMjIlNUQpJTVCMCU1RCUyMCUyMCUyMyUyMCclNUIlMjJXaGF0JTIwY29sb3IlMjBpcyUyMHRoZSUyMHNreSUzRiUyMiU1RCclMjAtJTNFJTIwJTIyV2hhdCUyMGNvbG9yJTIwaXMlMjB0aGUlMjBza3klM0YlMjIlMEElMjAlMjAlMjAlMjBjaG9zZW4lMjAlM0QlMjBqc29uLmxvYWRzKGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQpJTVCMCU1RCUwQSUyMCUyMCUyMCUyMHJlamVjdGVkJTIwJTNEJTIwanNvbi5sb2FkcyhleGFtcGxlJTVCJTIycmVqZWN0ZWQlMjIlNUQpJTVCMCU1RCUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU3QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMHByb21wdCU3RCUyQyUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwY2hvc2VuJTdEJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIycmVqZWN0ZWQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjBwcm9tcHQlN0QlMkMlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMHJlamVjdGVkJTdEJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTdEJTBBJTBBJTBBZGF0YXNldCUyMCUzRCUyMGRhdGFzZXQubWFwKG1ha2VfY29udmVyc2F0aW9uKSUwQSUwQSUyMyUyMEtlZXAlMjBvbmx5JTIwbmVjZXNzYXJ5JTIwY29sdW1ucyUwQWRhdGFzZXQlMjAlM0QlMjBkYXRhc2V0LnNlbGVjdF9jb2x1bW5zKCU1QiUyMmNob3NlbiUyMiUyQyUyMCUyMnJlamVjdGVkJTIyJTVEKSUwQSUwQXByaW50KG5leHQoaXRlcihkYXRhc2V0JTVCJTIydHJhaW4lMjIlNUQpKSk=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">import</span> json | |
| dataset = load_dataset(<span class="hljs-string">"lmarena-ai/arena-human-preference-55k"</span>) | |
| <span class="hljs-comment"># Filter out ties</span> | |
| dataset = dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> example: example[<span class="hljs-string">"winner_tie"</span>] == <span class="hljs-number">0</span>) | |
| <span class="hljs-comment"># Create 'chosen' and 'rejected' fields based on the winner column</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">response_a_b_to_chosen_rejected</span>(<span class="hljs-params">example</span>): | |
| <span class="hljs-keyword">if</span> example[<span class="hljs-string">"winner_model_a"</span>] == <span class="hljs-number">1</span>: | |
| example[<span class="hljs-string">"chosen"</span>] = example[<span class="hljs-string">"response_a"</span>] | |
| example[<span class="hljs-string">"rejected"</span>] = example[<span class="hljs-string">"response_b"</span>] | |
| <span class="hljs-keyword">else</span>: | |
| example[<span class="hljs-string">"chosen"</span>] = example[<span class="hljs-string">"response_b"</span>] | |
| example[<span class="hljs-string">"rejected"</span>] = example[<span class="hljs-string">"response_a"</span>] | |
| <span class="hljs-keyword">return</span> example | |
| dataset = dataset.<span class="hljs-built_in">map</span>(response_a_b_to_chosen_rejected) | |
| <span class="hljs-comment"># Convert to conversational format</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">make_conversation</span>(<span class="hljs-params">example</span>): | |
| prompt = json.loads(example[<span class="hljs-string">"prompt"</span>])[<span class="hljs-number">0</span>] <span class="hljs-comment"># '["What color is the sky?"]' -> "What color is the sky?"</span> | |
| chosen = json.loads(example[<span class="hljs-string">"chosen"</span>])[<span class="hljs-number">0</span>] | |
| rejected = json.loads(example[<span class="hljs-string">"rejected"</span>])[<span class="hljs-number">0</span>] | |
| <span class="hljs-keyword">return</span> { | |
| <span class="hljs-string">"chosen"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: prompt}, {<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: chosen}], | |
| <span class="hljs-string">"rejected"</span>: [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: prompt}, {<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: rejected}], | |
| } | |
| dataset = dataset.<span class="hljs-built_in">map</span>(make_conversation) | |
| <span class="hljs-comment"># Keep only necessary columns</span> | |
| dataset = dataset.select_columns([<span class="hljs-string">"chosen"</span>, <span class="hljs-string">"rejected"</span>]) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(dataset[<span class="hljs-string">"train"</span>])))`,wrap:!1}}),le=new B({props:{code:"JTdCJTBBJTIwJTIwJTIwJTIwJTIyY2hvc2VuJTIyJTNBJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXMlMjBpdCUyMG1vcmFsbHklMjByaWdodCUyMHRvJTIwdHJ5JTIwdG8lMjBoYXZlJTIwYSUyMGNlcnRhaW4lMjBwZXJjZW50YWdlJTIwb2YlMjBmZW1hbGVzJTIwb24lMjBtYW5hZ2VyaWFsJTIwcG9zaXRpb25zJTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJUaGUlMjBxdWVzdGlvbiUyMG9mJTIwd2hldGhlciUyMGl0JTIwaXMlMjBtb3JhbGx5JTIwcmlnaHQlMjB0byUyMGFpbSUyMGZvciUyMGElMjBjZXJ0YWluJTIwcGVyY2VudGFnZSUyMG9mJTIwZmVtYWxlcy4uLiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMnJlamVjdGVkJTIyJTNBJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXMlMjBpdCUyMG1vcmFsbHklMjByaWdodCUyMHRvJTIwdHJ5JTIwdG8lMjBoYXZlJTIwYSUyMGNlcnRhaW4lMjBwZXJjZW50YWdlJTIwb2YlMjBmZW1hbGVzJTIwb24lMjBtYW5hZ2VyaWFsJTIwcG9zaXRpb25zJTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJBcyUyMGFuJTIwQUklMkMlMjBJJTIwZG9uJ3QlMjBoYXZlJTIwcGVyc29uYWwlMjBiZWxpZWZzJTIwb3IlMjBvcGluaW9ucy4lMjBIb3dldmVyJTJDJTIwLi4uJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTdE",highlighted:`<span class="hljs-punctuation">{</span> | |
| <span class="hljs-attr">"chosen"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span> | |
| <span class="hljs-punctuation">{</span><span class="hljs-attr">"role"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"user"</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">"content"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"Is it morally right to try to have a certain percentage of females on managerial positions?"</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-punctuation">{</span><span class="hljs-attr">"role"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"assistant"</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">"content"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"The question of whether it is morally right to aim for a certain percentage of females..."</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"rejected"</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span> | |
| <span class="hljs-punctuation">{</span><span class="hljs-attr">"role"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"user"</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">"content"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"Is it morally right to try to have a certain percentage of females on managerial positions?"</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-punctuation">{</span><span class="hljs-attr">"role"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"assistant"</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">"content"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"As an AI, I don't have personal beliefs or opinions. However, ..."</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-punctuation">}</span>`,wrap:!1}}),re=new U({props:{title:"Looking deeper into the training method",local:"looking-deeper-into-the-training-method",headingTag:"h2"}}),pe=new U({props:{title:"Preprocessing and tokenization",local:"preprocessing-and-tokenization",headingTag:"h3"}}),ce=new U({props:{title:"Computing the loss",local:"computing-the-loss",headingTag:"h3"}}),de=new U({props:{title:"Logged metrics",local:"logged-metrics",headingTag:"h2"}}),ue=new U({props:{title:"Customization",local:"customization",headingTag:"h2"}}),fe=new U({props:{title:"Model initialization",local:"model-initialization",headingTag:"h3"}}),we=new B({props:{code:"bW9kZWwlMjAlM0QlMjBBdXRvTW9kZWxGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uLmZyb21fcHJldHJhaW5lZCglMjJRd2VuJTJGUXdlbjMtMC42QiUyMiUyQyUyMGR0eXBlJTNEdG9yY2guYmZsb2F0MTYp",highlighted:'model = AutoModelForSequenceClassification.from_pretrained(<span class="hljs-string">"Qwen/Qwen3-0.6B"</span>, dtype=torch.bfloat16)',wrap:!1}}),Me=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBSZXdhcmRDb25maWcoJTBBJTIwJTIwJTIwJTIwbW9kZWxfaW5pdF9rd2FyZ3MlM0QlN0IlMjJkdHlwZSUyMiUzQSUyMHRvcmNoLmJmbG9hdDE2JTdEJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardConfig | |
| training_args = RewardConfig( | |
| model_init_kwargs={<span class="hljs-string">"dtype"</span>: torch.bfloat16}, | |
| )`,wrap:!1}}),be=new U({props:{title:"Train adapters with PEFT",local:"train-adapters-with-peft",headingTag:"h3"}}),ve=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwcGVmdCUyMGltcG9ydCUyMExvcmFDb25maWclMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZ1bHRyYWZlZWRiYWNrX2JpbmFyaXplZCUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMFJld2FyZFRyYWluZXIoJTBBJTIwJTIwJTIwJTIwJTIyUXdlbiUyRlF3ZW4zLTRCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBwZWZ0X2NvbmZpZyUzRExvcmFDb25maWcobW9kdWxlc190b19zYXZlJTNEJTVCJTIyc2NvcmUlMjIlNUQpJTIwJTIwJTIzJTIwaW1wb3J0YW50JTIwdG8lMjBpbmNsdWRlJTIwdGhlJTIwc2NvcmUlMjBoZWFkJTIwd2hlbiUyMGJhc2UlMjBtb2RlbCUyMGlzJTIwbm90JTIwYSUyMHNlcXVlbmNlJTIwY2xhc3NpZmljYXRpb24lMjBtb2RlbCUwQSklMEElMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/ultrafeedback_binarized"</span>, split=<span class="hljs-string">"train"</span>) | |
| trainer = RewardTrainer( | |
| <span class="hljs-string">"Qwen/Qwen3-4B"</span>, | |
| train_dataset=dataset, | |
| peft_config=LoraConfig(modules_to_save=[<span class="hljs-string">"score"</span>]) <span class="hljs-comment"># important to include the score head when base model is not a sequence classification model</span> | |
| ) | |
| trainer.train()`,wrap:!1}}),Ue=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwcGVmdCUyMGltcG9ydCUyMEF1dG9QZWZ0TW9kZWxGb3JDYXVzYWxMTSUwQSUwQW1vZGVsJTIwJTNEJTIwQXV0b1BlZnRNb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCglMjJ0cmwtbGliJTJGUXdlbjMtNEItUmV3YXJkLUxvUkElMjIlMkMlMjBpc190cmFpbmFibGUlM0RUcnVlKSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydHJsLWxpYiUyRkNhcHliYXJhJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEElMEF0cmFpbmVyJTIwJTNEJTIwUmV3YXJkVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRG1vZGVsJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> AutoPeftModelForCausalLM | |
| model = AutoPeftModelForCausalLM.from_pretrained(<span class="hljs-string">"trl-lib/Qwen3-4B-Reward-LoRA"</span>, is_trainable=<span class="hljs-literal">True</span>) | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/Capybara"</span>, split=<span class="hljs-string">"train"</span>) | |
| trainer = RewardTrainer( | |
| model=model, | |
| train_dataset=dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),Ie=new B({props:{code:"UmV3YXJkQ29uZmlnKGxlYXJuaW5nX3JhdGUlM0QxZS0zJTJDJTIwLi4uKQ==",highlighted:'RewardConfig(learning_rate=<span class="hljs-number">1e-3</span>, ...)',wrap:!1}}),ke=new U({props:{title:"Tool Calling with Reward Modeling",local:"tool-calling-with-reward-modeling",headingTag:"h2"}}),qe=new U({props:{title:"RewardTrainer",local:"trl.RewardTrainer",headingTag:"h2"}}),$e=new Oe({props:{name:"class trl.RewardTrainer",anchor:"trl.RewardTrainer",parameters:[{name:"model",val:": str | PreTrainedModel | PeftModel"},{name:"args",val:": trl.trainer.reward_config.RewardConfig | None = None"},{name:"data_collator",val:": collections.abc.Callable[[list[typing.Any]], dict[str, typing.Any]] | None = None"},{name:"train_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | None = None"},{name:"eval_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | dict[str, datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset] | None = None"},{name:"processing_class",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase | None = None"},{name:"compute_metrics",val:": collections.abc.Callable[[transformers.trainer_utils.EvalPrediction], dict] | None = None"},{name:"callbacks",val:": list[transformers.trainer_callback.TrainerCallback] | None = None"},{name:"optimizers",val:": tuple = (None, None)"},{name:"optimizer_cls_and_kwargs",val:": tuple[type[torch.optim.optimizer.Optimizer], dict[str, typing.Any]] | None = None"},{name:"preprocess_logits_for_metrics",val:": collections.abc.Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None"},{name:"peft_config",val:": PeftConfig | None = None"}],parametersDescription:[{anchor:"trl.RewardTrainer.model",description:`<strong>model</strong> (<code>str</code> or <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> or <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a>) — | |
| Model to be trained. Can be either:</p> | |
| <ul> | |
| <li>A string, being the <em>model id</em> of a pretrained model hosted inside a model repo on huggingface.co, or a | |
| path to a <em>directory</em> containing model weights saved using | |
| <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>'./my_model_directory/'</code>. The model is loaded | |
| using <code>AutoModelForSequenceClassification.from_pretrained</code> with the keyword arguments in | |
| <code>args.model_init_kwargs</code>.</li> | |
| <li>A sequence classification <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> object.</li> | |
| <li>A sequence classification <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a> object.</li> | |
| </ul>`,name:"model"},{anchor:"trl.RewardTrainer.args",description:`<strong>args</strong> (<a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>, <em>optional</em>) — | |
| Configuration for this trainer. If <code>None</code>, a default configuration is used.`,name:"args"},{anchor:"trl.RewardTrainer.data_collator",description:`<strong>data_collator</strong> (<code>DataCollator</code>, <em>optional</em>) — | |
| Function to use to form a batch from a list of elements of the processed <code>train_dataset</code> or <code>eval_dataset</code>. | |
| Will default to <code>DataCollatorForPreference</code>.`,name:"data_collator"},{anchor:"trl.RewardTrainer.train_dataset",description:`<strong>train_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a> or <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a>) — | |
| Dataset to use for training. This trainer supports <a href="#preference">preference</a> type (both implicit and | |
| explicit prompt). The format of the samples can be either:</p> | |
| <ul> | |
| <li><a href="dataset_formats#standard">Standard</a>: Each sample contains plain text.</li> | |
| <li><a href="dataset_formats#conversational">Conversational</a>: Each sample contains structured messages (e.g., role | |
| and content).</li> | |
| </ul> | |
| <p>The trainer also supports processed datasets (tokenized) as long as they contain <code>chosen_ids</code> and | |
| <code>rejected_ids</code> fields.`,name:"train_dataset"},{anchor:"trl.RewardTrainer.eval_dataset",description:`<strong>eval_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a>, <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a> or <code>dict[str, Dataset | IterableDataset]</code>) — | |
| Dataset to use for evaluation. It must meet the same requirements as <code>train_dataset</code>.`,name:"eval_dataset"},{anchor:"trl.RewardTrainer.processing_class",description:`<strong>processing_class</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a>, <em>optional</em>) — | |
| Tokenizer used to process the data. If <code>None</code>, the tokenizer is loaded from the model’s name with | |
| <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained" rel="nofollow">from_pretrained</a>. A padding token, <code>processing_class.pad_token</code>, must be | |
| set. If the processing class has not set a padding token, <code>processing_class.eos_token</code> will be used as the | |
| default.`,name:"processing_class"},{anchor:"trl.RewardTrainer.compute_metrics",description:`<strong>compute_metrics</strong> (<code>Callable[[EvalPrediction], dict]</code>, <em>optional</em>) — | |
| The function that will be used to compute metrics at evaluation. Must take a | |
| <a href="https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.EvalPrediction" rel="nofollow">EvalPrediction</a> and return a dictionary string to metric values. When passing | |
| <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a> with <code>batch_eval_metrics</code> set to <code>True</code>, your <code>compute_metrics</code> function must take a | |
| boolean <code>compute_result</code> argument. This will be triggered after the last eval batch to signal that the | |
| function needs to calculate and return the global summary statistics rather than accumulating the | |
| batch-level statistics.`,name:"compute_metrics"},{anchor:"trl.RewardTrainer.callbacks",description:`<strong>callbacks</strong> (list of <a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback" rel="nofollow">TrainerCallback</a>, <em>optional</em>) — | |
| List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed | |
| in <a href="https://huggingface.co/docs/transformers/main_classes/callback" rel="nofollow">here</a>.</p> | |
| <p>If you want to remove one of the default callbacks used, use the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.remove_callback" rel="nofollow">remove_callback</a> | |
| method.`,name:"callbacks"},{anchor:"trl.RewardTrainer.optimizers",description:`<strong>optimizers</strong> (<code>tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]</code>, <em>optional</em>, defaults to <code>(None, None)</code>) — | |
| A tuple containing the optimizer and the scheduler to use. Will default to an instance of <code>AdamW</code> on your | |
| model and a scheduler given by <a href="https://huggingface.co/docs/transformers/main/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup" rel="nofollow">get_linear_schedule_with_warmup</a> controlled by <code>args</code>.`,name:"optimizers"},{anchor:"trl.RewardTrainer.optimizer_cls_and_kwargs",description:`<strong>optimizer_cls_and_kwargs</strong> (<code>tuple[Type[torch.optim.Optimizer], Dict[str, Any]]</code>, <em>optional</em>) — | |
| A tuple containing the optimizer class and keyword arguments to use. Overrides <code>optim</code> and <code>optim_args</code> in | |
| <code>args</code>. Incompatible with the <code>optimizers</code> argument.</p> | |
| <p>Unlike <code>optimizers</code>, this argument avoids the need to place model parameters on the correct devices before | |
| initializing the Trainer.`,name:"optimizer_cls_and_kwargs"},{anchor:"trl.RewardTrainer.preprocess_logits_for_metrics",description:`<strong>preprocess_logits_for_metrics</strong> (<code>Callable[[torch.Tensor, torch.Tensor], torch.Tensor]</code>, <em>optional</em>) — | |
| A function that preprocess the logits right before caching them at each evaluation step. Must take two | |
| tensors, the logits and the labels, and return the logits once processed as desired. The modifications made | |
| by this function will be reflected in the predictions received by <code>compute_metrics</code>.</p> | |
| <p>Note that the labels (second parameter) will be <code>None</code> if the dataset does not have them.`,name:"preprocess_logits_for_metrics"},{anchor:"trl.RewardTrainer.peft_config",description:`<strong>peft_config</strong> (<a href="https://huggingface.co/docs/peft/main/en/package_reference/config#peft.PeftConfig" rel="nofollow">PeftConfig</a>, <em>optional</em>) — | |
| PEFT configuration used to wrap the model. If <code>None</code>, the model is not wrapped. Note that if the loaded | |
| model is a causal LM, it’s highly recommended to set <code>modules_to_save=["score"]</code> in the PEFT configuration | |
| to ensure that the reward head is properly trained.`,name:"peft_config"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/trainer/reward_trainer.py#L229"}}),V=new Et({props:{anchor:"trl.RewardTrainer.example",$$slots:{default:[Vt]},$$scope:{ctx:ea}}}),Re=new Oe({props:{name:"train",anchor:"trl.RewardTrainer.train",parameters:[{name:"resume_from_checkpoint",val:": str | bool | None = None"},{name:"trial",val:": optuna.Trial | dict[str, Any] | None = None"},{name:"ignore_keys_for_eval",val:": list[str] | None = None"}],parametersDescription:[{anchor:"trl.RewardTrainer.train.resume_from_checkpoint",description:`<strong>resume_from_checkpoint</strong> (<code>str</code> or <code>bool</code>, <em>optional</em>) — | |
| If a <code>str</code>, local path to a saved checkpoint as saved by a previous instance of <code>Trainer</code>. If a | |
| <code>bool</code> and equals <code>True</code>, load the last checkpoint in <em>args.output_dir</em> as saved by a previous instance | |
| of <code>Trainer</code>. If present, training will resume from the model/optimizer/scheduler states loaded here.`,name:"resume_from_checkpoint"},{anchor:"trl.RewardTrainer.train.trial",description:`<strong>trial</strong> (<code>optuna.Trial</code> or <code>dict[str, Any]</code>, <em>optional</em>) — | |
| The trial run or the hyperparameter dictionary for hyperparameter search.`,name:"trial"},{anchor:"trl.RewardTrainer.train.ignore_keys_for_eval",description:`<strong>ignore_keys_for_eval</strong> (<code>list[str]</code>, <em>optional</em>) — | |
| A list of keys in the output of your model (if it is a dictionary) that should be ignored when | |
| gathering predictions for evaluation during the training.`,name:"ignore_keys_for_eval"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L1323",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>Object containing the global step count, training loss, and metrics.</p> | |
| `,returnType:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p><code>~trainer_utils.TrainOutput</code></p> | |
| `}}),ze=new Oe({props:{name:"save_model",anchor:"trl.RewardTrainer.save_model",parameters:[{name:"output_dir",val:": str | None = None"},{name:"_internal_call",val:": bool = False"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L3746"}}),Ze=new Oe({props:{name:"push_to_hub",anchor:"trl.RewardTrainer.push_to_hub",parameters:[{name:"commit_message",val:": str | None = 'End of training'"},{name:"blocking",val:": bool = True"},{name:"token",val:": str | None = None"},{name:"revision",val:": str | None = None"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"trl.RewardTrainer.push_to_hub.commit_message",description:`<strong>commit_message</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"End of training"</code>) — | |
| Message to commit while pushing.`,name:"commit_message"},{anchor:"trl.RewardTrainer.push_to_hub.blocking",description:`<strong>blocking</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether the function should return only when the <code>git push</code> has finished.`,name:"blocking"},{anchor:"trl.RewardTrainer.push_to_hub.token",description:`<strong>token</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) — | |
| Token with write permission to overwrite Trainer’s original args.`,name:"token"},{anchor:"trl.RewardTrainer.push_to_hub.revision",description:`<strong>revision</strong> (<code>str</code>, <em>optional</em>) — | |
| The git revision to commit from. Defaults to the head of the “main” branch.`,name:"revision"},{anchor:"trl.RewardTrainer.push_to_hub.kwargs",description:`<strong>kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) — | |
| Additional keyword arguments passed along to <code>~Trainer.create_model_card</code>.`,name:"kwargs"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L3993",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>The URL of the repository where the model was pushed if <code>blocking=False</code>, or a <code>Future</code> object tracking the | |
| progress of the commit if <code>blocking=True</code>.</p> | |
| `}}),Ge=new U({props:{title:"RewardConfig",local:"trl.RewardConfig",headingTag:"h2"}}),Be=new Oe({props:{name:"class trl.RewardConfig",anchor:"trl.RewardConfig",parameters:[{name:"output_dir",val:": str | None = None"},{name:"per_device_train_batch_size",val:": int = 8"},{name:"num_train_epochs",val:": float = 3.0"},{name:"max_steps",val:": int = -1"},{name:"learning_rate",val:": float = 0.0001"},{name:"lr_scheduler_type",val:": transformers.trainer_utils.SchedulerType | str = 'linear'"},{name:"lr_scheduler_kwargs",val:": dict | str | None = None"},{name:"warmup_steps",val:": float = 0"},{name:"optim",val:": transformers.training_args.OptimizerNames | str = 'adamw_torch_fused'"},{name:"optim_args",val:": str | None = None"},{name:"weight_decay",val:": float = 0.0"},{name:"adam_beta1",val:": float = 0.9"},{name:"adam_beta2",val:": float = 0.999"},{name:"adam_epsilon",val:": float = 1e-08"},{name:"optim_target_modules",val:": None | str | list[str] = None"},{name:"gradient_accumulation_steps",val:": int = 1"},{name:"average_tokens_across_devices",val:": bool = True"},{name:"max_grad_norm",val:": float = 1.0"},{name:"label_smoothing_factor",val:": float = 0.0"},{name:"bf16",val:": bool | None = None"},{name:"fp16",val:": bool = False"},{name:"bf16_full_eval",val:": bool = False"},{name:"fp16_full_eval",val:": bool = False"},{name:"tf32",val:": bool | None = None"},{name:"gradient_checkpointing",val:": bool = True"},{name:"gradient_checkpointing_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"torch_compile",val:": bool = False"},{name:"torch_compile_backend",val:": str | None = None"},{name:"torch_compile_mode",val:": str | None = None"},{name:"use_liger_kernel",val:": bool = False"},{name:"liger_kernel_config",val:": dict[str, bool] | None = None"},{name:"use_cache",val:": bool = False"},{name:"neftune_noise_alpha",val:": float | None = None"},{name:"torch_empty_cache_steps",val:": int | None = None"},{name:"auto_find_batch_size",val:": bool = False"},{name:"logging_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'steps'"},{name:"logging_steps",val:": float = 10"},{name:"logging_first_step",val:": bool = False"},{name:"log_on_each_node",val:": bool = True"},{name:"logging_nan_inf_filter",val:": bool = True"},{name:"include_num_input_tokens_seen",val:": str | bool = 'no'"},{name:"log_level",val:": str = 'passive'"},{name:"log_level_replica",val:": str = 'warning'"},{name:"disable_tqdm",val:": bool | None = None"},{name:"report_to",val:": None | str | list[str] = 'none'"},{name:"run_name",val:": str | None = None"},{name:"project",val:": str = 'huggingface'"},{name:"trackio_space_id",val:": str | None = 'trackio'"},{name:"eval_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'no'"},{name:"eval_steps",val:": float | None = None"},{name:"eval_delay",val:": float = 0"},{name:"per_device_eval_batch_size",val:": int = 8"},{name:"prediction_loss_only",val:": bool = False"},{name:"eval_on_start",val:": bool = False"},{name:"eval_do_concat_batches",val:": bool = True"},{name:"eval_use_gather_object",val:": bool = False"},{name:"eval_accumulation_steps",val:": int | None = None"},{name:"include_for_metrics",val:": list = <factory>"},{name:"batch_eval_metrics",val:": bool = False"},{name:"save_only_model",val:": bool = False"},{name:"save_strategy",val:": transformers.trainer_utils.SaveStrategy | str = 'steps'"},{name:"save_steps",val:": float = 500"},{name:"save_on_each_node",val:": bool = False"},{name:"save_total_limit",val:": int | None = None"},{name:"enable_jit_checkpoint",val:": bool = False"},{name:"push_to_hub",val:": bool = False"},{name:"hub_token",val:": str | None = None"},{name:"hub_private_repo",val:": bool | None = None"},{name:"hub_model_id",val:": str | None = None"},{name:"hub_strategy",val:": transformers.trainer_utils.HubStrategy | str = 'every_save'"},{name:"hub_always_push",val:": bool = False"},{name:"hub_revision",val:": str | None = None"},{name:"load_best_model_at_end",val:": bool = False"},{name:"metric_for_best_model",val:": str | None = None"},{name:"greater_is_better",val:": bool | None = None"},{name:"ignore_data_skip",val:": bool = False"},{name:"restore_callback_states_from_checkpoint",val:": bool = False"},{name:"full_determinism",val:": bool = False"},{name:"seed",val:": int = 42"},{name:"data_seed",val:": int | None = None"},{name:"use_cpu",val:": bool = False"},{name:"accelerator_config",val:": dict | str | None = None"},{name:"parallelism_config",val:": accelerate.parallelism_config.ParallelismConfig | None = None"},{name:"dataloader_drop_last",val:": bool = False"},{name:"dataloader_num_workers",val:": int = 0"},{name:"dataloader_pin_memory",val:": bool = True"},{name:"dataloader_persistent_workers",val:": bool = False"},{name:"dataloader_prefetch_factor",val:": int | None = None"},{name:"remove_unused_columns",val:": bool = True"},{name:"label_names",val:": list[str] | None = None"},{name:"train_sampling_strategy",val:": str = 'random'"},{name:"length_column_name",val:": str = 'length'"},{name:"ddp_find_unused_parameters",val:": bool | None = None"},{name:"ddp_bucket_cap_mb",val:": int | None = None"},{name:"ddp_broadcast_buffers",val:": bool | None = None"},{name:"ddp_backend",val:": str | None = None"},{name:"ddp_timeout",val:": int = 1800"},{name:"fsdp",val:": list[transformers.trainer_utils.FSDPOption] | str | None = None"},{name:"fsdp_config",val:": dict[str, typing.Any] | str | None = None"},{name:"deepspeed",val:": dict | str | None = None"},{name:"debug",val:": str | list[transformers.debug_utils.DebugOption] = ''"},{name:"skip_memory_metrics",val:": bool = True"},{name:"do_train",val:": bool = False"},{name:"do_eval",val:": bool = False"},{name:"do_predict",val:": bool = False"},{name:"resume_from_checkpoint",val:": str | None = None"},{name:"warmup_ratio",val:": float | None = None"},{name:"logging_dir",val:": str | None = None"},{name:"local_rank",val:": int = -1"},{name:"model_init_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"chat_template_path",val:": str | None = None"},{name:"disable_dropout",val:": bool = True"},{name:"dataset_num_proc",val:": int | None = None"},{name:"eos_token",val:": str | None = None"},{name:"max_length",val:": int | None = 1024"},{name:"pad_to_multiple_of",val:": int | None = None"},{name:"center_rewards_coefficient",val:": float | None = None"},{name:"activation_offloading",val:": bool = False"},{name:"pad_token",val:": str | None = None"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/trainer/reward_config.py#L23",parameterGroups:[{title:"Parameters that control the model",parametersDescription:[{anchor:"trl.RewardConfig.model_init_kwargs",description:`<strong>model_init_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) — | |
| Keyword arguments for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained" rel="nofollow">from_pretrained</a>, used when the <code>model</code> | |
| argument of the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> is provided as a string. If you’re training a MoE architecture and want | |
| to include the load balancing/auxiliary loss as a part of the final loss, remember to set | |
| <code>output_router_logits=True</code> in this dictionary.`,name:"model_init_kwargs"},{anchor:"trl.RewardConfig.chat_template_path",description:`<strong>chat_template_path</strong> (<code>str</code>, <em>optional</em>) — | |
| If specified, sets the model’s chat template. This can either be the path to a tokenizer (local directory | |
| or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must | |
| ensure that any special tokens referenced in the template are added to the tokenizer and that the model’s | |
| embedding layer is resized accordingly.`,name:"chat_template_path"},{anchor:"trl.RewardConfig.disable_dropout",description:`<strong>disable_dropout</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether to disable dropout in the model.`,name:"disable_dropout"}]},{title:"Parameters that control the data preprocessing",parametersDescription:[{anchor:"trl.RewardConfig.dataset_num_proc",description:`<strong>dataset_num_proc</strong> (<code>int</code>, <em>optional</em>) — | |
| Number of processes to use for processing the dataset.`,name:"dataset_num_proc"},{anchor:"trl.RewardConfig.eos_token",description:`<strong>eos_token</strong> (<code>str</code>, <em>optional</em>) — | |
| Token used to indicate the end of a turn or sequence. If <code>None</code>, it defaults to | |
| <code>processing_class.eos_token</code>.`,name:"eos_token"},{anchor:"trl.RewardConfig.max_length",description:`<strong>max_length</strong> (<code>int</code> or <code>None</code>, <em>optional</em>, defaults to <code>1024</code>) — | |
| Maximum length of the tokenized sequence. Samples are filtered out if either chosen or rejected sequence | |
| exceeds this value. If <code>None</code>, no filtering is applied.`,name:"max_length"},{anchor:"trl.RewardConfig.pad_to_multiple_of",description:`<strong>pad_to_multiple_of</strong> (<code>int</code>, <em>optional</em>) — | |
| If set, the sequences will be padded to a multiple of this value.`,name:"pad_to_multiple_of"}]},{title:"Parameters that control the training",parametersDescription:[{anchor:"trl.RewardConfig.center_rewards_coefficient",description:`<strong>center_rewards_coefficient</strong> (<code>float</code>, <em>optional</em>) — | |
| Coefficient to incentivize the reward model to output mean-zero rewards (proposed by | |
| <a href="https://huggingface.co/papers/2312.09244" rel="nofollow">https://huggingface.co/papers/2312.09244</a>, Eq. 2). Recommended value: <code>0.01</code>.`,name:"center_rewards_coefficient"},{anchor:"trl.RewardConfig.activation_offloading",description:`<strong>activation_offloading</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) — | |
| Whether to offload the activations to the CPU.`,name:"activation_offloading"}]},{title:"Deprecated parameters",parametersDescription:[{anchor:"trl.RewardConfig.pad_token",description:`<strong>pad_token</strong> —</p> | |
| <deprecated version="1.1.0"> | |
| <p>Parameter <code>pad_token</code> is deprecated and will be removed in version v2.0.0. Set <code>tokenizer.pad_token</code> | |
| directly and pass it as <code>processing_class</code> to the trainer instead.</p> | |
| </deprecated>`,name:"pad_token"}]}]}}),Qe=new Wt({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/reward_trainer.md"}}),{c(){M=o("meta"),Y=n(),z=o("p"),k=n(),m(C.$$.fragment),w=n(),m($.$$.fragment),aa=n(),H=o("p"),H.innerHTML=Es,sa=n(),m(L.$$.fragment),ta=n(),A=o("p"),A.textContent=Vs,na=n(),D=o("p"),D.innerHTML=Xs,la=n(),m(P.$$.fragment),ra=n(),K=o("p"),K.innerHTML=Ss,oa=n(),m(O.$$.fragment),ia=n(),x=o("iframe"),pa=n(),m(ee.$$.fragment),ma=n(),ae=o("p"),ae.innerHTML=Hs,ca=n(),m(se.$$.fragment),da=n(),te=o("p"),te.innerHTML=Ls,ha=n(),m(ne.$$.fragment),ga=n(),m(le.$$.fragment),ua=n(),m(re.$$.fragment),fa=n(),oe=o("p"),oe.textContent=As,ya=n(),ie=o("p"),ie.innerHTML=Ds,wa=n(),m(pe.$$.fragment),Ta=n(),me=o("p"),me.innerHTML=Ps,Ma=n(),m(ce.$$.fragment),_a=n(),f=o("p"),Js=v("Let "),ba=new N(!1),Ja=v(" be the input sequence (prompt) and "),va=new N(!1),ja=v(" and "),Ua=new N(!1),Ia=v(" be the chosen and rejected sequences respectively. Under the Bradley-Terry model ("),W=o("a"),W.textContent=Ks,vs=v("), the probability that "),ka=new N(!1),Ca=v(" is preferred over "),xa=new N(!1),Na=v(" given a reward function "),qa=new N(!1),$a=v(" is "),Ra=new N(!1),za=v(", where "),Za=new N(!1),Ga=v(" is the sigmoid function."),Ba=n(),I=o("p"),js=v("The reward model "),Fa=new N(!1),Qa=v(" is trained to assign higher scores to preferred responses "),Wa=new N(!1),Ea=v(" over non-preferred ones "),Va=new N(!1),Xa=v(`. The loss is then defined as the negative log-likelihood of the observed preferences: | |
| `),Sa=new N(!1),Ya=n(),E=o("blockquote"),E.innerHTML=Os,Ha=n(),m(de.$$.fragment),La=n(),he=o("p"),he.textContent=et,Aa=n(),ge=o("ul"),ge.innerHTML=at,Da=n(),m(ue.$$.fragment),Pa=n(),m(fe.$$.fragment),Ka=n(),ye=o("p"),ye.innerHTML=st,Oa=n(),m(we.$$.fragment),es=n(),Te=o("p"),Te.innerHTML=tt,as=n(),m(Me.$$.fragment),ss=n(),_e=o("p"),_e.innerHTML=nt,ts=n(),m(be.$$.fragment),ns=n(),Je=o("p"),Je.textContent=lt,ls=n(),m(ve.$$.fragment),rs=n(),je=o("p"),je.innerHTML=rt,os=n(),m(Ue.$$.fragment),is=n(),F=o("blockquote"),We=o("p"),We.textContent=ot,Us=n(),m(Ie.$$.fragment),ps=n(),m(ke.$$.fragment),ms=n(),Ce=o("p"),Ce.innerHTML=it,cs=n(),xe=o("ul"),xe.innerHTML=pt,ds=n(),Ne=o("p"),Ne.innerHTML=mt,hs=n(),m(qe.$$.fragment),gs=n(),T=o("div"),m($e.$$.fragment),Is=n(),Ee=o("p"),Ee.textContent=ct,ks=n(),Ve=o("p"),Ve.innerHTML=dt,Cs=n(),m(V.$$.fragment),xs=n(),X=o("div"),m(Re.$$.fragment),Ns=n(),Xe=o("p"),Xe.textContent=ht,qs=n(),Z=o("div"),m(ze.$$.fragment),$s=n(),Se=o("p"),Se.innerHTML=gt,Rs=n(),Ye=o("p"),Ye.textContent=ut,zs=n(),S=o("div"),m(Ze.$$.fragment),Zs=n(),He=o("p"),He.innerHTML=ft,us=n(),m(Ge.$$.fragment),fs=n(),_=o("div"),m(Be.$$.fragment),Gs=n(),Le=o("p"),Le.innerHTML=yt,Bs=n(),Ae=o("p"),Ae.innerHTML=wt,Fs=n(),De=o("p"),De.innerHTML=Tt,Qs=n(),Fe=o("blockquote"),Fe.innerHTML=Mt,ys=n(),m(Qe.$$.fragment),ws=n(),Ke=o("p"),this.h()},l(e){const a=Ft("svelte-u9bgzb",document.head);M=i(a,"META",{name:!0,content:!0}),a.forEach(s),Y=l(e),z=i(e,"P",{}),R(z).forEach(s),k=l(e),c(C.$$.fragment,e),w=l(e),c($.$$.fragment,e),aa=l(e),H=i(e,"P",{"data-svelte-h":!0}),p(H)!=="svelte-1rjidu2"&&(H.innerHTML=Es),sa=l(e),c(L.$$.fragment,e),ta=l(e),A=i(e,"P",{"data-svelte-h":!0}),p(A)!=="svelte-1ti5dgc"&&(A.textContent=Vs),na=l(e),D=i(e,"P",{"data-svelte-h":!0}),p(D)!=="svelte-a1ehbo"&&(D.innerHTML=Xs),la=l(e),c(P.$$.fragment,e),ra=l(e),K=i(e,"P",{"data-svelte-h":!0}),p(K)!=="svelte-1rk39qy"&&(K.innerHTML=Ss),oa=l(e),c(O.$$.fragment,e),ia=l(e),x=i(e,"IFRAME",{src:!0,style:!0,height:!0,frameborder:!0}),R(x).forEach(s),pa=l(e),c(ee.$$.fragment,e),ma=l(e),ae=i(e,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-m5hlgx"&&(ae.innerHTML=Hs),ca=l(e),c(se.$$.fragment,e),da=l(e),te=i(e,"P",{"data-svelte-h":!0}),p(te)!=="svelte-wa19cs"&&(te.innerHTML=Ls),ha=l(e),c(ne.$$.fragment,e),ga=l(e),c(le.$$.fragment,e),ua=l(e),c(re.$$.fragment,e),fa=l(e),oe=i(e,"P",{"data-svelte-h":!0}),p(oe)!=="svelte-cp5aph"&&(oe.textContent=As),ya=l(e),ie=i(e,"P",{"data-svelte-h":!0}),p(ie)!=="svelte-r5fokq"&&(ie.innerHTML=Ds),wa=l(e),c(pe.$$.fragment,e),Ta=l(e),me=i(e,"P",{"data-svelte-h":!0}),p(me)!=="svelte-87ovcn"&&(me.innerHTML=Ps),Ma=l(e),c(ce.$$.fragment,e),_a=l(e),f=i(e,"P",{});var y=R(f);Js=j(y,"Let "),ba=q(y,!1),Ja=j(y," be the input sequence (prompt) and "),va=q(y,!1),ja=j(y," and "),Ua=q(y,!1),Ia=j(y," be the chosen and rejected sequences respectively. Under the Bradley-Terry model ("),W=i(y,"A",{href:!0,rel:!0,"data-svelte-h":!0}),p(W)!=="svelte-18polgg"&&(W.textContent=Ks),vs=j(y,"), the probability that "),ka=q(y,!1),Ca=j(y," is preferred over "),xa=q(y,!1),Na=j(y," given a reward function "),qa=q(y,!1),$a=j(y," is "),Ra=q(y,!1),za=j(y,", where "),Za=q(y,!1),Ga=j(y," is the sigmoid function."),y.forEach(s),Ba=l(e),I=i(e,"P",{});var Q=R(I);js=j(Q,"The reward model "),Fa=q(Q,!1),Qa=j(Q," is trained to assign higher scores to preferred responses "),Wa=q(Q,!1),Ea=j(Q," over non-preferred ones "),Va=q(Q,!1),Xa=j(Q,`. The loss is then defined as the negative log-likelihood of the observed preferences: | |
| `),Sa=q(Q,!1),Q.forEach(s),Ya=l(e),E=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),p(E)!=="svelte-1nlt6oo"&&(E.innerHTML=Os),Ha=l(e),c(de.$$.fragment,e),La=l(e),he=i(e,"P",{"data-svelte-h":!0}),p(he)!=="svelte-132s7j9"&&(he.textContent=et),Aa=l(e),ge=i(e,"UL",{"data-svelte-h":!0}),p(ge)!=="svelte-ecm0rq"&&(ge.innerHTML=at),Da=l(e),c(ue.$$.fragment,e),Pa=l(e),c(fe.$$.fragment,e),Ka=l(e),ye=i(e,"P",{"data-svelte-h":!0}),p(ye)!=="svelte-1ieqrp4"&&(ye.innerHTML=st),Oa=l(e),c(we.$$.fragment,e),es=l(e),Te=i(e,"P",{"data-svelte-h":!0}),p(Te)!=="svelte-1riioh2"&&(Te.innerHTML=tt),as=l(e),c(Me.$$.fragment,e),ss=l(e),_e=i(e,"P",{"data-svelte-h":!0}),p(_e)!=="svelte-zb69hw"&&(_e.innerHTML=nt),ts=l(e),c(be.$$.fragment,e),ns=l(e),Je=i(e,"P",{"data-svelte-h":!0}),p(Je)!=="svelte-t2zuq8"&&(Je.textContent=lt),ls=l(e),c(ve.$$.fragment,e),rs=l(e),je=i(e,"P",{"data-svelte-h":!0}),p(je)!=="svelte-11e9nof"&&(je.innerHTML=rt),os=l(e),c(Ue.$$.fragment,e),is=l(e),F=i(e,"BLOCKQUOTE",{class:!0});var Ms=R(F);We=i(Ms,"P",{"data-svelte-h":!0}),p(We)!=="svelte-dc5ccy"&&(We.textContent=ot),Us=l(Ms),c(Ie.$$.fragment,Ms),Ms.forEach(s),ps=l(e),c(ke.$$.fragment,e),ms=l(e),Ce=i(e,"P",{"data-svelte-h":!0}),p(Ce)!=="svelte-iquuc2"&&(Ce.innerHTML=it),cs=l(e),xe=i(e,"UL",{"data-svelte-h":!0}),p(xe)!=="svelte-1vlmw2d"&&(xe.innerHTML=pt),ds=l(e),Ne=i(e,"P",{"data-svelte-h":!0}),p(Ne)!=="svelte-vl4ede"&&(Ne.innerHTML=mt),hs=l(e),c(qe.$$.fragment,e),gs=l(e),T=i(e,"DIV",{class:!0});var J=R(T);c($e.$$.fragment,J),Is=l(J),Ee=i(J,"P",{"data-svelte-h":!0}),p(Ee)!=="svelte-1prnqj7"&&(Ee.textContent=ct),ks=l(J),Ve=i(J,"P",{"data-svelte-h":!0}),p(Ve)!=="svelte-10vjtjm"&&(Ve.innerHTML=dt),Cs=l(J),c(V.$$.fragment,J),xs=l(J),X=i(J,"DIV",{class:!0});var _s=R(X);c(Re.$$.fragment,_s),Ns=l(_s),Xe=i(_s,"P",{"data-svelte-h":!0}),p(Xe)!=="svelte-1cilnet"&&(Xe.textContent=ht),_s.forEach(s),qs=l(J),Z=i(J,"DIV",{class:!0});var Pe=R(Z);c(ze.$$.fragment,Pe),$s=l(Pe),Se=i(Pe,"P",{"data-svelte-h":!0}),p(Se)!=="svelte-r8h4ov"&&(Se.innerHTML=gt),Rs=l(Pe),Ye=i(Pe,"P",{"data-svelte-h":!0}),p(Ye)!=="svelte-1e6bius"&&(Ye.textContent=ut),Pe.forEach(s),zs=l(J),S=i(J,"DIV",{class:!0});var bs=R(S);c(Ze.$$.fragment,bs),Zs=l(bs),He=i(bs,"P",{"data-svelte-h":!0}),p(He)!=="svelte-8tudwd"&&(He.innerHTML=ft),bs.forEach(s),J.forEach(s),us=l(e),c(Ge.$$.fragment,e),fs=l(e),_=i(e,"DIV",{class:!0});var G=R(_);c(Be.$$.fragment,G),Gs=l(G),Le=i(G,"P",{"data-svelte-h":!0}),p(Le)!=="svelte-o9kf85"&&(Le.innerHTML=yt),Bs=l(G),Ae=i(G,"P",{"data-svelte-h":!0}),p(Ae)!=="svelte-3thf4f"&&(Ae.innerHTML=wt),Fs=l(G),De=i(G,"P",{"data-svelte-h":!0}),p(De)!=="svelte-ekuf1t"&&(De.innerHTML=Tt),Qs=l(G),Fe=i(G,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),p(Fe)!=="svelte-mu4yq0"&&(Fe.innerHTML=Mt),G.forEach(s),ys=l(e),c(Qe.$$.fragment,e),ws=l(e),Ke=i(e,"P",{}),R(Ke).forEach(s),this.h()},h(){b(M,"name","hf:doc:metadata"),b(M,"content",St),Rt(x.src,Ys="https://trl-lib-trackio.hf.space/?project=trl-documentation&metrics=train*&sidebar=hidden&runs=reward_qwen3-0.6B_ultrafeedback2")||b(x,"src",Ys),Ws(x,"width","100%"),Ws(x,"min-width","300px"),Ws(x,"max-width","800px"),b(x,"height","830"),b(x,"frameborder","0"),ba.a=Ja,va.a=ja,Ua.a=Ia,b(W,"href","https://www.jstor.org/stable/2334029"),b(W,"rel","nofollow"),ka.a=Ca,xa.a=Na,qa.a=$a,Ra.a=za,Za.a=Ga,Fa.a=Qa,Wa.a=Ea,Va.a=Xa,Sa.a=null,b(E,"class","tip"),b(F,"class","tip"),b(X,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(Z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(S,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(Fe,"class","note"),b(_,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){r(document.head,M),t(e,Y,a),t(e,z,a),t(e,k,a),d(C,e,a),t(e,w,a),d($,e,a),t(e,aa,a),t(e,H,a),t(e,sa,a),d(L,e,a),t(e,ta,a),t(e,A,a),t(e,na,a),t(e,D,a),t(e,la,a),d(P,e,a),t(e,ra,a),t(e,K,a),t(e,oa,a),d(O,e,a),t(e,ia,a),t(e,x,a),t(e,pa,a),d(ee,e,a),t(e,ma,a),t(e,ae,a),t(e,ca,a),d(se,e,a),t(e,da,a),t(e,te,a),t(e,ha,a),d(ne,e,a),t(e,ga,a),d(le,e,a),t(e,ua,a),d(re,e,a),t(e,fa,a),t(e,oe,a),t(e,ya,a),t(e,ie,a),t(e,wa,a),d(pe,e,a),t(e,Ta,a),t(e,me,a),t(e,Ma,a),d(ce,e,a),t(e,_a,a),t(e,f,a),r(f,Js),ba.m(_t,f),r(f,Ja),va.m(bt,f),r(f,ja),Ua.m(Jt,f),r(f,Ia),r(f,W),r(f,vs),ka.m(vt,f),r(f,Ca),xa.m(jt,f),r(f,Na),qa.m(Ut,f),r(f,$a),Ra.m(It,f),r(f,za),Za.m(kt,f),r(f,Ga),t(e,Ba,a),t(e,I,a),r(I,js),Fa.m(Ct,I),r(I,Qa),Wa.m(xt,I),r(I,Ea),Va.m(Nt,I),r(I,Xa),Sa.m(qt,I),t(e,Ya,a),t(e,E,a),t(e,Ha,a),d(de,e,a),t(e,La,a),t(e,he,a),t(e,Aa,a),t(e,ge,a),t(e,Da,a),d(ue,e,a),t(e,Pa,a),d(fe,e,a),t(e,Ka,a),t(e,ye,a),t(e,Oa,a),d(we,e,a),t(e,es,a),t(e,Te,a),t(e,as,a),d(Me,e,a),t(e,ss,a),t(e,_e,a),t(e,ts,a),d(be,e,a),t(e,ns,a),t(e,Je,a),t(e,ls,a),d(ve,e,a),t(e,rs,a),t(e,je,a),t(e,os,a),d(Ue,e,a),t(e,is,a),t(e,F,a),r(F,We),r(F,Us),d(Ie,F,null),t(e,ps,a),d(ke,e,a),t(e,ms,a),t(e,Ce,a),t(e,cs,a),t(e,xe,a),t(e,ds,a),t(e,Ne,a),t(e,hs,a),d(qe,e,a),t(e,gs,a),t(e,T,a),d($e,T,null),r(T,Is),r(T,Ee),r(T,ks),r(T,Ve),r(T,Cs),d(V,T,null),r(T,xs),r(T,X),d(Re,X,null),r(X,Ns),r(X,Xe),r(T,qs),r(T,Z),d(ze,Z,null),r(Z,$s),r(Z,Se),r(Z,Rs),r(Z,Ye),r(T,zs),r(T,S),d(Ze,S,null),r(S,Zs),r(S,He),t(e,us,a),d(Ge,e,a),t(e,fs,a),t(e,_,a),d(Be,_,null),r(_,Gs),r(_,Le),r(_,Bs),r(_,Ae),r(_,Fs),r(_,De),r(_,Qs),r(_,Fe),t(e,ys,a),d(Qe,e,a),t(e,ws,a),t(e,Ke,a),Ts=!0},p(e,[a]){const y={};a&2&&(y.$$scope={dirty:a,ctx:e}),V.$set(y)},i(e){Ts||(h(C.$$.fragment,e),h($.$$.fragment,e),h(L.$$.fragment,e),h(P.$$.fragment,e),h(O.$$.fragment,e),h(ee.$$.fragment,e),h(se.$$.fragment,e),h(ne.$$.fragment,e),h(le.$$.fragment,e),h(re.$$.fragment,e),h(pe.$$.fragment,e),h(ce.$$.fragment,e),h(de.$$.fragment,e),h(ue.$$.fragment,e),h(fe.$$.fragment,e),h(we.$$.fragment,e),h(Me.$$.fragment,e),h(be.$$.fragment,e),h(ve.$$.fragment,e),h(Ue.$$.fragment,e),h(Ie.$$.fragment,e),h(ke.$$.fragment,e),h(qe.$$.fragment,e),h($e.$$.fragment,e),h(V.$$.fragment,e),h(Re.$$.fragment,e),h(ze.$$.fragment,e),h(Ze.$$.fragment,e),h(Ge.$$.fragment,e),h(Be.$$.fragment,e),h(Qe.$$.fragment,e),Ts=!0)},o(e){g(C.$$.fragment,e),g($.$$.fragment,e),g(L.$$.fragment,e),g(P.$$.fragment,e),g(O.$$.fragment,e),g(ee.$$.fragment,e),g(se.$$.fragment,e),g(ne.$$.fragment,e),g(le.$$.fragment,e),g(re.$$.fragment,e),g(pe.$$.fragment,e),g(ce.$$.fragment,e),g(de.$$.fragment,e),g(ue.$$.fragment,e),g(fe.$$.fragment,e),g(we.$$.fragment,e),g(Me.$$.fragment,e),g(be.$$.fragment,e),g(ve.$$.fragment,e),g(Ue.$$.fragment,e),g(Ie.$$.fragment,e),g(ke.$$.fragment,e),g(qe.$$.fragment,e),g($e.$$.fragment,e),g(V.$$.fragment,e),g(Re.$$.fragment,e),g(ze.$$.fragment,e),g(Ze.$$.fragment,e),g(Ge.$$.fragment,e),g(Be.$$.fragment,e),g(Qe.$$.fragment,e),Ts=!1},d(e){e&&(s(Y),s(z),s(k),s(w),s(aa),s(H),s(sa),s(ta),s(A),s(na),s(D),s(la),s(ra),s(K),s(oa),s(ia),s(x),s(pa),s(ma),s(ae),s(ca),s(da),s(te),s(ha),s(ga),s(ua),s(fa),s(oe),s(ya),s(ie),s(wa),s(Ta),s(me),s(Ma),s(_a),s(f),s(Ba),s(I),s(Ya),s(E),s(Ha),s(La),s(he),s(Aa),s(ge),s(Da),s(Pa),s(Ka),s(ye),s(Oa),s(es),s(Te),s(as),s(ss),s(_e),s(ts),s(ns),s(Je),s(ls),s(rs),s(je),s(os),s(is),s(F),s(ps),s(ms),s(Ce),s(cs),s(xe),s(ds),s(Ne),s(hs),s(gs),s(T),s(us),s(fs),s(_),s(ys),s(ws),s(Ke)),s(M),u(C,e),u($,e),u(L,e),u(P,e),u(O,e),u(ee,e),u(se,e),u(ne,e),u(le,e),u(re,e),u(pe,e),u(ce,e),u(de,e),u(ue,e),u(fe,e),u(we,e),u(Me,e),u(be,e),u(ve,e),u(Ue,e),u(Ie),u(ke,e),u(qe,e),u($e),u(V),u(Re),u(ze),u(Ze),u(Ge,e),u(Be),u(Qe,e)}}}const St='{"title":"Reward Modeling","local":"reward-modeling","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"Expected dataset type and format","local":"expected-dataset-type-and-format","sections":[],"depth":2},{"title":"Looking deeper into the training method","local":"looking-deeper-into-the-training-method","sections":[{"title":"Preprocessing and tokenization","local":"preprocessing-and-tokenization","sections":[],"depth":3},{"title":"Computing the loss","local":"computing-the-loss","sections":[],"depth":3}],"depth":2},{"title":"Logged metrics","local":"logged-metrics","sections":[],"depth":2},{"title":"Customization","local":"customization","sections":[{"title":"Model initialization","local":"model-initialization","sections":[],"depth":3},{"title":"Train adapters with PEFT","local":"train-adapters-with-peft","sections":[],"depth":3}],"depth":2},{"title":"Tool Calling with Reward Modeling","local":"tool-calling-with-reward-modeling","sections":[],"depth":2},{"title":"RewardTrainer","local":"trl.RewardTrainer","sections":[],"depth":2},{"title":"RewardConfig","local":"trl.RewardConfig","sections":[],"depth":2}],"depth":1}';function Yt(ea){return zt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ot extends Gt{constructor(M){super(),Bt(this,M,Yt,Xt,$t,{})}}export{Ot as component}; | |
Xet Storage Details
- Size:
- 89.7 kB
- Xet hash:
- 6d41cbc10ed19ab9b4c5aa117c7f418c8f54072c67e7cb38ceba679c2f5e6acb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.