Buckets:

HuggingFaceDocBuilder's picture
download
raw
89.7 kB
import{s as $t,a as Rt,o as zt,n as Zt}from"../chunks/scheduler.7b731bd4.js";import{S as Gt,i as Bt,e as o,s as n,c as m,q as v,H as N,h as Ft,a as i,d as s,b as l,f as R,g as c,j as p,r as j,u as q,k as b,v as Ws,l as r,m as t,n as d,t as h,o as g,p as u}from"../chunks/index.cc268345.js";import{C as Qt,H as U,E as Wt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{D as Oe}from"../chunks/Docstring.03f7b462.js";import{C as B}from"../chunks/CodeBlock.169a125f.js";import{E as Et}from"../chunks/ExampleCodeBlock.415f9452.js";function Vt(ea){let M,Y="Example:",z,k,C;return k=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZ1bHRyYWZlZWRiYWNrX2JpbmFyaXplZCUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMFJld2FyZFRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJRd2VuJTJGUXdlbjIuNS0wLjVCLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEEpJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/ultrafeedback_binarized&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
trainer = RewardTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen2.5-0.5B-Instruct&quot;</span>,
train_dataset=dataset,
)
trainer.train()`,wrap:!1}}),{c(){M=o("p"),M.textContent=Y,z=n(),m(k.$$.fragment)},l(w){M=i(w,"P",{"data-svelte-h":!0}),p(M)!=="svelte-11lpom8"&&(M.textContent=Y),z=l(w),c(k.$$.fragment,w)},m(w,$){t(w,M,$),t(w,z,$),d(k,w,$),C=!0},p:Zt,i(w){C||(h(k.$$.fragment,w),C=!0)},o(w){g(k.$$.fragment,w),C=!1},d(w){w&&(s(M),s(z)),u(k,w)}}}function Xt(ea){let M,Y,z,k,C,w,$,aa,H,Es='<a href="https://huggingface.co/models?other=reward-trainer,trl" rel="nofollow"><img src="https://img.shields.io/badge/All_models-Reward_Trainer-blue" alt="model badge"/></a>',sa,L,ta,A,Vs="TRL supports the Outcome-supervised Reward Modeling (ORM) Trainer for training reward models.",na,D,Xs='This post-training method was contributed by <a href="https://huggingface.co/ybelkada" rel="nofollow">Younes Belkada</a>.',la,P,ra,K,Ss='This example demonstrates how to train a reward model using the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> from TRL. We train a <a href="https://huggingface.co/Qwen/Qwen3-0.6B" rel="nofollow">Qwen 3 0.6B</a> model on the <a href="https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized" rel="nofollow">UltraFeedback dataset</a>, large-scale, fine-grained, diverse preference dataset.',oa,O,ia,x,Ys,pa,ee,ma,ae,Hs='<a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> supports <a href="dataset_formats#preference">preference</a> datasets type (both implicit and explicit prompt). The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> is compatible with both <a href="dataset_formats#standard">standard</a> and <a href="dataset_formats#conversational">conversational</a> dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.',ca,se,da,te,Ls='If your dataset is not in one of these formats, you can preprocess it to convert it into the expected format. Here is an example with the <a href="https://huggingface.co/datasets/lmarena-ai/arena-human-preference-55k" rel="nofollow">lmarena-ai/arena-human-preference-55k</a> dataset:',ha,ne,ga,le,ua,re,fa,oe,As="Reward Models (RMs) are typically trained using supervised learning on datasets containing pairs of preferred and non-preferred responses. The goal is to learn a function that assigns higher scores to preferred responses, enabling the model to rank outputs based on preferences.",ya,ie,Ds="This section breaks down how reward modeling works in practice, covering the key steps: <strong>preprocessing</strong> and <strong>loss computation</strong>.",wa,pe,Ta,me,Ps=`During training, each example is expected to contain a <strong>chosen</strong> and <strong>rejected</strong> field. For more details on the expected formats, see <a href="dataset_formats#preference">Dataset formats - Preference</a>.
The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> tokenizes each input using the model’s tokenizer. If prompts and completions (chosen and rejected) are provided separately (explicit prompt case), they are concatenated before tokenization.`,Ma,ce,_a,f,Js,ba,_t='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>x</mi></mrow><annotation encoding="application/x-tex"> x </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">x</span></span></span></span>',Ja,va,bt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',ja,Ua,Jt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Ia,W,Ks="Bradley & Terry, 1952",vs,ka,vt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',Ca,xa,jt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Na,qa,Ut='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>r</mi></mrow><annotation encoding="application/x-tex"> r </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span></span></span></span>',$a,Ra,It='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi><mo stretchy="false">(</mo><msup><mi>y</mi><mo>+</mo></msup><mo>≻</mo><msup><mi>y</mi><mo>−</mo></msup><mi mathvariant="normal">∣</mi><mi>x</mi><mo stretchy="false">)</mo><mo>=</mo><mi>σ</mi><mo stretchy="false">(</mo><mi>r</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo stretchy="false">)</mo><mo>−</mo><mi>r</mi><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> p(y^+ ≻ y^- |x) = \\sigma(r(x, y^+)−r(x, y^-)) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal">p</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">≻</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord mathnormal">x</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.0213em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose">))</span></span></span></span>',za,Za,kt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>σ</mi></mrow><annotation encoding="application/x-tex"> σ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span></span></span></span>',Ga,Ba,I,js,Fa,Ct='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><mi>y</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> r_\\theta(x, y) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mclose">)</span></span></span></span>',Qa,Wa,xt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>+</mo></msup></mrow><annotation encoding="application/x-tex"> y^+ </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span></span></span></span>',Ea,Va,Nt='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>y</mi><mo>−</mo></msup></mrow><annotation encoding="application/x-tex"> y^- </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9658em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7713em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span></span></span></span>',Xa,Sa,qt=`<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi mathvariant="script">L</mi><mo stretchy="false">(</mo><mi>θ</mi><mo stretchy="false">)</mo><mo>=</mo><mo>−</mo><msub><mi mathvariant="double-struck">E</mi><mrow><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo>∼</mo><mi mathvariant="script">D</mi></mrow></msub><mrow><mo fence="true">[</mo><mi>log</mi><mo>⁡</mo><mi>σ</mi><mo stretchy="false">(</mo><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>+</mo></msup><mo stretchy="false">)</mo><mo>−</mo><msub><mi>r</mi><mi>θ</mi></msub><mo stretchy="false">(</mo><mi>x</mi><mo separator="true">,</mo><msup><mi>y</mi><mo>−</mo></msup><mo stretchy="false">)</mo><mo stretchy="false">)</mo><mo fence="true">]</mo></mrow><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">
\\mathcal{L}(\\theta) = - \\mathbb{E}_{(x,y^+,y^-) \\sim \\mathcal{D}} \\left[ \\log \\sigma(r_\\theta(x, y^+) - r_\\theta(x, y^-)) \\right].
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathcal">L</span><span class="mopen">(</span><span class="mord mathnormal" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.2052em;vertical-align:-0.3552em;"></span><span class="mord">−</span><span class="mord"><span class="mord mathbb">E</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.5198em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">x</span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7027em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mpunct mtight">,</span><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7027em;"><span style="top:-2.786em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose mtight">)</span><span class="mrel mtight">∼</span><span class="mord mathcal mtight" style="margin-right:0.02778em;">D</span></span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.3552em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size1">[</span></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">σ</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8213em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">+</span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361em;"><span style="top:-2.55em;margin-left:-0.0278em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight" style="margin-right:0.02778em;">θ</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8213em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">−</span></span></span></span></span></span></span></span><span class="mclose">))</span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size1">]</span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">.</span></span></span></span></span>`,Ya,E,Os='<p>The Bradley-Terry model is underdetermined, meaning that adding a constant to all rewards does not change the preference probabilities. To address this, <a href="https://huggingface.co/papers/2312.09244" rel="nofollow">Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate Reward Hacking</a> proposes adding an auxiliary loss term that encourages the rewards to be centered around zero. This is controlled by the <code>center_rewards_coefficient</code> parameter in the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>. The recommended value is <code>1e-2</code>.</p>',Ha,de,La,he,et="While training and evaluating we record the following reward metrics:",Aa,ge,at="<li><code>global_step</code>: The total number of optimizer steps taken so far.</li> <li><code>epoch</code>: The current epoch number, based on dataset iteration.</li> <li><code>num_tokens</code>: The total number of tokens processed so far.</li> <li><code>loss</code>: The average loss over the last logging interval.</li> <li><code>accuracy</code>: The proportion of correct predictions (i.e., the model assigned a higher score to the chosen response than to the rejected one) averaged over the last logging interval.</li> <li><code>min_reward</code>: The minimum reward score assigned by the model. This value is averaged over the logging interval.</li> <li><code>mean_reward</code>: The average reward score assigned by the model over the last logging interval.</li> <li><code>max_reward</code>: The maximum reward score assigned by the model. This value is averaged over the logging interval.</li> <li><code>margin</code>: The average margin (difference between chosen and rejected rewards) over the last logging interval.</li> <li><code>learning_rate</code>: The current learning rate, which may change dynamically if a scheduler is used.</li> <li><code>grad_norm</code>: The L2 norm of the gradients, computed before gradient clipping.</li>",Da,ue,Pa,fe,Ka,ye,st='You can directly pass the kwargs of the <code>from_pretrained()</code> method to the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>. For example, if you want to load a model in a different precision, analogous to',Oa,we,es,Te,tt='you can do so by passing the <code>model_init_kwargs={&quot;dtype&quot;: torch.bfloat16}</code> argument to the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>.',as,Me,ss,_e,nt="Note that all keyword arguments of <code>from_pretrained()</code> are supported, except for <code>num_labels</code>, which is automatically set to 1.",ts,be,ns,Je,lt="We support tight integration with 🤗 PEFT library, allowing any user to conveniently train adapters and share them on the Hub, rather than training the entire model.",ls,ve,rs,je,rt='You can also continue training your <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a>. For that, first load a <code>PeftModel</code> outside <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> and pass it directly to the trainer without the <code>peft_config</code> argument being passed.',os,Ue,is,F,We,ot="When training adapters, you typically use a higher learning rate (≈1e‑3) since only new parameters are being learned.",Us,Ie,ps,ke,ms,Ce,it='The <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> fully supports fine-tuning models with <em>tool calling</em> capabilities. In this case, each dataset example should include:',cs,xe,pt="<li>The conversation messages, including any tool calls (<code>tool_calls</code>) and tool responses (<code>tool</code> role messages)</li> <li>The list of available tools in the <code>tools</code> column, typically provided as JSON schemas</li>",ds,Ne,mt='For details on the expected dataset structure, see the <a href="dataset_formats#tool-calling">Dataset Format — Tool Calling</a> section.',hs,qe,gs,T,$e,Is,Ee,ct="Trainer for Outcome-supervised Reward Models (ORM).",ks,Ve,dt='This class is a wrapper around the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer" rel="nofollow">Trainer</a> class and inherits all of its attributes and methods.',Cs,V,xs,X,Re,Ns,Xe,ht="Main training entry point.",qs,Z,ze,$s,Se,gt="Will save the model, so you can reload it using <code>from_pretrained()</code>.",Rs,Ye,ut="Will only save from the main process.",zs,S,Ze,Zs,He,ft="Upload <code>self.model</code> and <code>self.processing_class</code> to the 🤗 model hub on the repo <code>self.args.hub_model_id</code>.",us,Ge,fs,_,Be,Gs,Le,yt='Configuration class for the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a>.',Bs,Ae,wt=`This class includes only the parameters that are specific to Reward training. For a full list of training
arguments, please refer to the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a> documentation. Note that default values in this
class may differ from those in <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>.`,Fs,De,Tt=`Using <a href="https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.HfArgumentParser" rel="nofollow">HfArgumentParser</a> we can turn this class into
<a href="https://docs.python.org/3/library/argparse#module-argparse" rel="nofollow">argparse</a> arguments that can be specified on the
command line.`,Qs,Fe,Mt='<p>These parameters have default values different from <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>:</p> <ul><li><code>logging_steps</code>: Defaults to <code>10</code> instead of <code>500</code>.</li> <li><code>gradient_checkpointing</code>: Defaults to <code>True</code> instead of <code>False</code>.</li> <li><code>bf16</code>: Defaults to <code>True</code> if <code>fp16</code> is not set, instead of <code>False</code>.</li> <li><code>learning_rate</code>: Defaults to <code>1e-4</code> instead of <code>5e-5</code>.</li></ul>',ys,Qe,ws,Ke,Ts;return C=new Qt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),$=new U({props:{title:"Reward Modeling",local:"reward-modeling",headingTag:"h1"}}),L=new U({props:{title:"Overview",local:"overview",headingTag:"h2"}}),P=new U({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),O=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEF0cmFpbmVyJTIwJTNEJTIwUmV3YXJkVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMy0wLjZCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGxvYWRfZGF0YXNldCglMjJ0cmwtbGliJTJGdWx0cmFmZWVkYmFja19iaW5hcml6ZWQlMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
trainer = RewardTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen3-0.6B&quot;</span>,
train_dataset=load_dataset(<span class="hljs-string">&quot;trl-lib/ultrafeedback_binarized&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>),
)
trainer.train()`,wrap:!1}}),ee=new U({props:{title:"Expected dataset type and format",local:"expected-dataset-type-and-format",headingTag:"h2"}}),se=new B({props:{code:"JTIzJTIwU3RhbmRhcmQlMjBwcmVmZXJlbmNlJTIwKGltcGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMmNob3NlbiUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIwYmx1ZS4lMjIlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIwZ3JlZW4uJTIyJTdEJTBBJTBBJTIzJTIwQ29udmVyc2F0aW9uYWwlMjBwcmVmZXJlbmNlJTIwKGltcGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldoYXQlMjBjb2xvciUyMGlzJTIwdGhlJTIwc2t5JTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJJdCUyMGlzJTIwYmx1ZS4lMjIlN0QlNUQlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldoYXQlMjBjb2xvciUyMGlzJTIwdGhlJTIwc2t5JTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJJdCUyMGlzJTIwZ3JlZW4uJTIyJTdEJTVEJTdEJTBBJTBBJTIzJTIwU3RhbmRhcmQlMjBwcmVmZXJlbmNlJTIwKGV4cGxpY2l0JTIwcHJvbXB0KSUwQSU3QiUyMnByb21wdCUyMiUzQSUyMCUyMlRoZSUyMHNreSUyMGlzJTIyJTJDJTBBJTIwJTIyY2hvc2VuJTIyJTNBJTIwJTIyJTIwYmx1ZS4lMjIlMkMlMEElMjAlMjJyZWplY3RlZCUyMiUzQSUyMCUyMiUyMGdyZWVuLiUyMiU3RCUwQSUwQSUyMyUyMENvbnZlcnNhdGlvbmFsJTIwcHJlZmVyZW5jZSUyMChleHBsaWNpdCUyMHByb21wdCklMEElN0IlMjJwcm9tcHQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJXaGF0JTIwY29sb3IlMjBpcyUyMHRoZSUyMHNreSUzRiUyMiU3RCU1RCUyQyUwQSUyMCUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXQlMjBpcyUyMGJsdWUuJTIyJTdEJTVEJTJDJTBBJTIwJTIycmVqZWN0ZWQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMkl0JTIwaXMlMjBncmVlbi4lMjIlN0QlNUQlN0Q=",highlighted:`<span class="hljs-comment"># Standard preference (implicit prompt)</span>
{<span class="hljs-string">&quot;chosen&quot;</span>: <span class="hljs-string">&quot;The sky is blue.&quot;</span>,
<span class="hljs-string">&quot;rejected&quot;</span>: <span class="hljs-string">&quot;The sky is green.&quot;</span>}
<span class="hljs-comment"># Conversational preference (implicit prompt)</span>
{<span class="hljs-string">&quot;chosen&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What color is the sky?&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;It is blue.&quot;</span>}],
<span class="hljs-string">&quot;rejected&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What color is the sky?&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;It is green.&quot;</span>}]}
<span class="hljs-comment"># Standard preference (explicit prompt)</span>
{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;The sky is&quot;</span>,
<span class="hljs-string">&quot;chosen&quot;</span>: <span class="hljs-string">&quot; blue.&quot;</span>,
<span class="hljs-string">&quot;rejected&quot;</span>: <span class="hljs-string">&quot; green.&quot;</span>}
<span class="hljs-comment"># Conversational preference (explicit prompt)</span>
{<span class="hljs-string">&quot;prompt&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What color is the sky?&quot;</span>}],
<span class="hljs-string">&quot;chosen&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;It is blue.&quot;</span>}],
<span class="hljs-string">&quot;rejected&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;It is green.&quot;</span>}]}`,wrap:!1}}),ne=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBaW1wb3J0JTIwanNvbiUwQSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIybG1hcmVuYS1haSUyRmFyZW5hLWh1bWFuLXByZWZlcmVuY2UtNTVrJTIyKSUwQSUwQSUyMyUyMEZpbHRlciUyMG91dCUyMHRpZXMlMEFkYXRhc2V0JTIwJTNEJTIwZGF0YXNldC5maWx0ZXIobGFtYmRhJTIwZXhhbXBsZSUzQSUyMGV4YW1wbGUlNUIlMjJ3aW5uZXJfdGllJTIyJTVEJTIwJTNEJTNEJTIwMCklMEElMEElMjMlMjBDcmVhdGUlMjAnY2hvc2VuJyUyMGFuZCUyMCdyZWplY3RlZCclMjBmaWVsZHMlMjBiYXNlZCUyMG9uJTIwdGhlJTIwd2lubmVyJTIwY29sdW1uJTBBZGVmJTIwcmVzcG9uc2VfYV9iX3RvX2Nob3Nlbl9yZWplY3RlZChleGFtcGxlKSUzQSUwQSUyMCUyMCUyMCUyMGlmJTIwZXhhbXBsZSU1QiUyMndpbm5lcl9tb2RlbF9hJTIyJTVEJTIwJTNEJTNEJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQlMjAlM0QlMjBleGFtcGxlJTVCJTIycmVzcG9uc2VfYSUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJyZWplY3RlZCUyMiU1RCUyMCUzRCUyMGV4YW1wbGUlNUIlMjJyZXNwb25zZV9iJTIyJTVEJTBBJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQlMjAlM0QlMjBleGFtcGxlJTVCJTIycmVzcG9uc2VfYiUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGV4YW1wbGUlNUIlMjJyZWplY3RlZCUyMiU1RCUyMCUzRCUyMGV4YW1wbGUlNUIlMjJyZXNwb25zZV9hJTIyJTVEJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwZXhhbXBsZSUwQSUwQWRhdGFzZXQlMjAlM0QlMjBkYXRhc2V0Lm1hcChyZXNwb25zZV9hX2JfdG9fY2hvc2VuX3JlamVjdGVkKSUwQSUwQSUyMyUyMENvbnZlcnQlMjB0byUyMGNvbnZlcnNhdGlvbmFsJTIwZm9ybWF0JTBBZGVmJTIwbWFrZV9jb252ZXJzYXRpb24oZXhhbXBsZSklM0ElMEElMjAlMjAlMjAlMjBwcm9tcHQlMjAlM0QlMjBqc29uLmxvYWRzKGV4YW1wbGUlNUIlMjJwcm9tcHQlMjIlNUQpJTVCMCU1RCUyMCUyMCUyMyUyMCclNUIlMjJXaGF0JTIwY29sb3IlMjBpcyUyMHRoZSUyMHNreSUzRiUyMiU1RCclMjAtJTNFJTIwJTIyV2hhdCUyMGNvbG9yJTIwaXMlMjB0aGUlMjBza3klM0YlMjIlMEElMjAlMjAlMjAlMjBjaG9zZW4lMjAlM0QlMjBqc29uLmxvYWRzKGV4YW1wbGUlNUIlMjJjaG9zZW4lMjIlNUQpJTVCMCU1RCUwQSUyMCUyMCUyMCUyMHJlamVjdGVkJTIwJTNEJTIwanNvbi5sb2FkcyhleGFtcGxlJTVCJTIycmVqZWN0ZWQlMjIlNUQpJTVCMCU1RCUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU3QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMmNob3NlbiUyMiUzQSUyMCU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMHByb21wdCU3RCUyQyUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJhc3Npc3RhbnQlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwY2hvc2VuJTdEJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIycmVqZWN0ZWQlMjIlM0ElMjAlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjBwcm9tcHQlN0QlMkMlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyYXNzaXN0YW50JTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMHJlamVjdGVkJTdEJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTdEJTBBJTBBJTBBZGF0YXNldCUyMCUzRCUyMGRhdGFzZXQubWFwKG1ha2VfY29udmVyc2F0aW9uKSUwQSUwQSUyMyUyMEtlZXAlMjBvbmx5JTIwbmVjZXNzYXJ5JTIwY29sdW1ucyUwQWRhdGFzZXQlMjAlM0QlMjBkYXRhc2V0LnNlbGVjdF9jb2x1bW5zKCU1QiUyMmNob3NlbiUyMiUyQyUyMCUyMnJlamVjdGVkJTIyJTVEKSUwQSUwQXByaW50KG5leHQoaXRlcihkYXRhc2V0JTVCJTIydHJhaW4lMjIlNUQpKSk=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">import</span> json
dataset = load_dataset(<span class="hljs-string">&quot;lmarena-ai/arena-human-preference-55k&quot;</span>)
<span class="hljs-comment"># Filter out ties</span>
dataset = dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> example: example[<span class="hljs-string">&quot;winner_tie&quot;</span>] == <span class="hljs-number">0</span>)
<span class="hljs-comment"># Create &#x27;chosen&#x27; and &#x27;rejected&#x27; fields based on the winner column</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">response_a_b_to_chosen_rejected</span>(<span class="hljs-params">example</span>):
<span class="hljs-keyword">if</span> example[<span class="hljs-string">&quot;winner_model_a&quot;</span>] == <span class="hljs-number">1</span>:
example[<span class="hljs-string">&quot;chosen&quot;</span>] = example[<span class="hljs-string">&quot;response_a&quot;</span>]
example[<span class="hljs-string">&quot;rejected&quot;</span>] = example[<span class="hljs-string">&quot;response_b&quot;</span>]
<span class="hljs-keyword">else</span>:
example[<span class="hljs-string">&quot;chosen&quot;</span>] = example[<span class="hljs-string">&quot;response_b&quot;</span>]
example[<span class="hljs-string">&quot;rejected&quot;</span>] = example[<span class="hljs-string">&quot;response_a&quot;</span>]
<span class="hljs-keyword">return</span> example
dataset = dataset.<span class="hljs-built_in">map</span>(response_a_b_to_chosen_rejected)
<span class="hljs-comment"># Convert to conversational format</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">make_conversation</span>(<span class="hljs-params">example</span>):
prompt = json.loads(example[<span class="hljs-string">&quot;prompt&quot;</span>])[<span class="hljs-number">0</span>] <span class="hljs-comment"># &#x27;[&quot;What color is the sky?&quot;]&#x27; -&gt; &quot;What color is the sky?&quot;</span>
chosen = json.loads(example[<span class="hljs-string">&quot;chosen&quot;</span>])[<span class="hljs-number">0</span>]
rejected = json.loads(example[<span class="hljs-string">&quot;rejected&quot;</span>])[<span class="hljs-number">0</span>]
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;chosen&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: prompt}, {<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: chosen}],
<span class="hljs-string">&quot;rejected&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: prompt}, {<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: rejected}],
}
dataset = dataset.<span class="hljs-built_in">map</span>(make_conversation)
<span class="hljs-comment"># Keep only necessary columns</span>
dataset = dataset.select_columns([<span class="hljs-string">&quot;chosen&quot;</span>, <span class="hljs-string">&quot;rejected&quot;</span>])
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(dataset[<span class="hljs-string">&quot;train&quot;</span>])))`,wrap:!1}}),le=new B({props:{code:"JTdCJTBBJTIwJTIwJTIwJTIwJTIyY2hvc2VuJTIyJTNBJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXMlMjBpdCUyMG1vcmFsbHklMjByaWdodCUyMHRvJTIwdHJ5JTIwdG8lMjBoYXZlJTIwYSUyMGNlcnRhaW4lMjBwZXJjZW50YWdlJTIwb2YlMjBmZW1hbGVzJTIwb24lMjBtYW5hZ2VyaWFsJTIwcG9zaXRpb25zJTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJUaGUlMjBxdWVzdGlvbiUyMG9mJTIwd2hldGhlciUyMGl0JTIwaXMlMjBtb3JhbGx5JTIwcmlnaHQlMjB0byUyMGFpbSUyMGZvciUyMGElMjBjZXJ0YWluJTIwcGVyY2VudGFnZSUyMG9mJTIwZmVtYWxlcy4uLiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMnJlamVjdGVkJTIyJTNBJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIySXMlMjBpdCUyMG1vcmFsbHklMjByaWdodCUyMHRvJTIwdHJ5JTIwdG8lMjBoYXZlJTIwYSUyMGNlcnRhaW4lMjBwZXJjZW50YWdlJTIwb2YlMjBmZW1hbGVzJTIwb24lMjBtYW5hZ2VyaWFsJTIwcG9zaXRpb25zJTNGJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMmFzc2lzdGFudCUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJBcyUyMGFuJTIwQUklMkMlMjBJJTIwZG9uJ3QlMjBoYXZlJTIwcGVyc29uYWwlMjBiZWxpZWZzJTIwb3IlMjBvcGluaW9ucy4lMjBIb3dldmVyJTJDJTIwLi4uJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTdE",highlighted:`<span class="hljs-punctuation">{</span>
<span class="hljs-attr">&quot;chosen&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span>
<span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;role&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;user&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;content&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;Is it morally right to try to have a certain percentage of females on managerial positions?&quot;</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
<span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;role&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;assistant&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;content&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;The question of whether it is morally right to aim for a certain percentage of females...&quot;</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
<span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span>
<span class="hljs-attr">&quot;rejected&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span>
<span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;role&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;user&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;content&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;Is it morally right to try to have a certain percentage of females on managerial positions?&quot;</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
<span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;role&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;assistant&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;content&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-string">&quot;As an AI, I don&#x27;t have personal beliefs or opinions. However, ...&quot;</span><span class="hljs-punctuation">}</span><span class="hljs-punctuation">,</span>
<span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span>
<span class="hljs-punctuation">}</span>`,wrap:!1}}),re=new U({props:{title:"Looking deeper into the training method",local:"looking-deeper-into-the-training-method",headingTag:"h2"}}),pe=new U({props:{title:"Preprocessing and tokenization",local:"preprocessing-and-tokenization",headingTag:"h3"}}),ce=new U({props:{title:"Computing the loss",local:"computing-the-loss",headingTag:"h3"}}),de=new U({props:{title:"Logged metrics",local:"logged-metrics",headingTag:"h2"}}),ue=new U({props:{title:"Customization",local:"customization",headingTag:"h2"}}),fe=new U({props:{title:"Model initialization",local:"model-initialization",headingTag:"h3"}}),we=new B({props:{code:"bW9kZWwlMjAlM0QlMjBBdXRvTW9kZWxGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uLmZyb21fcHJldHJhaW5lZCglMjJRd2VuJTJGUXdlbjMtMC42QiUyMiUyQyUyMGR0eXBlJTNEdG9yY2guYmZsb2F0MTYp",highlighted:'model = AutoModelForSequenceClassification.from_pretrained(<span class="hljs-string">&quot;Qwen/Qwen3-0.6B&quot;</span>, dtype=torch.bfloat16)',wrap:!1}}),Me=new B({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZENvbmZpZyUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBSZXdhcmRDb25maWcoJTBBJTIwJTIwJTIwJTIwbW9kZWxfaW5pdF9rd2FyZ3MlM0QlN0IlMjJkdHlwZSUyMiUzQSUyMHRvcmNoLmJmbG9hdDE2JTdEJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardConfig
training_args = RewardConfig(
model_init_kwargs={<span class="hljs-string">&quot;dtype&quot;</span>: torch.bfloat16},
)`,wrap:!1}}),be=new U({props:{title:"Train adapters with PEFT",local:"train-adapters-with-peft",headingTag:"h3"}}),ve=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwcGVmdCUyMGltcG9ydCUyMExvcmFDb25maWclMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZ1bHRyYWZlZWRiYWNrX2JpbmFyaXplZCUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMFJld2FyZFRyYWluZXIoJTBBJTIwJTIwJTIwJTIwJTIyUXdlbiUyRlF3ZW4zLTRCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBwZWZ0X2NvbmZpZyUzRExvcmFDb25maWcobW9kdWxlc190b19zYXZlJTNEJTVCJTIyc2NvcmUlMjIlNUQpJTIwJTIwJTIzJTIwaW1wb3J0YW50JTIwdG8lMjBpbmNsdWRlJTIwdGhlJTIwc2NvcmUlMjBoZWFkJTIwd2hlbiUyMGJhc2UlMjBtb2RlbCUyMGlzJTIwbm90JTIwYSUyMHNlcXVlbmNlJTIwY2xhc3NpZmljYXRpb24lMjBtb2RlbCUwQSklMEElMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer
<span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/ultrafeedback_binarized&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
trainer = RewardTrainer(
<span class="hljs-string">&quot;Qwen/Qwen3-4B&quot;</span>,
train_dataset=dataset,
peft_config=LoraConfig(modules_to_save=[<span class="hljs-string">&quot;score&quot;</span>]) <span class="hljs-comment"># important to include the score head when base model is not a sequence classification model</span>
)
trainer.train()`,wrap:!1}}),Ue=new B({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZnJvbSUyMHRybCUyMGltcG9ydCUyMFJld2FyZFRyYWluZXIlMEFmcm9tJTIwcGVmdCUyMGltcG9ydCUyMEF1dG9QZWZ0TW9kZWxGb3JDYXVzYWxMTSUwQSUwQW1vZGVsJTIwJTNEJTIwQXV0b1BlZnRNb2RlbEZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCglMjJ0cmwtbGliJTJGUXdlbjMtNEItUmV3YXJkLUxvUkElMjIlMkMlMjBpc190cmFpbmFibGUlM0RUcnVlKSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydHJsLWxpYiUyRkNhcHliYXJhJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEElMEF0cmFpbmVyJTIwJTNEJTIwUmV3YXJkVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRG1vZGVsJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRGRhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> RewardTrainer
<span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> AutoPeftModelForCausalLM
model = AutoPeftModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;trl-lib/Qwen3-4B-Reward-LoRA&quot;</span>, is_trainable=<span class="hljs-literal">True</span>)
dataset = load_dataset(<span class="hljs-string">&quot;trl-lib/Capybara&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
trainer = RewardTrainer(
model=model,
train_dataset=dataset,
)
trainer.train()`,wrap:!1}}),Ie=new B({props:{code:"UmV3YXJkQ29uZmlnKGxlYXJuaW5nX3JhdGUlM0QxZS0zJTJDJTIwLi4uKQ==",highlighted:'RewardConfig(learning_rate=<span class="hljs-number">1e-3</span>, ...)',wrap:!1}}),ke=new U({props:{title:"Tool Calling with Reward Modeling",local:"tool-calling-with-reward-modeling",headingTag:"h2"}}),qe=new U({props:{title:"RewardTrainer",local:"trl.RewardTrainer",headingTag:"h2"}}),$e=new Oe({props:{name:"class trl.RewardTrainer",anchor:"trl.RewardTrainer",parameters:[{name:"model",val:": str | PreTrainedModel | PeftModel"},{name:"args",val:": trl.trainer.reward_config.RewardConfig | None = None"},{name:"data_collator",val:": collections.abc.Callable[[list[typing.Any]], dict[str, typing.Any]] | None = None"},{name:"train_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | None = None"},{name:"eval_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | dict[str, datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset] | None = None"},{name:"processing_class",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase | None = None"},{name:"compute_metrics",val:": collections.abc.Callable[[transformers.trainer_utils.EvalPrediction], dict] | None = None"},{name:"callbacks",val:": list[transformers.trainer_callback.TrainerCallback] | None = None"},{name:"optimizers",val:": tuple = (None, None)"},{name:"optimizer_cls_and_kwargs",val:": tuple[type[torch.optim.optimizer.Optimizer], dict[str, typing.Any]] | None = None"},{name:"preprocess_logits_for_metrics",val:": collections.abc.Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None"},{name:"peft_config",val:": PeftConfig | None = None"}],parametersDescription:[{anchor:"trl.RewardTrainer.model",description:`<strong>model</strong> (<code>str</code> or <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> or <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a>) &#x2014;
Model to be trained. Can be either:</p>
<ul>
<li>A string, being the <em>model id</em> of a pretrained model hosted inside a model repo on huggingface.co, or a
path to a <em>directory</em> containing model weights saved using
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>&apos;./my_model_directory/&apos;</code>. The model is loaded
using <code>AutoModelForSequenceClassification.from_pretrained</code> with the keyword arguments in
<code>args.model_init_kwargs</code>.</li>
<li>A sequence classification <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel" rel="nofollow">PreTrainedModel</a> object.</li>
<li>A sequence classification <a href="https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.PeftModel" rel="nofollow">PeftModel</a> object.</li>
</ul>`,name:"model"},{anchor:"trl.RewardTrainer.args",description:`<strong>args</strong> (<a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a>, <em>optional</em>) &#x2014;
Configuration for this trainer. If <code>None</code>, a default configuration is used.`,name:"args"},{anchor:"trl.RewardTrainer.data_collator",description:`<strong>data_collator</strong> (<code>DataCollator</code>, <em>optional</em>) &#x2014;
Function to use to form a batch from a list of elements of the processed <code>train_dataset</code> or <code>eval_dataset</code>.
Will default to <code>DataCollatorForPreference</code>.`,name:"data_collator"},{anchor:"trl.RewardTrainer.train_dataset",description:`<strong>train_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a> or <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a>) &#x2014;
Dataset to use for training. This trainer supports <a href="#preference">preference</a> type (both implicit and
explicit prompt). The format of the samples can be either:</p>
<ul>
<li><a href="dataset_formats#standard">Standard</a>: Each sample contains plain text.</li>
<li><a href="dataset_formats#conversational">Conversational</a>: Each sample contains structured messages (e.g., role
and content).</li>
</ul>
<p>The trainer also supports processed datasets (tokenized) as long as they contain <code>chosen_ids</code> and
<code>rejected_ids</code> fields.`,name:"train_dataset"},{anchor:"trl.RewardTrainer.eval_dataset",description:`<strong>eval_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a>, <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a> or <code>dict[str, Dataset | IterableDataset]</code>) &#x2014;
Dataset to use for evaluation. It must meet the same requirements as <code>train_dataset</code>.`,name:"eval_dataset"},{anchor:"trl.RewardTrainer.processing_class",description:`<strong>processing_class</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a>, <em>optional</em>) &#x2014;
Tokenizer used to process the data. If <code>None</code>, the tokenizer is loaded from the model&#x2019;s name with
<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained" rel="nofollow">from_pretrained</a>. A padding token, <code>processing_class.pad_token</code>, must be
set. If the processing class has not set a padding token, <code>processing_class.eos_token</code> will be used as the
default.`,name:"processing_class"},{anchor:"trl.RewardTrainer.compute_metrics",description:`<strong>compute_metrics</strong> (<code>Callable[[EvalPrediction], dict]</code>, <em>optional</em>) &#x2014;
The function that will be used to compute metrics at evaluation. Must take a
<a href="https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.EvalPrediction" rel="nofollow">EvalPrediction</a> and return a dictionary string to metric values. When passing
<a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardConfig">RewardConfig</a> with <code>batch_eval_metrics</code> set to <code>True</code>, your <code>compute_metrics</code> function must take a
boolean <code>compute_result</code> argument. This will be triggered after the last eval batch to signal that the
function needs to calculate and return the global summary statistics rather than accumulating the
batch-level statistics.`,name:"compute_metrics"},{anchor:"trl.RewardTrainer.callbacks",description:`<strong>callbacks</strong> (list of <a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback" rel="nofollow">TrainerCallback</a>, <em>optional</em>) &#x2014;
List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
in <a href="https://huggingface.co/docs/transformers/main_classes/callback" rel="nofollow">here</a>.</p>
<p>If you want to remove one of the default callbacks used, use the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.remove_callback" rel="nofollow">remove_callback</a>
method.`,name:"callbacks"},{anchor:"trl.RewardTrainer.optimizers",description:`<strong>optimizers</strong> (<code>tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]</code>, <em>optional</em>, defaults to <code>(None, None)</code>) &#x2014;
A tuple containing the optimizer and the scheduler to use. Will default to an instance of <code>AdamW</code> on your
model and a scheduler given by <a href="https://huggingface.co/docs/transformers/main/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup" rel="nofollow">get_linear_schedule_with_warmup</a> controlled by <code>args</code>.`,name:"optimizers"},{anchor:"trl.RewardTrainer.optimizer_cls_and_kwargs",description:`<strong>optimizer_cls_and_kwargs</strong> (<code>tuple[Type[torch.optim.Optimizer], Dict[str, Any]]</code>, <em>optional</em>) &#x2014;
A tuple containing the optimizer class and keyword arguments to use. Overrides <code>optim</code> and <code>optim_args</code> in
<code>args</code>. Incompatible with the <code>optimizers</code> argument.</p>
<p>Unlike <code>optimizers</code>, this argument avoids the need to place model parameters on the correct devices before
initializing the Trainer.`,name:"optimizer_cls_and_kwargs"},{anchor:"trl.RewardTrainer.preprocess_logits_for_metrics",description:`<strong>preprocess_logits_for_metrics</strong> (<code>Callable[[torch.Tensor, torch.Tensor], torch.Tensor]</code>, <em>optional</em>) &#x2014;
A function that preprocess the logits right before caching them at each evaluation step. Must take two
tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
by this function will be reflected in the predictions received by <code>compute_metrics</code>.</p>
<p>Note that the labels (second parameter) will be <code>None</code> if the dataset does not have them.`,name:"preprocess_logits_for_metrics"},{anchor:"trl.RewardTrainer.peft_config",description:`<strong>peft_config</strong> (<a href="https://huggingface.co/docs/peft/main/en/package_reference/config#peft.PeftConfig" rel="nofollow">PeftConfig</a>, <em>optional</em>) &#x2014;
PEFT configuration used to wrap the model. If <code>None</code>, the model is not wrapped. Note that if the loaded
model is a causal LM, it&#x2019;s highly recommended to set <code>modules_to_save=[&quot;score&quot;]</code> in the PEFT configuration
to ensure that the reward head is properly trained.`,name:"peft_config"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/trainer/reward_trainer.py#L229"}}),V=new Et({props:{anchor:"trl.RewardTrainer.example",$$slots:{default:[Vt]},$$scope:{ctx:ea}}}),Re=new Oe({props:{name:"train",anchor:"trl.RewardTrainer.train",parameters:[{name:"resume_from_checkpoint",val:": str | bool | None = None"},{name:"trial",val:": optuna.Trial | dict[str, Any] | None = None"},{name:"ignore_keys_for_eval",val:": list[str] | None = None"}],parametersDescription:[{anchor:"trl.RewardTrainer.train.resume_from_checkpoint",description:`<strong>resume_from_checkpoint</strong> (<code>str</code> or <code>bool</code>, <em>optional</em>) &#x2014;
If a <code>str</code>, local path to a saved checkpoint as saved by a previous instance of <code>Trainer</code>. If a
<code>bool</code> and equals <code>True</code>, load the last checkpoint in <em>args.output_dir</em> as saved by a previous instance
of <code>Trainer</code>. If present, training will resume from the model/optimizer/scheduler states loaded here.`,name:"resume_from_checkpoint"},{anchor:"trl.RewardTrainer.train.trial",description:`<strong>trial</strong> (<code>optuna.Trial</code> or <code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
The trial run or the hyperparameter dictionary for hyperparameter search.`,name:"trial"},{anchor:"trl.RewardTrainer.train.ignore_keys_for_eval",description:`<strong>ignore_keys_for_eval</strong> (<code>list[str]</code>, <em>optional</em>) &#x2014;
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions for evaluation during the training.`,name:"ignore_keys_for_eval"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L1323",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>Object containing the global step count, training loss, and metrics.</p>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>
<p><code>~trainer_utils.TrainOutput</code></p>
`}}),ze=new Oe({props:{name:"save_model",anchor:"trl.RewardTrainer.save_model",parameters:[{name:"output_dir",val:": str | None = None"},{name:"_internal_call",val:": bool = False"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L3746"}}),Ze=new Oe({props:{name:"push_to_hub",anchor:"trl.RewardTrainer.push_to_hub",parameters:[{name:"commit_message",val:": str | None = 'End of training'"},{name:"blocking",val:": bool = True"},{name:"token",val:": str | None = None"},{name:"revision",val:": str | None = None"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"trl.RewardTrainer.push_to_hub.commit_message",description:`<strong>commit_message</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;End of training&quot;</code>) &#x2014;
Message to commit while pushing.`,name:"commit_message"},{anchor:"trl.RewardTrainer.push_to_hub.blocking",description:`<strong>blocking</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether the function should return only when the <code>git push</code> has finished.`,name:"blocking"},{anchor:"trl.RewardTrainer.push_to_hub.token",description:`<strong>token</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) &#x2014;
Token with write permission to overwrite Trainer&#x2019;s original args.`,name:"token"},{anchor:"trl.RewardTrainer.push_to_hub.revision",description:`<strong>revision</strong> (<code>str</code>, <em>optional</em>) &#x2014;
The git revision to commit from. Defaults to the head of the &#x201C;main&#x201D; branch.`,name:"revision"},{anchor:"trl.RewardTrainer.push_to_hub.kwargs",description:`<strong>kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Additional keyword arguments passed along to <code>~Trainer.create_model_card</code>.`,name:"kwargs"}],source:"https://github.com/huggingface/trl/blob/vr_5607/transformers/trainer.py#L3993",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>The URL of the repository where the model was pushed if <code>blocking=False</code>, or a <code>Future</code> object tracking the
progress of the commit if <code>blocking=True</code>.</p>
`}}),Ge=new U({props:{title:"RewardConfig",local:"trl.RewardConfig",headingTag:"h2"}}),Be=new Oe({props:{name:"class trl.RewardConfig",anchor:"trl.RewardConfig",parameters:[{name:"output_dir",val:": str | None = None"},{name:"per_device_train_batch_size",val:": int = 8"},{name:"num_train_epochs",val:": float = 3.0"},{name:"max_steps",val:": int = -1"},{name:"learning_rate",val:": float = 0.0001"},{name:"lr_scheduler_type",val:": transformers.trainer_utils.SchedulerType | str = 'linear'"},{name:"lr_scheduler_kwargs",val:": dict | str | None = None"},{name:"warmup_steps",val:": float = 0"},{name:"optim",val:": transformers.training_args.OptimizerNames | str = 'adamw_torch_fused'"},{name:"optim_args",val:": str | None = None"},{name:"weight_decay",val:": float = 0.0"},{name:"adam_beta1",val:": float = 0.9"},{name:"adam_beta2",val:": float = 0.999"},{name:"adam_epsilon",val:": float = 1e-08"},{name:"optim_target_modules",val:": None | str | list[str] = None"},{name:"gradient_accumulation_steps",val:": int = 1"},{name:"average_tokens_across_devices",val:": bool = True"},{name:"max_grad_norm",val:": float = 1.0"},{name:"label_smoothing_factor",val:": float = 0.0"},{name:"bf16",val:": bool | None = None"},{name:"fp16",val:": bool = False"},{name:"bf16_full_eval",val:": bool = False"},{name:"fp16_full_eval",val:": bool = False"},{name:"tf32",val:": bool | None = None"},{name:"gradient_checkpointing",val:": bool = True"},{name:"gradient_checkpointing_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"torch_compile",val:": bool = False"},{name:"torch_compile_backend",val:": str | None = None"},{name:"torch_compile_mode",val:": str | None = None"},{name:"use_liger_kernel",val:": bool = False"},{name:"liger_kernel_config",val:": dict[str, bool] | None = None"},{name:"use_cache",val:": bool = False"},{name:"neftune_noise_alpha",val:": float | None = None"},{name:"torch_empty_cache_steps",val:": int | None = None"},{name:"auto_find_batch_size",val:": bool = False"},{name:"logging_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'steps'"},{name:"logging_steps",val:": float = 10"},{name:"logging_first_step",val:": bool = False"},{name:"log_on_each_node",val:": bool = True"},{name:"logging_nan_inf_filter",val:": bool = True"},{name:"include_num_input_tokens_seen",val:": str | bool = 'no'"},{name:"log_level",val:": str = 'passive'"},{name:"log_level_replica",val:": str = 'warning'"},{name:"disable_tqdm",val:": bool | None = None"},{name:"report_to",val:": None | str | list[str] = 'none'"},{name:"run_name",val:": str | None = None"},{name:"project",val:": str = 'huggingface'"},{name:"trackio_space_id",val:": str | None = 'trackio'"},{name:"eval_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'no'"},{name:"eval_steps",val:": float | None = None"},{name:"eval_delay",val:": float = 0"},{name:"per_device_eval_batch_size",val:": int = 8"},{name:"prediction_loss_only",val:": bool = False"},{name:"eval_on_start",val:": bool = False"},{name:"eval_do_concat_batches",val:": bool = True"},{name:"eval_use_gather_object",val:": bool = False"},{name:"eval_accumulation_steps",val:": int | None = None"},{name:"include_for_metrics",val:": list = <factory>"},{name:"batch_eval_metrics",val:": bool = False"},{name:"save_only_model",val:": bool = False"},{name:"save_strategy",val:": transformers.trainer_utils.SaveStrategy | str = 'steps'"},{name:"save_steps",val:": float = 500"},{name:"save_on_each_node",val:": bool = False"},{name:"save_total_limit",val:": int | None = None"},{name:"enable_jit_checkpoint",val:": bool = False"},{name:"push_to_hub",val:": bool = False"},{name:"hub_token",val:": str | None = None"},{name:"hub_private_repo",val:": bool | None = None"},{name:"hub_model_id",val:": str | None = None"},{name:"hub_strategy",val:": transformers.trainer_utils.HubStrategy | str = 'every_save'"},{name:"hub_always_push",val:": bool = False"},{name:"hub_revision",val:": str | None = None"},{name:"load_best_model_at_end",val:": bool = False"},{name:"metric_for_best_model",val:": str | None = None"},{name:"greater_is_better",val:": bool | None = None"},{name:"ignore_data_skip",val:": bool = False"},{name:"restore_callback_states_from_checkpoint",val:": bool = False"},{name:"full_determinism",val:": bool = False"},{name:"seed",val:": int = 42"},{name:"data_seed",val:": int | None = None"},{name:"use_cpu",val:": bool = False"},{name:"accelerator_config",val:": dict | str | None = None"},{name:"parallelism_config",val:": accelerate.parallelism_config.ParallelismConfig | None = None"},{name:"dataloader_drop_last",val:": bool = False"},{name:"dataloader_num_workers",val:": int = 0"},{name:"dataloader_pin_memory",val:": bool = True"},{name:"dataloader_persistent_workers",val:": bool = False"},{name:"dataloader_prefetch_factor",val:": int | None = None"},{name:"remove_unused_columns",val:": bool = True"},{name:"label_names",val:": list[str] | None = None"},{name:"train_sampling_strategy",val:": str = 'random'"},{name:"length_column_name",val:": str = 'length'"},{name:"ddp_find_unused_parameters",val:": bool | None = None"},{name:"ddp_bucket_cap_mb",val:": int | None = None"},{name:"ddp_broadcast_buffers",val:": bool | None = None"},{name:"ddp_backend",val:": str | None = None"},{name:"ddp_timeout",val:": int = 1800"},{name:"fsdp",val:": list[transformers.trainer_utils.FSDPOption] | str | None = None"},{name:"fsdp_config",val:": dict[str, typing.Any] | str | None = None"},{name:"deepspeed",val:": dict | str | None = None"},{name:"debug",val:": str | list[transformers.debug_utils.DebugOption] = ''"},{name:"skip_memory_metrics",val:": bool = True"},{name:"do_train",val:": bool = False"},{name:"do_eval",val:": bool = False"},{name:"do_predict",val:": bool = False"},{name:"resume_from_checkpoint",val:": str | None = None"},{name:"warmup_ratio",val:": float | None = None"},{name:"logging_dir",val:": str | None = None"},{name:"local_rank",val:": int = -1"},{name:"model_init_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"chat_template_path",val:": str | None = None"},{name:"disable_dropout",val:": bool = True"},{name:"dataset_num_proc",val:": int | None = None"},{name:"eos_token",val:": str | None = None"},{name:"max_length",val:": int | None = 1024"},{name:"pad_to_multiple_of",val:": int | None = None"},{name:"center_rewards_coefficient",val:": float | None = None"},{name:"activation_offloading",val:": bool = False"},{name:"pad_token",val:": str | None = None"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/trainer/reward_config.py#L23",parameterGroups:[{title:"Parameters that control the model",parametersDescription:[{anchor:"trl.RewardConfig.model_init_kwargs",description:`<strong>model_init_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) &#x2014;
Keyword arguments for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained" rel="nofollow">from_pretrained</a>, used when the <code>model</code>
argument of the <a href="/docs/trl/pr_5607/en/reward_trainer#trl.RewardTrainer">RewardTrainer</a> is provided as a string. If you&#x2019;re training a MoE architecture and want
to include the load balancing/auxiliary loss as a part of the final loss, remember to set
<code>output_router_logits=True</code> in this dictionary.`,name:"model_init_kwargs"},{anchor:"trl.RewardConfig.chat_template_path",description:`<strong>chat_template_path</strong> (<code>str</code>, <em>optional</em>) &#x2014;
If specified, sets the model&#x2019;s chat template. This can either be the path to a tokenizer (local directory
or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
ensure that any special tokens referenced in the template are added to the tokenizer and that the model&#x2019;s
embedding layer is resized accordingly.`,name:"chat_template_path"},{anchor:"trl.RewardConfig.disable_dropout",description:`<strong>disable_dropout</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether to disable dropout in the model.`,name:"disable_dropout"}]},{title:"Parameters that control the data preprocessing",parametersDescription:[{anchor:"trl.RewardConfig.dataset_num_proc",description:`<strong>dataset_num_proc</strong> (<code>int</code>, <em>optional</em>) &#x2014;
Number of processes to use for processing the dataset.`,name:"dataset_num_proc"},{anchor:"trl.RewardConfig.eos_token",description:`<strong>eos_token</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Token used to indicate the end of a turn or sequence. If <code>None</code>, it defaults to
<code>processing_class.eos_token</code>.`,name:"eos_token"},{anchor:"trl.RewardConfig.max_length",description:`<strong>max_length</strong> (<code>int</code> or <code>None</code>, <em>optional</em>, defaults to <code>1024</code>) &#x2014;
Maximum length of the tokenized sequence. Samples are filtered out if either chosen or rejected sequence
exceeds this value. If <code>None</code>, no filtering is applied.`,name:"max_length"},{anchor:"trl.RewardConfig.pad_to_multiple_of",description:`<strong>pad_to_multiple_of</strong> (<code>int</code>, <em>optional</em>) &#x2014;
If set, the sequences will be padded to a multiple of this value.`,name:"pad_to_multiple_of"}]},{title:"Parameters that control the training",parametersDescription:[{anchor:"trl.RewardConfig.center_rewards_coefficient",description:`<strong>center_rewards_coefficient</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
<a href="https://huggingface.co/papers/2312.09244" rel="nofollow">https://huggingface.co/papers/2312.09244</a>, Eq. 2). Recommended value: <code>0.01</code>.`,name:"center_rewards_coefficient"},{anchor:"trl.RewardConfig.activation_offloading",description:`<strong>activation_offloading</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to offload the activations to the CPU.`,name:"activation_offloading"}]},{title:"Deprecated parameters",parametersDescription:[{anchor:"trl.RewardConfig.pad_token",description:`<strong>pad_token</strong> &#x2014;</p>
<deprecated version="1.1.0">
<p>Parameter <code>pad_token</code> is deprecated and will be removed in version v2.0.0. Set <code>tokenizer.pad_token</code>
directly and pass it as <code>processing_class</code> to the trainer instead.</p>
</deprecated>`,name:"pad_token"}]}]}}),Qe=new Wt({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/reward_trainer.md"}}),{c(){M=o("meta"),Y=n(),z=o("p"),k=n(),m(C.$$.fragment),w=n(),m($.$$.fragment),aa=n(),H=o("p"),H.innerHTML=Es,sa=n(),m(L.$$.fragment),ta=n(),A=o("p"),A.textContent=Vs,na=n(),D=o("p"),D.innerHTML=Xs,la=n(),m(P.$$.fragment),ra=n(),K=o("p"),K.innerHTML=Ss,oa=n(),m(O.$$.fragment),ia=n(),x=o("iframe"),pa=n(),m(ee.$$.fragment),ma=n(),ae=o("p"),ae.innerHTML=Hs,ca=n(),m(se.$$.fragment),da=n(),te=o("p"),te.innerHTML=Ls,ha=n(),m(ne.$$.fragment),ga=n(),m(le.$$.fragment),ua=n(),m(re.$$.fragment),fa=n(),oe=o("p"),oe.textContent=As,ya=n(),ie=o("p"),ie.innerHTML=Ds,wa=n(),m(pe.$$.fragment),Ta=n(),me=o("p"),me.innerHTML=Ps,Ma=n(),m(ce.$$.fragment),_a=n(),f=o("p"),Js=v("Let "),ba=new N(!1),Ja=v(" be the input sequence (prompt) and "),va=new N(!1),ja=v(" and "),Ua=new N(!1),Ia=v(" be the chosen and rejected sequences respectively. Under the Bradley-Terry model ("),W=o("a"),W.textContent=Ks,vs=v("), the probability that "),ka=new N(!1),Ca=v(" is preferred over "),xa=new N(!1),Na=v(" given a reward function "),qa=new N(!1),$a=v(" is "),Ra=new N(!1),za=v(", where "),Za=new N(!1),Ga=v(" is the sigmoid function."),Ba=n(),I=o("p"),js=v("The reward model "),Fa=new N(!1),Qa=v(" is trained to assign higher scores to preferred responses "),Wa=new N(!1),Ea=v(" over non-preferred ones "),Va=new N(!1),Xa=v(`. The loss is then defined as the negative log-likelihood of the observed preferences:
`),Sa=new N(!1),Ya=n(),E=o("blockquote"),E.innerHTML=Os,Ha=n(),m(de.$$.fragment),La=n(),he=o("p"),he.textContent=et,Aa=n(),ge=o("ul"),ge.innerHTML=at,Da=n(),m(ue.$$.fragment),Pa=n(),m(fe.$$.fragment),Ka=n(),ye=o("p"),ye.innerHTML=st,Oa=n(),m(we.$$.fragment),es=n(),Te=o("p"),Te.innerHTML=tt,as=n(),m(Me.$$.fragment),ss=n(),_e=o("p"),_e.innerHTML=nt,ts=n(),m(be.$$.fragment),ns=n(),Je=o("p"),Je.textContent=lt,ls=n(),m(ve.$$.fragment),rs=n(),je=o("p"),je.innerHTML=rt,os=n(),m(Ue.$$.fragment),is=n(),F=o("blockquote"),We=o("p"),We.textContent=ot,Us=n(),m(Ie.$$.fragment),ps=n(),m(ke.$$.fragment),ms=n(),Ce=o("p"),Ce.innerHTML=it,cs=n(),xe=o("ul"),xe.innerHTML=pt,ds=n(),Ne=o("p"),Ne.innerHTML=mt,hs=n(),m(qe.$$.fragment),gs=n(),T=o("div"),m($e.$$.fragment),Is=n(),Ee=o("p"),Ee.textContent=ct,ks=n(),Ve=o("p"),Ve.innerHTML=dt,Cs=n(),m(V.$$.fragment),xs=n(),X=o("div"),m(Re.$$.fragment),Ns=n(),Xe=o("p"),Xe.textContent=ht,qs=n(),Z=o("div"),m(ze.$$.fragment),$s=n(),Se=o("p"),Se.innerHTML=gt,Rs=n(),Ye=o("p"),Ye.textContent=ut,zs=n(),S=o("div"),m(Ze.$$.fragment),Zs=n(),He=o("p"),He.innerHTML=ft,us=n(),m(Ge.$$.fragment),fs=n(),_=o("div"),m(Be.$$.fragment),Gs=n(),Le=o("p"),Le.innerHTML=yt,Bs=n(),Ae=o("p"),Ae.innerHTML=wt,Fs=n(),De=o("p"),De.innerHTML=Tt,Qs=n(),Fe=o("blockquote"),Fe.innerHTML=Mt,ys=n(),m(Qe.$$.fragment),ws=n(),Ke=o("p"),this.h()},l(e){const a=Ft("svelte-u9bgzb",document.head);M=i(a,"META",{name:!0,content:!0}),a.forEach(s),Y=l(e),z=i(e,"P",{}),R(z).forEach(s),k=l(e),c(C.$$.fragment,e),w=l(e),c($.$$.fragment,e),aa=l(e),H=i(e,"P",{"data-svelte-h":!0}),p(H)!=="svelte-1rjidu2"&&(H.innerHTML=Es),sa=l(e),c(L.$$.fragment,e),ta=l(e),A=i(e,"P",{"data-svelte-h":!0}),p(A)!=="svelte-1ti5dgc"&&(A.textContent=Vs),na=l(e),D=i(e,"P",{"data-svelte-h":!0}),p(D)!=="svelte-a1ehbo"&&(D.innerHTML=Xs),la=l(e),c(P.$$.fragment,e),ra=l(e),K=i(e,"P",{"data-svelte-h":!0}),p(K)!=="svelte-1rk39qy"&&(K.innerHTML=Ss),oa=l(e),c(O.$$.fragment,e),ia=l(e),x=i(e,"IFRAME",{src:!0,style:!0,height:!0,frameborder:!0}),R(x).forEach(s),pa=l(e),c(ee.$$.fragment,e),ma=l(e),ae=i(e,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-m5hlgx"&&(ae.innerHTML=Hs),ca=l(e),c(se.$$.fragment,e),da=l(e),te=i(e,"P",{"data-svelte-h":!0}),p(te)!=="svelte-wa19cs"&&(te.innerHTML=Ls),ha=l(e),c(ne.$$.fragment,e),ga=l(e),c(le.$$.fragment,e),ua=l(e),c(re.$$.fragment,e),fa=l(e),oe=i(e,"P",{"data-svelte-h":!0}),p(oe)!=="svelte-cp5aph"&&(oe.textContent=As),ya=l(e),ie=i(e,"P",{"data-svelte-h":!0}),p(ie)!=="svelte-r5fokq"&&(ie.innerHTML=Ds),wa=l(e),c(pe.$$.fragment,e),Ta=l(e),me=i(e,"P",{"data-svelte-h":!0}),p(me)!=="svelte-87ovcn"&&(me.innerHTML=Ps),Ma=l(e),c(ce.$$.fragment,e),_a=l(e),f=i(e,"P",{});var y=R(f);Js=j(y,"Let "),ba=q(y,!1),Ja=j(y," be the input sequence (prompt) and "),va=q(y,!1),ja=j(y," and "),Ua=q(y,!1),Ia=j(y," be the chosen and rejected sequences respectively. Under the Bradley-Terry model ("),W=i(y,"A",{href:!0,rel:!0,"data-svelte-h":!0}),p(W)!=="svelte-18polgg"&&(W.textContent=Ks),vs=j(y,"), the probability that "),ka=q(y,!1),Ca=j(y," is preferred over "),xa=q(y,!1),Na=j(y," given a reward function "),qa=q(y,!1),$a=j(y," is "),Ra=q(y,!1),za=j(y,", where "),Za=q(y,!1),Ga=j(y," is the sigmoid function."),y.forEach(s),Ba=l(e),I=i(e,"P",{});var Q=R(I);js=j(Q,"The reward model "),Fa=q(Q,!1),Qa=j(Q," is trained to assign higher scores to preferred responses "),Wa=q(Q,!1),Ea=j(Q," over non-preferred ones "),Va=q(Q,!1),Xa=j(Q,`. The loss is then defined as the negative log-likelihood of the observed preferences:
`),Sa=q(Q,!1),Q.forEach(s),Ya=l(e),E=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),p(E)!=="svelte-1nlt6oo"&&(E.innerHTML=Os),Ha=l(e),c(de.$$.fragment,e),La=l(e),he=i(e,"P",{"data-svelte-h":!0}),p(he)!=="svelte-132s7j9"&&(he.textContent=et),Aa=l(e),ge=i(e,"UL",{"data-svelte-h":!0}),p(ge)!=="svelte-ecm0rq"&&(ge.innerHTML=at),Da=l(e),c(ue.$$.fragment,e),Pa=l(e),c(fe.$$.fragment,e),Ka=l(e),ye=i(e,"P",{"data-svelte-h":!0}),p(ye)!=="svelte-1ieqrp4"&&(ye.innerHTML=st),Oa=l(e),c(we.$$.fragment,e),es=l(e),Te=i(e,"P",{"data-svelte-h":!0}),p(Te)!=="svelte-1riioh2"&&(Te.innerHTML=tt),as=l(e),c(Me.$$.fragment,e),ss=l(e),_e=i(e,"P",{"data-svelte-h":!0}),p(_e)!=="svelte-zb69hw"&&(_e.innerHTML=nt),ts=l(e),c(be.$$.fragment,e),ns=l(e),Je=i(e,"P",{"data-svelte-h":!0}),p(Je)!=="svelte-t2zuq8"&&(Je.textContent=lt),ls=l(e),c(ve.$$.fragment,e),rs=l(e),je=i(e,"P",{"data-svelte-h":!0}),p(je)!=="svelte-11e9nof"&&(je.innerHTML=rt),os=l(e),c(Ue.$$.fragment,e),is=l(e),F=i(e,"BLOCKQUOTE",{class:!0});var Ms=R(F);We=i(Ms,"P",{"data-svelte-h":!0}),p(We)!=="svelte-dc5ccy"&&(We.textContent=ot),Us=l(Ms),c(Ie.$$.fragment,Ms),Ms.forEach(s),ps=l(e),c(ke.$$.fragment,e),ms=l(e),Ce=i(e,"P",{"data-svelte-h":!0}),p(Ce)!=="svelte-iquuc2"&&(Ce.innerHTML=it),cs=l(e),xe=i(e,"UL",{"data-svelte-h":!0}),p(xe)!=="svelte-1vlmw2d"&&(xe.innerHTML=pt),ds=l(e),Ne=i(e,"P",{"data-svelte-h":!0}),p(Ne)!=="svelte-vl4ede"&&(Ne.innerHTML=mt),hs=l(e),c(qe.$$.fragment,e),gs=l(e),T=i(e,"DIV",{class:!0});var J=R(T);c($e.$$.fragment,J),Is=l(J),Ee=i(J,"P",{"data-svelte-h":!0}),p(Ee)!=="svelte-1prnqj7"&&(Ee.textContent=ct),ks=l(J),Ve=i(J,"P",{"data-svelte-h":!0}),p(Ve)!=="svelte-10vjtjm"&&(Ve.innerHTML=dt),Cs=l(J),c(V.$$.fragment,J),xs=l(J),X=i(J,"DIV",{class:!0});var _s=R(X);c(Re.$$.fragment,_s),Ns=l(_s),Xe=i(_s,"P",{"data-svelte-h":!0}),p(Xe)!=="svelte-1cilnet"&&(Xe.textContent=ht),_s.forEach(s),qs=l(J),Z=i(J,"DIV",{class:!0});var Pe=R(Z);c(ze.$$.fragment,Pe),$s=l(Pe),Se=i(Pe,"P",{"data-svelte-h":!0}),p(Se)!=="svelte-r8h4ov"&&(Se.innerHTML=gt),Rs=l(Pe),Ye=i(Pe,"P",{"data-svelte-h":!0}),p(Ye)!=="svelte-1e6bius"&&(Ye.textContent=ut),Pe.forEach(s),zs=l(J),S=i(J,"DIV",{class:!0});var bs=R(S);c(Ze.$$.fragment,bs),Zs=l(bs),He=i(bs,"P",{"data-svelte-h":!0}),p(He)!=="svelte-8tudwd"&&(He.innerHTML=ft),bs.forEach(s),J.forEach(s),us=l(e),c(Ge.$$.fragment,e),fs=l(e),_=i(e,"DIV",{class:!0});var G=R(_);c(Be.$$.fragment,G),Gs=l(G),Le=i(G,"P",{"data-svelte-h":!0}),p(Le)!=="svelte-o9kf85"&&(Le.innerHTML=yt),Bs=l(G),Ae=i(G,"P",{"data-svelte-h":!0}),p(Ae)!=="svelte-3thf4f"&&(Ae.innerHTML=wt),Fs=l(G),De=i(G,"P",{"data-svelte-h":!0}),p(De)!=="svelte-ekuf1t"&&(De.innerHTML=Tt),Qs=l(G),Fe=i(G,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),p(Fe)!=="svelte-mu4yq0"&&(Fe.innerHTML=Mt),G.forEach(s),ys=l(e),c(Qe.$$.fragment,e),ws=l(e),Ke=i(e,"P",{}),R(Ke).forEach(s),this.h()},h(){b(M,"name","hf:doc:metadata"),b(M,"content",St),Rt(x.src,Ys="https://trl-lib-trackio.hf.space/?project=trl-documentation&metrics=train*&sidebar=hidden&runs=reward_qwen3-0.6B_ultrafeedback2")||b(x,"src",Ys),Ws(x,"width","100%"),Ws(x,"min-width","300px"),Ws(x,"max-width","800px"),b(x,"height","830"),b(x,"frameborder","0"),ba.a=Ja,va.a=ja,Ua.a=Ia,b(W,"href","https://www.jstor.org/stable/2334029"),b(W,"rel","nofollow"),ka.a=Ca,xa.a=Na,qa.a=$a,Ra.a=za,Za.a=Ga,Fa.a=Qa,Wa.a=Ea,Va.a=Xa,Sa.a=null,b(E,"class","tip"),b(F,"class","tip"),b(X,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(Z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(S,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),b(Fe,"class","note"),b(_,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){r(document.head,M),t(e,Y,a),t(e,z,a),t(e,k,a),d(C,e,a),t(e,w,a),d($,e,a),t(e,aa,a),t(e,H,a),t(e,sa,a),d(L,e,a),t(e,ta,a),t(e,A,a),t(e,na,a),t(e,D,a),t(e,la,a),d(P,e,a),t(e,ra,a),t(e,K,a),t(e,oa,a),d(O,e,a),t(e,ia,a),t(e,x,a),t(e,pa,a),d(ee,e,a),t(e,ma,a),t(e,ae,a),t(e,ca,a),d(se,e,a),t(e,da,a),t(e,te,a),t(e,ha,a),d(ne,e,a),t(e,ga,a),d(le,e,a),t(e,ua,a),d(re,e,a),t(e,fa,a),t(e,oe,a),t(e,ya,a),t(e,ie,a),t(e,wa,a),d(pe,e,a),t(e,Ta,a),t(e,me,a),t(e,Ma,a),d(ce,e,a),t(e,_a,a),t(e,f,a),r(f,Js),ba.m(_t,f),r(f,Ja),va.m(bt,f),r(f,ja),Ua.m(Jt,f),r(f,Ia),r(f,W),r(f,vs),ka.m(vt,f),r(f,Ca),xa.m(jt,f),r(f,Na),qa.m(Ut,f),r(f,$a),Ra.m(It,f),r(f,za),Za.m(kt,f),r(f,Ga),t(e,Ba,a),t(e,I,a),r(I,js),Fa.m(Ct,I),r(I,Qa),Wa.m(xt,I),r(I,Ea),Va.m(Nt,I),r(I,Xa),Sa.m(qt,I),t(e,Ya,a),t(e,E,a),t(e,Ha,a),d(de,e,a),t(e,La,a),t(e,he,a),t(e,Aa,a),t(e,ge,a),t(e,Da,a),d(ue,e,a),t(e,Pa,a),d(fe,e,a),t(e,Ka,a),t(e,ye,a),t(e,Oa,a),d(we,e,a),t(e,es,a),t(e,Te,a),t(e,as,a),d(Me,e,a),t(e,ss,a),t(e,_e,a),t(e,ts,a),d(be,e,a),t(e,ns,a),t(e,Je,a),t(e,ls,a),d(ve,e,a),t(e,rs,a),t(e,je,a),t(e,os,a),d(Ue,e,a),t(e,is,a),t(e,F,a),r(F,We),r(F,Us),d(Ie,F,null),t(e,ps,a),d(ke,e,a),t(e,ms,a),t(e,Ce,a),t(e,cs,a),t(e,xe,a),t(e,ds,a),t(e,Ne,a),t(e,hs,a),d(qe,e,a),t(e,gs,a),t(e,T,a),d($e,T,null),r(T,Is),r(T,Ee),r(T,ks),r(T,Ve),r(T,Cs),d(V,T,null),r(T,xs),r(T,X),d(Re,X,null),r(X,Ns),r(X,Xe),r(T,qs),r(T,Z),d(ze,Z,null),r(Z,$s),r(Z,Se),r(Z,Rs),r(Z,Ye),r(T,zs),r(T,S),d(Ze,S,null),r(S,Zs),r(S,He),t(e,us,a),d(Ge,e,a),t(e,fs,a),t(e,_,a),d(Be,_,null),r(_,Gs),r(_,Le),r(_,Bs),r(_,Ae),r(_,Fs),r(_,De),r(_,Qs),r(_,Fe),t(e,ys,a),d(Qe,e,a),t(e,ws,a),t(e,Ke,a),Ts=!0},p(e,[a]){const y={};a&2&&(y.$$scope={dirty:a,ctx:e}),V.$set(y)},i(e){Ts||(h(C.$$.fragment,e),h($.$$.fragment,e),h(L.$$.fragment,e),h(P.$$.fragment,e),h(O.$$.fragment,e),h(ee.$$.fragment,e),h(se.$$.fragment,e),h(ne.$$.fragment,e),h(le.$$.fragment,e),h(re.$$.fragment,e),h(pe.$$.fragment,e),h(ce.$$.fragment,e),h(de.$$.fragment,e),h(ue.$$.fragment,e),h(fe.$$.fragment,e),h(we.$$.fragment,e),h(Me.$$.fragment,e),h(be.$$.fragment,e),h(ve.$$.fragment,e),h(Ue.$$.fragment,e),h(Ie.$$.fragment,e),h(ke.$$.fragment,e),h(qe.$$.fragment,e),h($e.$$.fragment,e),h(V.$$.fragment,e),h(Re.$$.fragment,e),h(ze.$$.fragment,e),h(Ze.$$.fragment,e),h(Ge.$$.fragment,e),h(Be.$$.fragment,e),h(Qe.$$.fragment,e),Ts=!0)},o(e){g(C.$$.fragment,e),g($.$$.fragment,e),g(L.$$.fragment,e),g(P.$$.fragment,e),g(O.$$.fragment,e),g(ee.$$.fragment,e),g(se.$$.fragment,e),g(ne.$$.fragment,e),g(le.$$.fragment,e),g(re.$$.fragment,e),g(pe.$$.fragment,e),g(ce.$$.fragment,e),g(de.$$.fragment,e),g(ue.$$.fragment,e),g(fe.$$.fragment,e),g(we.$$.fragment,e),g(Me.$$.fragment,e),g(be.$$.fragment,e),g(ve.$$.fragment,e),g(Ue.$$.fragment,e),g(Ie.$$.fragment,e),g(ke.$$.fragment,e),g(qe.$$.fragment,e),g($e.$$.fragment,e),g(V.$$.fragment,e),g(Re.$$.fragment,e),g(ze.$$.fragment,e),g(Ze.$$.fragment,e),g(Ge.$$.fragment,e),g(Be.$$.fragment,e),g(Qe.$$.fragment,e),Ts=!1},d(e){e&&(s(Y),s(z),s(k),s(w),s(aa),s(H),s(sa),s(ta),s(A),s(na),s(D),s(la),s(ra),s(K),s(oa),s(ia),s(x),s(pa),s(ma),s(ae),s(ca),s(da),s(te),s(ha),s(ga),s(ua),s(fa),s(oe),s(ya),s(ie),s(wa),s(Ta),s(me),s(Ma),s(_a),s(f),s(Ba),s(I),s(Ya),s(E),s(Ha),s(La),s(he),s(Aa),s(ge),s(Da),s(Pa),s(Ka),s(ye),s(Oa),s(es),s(Te),s(as),s(ss),s(_e),s(ts),s(ns),s(Je),s(ls),s(rs),s(je),s(os),s(is),s(F),s(ps),s(ms),s(Ce),s(cs),s(xe),s(ds),s(Ne),s(hs),s(gs),s(T),s(us),s(fs),s(_),s(ys),s(ws),s(Ke)),s(M),u(C,e),u($,e),u(L,e),u(P,e),u(O,e),u(ee,e),u(se,e),u(ne,e),u(le,e),u(re,e),u(pe,e),u(ce,e),u(de,e),u(ue,e),u(fe,e),u(we,e),u(Me,e),u(be,e),u(ve,e),u(Ue,e),u(Ie),u(ke,e),u(qe,e),u($e),u(V),u(Re),u(ze),u(Ze),u(Ge,e),u(Be),u(Qe,e)}}}const St='{"title":"Reward Modeling","local":"reward-modeling","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"Expected dataset type and format","local":"expected-dataset-type-and-format","sections":[],"depth":2},{"title":"Looking deeper into the training method","local":"looking-deeper-into-the-training-method","sections":[{"title":"Preprocessing and tokenization","local":"preprocessing-and-tokenization","sections":[],"depth":3},{"title":"Computing the loss","local":"computing-the-loss","sections":[],"depth":3}],"depth":2},{"title":"Logged metrics","local":"logged-metrics","sections":[],"depth":2},{"title":"Customization","local":"customization","sections":[{"title":"Model initialization","local":"model-initialization","sections":[],"depth":3},{"title":"Train adapters with PEFT","local":"train-adapters-with-peft","sections":[],"depth":3}],"depth":2},{"title":"Tool Calling with Reward Modeling","local":"tool-calling-with-reward-modeling","sections":[],"depth":2},{"title":"RewardTrainer","local":"trl.RewardTrainer","sections":[],"depth":2},{"title":"RewardConfig","local":"trl.RewardConfig","sections":[],"depth":2}],"depth":1}';function Yt(ea){return zt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ot extends Gt{constructor(M){super(),Bt(this,M,Yt,Xt,$t,{})}}export{Ot as component};

Xet Storage Details

Size:
89.7 kB
·
Xet hash:
6d41cbc10ed19ab9b4c5aa117c7f418c8f54072c67e7cb38ceba679c2f5e6acb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.