Buckets:
| import{s as zl,f as sl,o as Rl,n as $l}from"../chunks/scheduler.37c15a92.js";import{S as Zl,i as Vl,g as i,s,r as m,A as Ql,h as p,f as t,c as n,j as ue,u as c,x as r,k as M,y as kl,a,v as u,d as o,t as y,w}from"../chunks/index.2bf4358c.js";import{T as vl}from"../chunks/Tip.363c041f.js";import{C as me}from"../chunks/CodeBlock.4e987730.js";import{H as C,E as Wl}from"../chunks/getInferenceSnippets.24b50994.js";function Xl(oe){let J,f='Acest capitol este destinat începătorilor TRL. Dacă ești deja familiar cu TRL, ai putea de asemenea să consulți <a href="https://github.com/huggingface/open-r1/blob/main/src/open_r1/grpo.py" rel="nofollow">implementarea Open R1</a> a GRPO.';return{c(){J=i("p"),J.innerHTML=f},l(T){J=p(T,"P",{"data-svelte-h":!0}),r(J)!=="svelte-4m2s3v"&&(J.innerHTML=f)},m(T,Me){a(T,J,Me)},p:$l,d(T){T&&t(J)}}}function _l(oe){let J,f,T,Me,I,ye,h,nl="În această pagină, vom învăța cum să implementăm Optimizarea Relativă a Politicii de Grup (GRPO) folosind biblioteca Transformer Reinforcement Learning (TRL). Ne vom concentra pe implementarea practică cu cod minimal.",we,B,il="Vom explora conceptele centrale ale GRPO așa cum sunt întruchipate în GRPOTrainer din TRL, folosind fragmente din documentația oficială TRL pentru a ne ghida.",Je,b,Ue,g,pl="În primul rând, să ne reamintim unele dintre conceptele importante ale algoritmului GRPO:",je,G,rl="<li>Formarea Grupului: Modelul generează multiple completări pentru fiecare prompt.</li> <li>Învățarea Preferințelor: Modelul învață dintr-o funcție de recompensă care compară grupuri de completări.</li> <li>Configurația Antrenamentului: Modelul folosește o configurație pentru a controla procesul de antrenare.</li>",de,A,Ml="Ce trebuie să facem pentru a implementa GRPO?",Te,z,ml="<li>Să definim un set de date de prompt-uri.</li> <li>Să definim o funcție de recompensă care ia o listă de completări și returnează o listă de recompense.</li> <li>Să configurăm procesul de antrenare cu un GRPOConfig.</li> <li>Să antrenăm modelul folosind GRPOTrainer.</li>",Ce,R,cl="Iată un exemplu minimal pentru a începe antrenamentul GRPO:",be,$,fe,Z,Ie,V,he,Q,ul="Setul tău de date ar trebui să conțină prompt-uri la care modelul va răspunde. Antrenorul GRPO va genera multiple completări pentru fiecare prompt și va folosi funcția de recompensă pentru a le compara.",Be,k,ge,v,ol="Funcția de recompensă este crucială - determină cum învață modelul. Iată două exemple practice:",Ge,W,Ae,X,ze,_,yl="Parametrii cheie de considerat în <code>GRPOConfig</code>:",Re,x,$e,S,wl="Parametrul <code>num_generation</code> este deosebit de important pentru GRPO deoarece definește dimensiunea grupului - câte completări diferite va genera modelul pentru fiecare prompt. Acesta este un diferențiator cheie de alte metode RL:",Ze,N,Jl="<li>Prea mic (de exemplu, 2-3): S-ar putea să nu ofere suficientă diversitate pentru comparații semnificative</li> <li>Recomandat (4-16): Oferă un echilibru bun între diversitate și eficiența computațională</li> <li>Valori mai mari: Pot îmbunătăți învățarea dar cresc semnificativ costul computațional</li>",Ve,E,Ul="Dimensiunea grupului ar trebui aleasă în funcție de resursele tale computaționale și complexitatea sarcinii tale. Pentru sarcini simple, grupuri mai mici (4-8) pot fi suficiente, în timp ce sarcinile de raționament mai complexe ar putea beneficia de grupuri mai mari (8-16).",Qe,F,ke,Y,jl="<li><strong>Gestionarea Memoriei</strong>: Ajustează <code>per_device_train_batch_size</code> și <code>gradient_accumulation_steps</code> în funcție de memoria GPU-ului tău.</li> <li><strong>Viteza</strong>: Activează <code>use_vllm=True</code> pentru generare mai rapidă dacă modelul tău este suportat.</li> <li><strong>Monitorizarea</strong>: Urmărește metricile înregistrate în timpul antrenamentului:<ul><li><code>reward</code>: Recompensa medie pe completări</li> <li><code>reward_std</code>: Deviația standard în cadrul grupurilor de recompense</li> <li><code>kl</code>: Divergența KL de la modelul de referință</li></ul></li>",ve,H,We,D,dl="Lucrarea DeepSeek R1 demonstrează mai multe abordări eficiente pentru designul funcției de recompensă pe care le poți adapta pentru propria ta implementare GRPO:",Xe,O,_e,P,Tl="Una dintre cele mai ușoare recompense de implementat este o recompensă bazată pe lungime. Poți recompensa completări mai lungi:",xe,L,Se,q,Cl="Această funcție de recompensă penalizează completările care sunt prea scurte sau prea lungi, încurajând modelul să genereze completări care sunt aproape de lungimea ideală de 20 de token-uri.",Ne,U,bl,Ee,K,Fe,ee,fl="Pentru sarcini cu răspunsuri obiectiv corecte (cum ar fi matematica sau codarea), poți implementa funcții de recompensă bazate pe reguli:",Ye,le,He,j,Il,De,te,Oe,ae,hl="Poți de asemenea să recompensezi formatarea corespunzătoare, care a fost importantă în antrenamentul DeepSeek R1:",Pe,se,Le,d,Bl,qe,ne,gl="Aceste exemple demonstrează cum poți implementa funcții de recompensă inspirate din procesul de antrenare DeepSeek R1, concentrându-se pe corectitudine, formatare și semnale combinate.",Ke,ie,el,pe,Gl="În următoarea secțiune, vei urma un exercițiu pentru a implementa GRPO în TRL.",ll,re,tl,ce,al;return I=new C({props:{title:"Implementarea GRPO în TRL",local:"implementarea-grpo-în-trl",headingTag:"h1"}}),b=new vl({props:{$$slots:{default:[Xl]},$$scope:{ctx:oe}}}),$=new me({props:{code:"ZnJvbSUyMHRybCUyMGltcG9ydCUyMEdSUE9UcmFpbmVyJTJDJTIwR1JQT0NvbmZpZyUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGxvYWRfZGF0YXNldCUwQSUwQSUyMyUyMDEuJTIwJUMzJThFbmNhcmMlQzQlODMlMjBzZXR1bCUyMHQlQzQlODN1JTIwZGUlMjBkYXRlJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJzZXR1bF90YXVfZGVfZGF0ZSUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBJTBBJTIzJTIwMi4lMjBEZWZpbmUlQzglOTl0ZSUyMG8lMjBmdW5jJUM4JTlCaWUlMjBkZSUyMHJlY29tcGVucyVDNCU4MyUyMHNpbXBsJUM0JTgzJTBBZGVmJTIwcmV3YXJkX2Z1bmMoY29tcGxldGlvbnMlMkMlMjAqKmt3YXJncyklM0ElMEElMjAlMjAlMjAlMjAlMjIlMjIlMjJFeGVtcGx1JTNBJTIwUmVjb21wZW5zZWF6JUM0JTgzJTIwY29tcGxldCVDNCU4M3JpbGUlMjBtYWklMjBsdW5naSUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU1QmZsb2F0KGxlbihjb21wbGV0aW9uKSklMjBmb3IlMjBjb21wbGV0aW9uJTIwaW4lMjBjb21wbGV0aW9ucyU1RCUwQSUwQSUwQSUyMyUyMDMuJTIwQ29uZmlndXJlYXolQzQlODMlMjBhbnRyZW5hbWVudHVsJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyMEdSUE9Db25maWcoJTBBJTIwJTIwJTIwJTIwb3V0cHV0X2RpciUzRCUyMm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMG51bV90cmFpbl9lcG9jaHMlM0QzJTJDJTBBJTIwJTIwJTIwJTIwcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTNENCUyQyUwQSUyMCUyMCUyMCUyMGdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyUzRDIlMkMlMEElMjAlMjAlMjAlMjBsb2dnaW5nX3N0ZXBzJTNEMTAlMkMlMEEpJTBBJTBBJTIzJTIwNC4lMjBJbmklQzglOUJpYWxpemVheiVDNCU4MyUyMCVDOCU5OWklMjBhbnRyZW5lYXolQzQlODMlMEF0cmFpbmVyJTIwJTNEJTIwR1JQT1RyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJtb2RlbHVsX3RhdSUyMiUyQyUyMCUyMCUyMyUyMGRlJTIwZXhlbXBsdSUyMCUyMlF3ZW4lMkZRd2VuMi0wLjVCLUluc3RydWN0JTIyJTBBJTIwJTIwJTIwJTIwYXJncyUzRHRyYWluaW5nX2FyZ3MlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRHJld2FyZF9mdW5jJTJDJTBBKSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer, GRPOConfig | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-comment"># 1. Încarcă setul tău de date</span> | |
| dataset = load_dataset(<span class="hljs-string">"setul_tau_de_date"</span>, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-comment"># 2. Definește o funcție de recompensă simplă</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">completions, **kwargs</span>): | |
| <span class="hljs-string">"""Exemplu: Recompensează completările mai lungi"""</span> | |
| <span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions] | |
| <span class="hljs-comment"># 3. Configurează antrenamentul</span> | |
| training_args = GRPOConfig( | |
| output_dir=<span class="hljs-string">"output"</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, | |
| gradient_accumulation_steps=<span class="hljs-number">2</span>, | |
| logging_steps=<span class="hljs-number">10</span>, | |
| ) | |
| <span class="hljs-comment"># 4. Inițializează și antrenează</span> | |
| trainer = GRPOTrainer( | |
| model=<span class="hljs-string">"modelul_tau"</span>, <span class="hljs-comment"># de exemplu "Qwen/Qwen2-0.5B-Instruct"</span> | |
| args=training_args, | |
| train_dataset=dataset, | |
| reward_funcs=reward_func, | |
| ) | |
| trainer.train()`,wrap:!1}}),Z=new C({props:{title:"Componentele Cheie",local:"componentele-cheie",headingTag:"h2"}}),V=new C({props:{title:"1. Formatul Setului de Date",local:"1-formatul-setului-de-date",headingTag:"h3"}}),k=new C({props:{title:"2. Funcția de Recompensă",local:"2-funcția-de-recompensă",headingTag:"h3"}}),W=new me({props:{code:"JTIzJTIwRXhlbXBsdWwlMjAxJTNBJTIwUmVjb21wZW5zJUM0JTgzJTIwYmF6YXQlQzQlODMlMjBwZSUyMGx1bmdpbWVhJTIwY29tcGxldCVDNCU4M3JpaSUwQWRlZiUyMHJld2FyZF9sZW5ndGgoY29tcGxldGlvbnMlMkMlMjAqKmt3YXJncyklM0ElMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlNUJmbG9hdChsZW4oY29tcGxldGlvbikpJTIwZm9yJTIwY29tcGxldGlvbiUyMGluJTIwY29tcGxldGlvbnMlNUQlMEElMEElMEElMjMlMjBFeGVtcGx1bCUyMDIlM0ElMjBSZWNvbXBlbnMlQzQlODMlMjBiYXphdCVDNCU4MyUyMHBlJTIwcG90cml2aXJlYSUyMHVudWklMjBtb2RlbCUwQWltcG9ydCUyMHJlJTBBJTBBJTBBZGVmJTIwcmV3YXJkX2Zvcm1hdChjb21wbGV0aW9ucyUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMHBhdHRlcm4lMjAlM0QlMjByJTIyJTVFJTNDdGhpbmslM0UuKiUzRiUzQyUyRnRoaW5rJTNFJTNDYW5zd2VyJTNFLiolM0YlM0MlMkZhbnN3ZXIlM0UlMjQlMjIlMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlNUIxLjAlMjBpZiUyMHJlLm1hdGNoKHBhdHRlcm4lMkMlMjBjKSUyMGVsc2UlMjAwLjAlMjBmb3IlMjBjJTIwaW4lMjBjb21wbGV0aW9ucyU1RA==",highlighted:`<span class="hljs-comment"># Exemplul 1: Recompensă bazată pe lungimea completării</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_length</span>(<span class="hljs-params">completions, **kwargs</span>): | |
| <span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions] | |
| <span class="hljs-comment"># Exemplul 2: Recompensă bazată pe potrivirea unui model</span> | |
| <span class="hljs-keyword">import</span> re | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_format</span>(<span class="hljs-params">completions, **kwargs</span>): | |
| pattern = <span class="hljs-string">r"^<think>.*?</think><answer>.*?</answer>$"</span> | |
| <span class="hljs-keyword">return</span> [<span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> re.<span class="hljs-keyword">match</span>(pattern, c) <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> completions]`,wrap:!1}}),X=new C({props:{title:"3. Configurația Antrenamentului",local:"3-configurația-antrenamentului",headingTag:"h3"}}),x=new me({props:{code:"dHJhaW5pbmdfYXJncyUyMCUzRCUyMEdSUE9Db25maWcoJTBBJTIwJTIwJTIwJTIwJTIzJTIwUGFyYW1ldHJpaSUyMGVzZW4lQzglOUJpYWxpJTBBJTIwJTIwJTIwJTIwb3V0cHV0X2RpciUzRCUyMm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMG51bV90cmFpbl9lcG9jaHMlM0QzJTJDJTBBJTIwJTIwJTIwJTIwbnVtX2dlbmVyYXRpb24lM0Q0JTJDJTIwJTIwJTIzJTIwTnVtJUM0JTgzcnVsJTIwZGUlMjBjb21wbGV0JUM0JTgzcmklMjBkZSUyMGdlbmVyYXQlMjBwZW50cnUlMjBmaWVjYXJlJTIwcHJvbXB0JTBBJTIwJTIwJTIwJTIwcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTNENCUyQyUyMCUyMCUyMyUyMFZyZW0lMjBzJUM0JTgzJTIwb2IlQzglOUJpbmVtJTIwdG9hdGUlMjBnZW5lciVDNCU4M3JpbGUlMjAlQzMlQUVudHItdW4lMjBsb3QlMjBkZSUyMGRpc3Bveml0aXYlMEElMjAlMjAlMjAlMjAlMjMlMjBPcCVDOCU5QmlvbmFsJTIwZGFyJTIwdXRpbCUwQSUyMCUyMCUyMCUyMGdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyUzRDIlMkMlMEElMjAlMjAlMjAlMjBsZWFybmluZ19yYXRlJTNEMWUtNSUyQyUwQSUyMCUyMCUyMCUyMGxvZ2dpbmdfc3RlcHMlM0QxMCUyQyUwQSUyMCUyMCUyMCUyMCUyMyUyMFNwZWNpZmljJTIwR1JQTyUyMChvcCVDOCU5QmlvbmFsKSUwQSUyMCUyMCUyMCUyMHVzZV92bGxtJTNEVHJ1ZSUyQyUyMCUyMCUyMyUyMEFjY2VsZXJlYXolQzQlODMlMjBnZW5lcmFyZWElMEEp",highlighted:`training_args = GRPOConfig( | |
| <span class="hljs-comment"># Parametrii esențiali</span> | |
| output_dir=<span class="hljs-string">"output"</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| num_generation=<span class="hljs-number">4</span>, <span class="hljs-comment"># Numărul de completări de generat pentru fiecare prompt</span> | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Vrem să obținem toate generările într-un lot de dispozitiv</span> | |
| <span class="hljs-comment"># Opțional dar util</span> | |
| gradient_accumulation_steps=<span class="hljs-number">2</span>, | |
| learning_rate=<span class="hljs-number">1e-5</span>, | |
| logging_steps=<span class="hljs-number">10</span>, | |
| <span class="hljs-comment"># Specific GRPO (opțional)</span> | |
| use_vllm=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Accelerează generarea</span> | |
| )`,wrap:!1}}),F=new C({props:{title:"Sfaturi pentru Succes",local:"sfaturi-pentru-succes",headingTag:"h2"}}),H=new C({props:{title:"Designul Funcției de Recompensă",local:"designul-funcției-de-recompensă",headingTag:"h2"}}),O=new C({props:{title:"1. Recompense Bazate pe Lungime",local:"1-recompense-bazate-pe-lungime",headingTag:"h3"}}),L=new me({props:{code:"ZGVmJTIwcmV3YXJkX2xlbihjb21wbGV0aW9ucyUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMGlkZWFsX2xlbmd0aCUyMCUzRCUyMDIwJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCLWFicyhpZGVhbF9sZW5ndGglMjAtJTIwbGVuKGNvbXBsZXRpb24pKSUyMGZvciUyMGNvbXBsZXRpb24lMjBpbiUyMGNvbXBsZXRpb25zJTVE",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_len</span>(<span class="hljs-params">completions, **kwargs</span>): | |
| ideal_length = <span class="hljs-number">20</span> | |
| <span class="hljs-keyword">return</span> [-<span class="hljs-built_in">abs</span>(ideal_length - <span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]`,wrap:!1}}),K=new C({props:{title:"2. Recompense Bazate pe Reguli pentru Sarcini Verificabile",local:"2-recompense-bazate-pe-reguli-pentru-sarcini-verificabile",headingTag:"h2"}}),le=new me({props:{code:"ZGVmJTIwcHJvYmxlbV9yZXdhcmQoY29tcGxldGlvbnMlMkMlMjBhbnN3ZXJzJTJDJTIwKiprd2FyZ3MpJTNBJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyRnVuYyVDOCU5QmllJTIwZGUlMjByZWNvbXBlbnMlQzQlODMlMjBwZW50cnUlMjBwcm9ibGVtZSUyMGRlJTIwbWF0ZW1hdGljJUM0JTgzJTIwY3UlMjByJUM0JTgzc3B1bnN1cmklMjB2ZXJpZmljYWJpbGUlMEElMjAlMjAlMjAlMjBjb21wbGV0aW9ucyUzQSUyMGxpc3RhJTIwZGUlMjBjb21wbGV0JUM0JTgzcmklMjBkZSUyMGV2YWx1YXQlMEElMjAlMjAlMjAlMjBhbnN3ZXJzJTNBJTIwbGlzdGElMjBkZSUyMHIlQzQlODNzcHVuc3VyaSUyMGxhJTIwcHJvYmxlbWVsZSUyMGRpbiUyMHNldHVsJTIwZGUlMjBkYXRlJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyJTBBJTBBJTIwJTIwJTIwJTIwcmV3YXJkcyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMGZvciUyMGNvbXBsZXRpb24lMkMlMjBjb3JyZWN0X2Fuc3dlciUyMGluJTIwemlwKGNvbXBsZXRpb25zJTJDJTIwYW5zd2VycyklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBFeHRyYWdlJTIwciVDNCU4M3NwdW5zdWwlMjBkaW4lMjBjb21wbGV0YXJlJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdHJ5JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwQWNlc3RhJTIwZXN0ZSUyMHVuJTIwZXhlbXBsdSUyMHNpbXBsaWZpY2F0JTIwLSUyMGFpJTIwYXZlYSUyMG5ldm9pZSUyMGRlJTIwcGFyc2luZyUyMGNvcmVzcHVueiVDNCU4M3RvciUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGFuc3dlciUyMCUzRCUyMGV4dHJhY3RfZmluYWxfYW5zd2VyKGNvbXBsZXRpb24pJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwUmVjb21wZW5zJUM0JTgzJTIwYmluYXIlQzQlODMlM0ElMjAxJTIwcGVudHJ1JTIwY29yZWN0JTJDJTIwMCUyMHBlbnRydSUyMGluY29yZWN0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV3YXJkJTIwJTNEJTIwMS4wJTIwaWYlMjBhbnN3ZXIlMjAlM0QlM0QlMjBjb3JyZWN0X2Fuc3dlciUyMGVsc2UlMjAwLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZChyZXdhcmQpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZXhjZXB0JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwRGFjJUM0JTgzJTIwbnUlMjBwdXRlbSUyMHBhcnNhJTIwdW4lMjByJUM0JTgzc3B1bnMlMkMlMjBkJUM0JTgzbSUyMG8lMjByZWNvbXBlbnMlQzQlODMlMjBtaWMlQzQlODMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZCgwLjApJTBBJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwcmV3YXJkcw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">problem_reward</span>(<span class="hljs-params">completions, answers, **kwargs</span>): | |
| <span class="hljs-string">"""Funcție de recompensă pentru probleme de matematică cu răspunsuri verificabile | |
| completions: lista de completări de evaluat | |
| answers: lista de răspunsuri la problemele din setul de date | |
| """</span> | |
| rewards = [] | |
| <span class="hljs-keyword">for</span> completion, correct_answer <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(completions, answers): | |
| <span class="hljs-comment"># Extrage răspunsul din completare</span> | |
| <span class="hljs-keyword">try</span>: | |
| <span class="hljs-comment"># Acesta este un exemplu simplificat - ai avea nevoie de parsing corespunzător</span> | |
| answer = extract_final_answer(completion) | |
| <span class="hljs-comment"># Recompensă binară: 1 pentru corect, 0 pentru incorect</span> | |
| reward = <span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> answer == correct_answer <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> | |
| rewards.append(reward) | |
| <span class="hljs-keyword">except</span>: | |
| <span class="hljs-comment"># Dacă nu putem parsa un răspuns, dăm o recompensă mică</span> | |
| rewards.append(<span class="hljs-number">0.0</span>) | |
| <span class="hljs-keyword">return</span> rewards`,wrap:!1}}),te=new C({props:{title:"3. Recompense Bazate pe Format",local:"3-recompense-bazate-pe-format",headingTag:"h2"}}),se=new me({props:{code:"ZGVmJTIwZm9ybWF0X3Jld2FyZChjb21wbGV0aW9ucyUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMCUyMiUyMiUyMlJlY29tcGVuc2VheiVDNCU4MyUyMGNvbXBsZXQlQzQlODNyaWxlJTIwY2FyZSUyMHVybWVheiVDNCU4MyUyMGZvcm1hdHVsJTIwZG9yaXQlMjIlMjIlMjIlMEElMjAlMjAlMjAlMjAlMjMlMjBFeGVtcGx1JTNBJTIwVmVyaWZpYyVDNCU4MyUyMGRhYyVDNCU4MyUyMGNvbXBsZXRhcmVhJTIwdXJtZWF6JUM0JTgzJTIwdW4lMjBmb3JtYXQlMjBnJUMzJUEybmRlJUM4JTk5dGUtYXBvaS1yJUM0JTgzc3B1bmRlJTBBJTIwJTIwJTIwJTIwcGF0dGVybiUyMCUzRCUyMHIlMjIlM0N0aGluayUzRSguKiUzRiklM0MlMkZ0aGluayUzRSU1Q3MqJTNDYW5zd2VyJTNFKC4qJTNGKSUzQyUyRmFuc3dlciUzRSUyMiUwQSUwQSUyMCUyMCUyMCUyMHJld2FyZHMlMjAlM0QlMjAlNUIlNUQlMEElMjAlMjAlMjAlMjBmb3IlMjBjb21wbGV0aW9uJTIwaW4lMjBjb21wbGV0aW9ucyUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1hdGNoJTIwJTNEJTIwcmUuc2VhcmNoKHBhdHRlcm4lMkMlMjBjb21wbGV0aW9uJTJDJTIwcmUuRE9UQUxMKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbWF0Y2glM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBWZXJpZmljJUM0JTgzJTIwZGFjJUM0JTgzJTIwZXhpc3QlQzQlODMlMjBjb24lQzglOUJpbnV0JTIwc3Vic3RhbiVDOCU5QmlhbCUyMCVDMyVBRW4lMjBhbWJlbGUlMjBzZWMlQzglOUJpdW5pJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdGhpbmtfY29udGVudCUyMCUzRCUyMG1hdGNoLmdyb3VwKDEpLnN0cmlwKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBhbnN3ZXJfY29udGVudCUyMCUzRCUyMG1hdGNoLmdyb3VwKDIpLnN0cmlwKCklMEElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGxlbih0aGlua19jb250ZW50KSUyMCUzRSUyMDIwJTIwYW5kJTIwbGVuKGFuc3dlcl9jb250ZW50KSUyMCUzRSUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZCgxLjApJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJld2FyZHMuYXBwZW5kKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMDAuNSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCklMjAlMjAlMjMlMjBSZWNvbXBlbnMlQzQlODMlMjBwYXIlQzglOUJpYWwlQzQlODMlMjBwZW50cnUlMjBmb3JtYXQlMjBjb3JlY3QlMjBkYXIlMjBjb24lQzglOUJpbnV0JTIwbGltaXRhdCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVsc2UlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZCgwLjApJTIwJTIwJTIzJTIwTmljaW8lMjByZWNvbXBlbnMlQzQlODMlMjBwZW50cnUlMjBmb3JtYXQlMjBpbmNvcmVjdCUwQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHJld2FyZHM=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">format_reward</span>(<span class="hljs-params">completions, **kwargs</span>): | |
| <span class="hljs-string">"""Recompensează completările care urmează formatul dorit"""</span> | |
| <span class="hljs-comment"># Exemplu: Verifică dacă completarea urmează un format gândește-apoi-răspunde</span> | |
| pattern = <span class="hljs-string">r"<think>(.*?)</think>\\s*<answer>(.*?)</answer>"</span> | |
| rewards = [] | |
| <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions: | |
| <span class="hljs-keyword">match</span> = re.search(pattern, completion, re.DOTALL) | |
| <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span>: | |
| <span class="hljs-comment"># Verifică dacă există conținut substanțial în ambele secțiuni</span> | |
| think_content = <span class="hljs-keyword">match</span>.group(<span class="hljs-number">1</span>).strip() | |
| answer_content = <span class="hljs-keyword">match</span>.group(<span class="hljs-number">2</span>).strip() | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(think_content) > <span class="hljs-number">20</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(answer_content) > <span class="hljs-number">0</span>: | |
| rewards.append(<span class="hljs-number">1.0</span>) | |
| <span class="hljs-keyword">else</span>: | |
| rewards.append( | |
| <span class="hljs-number">0.5</span> | |
| ) <span class="hljs-comment"># Recompensă parțială pentru format corect dar conținut limitat</span> | |
| <span class="hljs-keyword">else</span>: | |
| rewards.append(<span class="hljs-number">0.0</span>) <span class="hljs-comment"># Nicio recompensă pentru format incorect</span> | |
| <span class="hljs-keyword">return</span> rewards`,wrap:!1}}),ie=new C({props:{title:"Asta e tot!",local:"asta-e-tot",headingTag:"h2"}}),re=new Wl({props:{source:"https://github.com/huggingface/course/blob/main/chapters/rum/chapter12/4.mdx"}}),{c(){J=i("meta"),f=s(),T=i("p"),Me=s(),m(I.$$.fragment),ye=s(),h=i("p"),h.textContent=nl,we=s(),B=i("p"),B.textContent=il,Je=s(),m(b.$$.fragment),Ue=s(),g=i("p"),g.textContent=pl,je=s(),G=i("ul"),G.innerHTML=rl,de=s(),A=i("p"),A.textContent=Ml,Te=s(),z=i("ul"),z.innerHTML=ml,Ce=s(),R=i("p"),R.textContent=cl,be=s(),m($.$$.fragment),fe=s(),m(Z.$$.fragment),Ie=s(),m(V.$$.fragment),he=s(),Q=i("p"),Q.textContent=ul,Be=s(),m(k.$$.fragment),ge=s(),v=i("p"),v.textContent=ol,Ge=s(),m(W.$$.fragment),Ae=s(),m(X.$$.fragment),ze=s(),_=i("p"),_.innerHTML=yl,Re=s(),m(x.$$.fragment),$e=s(),S=i("p"),S.innerHTML=wl,Ze=s(),N=i("ul"),N.innerHTML=Jl,Ve=s(),E=i("p"),E.textContent=Ul,Qe=s(),m(F.$$.fragment),ke=s(),Y=i("ol"),Y.innerHTML=jl,ve=s(),m(H.$$.fragment),We=s(),D=i("p"),D.textContent=dl,Xe=s(),m(O.$$.fragment),_e=s(),P=i("p"),P.textContent=Tl,xe=s(),m(L.$$.fragment),Se=s(),q=i("p"),q.textContent=Cl,Ne=s(),U=i("iframe"),Ee=s(),m(K.$$.fragment),Fe=s(),ee=i("p"),ee.textContent=fl,Ye=s(),m(le.$$.fragment),He=s(),j=i("iframe"),De=s(),m(te.$$.fragment),Oe=s(),ae=i("p"),ae.textContent=hl,Pe=s(),m(se.$$.fragment),Le=s(),d=i("iframe"),qe=s(),ne=i("p"),ne.textContent=gl,Ke=s(),m(ie.$$.fragment),el=s(),pe=i("p"),pe.textContent=Gl,ll=s(),m(re.$$.fragment),tl=s(),ce=i("p"),this.h()},l(e){const l=Ql("svelte-u9bgzb",document.head);J=p(l,"META",{name:!0,content:!0}),l.forEach(t),f=n(e),T=p(e,"P",{}),ue(T).forEach(t),Me=n(e),c(I.$$.fragment,e),ye=n(e),h=p(e,"P",{"data-svelte-h":!0}),r(h)!=="svelte-1293vxv"&&(h.textContent=nl),we=n(e),B=p(e,"P",{"data-svelte-h":!0}),r(B)!=="svelte-na6ewk"&&(B.textContent=il),Je=n(e),c(b.$$.fragment,e),Ue=n(e),g=p(e,"P",{"data-svelte-h":!0}),r(g)!=="svelte-1kpbqds"&&(g.textContent=pl),je=n(e),G=p(e,"UL",{"data-svelte-h":!0}),r(G)!=="svelte-ez6ot5"&&(G.innerHTML=rl),de=n(e),A=p(e,"P",{"data-svelte-h":!0}),r(A)!=="svelte-1a3xk8g"&&(A.textContent=Ml),Te=n(e),z=p(e,"UL",{"data-svelte-h":!0}),r(z)!=="svelte-xpn84y"&&(z.innerHTML=ml),Ce=n(e),R=p(e,"P",{"data-svelte-h":!0}),r(R)!=="svelte-f204iz"&&(R.textContent=cl),be=n(e),c($.$$.fragment,e),fe=n(e),c(Z.$$.fragment,e),Ie=n(e),c(V.$$.fragment,e),he=n(e),Q=p(e,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-j8bpdm"&&(Q.textContent=ul),Be=n(e),c(k.$$.fragment,e),ge=n(e),v=p(e,"P",{"data-svelte-h":!0}),r(v)!=="svelte-iw81bf"&&(v.textContent=ol),Ge=n(e),c(W.$$.fragment,e),Ae=n(e),c(X.$$.fragment,e),ze=n(e),_=p(e,"P",{"data-svelte-h":!0}),r(_)!=="svelte-1cxbitg"&&(_.innerHTML=yl),Re=n(e),c(x.$$.fragment,e),$e=n(e),S=p(e,"P",{"data-svelte-h":!0}),r(S)!=="svelte-zu7ra"&&(S.innerHTML=wl),Ze=n(e),N=p(e,"UL",{"data-svelte-h":!0}),r(N)!=="svelte-1al5l6o"&&(N.innerHTML=Jl),Ve=n(e),E=p(e,"P",{"data-svelte-h":!0}),r(E)!=="svelte-1rh1v9n"&&(E.textContent=Ul),Qe=n(e),c(F.$$.fragment,e),ke=n(e),Y=p(e,"OL",{"data-svelte-h":!0}),r(Y)!=="svelte-db4915"&&(Y.innerHTML=jl),ve=n(e),c(H.$$.fragment,e),We=n(e),D=p(e,"P",{"data-svelte-h":!0}),r(D)!=="svelte-6gbmmo"&&(D.textContent=dl),Xe=n(e),c(O.$$.fragment,e),_e=n(e),P=p(e,"P",{"data-svelte-h":!0}),r(P)!=="svelte-18bht15"&&(P.textContent=Tl),xe=n(e),c(L.$$.fragment,e),Se=n(e),q=p(e,"P",{"data-svelte-h":!0}),r(q)!=="svelte-1xyrfe1"&&(q.textContent=Cl),Ne=n(e),U=p(e,"IFRAME",{src:!0,title:!0,width:!0,height:!0,frameborder:!0,allow:!0}),ue(U).forEach(t),Ee=n(e),c(K.$$.fragment,e),Fe=n(e),ee=p(e,"P",{"data-svelte-h":!0}),r(ee)!=="svelte-11f5ljd"&&(ee.textContent=fl),Ye=n(e),c(le.$$.fragment,e),He=n(e),j=p(e,"IFRAME",{src:!0,title:!0,width:!0,height:!0,frameborder:!0,allow:!0}),ue(j).forEach(t),De=n(e),c(te.$$.fragment,e),Oe=n(e),ae=p(e,"P",{"data-svelte-h":!0}),r(ae)!=="svelte-15alen6"&&(ae.textContent=hl),Pe=n(e),c(se.$$.fragment,e),Le=n(e),d=p(e,"IFRAME",{src:!0,title:!0,width:!0,height:!0,frameborder:!0,allow:!0}),ue(d).forEach(t),qe=n(e),ne=p(e,"P",{"data-svelte-h":!0}),r(ne)!=="svelte-5dp67z"&&(ne.textContent=gl),Ke=n(e),c(ie.$$.fragment,e),el=n(e),pe=p(e,"P",{"data-svelte-h":!0}),r(pe)!=="svelte-11kcv9w"&&(pe.textContent=Gl),ll=n(e),c(re.$$.fragment,e),tl=n(e),ce=p(e,"P",{}),ue(ce).forEach(t),this.h()},h(){M(J,"name","hf:doc:metadata"),M(J,"content",xl),sl(U.src,bl="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_length.py&embed=true&show-chrome=false")||M(U,"src",bl),M(U,"title","Marimo Notebook"),M(U,"width","100%"),M(U,"height","800px"),M(U,"frameborder","0"),M(U,"allow","clipboard-write"),sl(j.src,Il="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_math.py&embed=true&show-chrome=false")||M(j,"src",Il),M(j,"title","Marimo Notebook"),M(j,"width","100%"),M(j,"height","800px"),M(j,"frameborder","0"),M(j,"allow","clipboard-write"),sl(d.src,Bl="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_format.py&embed=true&show-chrome=false")||M(d,"src",Bl),M(d,"title","Marimo Notebook"),M(d,"width","100%"),M(d,"height","800px"),M(d,"frameborder","0"),M(d,"allow","clipboard-write")},m(e,l){kl(document.head,J),a(e,f,l),a(e,T,l),a(e,Me,l),u(I,e,l),a(e,ye,l),a(e,h,l),a(e,we,l),a(e,B,l),a(e,Je,l),u(b,e,l),a(e,Ue,l),a(e,g,l),a(e,je,l),a(e,G,l),a(e,de,l),a(e,A,l),a(e,Te,l),a(e,z,l),a(e,Ce,l),a(e,R,l),a(e,be,l),u($,e,l),a(e,fe,l),u(Z,e,l),a(e,Ie,l),u(V,e,l),a(e,he,l),a(e,Q,l),a(e,Be,l),u(k,e,l),a(e,ge,l),a(e,v,l),a(e,Ge,l),u(W,e,l),a(e,Ae,l),u(X,e,l),a(e,ze,l),a(e,_,l),a(e,Re,l),u(x,e,l),a(e,$e,l),a(e,S,l),a(e,Ze,l),a(e,N,l),a(e,Ve,l),a(e,E,l),a(e,Qe,l),u(F,e,l),a(e,ke,l),a(e,Y,l),a(e,ve,l),u(H,e,l),a(e,We,l),a(e,D,l),a(e,Xe,l),u(O,e,l),a(e,_e,l),a(e,P,l),a(e,xe,l),u(L,e,l),a(e,Se,l),a(e,q,l),a(e,Ne,l),a(e,U,l),a(e,Ee,l),u(K,e,l),a(e,Fe,l),a(e,ee,l),a(e,Ye,l),u(le,e,l),a(e,He,l),a(e,j,l),a(e,De,l),u(te,e,l),a(e,Oe,l),a(e,ae,l),a(e,Pe,l),u(se,e,l),a(e,Le,l),a(e,d,l),a(e,qe,l),a(e,ne,l),a(e,Ke,l),u(ie,e,l),a(e,el,l),a(e,pe,l),a(e,ll,l),u(re,e,l),a(e,tl,l),a(e,ce,l),al=!0},p(e,[l]){const Al={};l&2&&(Al.$$scope={dirty:l,ctx:e}),b.$set(Al)},i(e){al||(o(I.$$.fragment,e),o(b.$$.fragment,e),o($.$$.fragment,e),o(Z.$$.fragment,e),o(V.$$.fragment,e),o(k.$$.fragment,e),o(W.$$.fragment,e),o(X.$$.fragment,e),o(x.$$.fragment,e),o(F.$$.fragment,e),o(H.$$.fragment,e),o(O.$$.fragment,e),o(L.$$.fragment,e),o(K.$$.fragment,e),o(le.$$.fragment,e),o(te.$$.fragment,e),o(se.$$.fragment,e),o(ie.$$.fragment,e),o(re.$$.fragment,e),al=!0)},o(e){y(I.$$.fragment,e),y(b.$$.fragment,e),y($.$$.fragment,e),y(Z.$$.fragment,e),y(V.$$.fragment,e),y(k.$$.fragment,e),y(W.$$.fragment,e),y(X.$$.fragment,e),y(x.$$.fragment,e),y(F.$$.fragment,e),y(H.$$.fragment,e),y(O.$$.fragment,e),y(L.$$.fragment,e),y(K.$$.fragment,e),y(le.$$.fragment,e),y(te.$$.fragment,e),y(se.$$.fragment,e),y(ie.$$.fragment,e),y(re.$$.fragment,e),al=!1},d(e){e&&(t(f),t(T),t(Me),t(ye),t(h),t(we),t(B),t(Je),t(Ue),t(g),t(je),t(G),t(de),t(A),t(Te),t(z),t(Ce),t(R),t(be),t(fe),t(Ie),t(he),t(Q),t(Be),t(ge),t(v),t(Ge),t(Ae),t(ze),t(_),t(Re),t($e),t(S),t(Ze),t(N),t(Ve),t(E),t(Qe),t(ke),t(Y),t(ve),t(We),t(D),t(Xe),t(_e),t(P),t(xe),t(Se),t(q),t(Ne),t(U),t(Ee),t(Fe),t(ee),t(Ye),t(He),t(j),t(De),t(Oe),t(ae),t(Pe),t(Le),t(d),t(qe),t(ne),t(Ke),t(el),t(pe),t(ll),t(tl),t(ce)),t(J),w(I,e),w(b,e),w($,e),w(Z,e),w(V,e),w(k,e),w(W,e),w(X,e),w(x,e),w(F,e),w(H,e),w(O,e),w(L,e),w(K,e),w(le,e),w(te,e),w(se,e),w(ie,e),w(re,e)}}}const xl='{"title":"Implementarea GRPO în TRL","local":"implementarea-grpo-în-trl","sections":[{"title":"Componentele Cheie","local":"componentele-cheie","sections":[{"title":"1. Formatul Setului de Date","local":"1-formatul-setului-de-date","sections":[],"depth":3},{"title":"2. Funcția de Recompensă","local":"2-funcția-de-recompensă","sections":[],"depth":3},{"title":"3. Configurația Antrenamentului","local":"3-configurația-antrenamentului","sections":[],"depth":3}],"depth":2},{"title":"Sfaturi pentru Succes","local":"sfaturi-pentru-succes","sections":[],"depth":2},{"title":"Designul Funcției de Recompensă","local":"designul-funcției-de-recompensă","sections":[{"title":"1. Recompense Bazate pe Lungime","local":"1-recompense-bazate-pe-lungime","sections":[],"depth":3}],"depth":2},{"title":"2. Recompense Bazate pe Reguli pentru Sarcini Verificabile","local":"2-recompense-bazate-pe-reguli-pentru-sarcini-verificabile","sections":[],"depth":2},{"title":"3. Recompense Bazate pe Format","local":"3-recompense-bazate-pe-format","sections":[],"depth":2},{"title":"Asta e tot!","local":"asta-e-tot","sections":[],"depth":2}],"depth":1}';function Sl(oe){return Rl(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Dl extends Zl{constructor(J){super(),Vl(this,J,Sl,_l,zl,{})}}export{Dl as component}; | |
Xet Storage Details
- Size:
- 31.5 kB
- Xet hash:
- dbee27f8febef11933b3d4306dfb57c15d98c3e4b7c2f406b338537f99bff523
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.