Buckets:

rtrm's picture
download
raw
57.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ejercicio práctico: GRPO con Unsloth&quot;,&quot;local&quot;:&quot;ejercicio-práctico-grpo-con-unsloth&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Instala las dependencias&quot;,&quot;local&quot;:&quot;instala-las-dependencias&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Configura Unsloth&quot;,&quot;local&quot;:&quot;configura-unsloth&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Preparación de datos&quot;,&quot;local&quot;:&quot;preparación-de-datos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Definición de funciones de recompensa&quot;,&quot;local&quot;:&quot;definición-de-funciones-de-recompensa&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Entrenamiento con GRPO&quot;,&quot;local&quot;:&quot;entrenamiento-con-grpo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prueba del modelo&quot;,&quot;local&quot;:&quot;prueba-del-modelo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Guardar el modelo&quot;,&quot;local&quot;:&quot;guardar-el-modelo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Subirlo al Hugging Face Hub&quot;,&quot;local&quot;:&quot;subirlo-al-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conclusión&quot;,&quot;local&quot;:&quot;conclusión&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1213/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/scheduler.505acc25.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/singletons.6865fa96.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.001f95d5.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/paths.ec28c642.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/preload-helper.8c2bab6b.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.e22abd30.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/0.e2c0ea78.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/34.cbaa1a07.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a144e953.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/CodeBlock.f6688f67.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/CourseFloatingBanner.f0a2dc21.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ejercicio práctico: GRPO con Unsloth&quot;,&quot;local&quot;:&quot;ejercicio-práctico-grpo-con-unsloth&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Instala las dependencias&quot;,&quot;local&quot;:&quot;instala-las-dependencias&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Configura Unsloth&quot;,&quot;local&quot;:&quot;configura-unsloth&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Preparación de datos&quot;,&quot;local&quot;:&quot;preparación-de-datos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Definición de funciones de recompensa&quot;,&quot;local&quot;:&quot;definición-de-funciones-de-recompensa&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Entrenamiento con GRPO&quot;,&quot;local&quot;:&quot;entrenamiento-con-grpo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prueba del modelo&quot;,&quot;local&quot;:&quot;prueba-del-modelo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Guardar el modelo&quot;,&quot;local&quot;:&quot;guardar-el-modelo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Subirlo al Hugging Face Hub&quot;,&quot;local&quot;:&quot;subirlo-al-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conclusión&quot;,&quot;local&quot;:&quot;conclusión&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-2-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/HuggingFace%20Course-Gemma3_(1B)-GRPO.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> </div> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="ejercicio-práctico-grpo-con-unsloth" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ejercicio-práctico-grpo-con-unsloth"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ejercicio práctico: GRPO con Unsloth</span></h1> <p data-svelte-h="svelte-ib6vz7">En este ejercicio, ajustarás un modelo con GRPO (Group Relative Policy Optimization) usando Unsloth para mejorar sus capacidades de razonamiento. Vimos GRPO en el <a href="/course/chapter3/3">Capítulo 3</a>.</p> <p data-svelte-h="svelte-szg2yv">Unsloth es una librería que acelera el ajuste fino de LLM, lo que permite entrenar modelos más rápido y con menos recursos computacionales. Unsloth se integra con TRL, así que construiremos sobre lo visto en las secciones anteriores y lo adaptaremos a las particularidades de Unsloth.</p> <blockquote class="tip" data-svelte-h="svelte-1k4y4m2"><p>Este ejercicio puede ejecutarse en una GPU T4 gratuita de Google Colab. Para obtener la mejor experiencia, sigue el notebook enlazado arriba y pruébalo tú mismo.</p></blockquote> <h2 class="relative group"><a id="instala-las-dependencias" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#instala-las-dependencias"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Instala las dependencias</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install unsloth vllm
pip install --upgrade pillow<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="configura-unsloth" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configura-unsloth"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configura Unsloth</span></h2> <p data-svelte-h="svelte-ekj75n">Unsloth proporciona una clase, <code>FastLanguageModel</code>, que integra <code>transformers</code> con las optimizaciones de Unsloth. Importémosla:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> unsloth <span class="hljs-keyword">import</span> FastLanguageModel<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-drcz3v">Ahora carguemos el modelo Gemma 3 1B Instruct de Google y configúralo para el ajuste:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> unsloth <span class="hljs-keyword">import</span> FastLanguageModel
<span class="hljs-keyword">import</span> torch
max_seq_length = <span class="hljs-number">1024</span> <span class="hljs-comment"># Can increase for longer reasoning traces</span>
lora_rank = <span class="hljs-number">32</span> <span class="hljs-comment"># Larger rank = smarter, but slower</span>
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=<span class="hljs-string">&quot;google/gemma-3-1b-it&quot;</span>,
max_seq_length=max_seq_length,
load_in_4bit=<span class="hljs-literal">True</span>, <span class="hljs-comment"># False for LoRA 16bit</span>
fast_inference=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable vLLM fast inference</span>
max_lora_rank=lora_rank,
gpu_memory_utilization=<span class="hljs-number">0.6</span>, <span class="hljs-comment"># Reduce if out of memory</span>
)
model = FastLanguageModel.get_peft_model(
model,
r=lora_rank,
target_modules=[
<span class="hljs-string">&quot;q_proj&quot;</span>,
<span class="hljs-string">&quot;k_proj&quot;</span>,
<span class="hljs-string">&quot;v_proj&quot;</span>,
<span class="hljs-string">&quot;o_proj&quot;</span>,
<span class="hljs-string">&quot;gate_proj&quot;</span>,
<span class="hljs-string">&quot;up_proj&quot;</span>,
<span class="hljs-string">&quot;down_proj&quot;</span>,
],
lora_alpha=lora_rank,
use_gradient_checkpointing=<span class="hljs-string">&quot;unsloth&quot;</span>,
random_state=<span class="hljs-number">3407</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oknsf3">Este código carga el modelo con cuantización de 4 bits para ahorrar memoria y aplica LoRA para un ajuste eficiente.</p> <blockquote class="tip" data-svelte-h="svelte-o0kcen"><p>No cubriremos los detalles de LoRA en este capítulo, pero puedes aprender más en el <a href="/course/chapter11/3">Capítulo 11</a>.</p></blockquote> <h2 class="relative group"><a id="preparación-de-datos" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#preparación-de-datos"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Preparación de datos</span></h2> <p data-svelte-h="svelte-1wutod0">Para este ejercicio usaremos el dataset GSM8K, que contiene problemas de matemáticas de primaria. Daremos formato a los datos para animar al modelo a mostrar su razonamiento antes de dar una respuesta.</p> <p data-svelte-h="svelte-s2ahcq">Primero definiremos el formato de los prompts y las respuestas:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Define the system prompt that instructs the model to use a specific format</span>
SYSTEM_PROMPT = <span class="hljs-string">&quot;&quot;&quot;
Respond in the following format:
&lt;reasoning&gt;
...
&lt;/reasoning&gt;
&lt;answer&gt;
...
&lt;/answer&gt;
&quot;&quot;&quot;</span>
XML_COT_FORMAT = <span class="hljs-string">&quot;&quot;&quot;\
&lt;reasoning&gt;
{reasoning}
&lt;/reasoning&gt;
&lt;answer&gt;
{answer}
&lt;/answer&gt;
&quot;&quot;&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9evkwo">Ahora preparemos el dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> re
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, Dataset
<span class="hljs-keyword">def</span> <span class="hljs-title function_">extract_xml_answer</span>(<span class="hljs-params">text: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
answer = text.split(<span class="hljs-string">&quot;&lt;answer&gt;&quot;</span>)[-<span class="hljs-number">1</span>]
answer = answer.split(<span class="hljs-string">&quot;&lt;/answer&gt;&quot;</span>)[<span class="hljs-number">0</span>]
<span class="hljs-keyword">return</span> answer.strip()
<span class="hljs-keyword">def</span> <span class="hljs-title function_">extract_hash_answer</span>(<span class="hljs-params">text: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
<span class="hljs-keyword">if</span> <span class="hljs-string">&quot;####&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> text:
<span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">return</span> text.split(<span class="hljs-string">&quot;####&quot;</span>)[<span class="hljs-number">1</span>].strip()
<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_gsm8k_questions</span>(<span class="hljs-params">split=<span class="hljs-string">&quot;train&quot;</span></span>) -&gt; Dataset:
data = load_dataset(<span class="hljs-string">&quot;openai/gsm8k&quot;</span>, <span class="hljs-string">&quot;main&quot;</span>)[split]
data = data.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {
<span class="hljs-string">&quot;prompt&quot;</span>: [
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: SYSTEM_PROMPT},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: x[<span class="hljs-string">&quot;question&quot;</span>]},
],
<span class="hljs-string">&quot;answer&quot;</span>: extract_hash_answer(x[<span class="hljs-string">&quot;answer&quot;</span>]),
}
)
<span class="hljs-keyword">return</span> data
dataset = get_gsm8k_questions()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="definición-de-funciones-de-recompensa" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#definición-de-funciones-de-recompensa"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Definición de funciones de recompensa</span></h2> <p data-svelte-h="svelte-1w1v2f5">Como comentamos en <a href="/course/chapter12/4">una página anterior</a>, GRPO puede usar funciones de recompensa para guiar el aprendizaje según criterios verificables como longitud y formato.</p> <p data-svelte-h="svelte-gmxi8d">En este ejercicio definiremos varias funciones de recompensa que fomentan distintos aspectos de un buen razonamiento.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">correctness_reward_func</span>(<span class="hljs-params">prompts, completions, answer, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
responses = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
q = prompts[<span class="hljs-number">0</span>][-<span class="hljs-number">1</span>][<span class="hljs-string">&quot;content&quot;</span>]
extracted_responses = [extract_xml_answer(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> responses]
<span class="hljs-built_in">print</span>(
<span class="hljs-string">&quot;-&quot;</span> * <span class="hljs-number">20</span>,
<span class="hljs-string">f&quot;Question:\n<span class="hljs-subst">{q}</span>&quot;</span>,
<span class="hljs-string">f&quot;\nAnswer:\n<span class="hljs-subst">{answer[<span class="hljs-number">0</span>]}</span>&quot;</span>,
<span class="hljs-string">f&quot;\nResponse:\n<span class="hljs-subst">{responses[<span class="hljs-number">0</span>]}</span>&quot;</span>,
<span class="hljs-string">f&quot;\nExtracted:\n<span class="hljs-subst">{extracted_responses[<span class="hljs-number">0</span>]}</span>&quot;</span>,
)
<span class="hljs-keyword">return</span> [<span class="hljs-number">2.0</span> <span class="hljs-keyword">if</span> r == a <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> r, a <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(extracted_responses, answer)]
<span class="hljs-keyword">def</span> <span class="hljs-title function_">int_reward_func</span>(<span class="hljs-params">completions, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
responses = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
extracted_responses = [extract_xml_answer(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> responses]
<span class="hljs-keyword">return</span> [<span class="hljs-number">0.5</span> <span class="hljs-keyword">if</span> r.isdigit() <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> extracted_responses]
<span class="hljs-keyword">def</span> <span class="hljs-title function_">strict_format_reward_func</span>(<span class="hljs-params">completions, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
pattern = <span class="hljs-string">r&quot;^&lt;reasoning&gt;\n.*?\n&lt;/reasoning&gt;\n&lt;answer&gt;\n.*?\n&lt;/answer&gt;\n$&quot;</span>
responses = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
matches = [re.<span class="hljs-keyword">match</span>(pattern, r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> responses]
<span class="hljs-keyword">return</span> [<span class="hljs-number">0.5</span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">in</span> matches]
<span class="hljs-keyword">def</span> <span class="hljs-title function_">soft_format_reward_func</span>(<span class="hljs-params">completions, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
pattern = <span class="hljs-string">r&quot;&lt;reasoning&gt;.*?&lt;/reasoning&gt;\s*&lt;answer&gt;.*?&lt;/answer&gt;&quot;</span>
responses = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
matches = [re.<span class="hljs-keyword">match</span>(pattern, r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> responses]
<span class="hljs-keyword">return</span> [<span class="hljs-number">0.5</span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> <span class="hljs-keyword">match</span> <span class="hljs-keyword">in</span> matches]
<span class="hljs-keyword">def</span> <span class="hljs-title function_">count_xml</span>(<span class="hljs-params">text</span>) -&gt; <span class="hljs-built_in">float</span>:
count = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">if</span> text.count(<span class="hljs-string">&quot;&lt;reasoning&gt;\n&quot;</span>) == <span class="hljs-number">1</span>:
count += <span class="hljs-number">0.125</span>
<span class="hljs-keyword">if</span> text.count(<span class="hljs-string">&quot;\n&lt;/reasoning&gt;\n&quot;</span>) == <span class="hljs-number">1</span>:
count += <span class="hljs-number">0.125</span>
<span class="hljs-keyword">if</span> text.count(<span class="hljs-string">&quot;\n&lt;answer&gt;\n&quot;</span>) == <span class="hljs-number">1</span>:
count += <span class="hljs-number">0.125</span>
count -= <span class="hljs-built_in">len</span>(text.split(<span class="hljs-string">&quot;\n&lt;/answer&gt;\n&quot;</span>)[-<span class="hljs-number">1</span>]) * <span class="hljs-number">0.001</span>
<span class="hljs-keyword">if</span> text.count(<span class="hljs-string">&quot;\n&lt;/answer&gt;&quot;</span>) == <span class="hljs-number">1</span>:
count += <span class="hljs-number">0.125</span>
count -= (<span class="hljs-built_in">len</span>(text.split(<span class="hljs-string">&quot;\n&lt;/answer&gt;&quot;</span>)[-<span class="hljs-number">1</span>]) - <span class="hljs-number">1</span>) * <span class="hljs-number">0.001</span>
<span class="hljs-keyword">return</span> count
<span class="hljs-keyword">def</span> <span class="hljs-title function_">xmlcount_reward_func</span>(<span class="hljs-params">completions, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
contents = [completion[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>] <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]
<span class="hljs-keyword">return</span> [count_xml(c) <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> contents]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qij65n">Estas funciones de recompensa cumplen propósitos distintos:</p> <table data-svelte-h="svelte-ju1rmf"><thead><tr><th>Reward Function</th> <th>Purpose</th></tr></thead> <tbody><tr><td><code>correctness_reward_func</code></td> <td>Recompensa al modelo cuando su respuesta coincide con la respuesta correcta</td></tr> <tr><td><code>int_reward_func</code></td> <td>Recompensa al modelo por dar una respuesta numérica</td></tr> <tr><td><code>strict_format_reward_func</code> y <code>soft_format_reward_func</code></td> <td>Recompensan al modelo por seguir el formato especificado</td></tr> <tr><td><code>xmlcount_reward_func</code></td> <td>Recompensa el uso correcto de etiquetas XML y penaliza contenido extra</td></tr></tbody></table> <h2 class="relative group"><a id="entrenamiento-con-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#entrenamiento-con-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Entrenamiento con GRPO</span></h2> <p data-svelte-h="svelte-1tc5dlw">Ahora configuraremos <code>GRPOTrainer</code> con nuestro modelo, tokenizador y funciones de recompensa.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer
max_prompt_length = <span class="hljs-number">256</span>
training_args = GRPOConfig(
learning_rate=<span class="hljs-number">5e-6</span>,
adam_beta1=<span class="hljs-number">0.9</span>,
adam_beta2=<span class="hljs-number">0.99</span>,
weight_decay=<span class="hljs-number">0.1</span>,
warmup_ratio=<span class="hljs-number">0.1</span>,
lr_scheduler_type=<span class="hljs-string">&quot;cosine&quot;</span>,
optim=<span class="hljs-string">&quot;paged_adamw_8bit&quot;</span>,
logging_steps=<span class="hljs-number">1</span>,
per_device_train_batch_size=<span class="hljs-number">1</span>,
gradient_accumulation_steps=<span class="hljs-number">1</span>,
num_generations=<span class="hljs-number">6</span>,
max_prompt_length=max_prompt_length,
max_completion_length=max_seq_length - max_prompt_length,
max_steps=<span class="hljs-number">250</span>,
save_steps=<span class="hljs-number">250</span>,
max_grad_norm=<span class="hljs-number">0.1</span>,
report_to=<span class="hljs-string">&quot;none&quot;</span>,
output_dir=<span class="hljs-string">&quot;outputs&quot;</span>,
)
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=[
xmlcount_reward_func,
soft_format_reward_func,
strict_format_reward_func,
int_reward_func,
correctness_reward_func,
],
args=training_args,
train_dataset=dataset,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jbvyj0">La <code>GRPOConfig</code> establece varios hiperparámetros para el entrenamiento, entre ellos <code>learning_rate</code>, <code>num_generations</code> y <code>max_steps</code>.</p> <p data-svelte-h="svelte-1xpy9fj">Ahora iniciemos el entrenamiento:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train()<!-- HTML_TAG_END --></pre></div> <blockquote class="warning" data-svelte-h="svelte-uqayhr"><p>El entrenamiento puede tardar un tiempo. Puede que no veas mejoras inmediatas en las recompensas; a veces hacen falta entre 150 y 200 pasos.</p></blockquote> <h2 class="relative group"><a id="prueba-del-modelo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prueba-del-modelo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prueba del modelo</span></h2> <p data-svelte-h="svelte-11w1s2p">Después del entrenamiento, guardemos primero los pesos de LoRA:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.save_lora(<span class="hljs-string">&quot;grpo_saved_lora&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-baytq2">Ahora probemos el modelo con una pregunta nueva:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> SamplingParams
text = tokenizer.apply_chat_template(
[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: SYSTEM_PROMPT},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Calculate pi.&quot;</span>},
],
tokenize=<span class="hljs-literal">False</span>,
add_generation_prompt=<span class="hljs-literal">True</span>,
)
sampling_params = SamplingParams(
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
max_tokens=<span class="hljs-number">1024</span>,
)
output = (
model.fast_generate(
text,
sampling_params=sampling_params,
lora_request=model.load_lora(<span class="hljs-string">&quot;grpo_saved_lora&quot;</span>),
)[<span class="hljs-number">0</span>]
.outputs[<span class="hljs-number">0</span>]
.text
)
<span class="hljs-built_in">print</span>(output)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1b8y4wq">Deberías ver que el modelo ahora sigue el formato especificado y muestra su razonamiento antes de dar una respuesta.</p> <h2 class="relative group"><a id="guardar-el-modelo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#guardar-el-modelo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Guardar el modelo</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.save_pretrained_merged(<span class="hljs-string">&quot;model&quot;</span>, tokenizer, save_method=<span class="hljs-string">&quot;merged_16bit&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="subirlo-al-hugging-face-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#subirlo-al-hugging-face-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Subirlo al Hugging Face Hub</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.push_to_hub_merged(
<span class="hljs-string">&quot;your-username/model-name&quot;</span>, tokenizer, save_method=<span class="hljs-string">&quot;merged_16bit&quot;</span>, token=<span class="hljs-string">&quot;your-token&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dqm68w">Unsloth también permite guardar en formato GGUF para usarlo con <code>llama.cpp</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.push_to_hub_gguf(
<span class="hljs-string">&quot;your-username/model-name&quot;</span>,
tokenizer,
quantization_method=[<span class="hljs-string">&quot;q4_k_m&quot;</span>, <span class="hljs-string">&quot;q8_0&quot;</span>, <span class="hljs-string">&quot;q5_k_m&quot;</span>],
token=<span class="hljs-string">&quot;your-token&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1p6tjpa">Los archivos GGUF pueden usarse con <code>llama.cpp</code> o con interfaces como Jan u Open WebUI.</p> <h2 class="relative group"><a id="conclusión" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusión"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusión</span></h2> <p data-svelte-h="svelte-zguqtb">En este ejercicio aprendiste a:</p> <ol data-svelte-h="svelte-uxq10x"><li>Configurar Unsloth para un ajuste acelerado.</li> <li>Preparar datos para entrenamiento con GRPO.</li> <li>Definir funciones de recompensa para guiar el aprendizaje.</li> <li>Entrenar un modelo usando GRPO.</li> <li>Probar el modelo ajustado.</li> <li>Guardar el modelo en varios formatos.</li></ol> <p data-svelte-h="svelte-luv4pd">GRPO es una técnica potente para alinear modelos de lenguaje con comportamientos específicos, y Unsloth hace que resulte más accesible incluso en hardware limitado.</p> <p data-svelte-h="svelte-mnskwp">Para obtener más información y recursos, consulta:</p> <ul data-svelte-h="svelte-1gnqsv0"><li><a href="https://docs.unsloth.ai/" rel="nofollow">Documentación de Unsloth</a></li> <li><a href="https://discord.gg/unsloth" rel="nofollow">Discord de Unsloth</a></li> <li><a href="https://github.com/unslothai/unsloth" rel="nofollow">GitHub de Unsloth</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/es/chapter12/6.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1nznq34 = {
assets: "/docs/course/pr_1213/es",
base: "/docs/course/pr_1213/es",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js"),
import("/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 34],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
57.8 kB
·
Xet hash:
2129a9bf6b569484c33c821fa994ccb8a9920c247d966ae1bf7e3cb1267dd435

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.