Buckets:

download
raw
70.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Collecting rollouts with OpenEnv for supervised training&quot;,&quot;local&quot;:&quot;collecting-rollouts-with-openenv-for-supervised-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Why use an environment to collect training data&quot;,&quot;local&quot;:&quot;why-use-an-environment-to-collect-training-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;What you’ll use&quot;,&quot;local&quot;:&quot;what-youll-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;1. Install dependencies&quot;,&quot;local&quot;:&quot;1-install-dependencies&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Set your credentials&quot;,&quot;local&quot;:&quot;2-set-your-credentials&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Define the system prompt&quot;,&quot;local&quot;:&quot;3-define-the-system-prompt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;4. Configure data collection&quot;,&quot;local&quot;:&quot;4-configure-data-collection&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;5. Collect rollouts with openenv collect&quot;,&quot;local&quot;:&quot;5-collect-rollouts-with-openenv-collect&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;6. Filter the dataset&quot;,&quot;local&quot;:&quot;6-filter-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;7. Inspect the dataset before training&quot;,&quot;local&quot;:&quot;7-inspect-the-dataset-before-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;8. Measure token lengths&quot;,&quot;local&quot;:&quot;8-measure-token-lengths&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;9. Fine-tune with SFTTrainer&quot;,&quot;local&quot;:&quot;9-fine-tune-with-sfttrainer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;10. Evaluate: before vs after&quot;,&quot;local&quot;:&quot;10-evaluate-before-vs-after&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;11. Where to go next: GRPO&quot;,&quot;local&quot;:&quot;11-where-to-go-next-grpo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/openenv/pr_749/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/scheduler.2b22cead.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/singletons.63566282.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/paths.dd876c7b.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/preload-helper.0820fbc7.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/index.1a0e8013.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/0.167255c0.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/65.abb66251.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/Heading.c0d3f116.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.21bcf336.js">
<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/CodeBlock.c8d73295.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Collecting rollouts with OpenEnv for supervised training&quot;,&quot;local&quot;:&quot;collecting-rollouts-with-openenv-for-supervised-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Why use an environment to collect training data&quot;,&quot;local&quot;:&quot;why-use-an-environment-to-collect-training-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;What you’ll use&quot;,&quot;local&quot;:&quot;what-youll-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;1. Install dependencies&quot;,&quot;local&quot;:&quot;1-install-dependencies&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Set your credentials&quot;,&quot;local&quot;:&quot;2-set-your-credentials&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Define the system prompt&quot;,&quot;local&quot;:&quot;3-define-the-system-prompt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;4. Configure data collection&quot;,&quot;local&quot;:&quot;4-configure-data-collection&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;5. Collect rollouts with openenv collect&quot;,&quot;local&quot;:&quot;5-collect-rollouts-with-openenv-collect&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;6. Filter the dataset&quot;,&quot;local&quot;:&quot;6-filter-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;7. Inspect the dataset before training&quot;,&quot;local&quot;:&quot;7-inspect-the-dataset-before-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;8. Measure token lengths&quot;,&quot;local&quot;:&quot;8-measure-token-lengths&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;9. Fine-tune with SFTTrainer&quot;,&quot;local&quot;:&quot;9-fine-tune-with-sfttrainer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;10. Evaluate: before vs after&quot;,&quot;local&quot;:&quot;10-evaluate-before-vs-after&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;11. Where to go next: GRPO&quot;,&quot;local&quot;:&quot;11-where-to-go-next-grpo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="collecting-rollouts-with-openenv-for-supervised-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#collecting-rollouts-with-openenv-for-supervised-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Collecting rollouts with OpenEnv for supervised training</span></h1> <p data-svelte-h="svelte-1mb1q37"><a href="https://colab.research.google.com/github/huggingface/OpenEnv/blob/main/examples/sft_warmup.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></p> <p data-svelte-h="svelte-n1a38f">OpenEnv environments are not only useful for RL training — they are also a natural tool for <strong>collecting
rollouts that become supervised training data</strong>. The environment handles episode management, automatic scoring,
and reproducibility, so you get a reward-labeled dataset without writing any of that infrastructure yourself.</p> <p data-svelte-h="svelte-cj2ozg">This tutorial shows the full pipeline:</p> <ol data-svelte-h="svelte-1mxsm9c"><li>Run a strong teacher model inside an OpenEnv environment to collect rollouts.</li> <li>Use the environment’s reward signal to filter out incorrect examples automatically.</li> <li>Train a smaller student model on the filtered rollouts with TRL’s <code>SFTTrainer</code>.</li></ol> <p data-svelte-h="svelte-1rn2nlx">As a concrete application, the resulting checkpoint is used as a warm-start for GRPO: once the student
reliably produces valid tool calls, GRPO’s <code>reward_std</code> is non-zero from the first batch and the reward
curve climbs immediately.</p> <h2 class="relative group"><a id="why-use-an-environment-to-collect-training-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-use-an-environment-to-collect-training-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why use an environment to collect training data</span></h2> <p data-svelte-h="svelte-2qxvgr">Building a supervised dataset usually means writing a custom collection loop, a scorer, and episode
bookkeeping. An OpenEnv environment gives you all three out of the box:</p> <ul data-svelte-h="svelte-16gl30a"><li><strong>Automatic scoring</strong> — every <code>step()</code> returns a reward. Filter by <code>reward == 1.0</code> and you have a
clean, correct dataset with no manual labelling.</li> <li><strong>Reproducible episodes</strong><code>reset(seed=42, size=N)</code> produces the same sequence of problems every
run. Anyone can regenerate the exact dataset.</li> <li><strong>Configurable difficulty</strong> — adjust <code>DATASET_CONFIG</code> to control problem complexity without changing
any collection code.</li> <li><strong>Portable across environments</strong> — the same collect → filter → train pipeline works for any OpenEnv
environment. Swap the env and the tool definition; everything else stays the same.</li></ul> <h2 class="relative group"><a id="what-youll-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-youll-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What you’ll use</span></h2> <table data-svelte-h="svelte-148mvz9"><thead><tr><th></th> <th></th></tr></thead> <tbody><tr><td><strong>Student model</strong></td> <td><a href="https://huggingface.co/Qwen/Qwen3-1.7B" rel="nofollow"><code>Qwen/Qwen3-1.7B</code></a></td></tr> <tr><td><strong>Teacher model</strong></td> <td><code>gpt-5-mini</code> via the OpenAI API</td></tr> <tr><td><strong>Environment</strong></td> <td><a href="https://github.com/huggingface/OpenEnv/tree/main/envs/reasoning_gym_env" rel="nofollow"><code>reasoning_gym_env</code></a> / <code>chain_sum</code></td></tr> <tr><td><strong>SFT trainer</strong></td> <td><a href="https://huggingface.co/docs/trl/main/en/sft_trainer" rel="nofollow">TRL <code>SFTTrainer</code></a></td></tr> <tr><td><strong>Next step</strong></td> <td><a href="end-to-end-walkthrough">End-to-end walkthrough with GRPO</a></td></tr></tbody></table> <hr> <h2 class="relative group"><a id="1-install-dependencies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-install-dependencies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Install dependencies</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->!pip install -q openai trl
!pip install -q openenv
!pip install -q --no-deps git+https://huggingface.co/spaces/sergiopaniego/reasoning_gym
!pip install -Uq <span class="hljs-string">&quot;transformers&gt;=5.3.0&quot;</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="2-set-your-credentials" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-set-your-credentials"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Set your credentials</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> getpass, os
<span class="hljs-keyword">if</span> <span class="hljs-string">&quot;OPENAI_API_KEY&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> os.environ:
os.environ[<span class="hljs-string">&quot;OPENAI_API_KEY&quot;</span>] = getpass.getpass(<span class="hljs-string">&quot;OpenAI API key: &quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t2mhms">You’ll also need a Hugging Face login to download the base model and push both the collected dataset
and the fine-tuned checkpoint:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->YOUR_HF_USERNAME = <span class="hljs-string">&quot;your-username&quot;</span> <span class="hljs-comment"># replace with your Hugging Face username</span>
<span class="hljs-keyword">assert</span> YOUR_HF_USERNAME != <span class="hljs-string">&quot;your-username&quot;</span>, <span class="hljs-string">&quot;Replace YOUR_HF_USERNAME with your Hugging Face username&quot;</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="3-define-the-system-prompt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-define-the-system-prompt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Define the system prompt</span></h2> <p data-svelte-h="svelte-18uzm6p">Use the same prompt as the <a href="end-to-end-walkthrough">GRPO tutorial</a>
so the SFT-trained model is a drop-in replacement when you continue with GRPO.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->SYSTEM_PROMPT = <span class="hljs-string">&quot;&quot;&quot;You are a careful arithmetic assistant.
You will be given a chain of integer additions. Compute the result and submit it as a single number.
Rules:
1. Read the question carefully.
2. Use the tool `answer` exactly once with your final number.
3. The answer must be a single integer with no units or explanation.
&quot;&quot;&quot;</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="4-configure-data-collection" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-configure-data-collection"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Configure data collection</span></h2> <p data-svelte-h="svelte-poatas"><code>DATASET_CONFIG</code> controls the difficulty of the <code>chain_sum</code> problems the environment generates:
<code>min_terms</code>/<code>max_terms</code> set how many integers are added together, and <code>min_digits</code>/<code>max_digits</code> set
how many digits each integer has. At these settings each problem is a sum of 2–3 two-digit numbers
— easy enough for <code>gpt-5-mini</code> to answer correctly ~90% of the time, which gives a clean training
signal after filtering.</p> <p data-svelte-h="svelte-158jha2"><code>N_EPISODES</code> is the number of problems to collect. 300 is enough to get ~270 correct examples after
filtering, which is sufficient for format compliance training.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->DATASET_CONFIG = {
<span class="hljs-string">&quot;min_terms&quot;</span>: <span class="hljs-number">2</span>,
<span class="hljs-string">&quot;max_terms&quot;</span>: <span class="hljs-number">3</span>,
<span class="hljs-string">&quot;min_digits&quot;</span>: <span class="hljs-number">2</span>,
<span class="hljs-string">&quot;max_digits&quot;</span>: <span class="hljs-number">2</span>,
}
N_EPISODES = <span class="hljs-number">300</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="5-collect-rollouts-with-openenv-collect" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-collect-rollouts-with-openenv-collect"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. Collect rollouts with openenv collect</span></h2> <p data-svelte-h="svelte-1w7kvuu"><code>openenv collect</code> runs the teacher model inside the environment and records every episode — the
environment’s <code>step()</code> reward is written alongside the messages, so filtering by correctness requires
no additional scoring code.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json, shlex
dataset_config_arg = shlex.quote(json.dumps(DATASET_CONFIG))
system_prompt_arg = shlex.quote(SYSTEM_PROMPT)
hub_repo_arg = shlex.quote(<span class="hljs-string">f&quot;<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts&quot;</span>)
!openenv collect reasoning_gym:chain_sum \
--base-url https://sergiopaniego-reasoning-gym.hf.space \
--provider openai \
--model gpt-<span class="hljs-number">5</span>-mini \
--num-episodes {N_EPISODES} \
--<span class="hljs-built_in">max</span>-tokens <span class="hljs-number">1024</span> \
--dataset-config {dataset_config_arg} \
--system-prompt {system_prompt_arg} \
--push-to-hub {hub_repo_arg} \
--output-<span class="hljs-built_in">dir</span> ./rollouts<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ptdf38">The command prints a live progress summary and pushes the collected episodes to the Hub as
<code>{YOUR_HF_USERNAME}/chain-sum-rollouts</code>. Pull them back to start filtering:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
ds = load_dataset(<span class="hljs-string">f&quot;<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
raw_rollouts = <span class="hljs-built_in">list</span>(ds)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Collected <span class="hljs-subst">{<span class="hljs-built_in">len</span>(raw_rollouts)}</span> episodes&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pn841m">The <code>messages</code> field stores the full conversation in standard OpenAI format (assistant messages have
a <code>tool_calls</code> list). Convert to Qwen3’s <code>&lt;tool_call&gt;</code> text format before training — GRPOTrainer
produces this same format during RL, so the SFT checkpoint becomes a direct drop-in:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">to_qwen3_messages</span>(<span class="hljs-params">record</span>):
converted = []
<span class="hljs-keyword">for</span> msg <span class="hljs-keyword">in</span> record[<span class="hljs-string">&quot;messages&quot;</span>]:
<span class="hljs-keyword">if</span> msg[<span class="hljs-string">&quot;role&quot;</span>] == <span class="hljs-string">&quot;tool&quot;</span>:
<span class="hljs-keyword">continue</span> <span class="hljs-comment"># strip environment responses; SFT only needs the assistant turn</span>
<span class="hljs-keyword">if</span> msg[<span class="hljs-string">&quot;role&quot;</span>] == <span class="hljs-string">&quot;assistant&quot;</span> <span class="hljs-keyword">and</span> msg.get(<span class="hljs-string">&quot;tool_calls&quot;</span>):
tc = msg[<span class="hljs-string">&quot;tool_calls&quot;</span>][<span class="hljs-number">0</span>]
args = json.loads(tc[<span class="hljs-string">&quot;function&quot;</span>][<span class="hljs-string">&quot;arguments&quot;</span>])
answer_str = args.get(<span class="hljs-string">&quot;answer&quot;</span>, <span class="hljs-string">&quot;&quot;</span>)
tool_call_text = (
<span class="hljs-string">&quot;&lt;tool_call&gt;\n&quot;</span>
+ json.dumps({<span class="hljs-string">&quot;name&quot;</span>: <span class="hljs-string">&quot;answer&quot;</span>, <span class="hljs-string">&quot;arguments&quot;</span>: {<span class="hljs-string">&quot;answer&quot;</span>: answer_str}})
+ <span class="hljs-string">&quot;\n&lt;/tool_call&gt;&quot;</span>
)
converted.append({<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: tool_call_text})
<span class="hljs-keyword">else</span>:
converted.append(msg)
<span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;messages&quot;</span>: converted, <span class="hljs-string">&quot;reward&quot;</span>: record[<span class="hljs-string">&quot;reward&quot;</span>]}
rollouts = [to_qwen3_messages(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> raw_rollouts]<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="6-filter-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#6-filter-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>6. Filter the dataset</span></h2> <p data-svelte-h="svelte-1vi36q9">Keep only episodes where the teacher answered correctly. The environment’s reward signal does the
labelling — no manual annotation needed.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->correct = [r <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rollouts <span class="hljs-keyword">if</span> r[<span class="hljs-string">&quot;reward&quot;</span>] == <span class="hljs-number">1.0</span>]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Correct: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)}</span> / <span class="hljs-subst">{<span class="hljs-built_in">len</span>(rollouts)}</span> (<span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)/<span class="hljs-built_in">len</span>(rollouts):<span class="hljs-number">.1</span>%}</span>)&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wrgc3t"><code>gpt-5-mini</code> typically scores above 90% on <code>chain_sum</code> at this difficulty, so you should end up with
~270 examples from 300 rollouts.</p> <hr> <h2 class="relative group"><a id="7-inspect-the-dataset-before-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#7-inspect-the-dataset-before-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>7. Inspect the dataset before training</span></h2> <p data-svelte-h="svelte-6lfca0">Always look at your data before training. Automated collection can introduce unexpected patterns that the
student model will learn to imitate.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> random
<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> random.sample(correct, <span class="hljs-number">3</span>):
question = row[<span class="hljs-string">&quot;messages&quot;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&quot;content&quot;</span>]
response = row[<span class="hljs-string">&quot;messages&quot;</span>][<span class="hljs-number">1</span>][<span class="hljs-string">&quot;content&quot;</span>]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Q: <span class="hljs-subst">{question}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;A: <span class="hljs-subst">{response}</span>&quot;</span>)
<span class="hljs-built_in">print</span>()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jmsvh4">Things to check:</p> <ul data-svelte-h="svelte-vbk6t4"><li>Does every response contain a valid <code>&lt;tool_call&gt;</code> block?</li> <li>Are the answers integers with no extra text?</li> <li>Is there any reasoning in the assistant message that you don’t want the student to learn?
(For example: an internal monologue, disclaimers, or repeated phrasing that the teacher leaked
from its own system prompt.)</li></ul> <hr> <h2 class="relative group"><a id="8-measure-token-lengths" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#8-measure-token-lengths"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>8. Measure token lengths</span></h2> <p data-svelte-h="svelte-d6vup6">Set <code>max_length</code> in <code>SFTConfig</code> to cover nearly all examples without wasting GPU memory on padding.
The 99th percentile is a good target: you truncate fewer than 1% of examples while keeping batches tight.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>)
lengths = []
<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> correct:
text = tokenizer.apply_chat_template(
row[<span class="hljs-string">&quot;messages&quot;</span>], tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">False</span>
)
ids = tokenizer.encode(text)
lengths.append(<span class="hljs-built_in">len</span>(ids))
lengths = np.array(lengths)
MAX_SEQ_LEN = <span class="hljs-built_in">int</span>(np.percentile(lengths, <span class="hljs-number">99</span>)) + <span class="hljs-number">16</span>
<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;p50=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">50</span>):<span class="hljs-number">.0</span>f}</span> &quot;</span>
<span class="hljs-string">f&quot;p95=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">95</span>):<span class="hljs-number">.0</span>f}</span> &quot;</span>
<span class="hljs-string">f&quot;p99=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">99</span>):<span class="hljs-number">.0</span>f}</span> &quot;</span>
<span class="hljs-string">f&quot;max=<span class="hljs-subst">{lengths.<span class="hljs-built_in">max</span>()}</span>&quot;</span>
)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Setting MAX_SEQ_LEN = <span class="hljs-subst">{MAX_SEQ_LEN}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="9-fine-tune-with-sfttrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#9-fine-tune-with-sfttrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>9. Fine-tune with SFTTrainer</span></h2> <p data-svelte-h="svelte-1uo0pb3"><code>assistant_only_loss=True</code> in <code>SFTConfig</code> masks the prompt tokens so the loss is computed only on the
assistant response — the <code>&lt;tool_call&gt;</code> block. This is more efficient than full-sequence training and avoids
accidentally reinforcing the system prompt wording.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig, SFTTrainer
dataset = Dataset.from_list([{<span class="hljs-string">&quot;messages&quot;</span>: r[<span class="hljs-string">&quot;messages&quot;</span>]} <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> correct])
model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>)
sft_config = SFTConfig(
output_dir=<span class="hljs-string">&quot;reasoning-gym-chain-sum-Qwen3-1.7B-sft&quot;</span>,
max_length=MAX_SEQ_LEN,
num_train_epochs=<span class="hljs-number">3</span>,
per_device_train_batch_size=<span class="hljs-number">4</span>,
gradient_accumulation_steps=<span class="hljs-number">2</span>,
learning_rate=<span class="hljs-number">2e-5</span>,
warmup_steps=<span class="hljs-number">10</span>,
lr_scheduler_type=<span class="hljs-string">&quot;cosine&quot;</span>,
logging_steps=<span class="hljs-number">5</span>,
save_strategy=<span class="hljs-string">&quot;no&quot;</span>,
assistant_only_loss=<span class="hljs-literal">True</span>,
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
processing_class=tokenizer,
args=sft_config,
)
trainer.train()
trainer.push_to_hub(commit_message=<span class="hljs-string">&quot;SFT warm-up on reasoning_gym chain_sum&quot;</span>)<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-l32ipe"><p>Training ~270 examples for 3 epochs takes around 5 minutes on a single A100 (40 GB). The goal is format
compliance, not task mastery — a handful of epochs is enough. Mastery comes from GRPO.</p></blockquote> <hr> <h2 class="relative group"><a id="10-evaluate-before-vs-after" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#10-evaluate-before-vs-after"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>10. Evaluate: before vs after</span></h2> <p data-svelte-h="svelte-1bax8nc">Run both the base model and the SFT checkpoint on a held-out set and compare. The key metric for a
warm-up evaluation is <strong>format compliance</strong> — how often the model uses <code>&lt;tool_call&gt;</code> correctly — as
well as overall accuracy.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> re
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-keyword">from</span> reasoning_gym_env.client <span class="hljs-keyword">import</span> ReasoningGymEnv
<span class="hljs-keyword">from</span> reasoning_gym_env.models <span class="hljs-keyword">import</span> ReasoningGymAction
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">evaluate_model</span>(<span class="hljs-params">model_name, n_eval=<span class="hljs-number">50</span>, seed=<span class="hljs-number">999</span></span>):
gen = pipeline(
<span class="hljs-string">&quot;text-generation&quot;</span>,
model=model_name,
tokenizer=model_name,
device_map=<span class="hljs-string">&quot;auto&quot;</span>,
dtype=<span class="hljs-string">&quot;auto&quot;</span>,
)
gen.model.generation_config.max_length = <span class="hljs-literal">None</span>
tok = AutoTokenizer.from_pretrained(model_name)
env = ReasoningGymEnv(base_url=<span class="hljs-string">&quot;https://sergiopaniego-reasoning-gym.hf.space&quot;</span>)
obs = <span class="hljs-keyword">await</span> env.reset(
dataset_name=<span class="hljs-string">&quot;chain_sum&quot;</span>,
dataset_config=DATASET_CONFIG,
seed=seed,
size=n_eval,
)
rewards, format_hits = [], <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_eval):
<span class="hljs-keyword">if</span> i &gt; <span class="hljs-number">0</span>:
obs = <span class="hljs-keyword">await</span> env.reset()
question = obs.observation.question
messages = [
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: SYSTEM_PROMPT},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: question},
]
prompt = tok.apply_chat_template(
messages, tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">True</span>
)
completion = gen(prompt, max_new_tokens=<span class="hljs-number">64</span>)[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;generated_text&quot;</span>][<span class="hljs-built_in">len</span>(prompt):]
m = re.search(<span class="hljs-string">r&#x27;&quot;answer&quot;\s*:\s*&quot;?(\d+)&quot;?&#x27;</span>, completion)
<span class="hljs-keyword">if</span> m:
format_hits += <span class="hljs-number">1</span>
answer = m.group(<span class="hljs-number">1</span>)
<span class="hljs-keyword">else</span>:
nums = re.findall(<span class="hljs-string">r&quot;\b(\d+)\b&quot;</span>, completion)
answer = nums[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> nums <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;0&quot;</span>
result = <span class="hljs-keyword">await</span> env.step(ReasoningGymAction(answer=answer))
rewards.append(<span class="hljs-built_in">float</span>(result.observation.score <span class="hljs-keyword">or</span> <span class="hljs-number">0.0</span>))
<span class="hljs-keyword">await</span> env.close()
<span class="hljs-keyword">del</span> gen <span class="hljs-comment"># free GPU memory before loading the next model</span>
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;accuracy&quot;</span>: <span class="hljs-built_in">sum</span>(rewards) / <span class="hljs-built_in">len</span>(rewards),
<span class="hljs-string">&quot;format_compliance&quot;</span>: format_hits / n_eval,
}
base_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>)
sft_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">f&quot;<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;\n<span class="hljs-subst">{<span class="hljs-string">&#x27;Metric&#x27;</span>:&lt;<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{<span class="hljs-string">&#x27;Base model&#x27;</span>:&gt;<span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">&#x27;After SFT&#x27;</span>:&gt;<span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">&#x27;Delta&#x27;</span>:&gt;<span class="hljs-number">10</span>}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;-&quot;</span> * <span class="hljs-number">62</span>)
<span class="hljs-keyword">for</span> key, label <span class="hljs-keyword">in</span> [(<span class="hljs-string">&quot;format_compliance&quot;</span>, <span class="hljs-string">&quot;Format compliance&quot;</span>), (<span class="hljs-string">&quot;accuracy&quot;</span>, <span class="hljs-string">&quot;Accuracy&quot;</span>)]:
b, s = base_metrics[key], sft_metrics[key]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{label:&lt;<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{b:&gt;<span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{s:&gt;<span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{(s - b) * <span class="hljs-number">100</span>:&gt;+<span class="hljs-number">9.1</span>f}</span> pp&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1udb8dr">A successful warm-up looks like this:</p> <table data-svelte-h="svelte-1oxtqlv"><thead><tr><th>Metric</th> <th>Base model</th> <th>After SFT</th> <th>Delta</th></tr></thead> <tbody><tr><td>Format compliance</td> <td>~0%</td> <td>~68%</td> <td>+68 pp</td></tr> <tr><td>Accuracy</td> <td>~4%</td> <td>~68%</td> <td>+64 pp</td></tr></tbody></table> <p data-svelte-h="svelte-f1s8c3">Format compliance should jump sharply from near-zero — that’s the primary goal. <code>Qwen3-1.7B</code> produces
essentially no valid <code>&lt;tool_call&gt;</code> blocks out of the box. After SFT on ~270 examples, the model reliably
uses the format, and accuracy rises in lockstep because correct format is a prerequisite for the
environment’s scorer to award any credit.</p> <hr> <h2 class="relative group"><a id="11-where-to-go-next-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#11-where-to-go-next-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>11. Where to go next: GRPO</span></h2> <p data-svelte-h="svelte-6b2drq">The SFT checkpoint is ready to use as the starting model for GRPO. In the
<a href="end-to-end-walkthrough">end-to-end walkthrough</a>,
change one line in section 8:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># Before (cold-start from the base model):</span>
MODEL_NAME = <span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>
<span class="hljs-comment"># After (warm-start from your SFT checkpoint):</span>
MODEL_NAME = <span class="hljs-string">f&quot;<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18tauj5">With format compliance already near 100%, GRPO’s <code>reward_std</code> will be non-zero from the very first
batch and the reward curve will climb immediately — no cold-start stall.</p> <p data-svelte-h="svelte-r3ulln"><strong>Other directions:</strong></p> <ul data-svelte-h="svelte-iyl68d"><li><strong>Harder tasks.</strong> Increase <code>max_terms</code> or <code>max_digits</code> in <code>DATASET_CONFIG</code> and collect a new SFT set.
Once the student handles easier examples reliably, a harder GRPO phase can push further.</li> <li><strong>Different environments.</strong> The same pipeline — teacher collects → filter → SFT → GRPO — applies to
any OpenEnv environment. Swap <code>reasoning_gym_env</code> and the <code>answer</code> tool definition for your env’s
tool surface.</li> <li><strong>Larger teacher.</strong> <code>gpt-5</code> or <code>claude-opus-4</code> as teacher will yield higher-quality examples,
especially for tasks where <code>gpt-5-mini</code> struggles.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/openenv/blob/main/docs/source/tutorials/sft-warmup.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1qwoa43 = {
assets: "/docs/openenv/pr_749/en",
base: "/docs/openenv/pr_749/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js"),
import("/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 65],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
70.1 kB
·
Xet hash:
780a59e2c9544dff0eff2652362cdccf60536655adcb0a5eb4b0879e36f6904e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.