Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Collecting rollouts with OpenEnv for supervised training","local":"collecting-rollouts-with-openenv-for-supervised-training","sections":[{"title":"Why use an environment to collect training data","local":"why-use-an-environment-to-collect-training-data","sections":[],"depth":2},{"title":"What you’ll use","local":"what-youll-use","sections":[],"depth":2},{"title":"1. Install dependencies","local":"1-install-dependencies","sections":[],"depth":2},{"title":"2. Set your credentials","local":"2-set-your-credentials","sections":[],"depth":2},{"title":"3. Define the system prompt","local":"3-define-the-system-prompt","sections":[],"depth":2},{"title":"4. Configure data collection","local":"4-configure-data-collection","sections":[],"depth":2},{"title":"5. Collect rollouts with openenv collect","local":"5-collect-rollouts-with-openenv-collect","sections":[],"depth":2},{"title":"6. Filter the dataset","local":"6-filter-the-dataset","sections":[],"depth":2},{"title":"7. Inspect the dataset before training","local":"7-inspect-the-dataset-before-training","sections":[],"depth":2},{"title":"8. Measure token lengths","local":"8-measure-token-lengths","sections":[],"depth":2},{"title":"9. Fine-tune with SFTTrainer","local":"9-fine-tune-with-sfttrainer","sections":[],"depth":2},{"title":"10. Evaluate: before vs after","local":"10-evaluate-before-vs-after","sections":[],"depth":2},{"title":"11. Where to go next: GRPO","local":"11-where-to-go-next-grpo","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/openenv/pr_749/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/scheduler.2b22cead.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/singletons.63566282.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/paths.dd876c7b.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/preload-helper.0820fbc7.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/index.1a0e8013.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/0.167255c0.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/65.abb66251.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/Heading.c0d3f116.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.21bcf336.js"> | |
| <link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/CodeBlock.c8d73295.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Collecting rollouts with OpenEnv for supervised training","local":"collecting-rollouts-with-openenv-for-supervised-training","sections":[{"title":"Why use an environment to collect training data","local":"why-use-an-environment-to-collect-training-data","sections":[],"depth":2},{"title":"What you’ll use","local":"what-youll-use","sections":[],"depth":2},{"title":"1. Install dependencies","local":"1-install-dependencies","sections":[],"depth":2},{"title":"2. Set your credentials","local":"2-set-your-credentials","sections":[],"depth":2},{"title":"3. Define the system prompt","local":"3-define-the-system-prompt","sections":[],"depth":2},{"title":"4. Configure data collection","local":"4-configure-data-collection","sections":[],"depth":2},{"title":"5. Collect rollouts with openenv collect","local":"5-collect-rollouts-with-openenv-collect","sections":[],"depth":2},{"title":"6. Filter the dataset","local":"6-filter-the-dataset","sections":[],"depth":2},{"title":"7. Inspect the dataset before training","local":"7-inspect-the-dataset-before-training","sections":[],"depth":2},{"title":"8. Measure token lengths","local":"8-measure-token-lengths","sections":[],"depth":2},{"title":"9. Fine-tune with SFTTrainer","local":"9-fine-tune-with-sfttrainer","sections":[],"depth":2},{"title":"10. Evaluate: before vs after","local":"10-evaluate-before-vs-after","sections":[],"depth":2},{"title":"11. Where to go next: GRPO","local":"11-where-to-go-next-grpo","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="collecting-rollouts-with-openenv-for-supervised-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#collecting-rollouts-with-openenv-for-supervised-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Collecting rollouts with OpenEnv for supervised training</span></h1> <p data-svelte-h="svelte-1mb1q37"><a href="https://colab.research.google.com/github/huggingface/OpenEnv/blob/main/examples/sft_warmup.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></p> <p data-svelte-h="svelte-n1a38f">OpenEnv environments are not only useful for RL training — they are also a natural tool for <strong>collecting | |
| rollouts that become supervised training data</strong>. The environment handles episode management, automatic scoring, | |
| and reproducibility, so you get a reward-labeled dataset without writing any of that infrastructure yourself.</p> <p data-svelte-h="svelte-cj2ozg">This tutorial shows the full pipeline:</p> <ol data-svelte-h="svelte-1mxsm9c"><li>Run a strong teacher model inside an OpenEnv environment to collect rollouts.</li> <li>Use the environment’s reward signal to filter out incorrect examples automatically.</li> <li>Train a smaller student model on the filtered rollouts with TRL’s <code>SFTTrainer</code>.</li></ol> <p data-svelte-h="svelte-1rn2nlx">As a concrete application, the resulting checkpoint is used as a warm-start for GRPO: once the student | |
| reliably produces valid tool calls, GRPO’s <code>reward_std</code> is non-zero from the first batch and the reward | |
| curve climbs immediately.</p> <h2 class="relative group"><a id="why-use-an-environment-to-collect-training-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-use-an-environment-to-collect-training-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why use an environment to collect training data</span></h2> <p data-svelte-h="svelte-2qxvgr">Building a supervised dataset usually means writing a custom collection loop, a scorer, and episode | |
| bookkeeping. An OpenEnv environment gives you all three out of the box:</p> <ul data-svelte-h="svelte-16gl30a"><li><strong>Automatic scoring</strong> — every <code>step()</code> returns a reward. Filter by <code>reward == 1.0</code> and you have a | |
| clean, correct dataset with no manual labelling.</li> <li><strong>Reproducible episodes</strong> — <code>reset(seed=42, size=N)</code> produces the same sequence of problems every | |
| run. Anyone can regenerate the exact dataset.</li> <li><strong>Configurable difficulty</strong> — adjust <code>DATASET_CONFIG</code> to control problem complexity without changing | |
| any collection code.</li> <li><strong>Portable across environments</strong> — the same collect → filter → train pipeline works for any OpenEnv | |
| environment. Swap the env and the tool definition; everything else stays the same.</li></ul> <h2 class="relative group"><a id="what-youll-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-youll-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What you’ll use</span></h2> <table data-svelte-h="svelte-148mvz9"><thead><tr><th></th> <th></th></tr></thead> <tbody><tr><td><strong>Student model</strong></td> <td><a href="https://huggingface.co/Qwen/Qwen3-1.7B" rel="nofollow"><code>Qwen/Qwen3-1.7B</code></a></td></tr> <tr><td><strong>Teacher model</strong></td> <td><code>gpt-5-mini</code> via the OpenAI API</td></tr> <tr><td><strong>Environment</strong></td> <td><a href="https://github.com/huggingface/OpenEnv/tree/main/envs/reasoning_gym_env" rel="nofollow"><code>reasoning_gym_env</code></a> / <code>chain_sum</code></td></tr> <tr><td><strong>SFT trainer</strong></td> <td><a href="https://huggingface.co/docs/trl/main/en/sft_trainer" rel="nofollow">TRL <code>SFTTrainer</code></a></td></tr> <tr><td><strong>Next step</strong></td> <td><a href="end-to-end-walkthrough">End-to-end walkthrough with GRPO</a></td></tr></tbody></table> <hr> <h2 class="relative group"><a id="1-install-dependencies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-install-dependencies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Install dependencies</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->!pip install -q openai trl | |
| !pip install -q openenv | |
| !pip install -q --no-deps git+https://huggingface.co/spaces/sergiopaniego/reasoning_gym | |
| !pip install -Uq <span class="hljs-string">"transformers>=5.3.0"</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="2-set-your-credentials" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-set-your-credentials"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Set your credentials</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> getpass, os | |
| <span class="hljs-keyword">if</span> <span class="hljs-string">"OPENAI_API_KEY"</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> os.environ: | |
| os.environ[<span class="hljs-string">"OPENAI_API_KEY"</span>] = getpass.getpass(<span class="hljs-string">"OpenAI API key: "</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t2mhms">You’ll also need a Hugging Face login to download the base model and push both the collected dataset | |
| and the fine-tuned checkpoint:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login | |
| notebook_login()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->YOUR_HF_USERNAME = <span class="hljs-string">"your-username"</span> <span class="hljs-comment"># replace with your Hugging Face username</span> | |
| <span class="hljs-keyword">assert</span> YOUR_HF_USERNAME != <span class="hljs-string">"your-username"</span>, <span class="hljs-string">"Replace YOUR_HF_USERNAME with your Hugging Face username"</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="3-define-the-system-prompt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-define-the-system-prompt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Define the system prompt</span></h2> <p data-svelte-h="svelte-18uzm6p">Use the same prompt as the <a href="end-to-end-walkthrough">GRPO tutorial</a> | |
| so the SFT-trained model is a drop-in replacement when you continue with GRPO.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->SYSTEM_PROMPT = <span class="hljs-string">"""You are a careful arithmetic assistant. | |
| You will be given a chain of integer additions. Compute the result and submit it as a single number. | |
| Rules: | |
| 1. Read the question carefully. | |
| 2. Use the tool `answer` exactly once with your final number. | |
| 3. The answer must be a single integer with no units or explanation. | |
| """</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="4-configure-data-collection" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-configure-data-collection"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Configure data collection</span></h2> <p data-svelte-h="svelte-poatas"><code>DATASET_CONFIG</code> controls the difficulty of the <code>chain_sum</code> problems the environment generates: | |
| <code>min_terms</code>/<code>max_terms</code> set how many integers are added together, and <code>min_digits</code>/<code>max_digits</code> set | |
| how many digits each integer has. At these settings each problem is a sum of 2–3 two-digit numbers | |
| — easy enough for <code>gpt-5-mini</code> to answer correctly ~90% of the time, which gives a clean training | |
| signal after filtering.</p> <p data-svelte-h="svelte-158jha2"><code>N_EPISODES</code> is the number of problems to collect. 300 is enough to get ~270 correct examples after | |
| filtering, which is sufficient for format compliance training.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->DATASET_CONFIG = { | |
| <span class="hljs-string">"min_terms"</span>: <span class="hljs-number">2</span>, | |
| <span class="hljs-string">"max_terms"</span>: <span class="hljs-number">3</span>, | |
| <span class="hljs-string">"min_digits"</span>: <span class="hljs-number">2</span>, | |
| <span class="hljs-string">"max_digits"</span>: <span class="hljs-number">2</span>, | |
| } | |
| N_EPISODES = <span class="hljs-number">300</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="5-collect-rollouts-with-openenv-collect" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-collect-rollouts-with-openenv-collect"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. Collect rollouts with openenv collect</span></h2> <p data-svelte-h="svelte-1w7kvuu"><code>openenv collect</code> runs the teacher model inside the environment and records every episode — the | |
| environment’s <code>step()</code> reward is written alongside the messages, so filtering by correctness requires | |
| no additional scoring code.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json, shlex | |
| dataset_config_arg = shlex.quote(json.dumps(DATASET_CONFIG)) | |
| system_prompt_arg = shlex.quote(SYSTEM_PROMPT) | |
| hub_repo_arg = shlex.quote(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts"</span>) | |
| !openenv collect reasoning_gym:chain_sum \ | |
| --base-url https://sergiopaniego-reasoning-gym.hf.space \ | |
| --provider openai \ | |
| --model gpt-<span class="hljs-number">5</span>-mini \ | |
| --num-episodes {N_EPISODES} \ | |
| --<span class="hljs-built_in">max</span>-tokens <span class="hljs-number">1024</span> \ | |
| --dataset-config {dataset_config_arg} \ | |
| --system-prompt {system_prompt_arg} \ | |
| --push-to-hub {hub_repo_arg} \ | |
| --output-<span class="hljs-built_in">dir</span> ./rollouts<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ptdf38">The command prints a live progress summary and pushes the collected episodes to the Hub as | |
| <code>{YOUR_HF_USERNAME}/chain-sum-rollouts</code>. Pull them back to start filtering:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| ds = load_dataset(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts"</span>, split=<span class="hljs-string">"train"</span>) | |
| raw_rollouts = <span class="hljs-built_in">list</span>(ds) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Collected <span class="hljs-subst">{<span class="hljs-built_in">len</span>(raw_rollouts)}</span> episodes"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pn841m">The <code>messages</code> field stores the full conversation in standard OpenAI format (assistant messages have | |
| a <code>tool_calls</code> list). Convert to Qwen3’s <code><tool_call></code> text format before training — GRPOTrainer | |
| produces this same format during RL, so the SFT checkpoint becomes a direct drop-in:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">to_qwen3_messages</span>(<span class="hljs-params">record</span>): | |
| converted = [] | |
| <span class="hljs-keyword">for</span> msg <span class="hljs-keyword">in</span> record[<span class="hljs-string">"messages"</span>]: | |
| <span class="hljs-keyword">if</span> msg[<span class="hljs-string">"role"</span>] == <span class="hljs-string">"tool"</span>: | |
| <span class="hljs-keyword">continue</span> <span class="hljs-comment"># strip environment responses; SFT only needs the assistant turn</span> | |
| <span class="hljs-keyword">if</span> msg[<span class="hljs-string">"role"</span>] == <span class="hljs-string">"assistant"</span> <span class="hljs-keyword">and</span> msg.get(<span class="hljs-string">"tool_calls"</span>): | |
| tc = msg[<span class="hljs-string">"tool_calls"</span>][<span class="hljs-number">0</span>] | |
| args = json.loads(tc[<span class="hljs-string">"function"</span>][<span class="hljs-string">"arguments"</span>]) | |
| answer_str = args.get(<span class="hljs-string">"answer"</span>, <span class="hljs-string">""</span>) | |
| tool_call_text = ( | |
| <span class="hljs-string">"<tool_call>\n"</span> | |
| + json.dumps({<span class="hljs-string">"name"</span>: <span class="hljs-string">"answer"</span>, <span class="hljs-string">"arguments"</span>: {<span class="hljs-string">"answer"</span>: answer_str}}) | |
| + <span class="hljs-string">"\n</tool_call>"</span> | |
| ) | |
| converted.append({<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: tool_call_text}) | |
| <span class="hljs-keyword">else</span>: | |
| converted.append(msg) | |
| <span class="hljs-keyword">return</span> {<span class="hljs-string">"messages"</span>: converted, <span class="hljs-string">"reward"</span>: record[<span class="hljs-string">"reward"</span>]} | |
| rollouts = [to_qwen3_messages(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> raw_rollouts]<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="6-filter-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#6-filter-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>6. Filter the dataset</span></h2> <p data-svelte-h="svelte-1vi36q9">Keep only episodes where the teacher answered correctly. The environment’s reward signal does the | |
| labelling — no manual annotation needed.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->correct = [r <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rollouts <span class="hljs-keyword">if</span> r[<span class="hljs-string">"reward"</span>] == <span class="hljs-number">1.0</span>] | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Correct: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)}</span> / <span class="hljs-subst">{<span class="hljs-built_in">len</span>(rollouts)}</span> (<span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)/<span class="hljs-built_in">len</span>(rollouts):<span class="hljs-number">.1</span>%}</span>)"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wrgc3t"><code>gpt-5-mini</code> typically scores above 90% on <code>chain_sum</code> at this difficulty, so you should end up with | |
| ~270 examples from 300 rollouts.</p> <hr> <h2 class="relative group"><a id="7-inspect-the-dataset-before-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#7-inspect-the-dataset-before-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>7. Inspect the dataset before training</span></h2> <p data-svelte-h="svelte-6lfca0">Always look at your data before training. Automated collection can introduce unexpected patterns that the | |
| student model will learn to imitate.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> random | |
| <span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> random.sample(correct, <span class="hljs-number">3</span>): | |
| question = row[<span class="hljs-string">"messages"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"content"</span>] | |
| response = row[<span class="hljs-string">"messages"</span>][<span class="hljs-number">1</span>][<span class="hljs-string">"content"</span>] | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Q: <span class="hljs-subst">{question}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"A: <span class="hljs-subst">{response}</span>"</span>) | |
| <span class="hljs-built_in">print</span>()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jmsvh4">Things to check:</p> <ul data-svelte-h="svelte-vbk6t4"><li>Does every response contain a valid <code><tool_call></code> block?</li> <li>Are the answers integers with no extra text?</li> <li>Is there any reasoning in the assistant message that you don’t want the student to learn? | |
| (For example: an internal monologue, disclaimers, or repeated phrasing that the teacher leaked | |
| from its own system prompt.)</li></ul> <hr> <h2 class="relative group"><a id="8-measure-token-lengths" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#8-measure-token-lengths"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>8. Measure token lengths</span></h2> <p data-svelte-h="svelte-d6vup6">Set <code>max_length</code> in <code>SFTConfig</code> to cover nearly all examples without wasting GPU memory on padding. | |
| The 99th percentile is a good target: you truncate fewer than 1% of examples while keeping batches tight.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>) | |
| lengths = [] | |
| <span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> correct: | |
| text = tokenizer.apply_chat_template( | |
| row[<span class="hljs-string">"messages"</span>], tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">False</span> | |
| ) | |
| ids = tokenizer.encode(text) | |
| lengths.append(<span class="hljs-built_in">len</span>(ids)) | |
| lengths = np.array(lengths) | |
| MAX_SEQ_LEN = <span class="hljs-built_in">int</span>(np.percentile(lengths, <span class="hljs-number">99</span>)) + <span class="hljs-number">16</span> | |
| <span class="hljs-built_in">print</span>( | |
| <span class="hljs-string">f"p50=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">50</span>):<span class="hljs-number">.0</span>f}</span> "</span> | |
| <span class="hljs-string">f"p95=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">95</span>):<span class="hljs-number">.0</span>f}</span> "</span> | |
| <span class="hljs-string">f"p99=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">99</span>):<span class="hljs-number">.0</span>f}</span> "</span> | |
| <span class="hljs-string">f"max=<span class="hljs-subst">{lengths.<span class="hljs-built_in">max</span>()}</span>"</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Setting MAX_SEQ_LEN = <span class="hljs-subst">{MAX_SEQ_LEN}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="9-fine-tune-with-sfttrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#9-fine-tune-with-sfttrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>9. Fine-tune with SFTTrainer</span></h2> <p data-svelte-h="svelte-1uo0pb3"><code>assistant_only_loss=True</code> in <code>SFTConfig</code> masks the prompt tokens so the loss is computed only on the | |
| assistant response — the <code><tool_call></code> block. This is more efficient than full-sequence training and avoids | |
| accidentally reinforcing the system prompt wording.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig, SFTTrainer | |
| dataset = Dataset.from_list([{<span class="hljs-string">"messages"</span>: r[<span class="hljs-string">"messages"</span>]} <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> correct]) | |
| model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>) | |
| sft_config = SFTConfig( | |
| output_dir=<span class="hljs-string">"reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span>, | |
| max_length=MAX_SEQ_LEN, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, | |
| gradient_accumulation_steps=<span class="hljs-number">2</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| warmup_steps=<span class="hljs-number">10</span>, | |
| lr_scheduler_type=<span class="hljs-string">"cosine"</span>, | |
| logging_steps=<span class="hljs-number">5</span>, | |
| save_strategy=<span class="hljs-string">"no"</span>, | |
| assistant_only_loss=<span class="hljs-literal">True</span>, | |
| ) | |
| trainer = SFTTrainer( | |
| model=model, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| args=sft_config, | |
| ) | |
| trainer.train() | |
| trainer.push_to_hub(commit_message=<span class="hljs-string">"SFT warm-up on reasoning_gym chain_sum"</span>)<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-l32ipe"><p>Training ~270 examples for 3 epochs takes around 5 minutes on a single A100 (40 GB). The goal is format | |
| compliance, not task mastery — a handful of epochs is enough. Mastery comes from GRPO.</p></blockquote> <hr> <h2 class="relative group"><a id="10-evaluate-before-vs-after" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#10-evaluate-before-vs-after"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>10. Evaluate: before vs after</span></h2> <p data-svelte-h="svelte-1bax8nc">Run both the base model and the SFT checkpoint on a held-out set and compare. The key metric for a | |
| warm-up evaluation is <strong>format compliance</strong> — how often the model uses <code><tool_call></code> correctly — as | |
| well as overall accuracy.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> re | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline | |
| <span class="hljs-keyword">from</span> reasoning_gym_env.client <span class="hljs-keyword">import</span> ReasoningGymEnv | |
| <span class="hljs-keyword">from</span> reasoning_gym_env.models <span class="hljs-keyword">import</span> ReasoningGymAction | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">evaluate_model</span>(<span class="hljs-params">model_name, n_eval=<span class="hljs-number">50</span>, seed=<span class="hljs-number">999</span></span>): | |
| gen = pipeline( | |
| <span class="hljs-string">"text-generation"</span>, | |
| model=model_name, | |
| tokenizer=model_name, | |
| device_map=<span class="hljs-string">"auto"</span>, | |
| dtype=<span class="hljs-string">"auto"</span>, | |
| ) | |
| gen.model.generation_config.max_length = <span class="hljs-literal">None</span> | |
| tok = AutoTokenizer.from_pretrained(model_name) | |
| env = ReasoningGymEnv(base_url=<span class="hljs-string">"https://sergiopaniego-reasoning-gym.hf.space"</span>) | |
| obs = <span class="hljs-keyword">await</span> env.reset( | |
| dataset_name=<span class="hljs-string">"chain_sum"</span>, | |
| dataset_config=DATASET_CONFIG, | |
| seed=seed, | |
| size=n_eval, | |
| ) | |
| rewards, format_hits = [], <span class="hljs-number">0</span> | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_eval): | |
| <span class="hljs-keyword">if</span> i > <span class="hljs-number">0</span>: | |
| obs = <span class="hljs-keyword">await</span> env.reset() | |
| question = obs.observation.question | |
| messages = [ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: SYSTEM_PROMPT}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: question}, | |
| ] | |
| prompt = tok.apply_chat_template( | |
| messages, tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">True</span> | |
| ) | |
| completion = gen(prompt, max_new_tokens=<span class="hljs-number">64</span>)[<span class="hljs-number">0</span>][<span class="hljs-string">"generated_text"</span>][<span class="hljs-built_in">len</span>(prompt):] | |
| m = re.search(<span class="hljs-string">r'"answer"\s*:\s*"?(\d+)"?'</span>, completion) | |
| <span class="hljs-keyword">if</span> m: | |
| format_hits += <span class="hljs-number">1</span> | |
| answer = m.group(<span class="hljs-number">1</span>) | |
| <span class="hljs-keyword">else</span>: | |
| nums = re.findall(<span class="hljs-string">r"\b(\d+)\b"</span>, completion) | |
| answer = nums[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> nums <span class="hljs-keyword">else</span> <span class="hljs-string">"0"</span> | |
| result = <span class="hljs-keyword">await</span> env.step(ReasoningGymAction(answer=answer)) | |
| rewards.append(<span class="hljs-built_in">float</span>(result.observation.score <span class="hljs-keyword">or</span> <span class="hljs-number">0.0</span>)) | |
| <span class="hljs-keyword">await</span> env.close() | |
| <span class="hljs-keyword">del</span> gen <span class="hljs-comment"># free GPU memory before loading the next model</span> | |
| <span class="hljs-keyword">return</span> { | |
| <span class="hljs-string">"accuracy"</span>: <span class="hljs-built_in">sum</span>(rewards) / <span class="hljs-built_in">len</span>(rewards), | |
| <span class="hljs-string">"format_compliance"</span>: format_hits / n_eval, | |
| } | |
| base_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>) | |
| sft_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"\n<span class="hljs-subst">{<span class="hljs-string">'Metric'</span>:<<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'Base model'</span>:><span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'After SFT'</span>:><span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'Delta'</span>:><span class="hljs-number">10</span>}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">"-"</span> * <span class="hljs-number">62</span>) | |
| <span class="hljs-keyword">for</span> key, label <span class="hljs-keyword">in</span> [(<span class="hljs-string">"format_compliance"</span>, <span class="hljs-string">"Format compliance"</span>), (<span class="hljs-string">"accuracy"</span>, <span class="hljs-string">"Accuracy"</span>)]: | |
| b, s = base_metrics[key], sft_metrics[key] | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{label:<<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{b:><span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{s:><span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{(s - b) * <span class="hljs-number">100</span>:>+<span class="hljs-number">9.1</span>f}</span> pp"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1udb8dr">A successful warm-up looks like this:</p> <table data-svelte-h="svelte-1oxtqlv"><thead><tr><th>Metric</th> <th>Base model</th> <th>After SFT</th> <th>Delta</th></tr></thead> <tbody><tr><td>Format compliance</td> <td>~0%</td> <td>~68%</td> <td>+68 pp</td></tr> <tr><td>Accuracy</td> <td>~4%</td> <td>~68%</td> <td>+64 pp</td></tr></tbody></table> <p data-svelte-h="svelte-f1s8c3">Format compliance should jump sharply from near-zero — that’s the primary goal. <code>Qwen3-1.7B</code> produces | |
| essentially no valid <code><tool_call></code> blocks out of the box. After SFT on ~270 examples, the model reliably | |
| uses the format, and accuracy rises in lockstep because correct format is a prerequisite for the | |
| environment’s scorer to award any credit.</p> <hr> <h2 class="relative group"><a id="11-where-to-go-next-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#11-where-to-go-next-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>11. Where to go next: GRPO</span></h2> <p data-svelte-h="svelte-6b2drq">The SFT checkpoint is ready to use as the starting model for GRPO. In the | |
| <a href="end-to-end-walkthrough">end-to-end walkthrough</a>, | |
| change one line in section 8:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># Before (cold-start from the base model):</span> | |
| MODEL_NAME = <span class="hljs-string">"Qwen/Qwen3-1.7B"</span> | |
| <span class="hljs-comment"># After (warm-start from your SFT checkpoint):</span> | |
| MODEL_NAME = <span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18tauj5">With format compliance already near 100%, GRPO’s <code>reward_std</code> will be non-zero from the very first | |
| batch and the reward curve will climb immediately — no cold-start stall.</p> <p data-svelte-h="svelte-r3ulln"><strong>Other directions:</strong></p> <ul data-svelte-h="svelte-iyl68d"><li><strong>Harder tasks.</strong> Increase <code>max_terms</code> or <code>max_digits</code> in <code>DATASET_CONFIG</code> and collect a new SFT set. | |
| Once the student handles easier examples reliably, a harder GRPO phase can push further.</li> <li><strong>Different environments.</strong> The same pipeline — teacher collects → filter → SFT → GRPO — applies to | |
| any OpenEnv environment. Swap <code>reasoning_gym_env</code> and the <code>answer</code> tool definition for your env’s | |
| tool surface.</li> <li><strong>Larger teacher.</strong> <code>gpt-5</code> or <code>claude-opus-4</code> as teacher will yield higher-quality examples, | |
| especially for tasks where <code>gpt-5-mini</code> struggles.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/openenv/blob/main/docs/source/tutorials/sft-warmup.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1qwoa43 = { | |
| assets: "/docs/openenv/pr_749/en", | |
| base: "/docs/openenv/pr_749/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js"), | |
| import("/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 65], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 70.1 kB
- Xet hash:
- 780a59e2c9544dff0eff2652362cdccf60536655adcb0a5eb4b0879e36f6904e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.