Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / openenv /pr_749 /en /tutorials /sft-warmup.html

HuggingFaceDocBuilder

27 days ago

download

raw

70.1 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Collecting rollouts with OpenEnv for supervised training","local":"collecting-rollouts-with-openenv-for-supervised-training","sections":[{"title":"Why use an environment to collect training data","local":"why-use-an-environment-to-collect-training-data","sections":[],"depth":2},{"title":"What you’ll use","local":"what-youll-use","sections":[],"depth":2},{"title":"1. Install dependencies","local":"1-install-dependencies","sections":[],"depth":2},{"title":"2. Set your credentials","local":"2-set-your-credentials","sections":[],"depth":2},{"title":"3. Define the system prompt","local":"3-define-the-system-prompt","sections":[],"depth":2},{"title":"4. Configure data collection","local":"4-configure-data-collection","sections":[],"depth":2},{"title":"5. Collect rollouts with openenv collect","local":"5-collect-rollouts-with-openenv-collect","sections":[],"depth":2},{"title":"6. Filter the dataset","local":"6-filter-the-dataset","sections":[],"depth":2},{"title":"7. Inspect the dataset before training","local":"7-inspect-the-dataset-before-training","sections":[],"depth":2},{"title":"8. Measure token lengths","local":"8-measure-token-lengths","sections":[],"depth":2},{"title":"9. Fine-tune with SFTTrainer","local":"9-fine-tune-with-sfttrainer","sections":[],"depth":2},{"title":"10. Evaluate: before vs after","local":"10-evaluate-before-vs-after","sections":[],"depth":2},{"title":"11. Where to go next: GRPO","local":"11-where-to-go-next-grpo","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/openenv/pr_749/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/scheduler.2b22cead.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/singletons.63566282.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/paths.dd876c7b.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/preload-helper.0820fbc7.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/index.1a0e8013.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/0.167255c0.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/nodes/65.abb66251.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/Heading.c0d3f116.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.21bcf336.js">
	<link rel="modulepreload" href="/docs/openenv/pr_749/en/_app/immutable/chunks/CodeBlock.c8d73295.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Collecting rollouts with OpenEnv for supervised training","local":"collecting-rollouts-with-openenv-for-supervised-training","sections":[{"title":"Why use an environment to collect training data","local":"why-use-an-environment-to-collect-training-data","sections":[],"depth":2},{"title":"What you’ll use","local":"what-youll-use","sections":[],"depth":2},{"title":"1. Install dependencies","local":"1-install-dependencies","sections":[],"depth":2},{"title":"2. Set your credentials","local":"2-set-your-credentials","sections":[],"depth":2},{"title":"3. Define the system prompt","local":"3-define-the-system-prompt","sections":[],"depth":2},{"title":"4. Configure data collection","local":"4-configure-data-collection","sections":[],"depth":2},{"title":"5. Collect rollouts with openenv collect","local":"5-collect-rollouts-with-openenv-collect","sections":[],"depth":2},{"title":"6. Filter the dataset","local":"6-filter-the-dataset","sections":[],"depth":2},{"title":"7. Inspect the dataset before training","local":"7-inspect-the-dataset-before-training","sections":[],"depth":2},{"title":"8. Measure token lengths","local":"8-measure-token-lengths","sections":[],"depth":2},{"title":"9. Fine-tune with SFTTrainer","local":"9-fine-tune-with-sfttrainer","sections":[],"depth":2},{"title":"10. Evaluate: before vs after","local":"10-evaluate-before-vs-after","sections":[],"depth":2},{"title":"11. Where to go next: GRPO","local":"11-where-to-go-next-grpo","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="collecting-rollouts-with-openenv-for-supervised-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#collecting-rollouts-with-openenv-for-supervised-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Collecting rollouts with OpenEnv for supervised training</span></h1> <p data-svelte-h="svelte-1mb1q37"><a href="https://colab.research.google.com/github/huggingface/OpenEnv/blob/main/examples/sft_warmup.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></p> <p data-svelte-h="svelte-n1a38f">OpenEnv environments are not only useful for RL training — they are also a natural tool for <strong>collecting
	rollouts that become supervised training data</strong>. The environment handles episode management, automatic scoring,
	and reproducibility, so you get a reward-labeled dataset without writing any of that infrastructure yourself.</p> <p data-svelte-h="svelte-cj2ozg">This tutorial shows the full pipeline:</p> <ol data-svelte-h="svelte-1mxsm9c"><li>Run a strong teacher model inside an OpenEnv environment to collect rollouts.</li> <li>Use the environment’s reward signal to filter out incorrect examples automatically.</li> <li>Train a smaller student model on the filtered rollouts with TRL’s <code>SFTTrainer</code>.</li></ol> <p data-svelte-h="svelte-1rn2nlx">As a concrete application, the resulting checkpoint is used as a warm-start for GRPO: once the student
	reliably produces valid tool calls, GRPO’s <code>reward_std</code> is non-zero from the first batch and the reward
	curve climbs immediately.</p> <h2 class="relative group"><a id="why-use-an-environment-to-collect-training-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-use-an-environment-to-collect-training-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why use an environment to collect training data</span></h2> <p data-svelte-h="svelte-2qxvgr">Building a supervised dataset usually means writing a custom collection loop, a scorer, and episode
	bookkeeping. An OpenEnv environment gives you all three out of the box:</p> <ul data-svelte-h="svelte-16gl30a"><li><strong>Automatic scoring</strong> — every <code>step()</code> returns a reward. Filter by <code>reward == 1.0</code> and you have a
	clean, correct dataset with no manual labelling.</li> <li><strong>Reproducible episodes</strong> — <code>reset(seed=42, size=N)</code> produces the same sequence of problems every
	run. Anyone can regenerate the exact dataset.</li> <li><strong>Configurable difficulty</strong> — adjust <code>DATASET_CONFIG</code> to control problem complexity without changing
	any collection code.</li> <li><strong>Portable across environments</strong> — the same collect → filter → train pipeline works for any OpenEnv
	environment. Swap the env and the tool definition; everything else stays the same.</li></ul> <h2 class="relative group"><a id="what-youll-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-youll-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What you’ll use</span></h2> <table data-svelte-h="svelte-148mvz9"><thead><tr><th></th> <th></th></tr></thead> <tbody><tr><td><strong>Student model</strong></td> <td><a href="https://huggingface.co/Qwen/Qwen3-1.7B" rel="nofollow"><code>Qwen/Qwen3-1.7B</code></a></td></tr> <tr><td><strong>Teacher model</strong></td> <td><code>gpt-5-mini</code> via the OpenAI API</td></tr> <tr><td><strong>Environment</strong></td> <td><a href="https://github.com/huggingface/OpenEnv/tree/main/envs/reasoning_gym_env" rel="nofollow"><code>reasoning_gym_env</code></a> / <code>chain_sum</code></td></tr> <tr><td><strong>SFT trainer</strong></td> <td><a href="https://huggingface.co/docs/trl/main/en/sft_trainer" rel="nofollow">TRL <code>SFTTrainer</code></a></td></tr> <tr><td><strong>Next step</strong></td> <td><a href="end-to-end-walkthrough">End-to-end walkthrough with GRPO</a></td></tr></tbody></table> <hr> <h2 class="relative group"><a id="1-install-dependencies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-install-dependencies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Install dependencies</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->!pip install -q openai trl
	!pip install -q openenv
	!pip install -q --no-deps git+https://huggingface.co/spaces/sergiopaniego/reasoning_gym
	!pip install -Uq <span class="hljs-string">"transformers>=5.3.0"</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="2-set-your-credentials" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-set-your-credentials"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Set your credentials</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> getpass, os

	<span class="hljs-keyword">if</span> <span class="hljs-string">"OPENAI_API_KEY"</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> os.environ:
	os.environ[<span class="hljs-string">"OPENAI_API_KEY"</span>] = getpass.getpass(<span class="hljs-string">"OpenAI API key: "</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t2mhms">You’ll also need a Hugging Face login to download the base model and push both the collected dataset
	and the fine-tuned checkpoint:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login

	notebook_login()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->YOUR_HF_USERNAME = <span class="hljs-string">"your-username"</span> <span class="hljs-comment"># replace with your Hugging Face username</span>
	<span class="hljs-keyword">assert</span> YOUR_HF_USERNAME != <span class="hljs-string">"your-username"</span>, <span class="hljs-string">"Replace YOUR_HF_USERNAME with your Hugging Face username"</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="3-define-the-system-prompt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-define-the-system-prompt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Define the system prompt</span></h2> <p data-svelte-h="svelte-18uzm6p">Use the same prompt as the <a href="end-to-end-walkthrough">GRPO tutorial</a>
	so the SFT-trained model is a drop-in replacement when you continue with GRPO.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->SYSTEM_PROMPT = <span class="hljs-string">"""You are a careful arithmetic assistant.

	You will be given a chain of integer additions. Compute the result and submit it as a single number.

	Rules:
	1. Read the question carefully.
	2. Use the tool `answer` exactly once with your final number.
	3. The answer must be a single integer with no units or explanation.
	"""</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="4-configure-data-collection" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-configure-data-collection"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Configure data collection</span></h2> <p data-svelte-h="svelte-poatas"><code>DATASET_CONFIG</code> controls the difficulty of the <code>chain_sum</code> problems the environment generates:
	<code>min_terms</code>/<code>max_terms</code> set how many integers are added together, and <code>min_digits</code>/<code>max_digits</code> set
	how many digits each integer has. At these settings each problem is a sum of 2–3 two-digit numbers
	— easy enough for <code>gpt-5-mini</code> to answer correctly ~90% of the time, which gives a clean training
	signal after filtering.</p> <p data-svelte-h="svelte-158jha2"><code>N_EPISODES</code> is the number of problems to collect. 300 is enough to get ~270 correct examples after
	filtering, which is sufficient for format compliance training.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->DATASET_CONFIG = {
	<span class="hljs-string">"min_terms"</span>: <span class="hljs-number">2</span>,
	<span class="hljs-string">"max_terms"</span>: <span class="hljs-number">3</span>,
	<span class="hljs-string">"min_digits"</span>: <span class="hljs-number">2</span>,
	<span class="hljs-string">"max_digits"</span>: <span class="hljs-number">2</span>,
	}

	N_EPISODES = <span class="hljs-number">300</span><!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="5-collect-rollouts-with-openenv-collect" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-collect-rollouts-with-openenv-collect"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. Collect rollouts with openenv collect</span></h2> <p data-svelte-h="svelte-1w7kvuu"><code>openenv collect</code> runs the teacher model inside the environment and records every episode — the
	environment’s <code>step()</code> reward is written alongside the messages, so filtering by correctness requires
	no additional scoring code.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json, shlex

	dataset_config_arg = shlex.quote(json.dumps(DATASET_CONFIG))
	system_prompt_arg = shlex.quote(SYSTEM_PROMPT)
	hub_repo_arg = shlex.quote(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts"</span>)

	!openenv collect reasoning_gym:chain_sum \
	--base-url https://sergiopaniego-reasoning-gym.hf.space \
	--provider openai \
	--model gpt-<span class="hljs-number">5</span>-mini \
	--num-episodes {N_EPISODES} \
	--<span class="hljs-built_in">max</span>-tokens <span class="hljs-number">1024</span> \
	--dataset-config {dataset_config_arg} \
	--system-prompt {system_prompt_arg} \
	--push-to-hub {hub_repo_arg} \
	--output-<span class="hljs-built_in">dir</span> ./rollouts<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ptdf38">The command prints a live progress summary and pushes the collected episodes to the Hub as
	<code>{YOUR_HF_USERNAME}/chain-sum-rollouts</code>. Pull them back to start filtering:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

	ds = load_dataset(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/chain-sum-rollouts"</span>, split=<span class="hljs-string">"train"</span>)
	raw_rollouts = <span class="hljs-built_in">list</span>(ds)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Collected <span class="hljs-subst">{<span class="hljs-built_in">len</span>(raw_rollouts)}</span> episodes"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pn841m">The <code>messages</code> field stores the full conversation in standard OpenAI format (assistant messages have
	a <code>tool_calls</code> list). Convert to Qwen3’s <code><tool_call></code> text format before training — GRPOTrainer
	produces this same format during RL, so the SFT checkpoint becomes a direct drop-in:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">to_qwen3_messages</span>(<span class="hljs-params">record</span>):
	converted = []
	<span class="hljs-keyword">for</span> msg <span class="hljs-keyword">in</span> record[<span class="hljs-string">"messages"</span>]:
	<span class="hljs-keyword">if</span> msg[<span class="hljs-string">"role"</span>] == <span class="hljs-string">"tool"</span>:
	<span class="hljs-keyword">continue</span> <span class="hljs-comment"># strip environment responses; SFT only needs the assistant turn</span>
	<span class="hljs-keyword">if</span> msg[<span class="hljs-string">"role"</span>] == <span class="hljs-string">"assistant"</span> <span class="hljs-keyword">and</span> msg.get(<span class="hljs-string">"tool_calls"</span>):
	tc = msg[<span class="hljs-string">"tool_calls"</span>][<span class="hljs-number">0</span>]
	args = json.loads(tc[<span class="hljs-string">"function"</span>][<span class="hljs-string">"arguments"</span>])
	answer_str = args.get(<span class="hljs-string">"answer"</span>, <span class="hljs-string">""</span>)
	tool_call_text = (
	<span class="hljs-string">"<tool_call>\n"</span>
	+ json.dumps({<span class="hljs-string">"name"</span>: <span class="hljs-string">"answer"</span>, <span class="hljs-string">"arguments"</span>: {<span class="hljs-string">"answer"</span>: answer_str}})
	+ <span class="hljs-string">"\n</tool_call>"</span>
	)
	converted.append({<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: tool_call_text})
	<span class="hljs-keyword">else</span>:
	converted.append(msg)
	<span class="hljs-keyword">return</span> {<span class="hljs-string">"messages"</span>: converted, <span class="hljs-string">"reward"</span>: record[<span class="hljs-string">"reward"</span>]}

	rollouts = [to_qwen3_messages(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> raw_rollouts]<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="6-filter-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#6-filter-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>6. Filter the dataset</span></h2> <p data-svelte-h="svelte-1vi36q9">Keep only episodes where the teacher answered correctly. The environment’s reward signal does the
	labelling — no manual annotation needed.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->correct = [r <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rollouts <span class="hljs-keyword">if</span> r[<span class="hljs-string">"reward"</span>] == <span class="hljs-number">1.0</span>]
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Correct: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)}</span> / <span class="hljs-subst">{<span class="hljs-built_in">len</span>(rollouts)}</span> (<span class="hljs-subst">{<span class="hljs-built_in">len</span>(correct)/<span class="hljs-built_in">len</span>(rollouts):<span class="hljs-number">.1</span>%}</span>)"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wrgc3t"><code>gpt-5-mini</code> typically scores above 90% on <code>chain_sum</code> at this difficulty, so you should end up with
	~270 examples from 300 rollouts.</p> <hr> <h2 class="relative group"><a id="7-inspect-the-dataset-before-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#7-inspect-the-dataset-before-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>7. Inspect the dataset before training</span></h2> <p data-svelte-h="svelte-6lfca0">Always look at your data before training. Automated collection can introduce unexpected patterns that the
	student model will learn to imitate.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> random

	<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> random.sample(correct, <span class="hljs-number">3</span>):
	question = row[<span class="hljs-string">"messages"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"content"</span>]
	response = row[<span class="hljs-string">"messages"</span>][<span class="hljs-number">1</span>][<span class="hljs-string">"content"</span>]
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Q: <span class="hljs-subst">{question}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"A: <span class="hljs-subst">{response}</span>"</span>)
	<span class="hljs-built_in">print</span>()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jmsvh4">Things to check:</p> <ul data-svelte-h="svelte-vbk6t4"><li>Does every response contain a valid <code><tool_call></code> block?</li> <li>Are the answers integers with no extra text?</li> <li>Is there any reasoning in the assistant message that you don’t want the student to learn?
	(For example: an internal monologue, disclaimers, or repeated phrasing that the teacher leaked
	from its own system prompt.)</li></ul> <hr> <h2 class="relative group"><a id="8-measure-token-lengths" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#8-measure-token-lengths"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>8. Measure token lengths</span></h2> <p data-svelte-h="svelte-d6vup6">Set <code>max_length</code> in <code>SFTConfig</code> to cover nearly all examples without wasting GPU memory on padding.
	The 99th percentile is a good target: you truncate fewer than 1% of examples while keeping batches tight.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>)

	lengths = []
	<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> correct:
	text = tokenizer.apply_chat_template(
	row[<span class="hljs-string">"messages"</span>], tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">False</span>
	)
	ids = tokenizer.encode(text)
	lengths.append(<span class="hljs-built_in">len</span>(ids))

	lengths = np.array(lengths)
	MAX_SEQ_LEN = <span class="hljs-built_in">int</span>(np.percentile(lengths, <span class="hljs-number">99</span>)) + <span class="hljs-number">16</span>

	<span class="hljs-built_in">print</span>(
	<span class="hljs-string">f"p50=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">50</span>):<span class="hljs-number">.0</span>f}</span> "</span>
	<span class="hljs-string">f"p95=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">95</span>):<span class="hljs-number">.0</span>f}</span> "</span>
	<span class="hljs-string">f"p99=<span class="hljs-subst">{np.percentile(lengths, <span class="hljs-number">99</span>):<span class="hljs-number">.0</span>f}</span> "</span>
	<span class="hljs-string">f"max=<span class="hljs-subst">{lengths.<span class="hljs-built_in">max</span>()}</span>"</span>
	)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Setting MAX_SEQ_LEN = <span class="hljs-subst">{MAX_SEQ_LEN}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <hr> <h2 class="relative group"><a id="9-fine-tune-with-sfttrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#9-fine-tune-with-sfttrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>9. Fine-tune with SFTTrainer</span></h2> <p data-svelte-h="svelte-1uo0pb3"><code>assistant_only_loss=True</code> in <code>SFTConfig</code> masks the prompt tokens so the loss is computed only on the
	assistant response — the <code><tool_call></code> block. This is more efficient than full-sequence training and avoids
	accidentally reinforcing the system prompt wording.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM
	<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig, SFTTrainer

	dataset = Dataset.from_list([{<span class="hljs-string">"messages"</span>: r[<span class="hljs-string">"messages"</span>]} <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> correct])

	model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>)

	sft_config = SFTConfig(
	output_dir=<span class="hljs-string">"reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span>,
	max_length=MAX_SEQ_LEN,
	num_train_epochs=<span class="hljs-number">3</span>,
	per_device_train_batch_size=<span class="hljs-number">4</span>,
	gradient_accumulation_steps=<span class="hljs-number">2</span>,
	learning_rate=<span class="hljs-number">2e-5</span>,
	warmup_steps=<span class="hljs-number">10</span>,
	lr_scheduler_type=<span class="hljs-string">"cosine"</span>,
	logging_steps=<span class="hljs-number">5</span>,
	save_strategy=<span class="hljs-string">"no"</span>,
	assistant_only_loss=<span class="hljs-literal">True</span>,
	)

	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	processing_class=tokenizer,
	args=sft_config,
	)

	trainer.train()
	trainer.push_to_hub(commit_message=<span class="hljs-string">"SFT warm-up on reasoning_gym chain_sum"</span>)<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-l32ipe"><p>Training ~270 examples for 3 epochs takes around 5 minutes on a single A100 (40 GB). The goal is format
	compliance, not task mastery — a handful of epochs is enough. Mastery comes from GRPO.</p></blockquote> <hr> <h2 class="relative group"><a id="10-evaluate-before-vs-after" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#10-evaluate-before-vs-after"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>10. Evaluate: before vs after</span></h2> <p data-svelte-h="svelte-1bax8nc">Run both the base model and the SFT checkpoint on a held-out set and compare. The key metric for a
	warm-up evaluation is <strong>format compliance</strong> — how often the model uses <code><tool_call></code> correctly — as
	well as overall accuracy.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> re
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
	<span class="hljs-keyword">from</span> reasoning_gym_env.client <span class="hljs-keyword">import</span> ReasoningGymEnv
	<span class="hljs-keyword">from</span> reasoning_gym_env.models <span class="hljs-keyword">import</span> ReasoningGymAction


	<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">evaluate_model</span>(<span class="hljs-params">model_name, n_eval=<span class="hljs-number">50</span>, seed=<span class="hljs-number">999</span></span>):
	gen = pipeline(
	<span class="hljs-string">"text-generation"</span>,
	model=model_name,
	tokenizer=model_name,
	device_map=<span class="hljs-string">"auto"</span>,
	dtype=<span class="hljs-string">"auto"</span>,
	)
	gen.model.generation_config.max_length = <span class="hljs-literal">None</span>
	tok = AutoTokenizer.from_pretrained(model_name)
	env = ReasoningGymEnv(base_url=<span class="hljs-string">"https://sergiopaniego-reasoning-gym.hf.space"</span>)

	obs = <span class="hljs-keyword">await</span> env.reset(
	dataset_name=<span class="hljs-string">"chain_sum"</span>,
	dataset_config=DATASET_CONFIG,
	seed=seed,
	size=n_eval,
	)

	rewards, format_hits = [], <span class="hljs-number">0</span>

	<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_eval):
	<span class="hljs-keyword">if</span> i > <span class="hljs-number">0</span>:
	obs = <span class="hljs-keyword">await</span> env.reset()

	question = obs.observation.question
	messages = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: SYSTEM_PROMPT},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: question},
	]
	prompt = tok.apply_chat_template(
	messages, tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">True</span>
	)
	completion = gen(prompt, max_new_tokens=<span class="hljs-number">64</span>)[<span class="hljs-number">0</span>][<span class="hljs-string">"generated_text"</span>][<span class="hljs-built_in">len</span>(prompt):]

	m = re.search(<span class="hljs-string">r'"answer"\s:\s"?(\d+)"?'</span>, completion)
	<span class="hljs-keyword">if</span> m:
	format_hits += <span class="hljs-number">1</span>
	answer = m.group(<span class="hljs-number">1</span>)
	<span class="hljs-keyword">else</span>:
	nums = re.findall(<span class="hljs-string">r"\b(\d+)\b"</span>, completion)
	answer = nums[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> nums <span class="hljs-keyword">else</span> <span class="hljs-string">"0"</span>

	result = <span class="hljs-keyword">await</span> env.step(ReasoningGymAction(answer=answer))
	rewards.append(<span class="hljs-built_in">float</span>(result.observation.score <span class="hljs-keyword">or</span> <span class="hljs-number">0.0</span>))

	<span class="hljs-keyword">await</span> env.close()
	<span class="hljs-keyword">del</span> gen <span class="hljs-comment"># free GPU memory before loading the next model</span>

	<span class="hljs-keyword">return</span> {
	<span class="hljs-string">"accuracy"</span>: <span class="hljs-built_in">sum</span>(rewards) / <span class="hljs-built_in">len</span>(rewards),
	<span class="hljs-string">"format_compliance"</span>: format_hits / n_eval,
	}


	base_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>)
	sft_metrics = <span class="hljs-keyword">await</span> evaluate_model(<span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span>)

	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"\n<span class="hljs-subst">{<span class="hljs-string">'Metric'</span>:<<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'Base model'</span>:><span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'After SFT'</span>:><span class="hljs-number">12</span>}</span> <span class="hljs-subst">{<span class="hljs-string">'Delta'</span>:><span class="hljs-number">10</span>}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"-"</span> * <span class="hljs-number">62</span>)
	<span class="hljs-keyword">for</span> key, label <span class="hljs-keyword">in</span> [(<span class="hljs-string">"format_compliance"</span>, <span class="hljs-string">"Format compliance"</span>), (<span class="hljs-string">"accuracy"</span>, <span class="hljs-string">"Accuracy"</span>)]:
	b, s = base_metrics[key], sft_metrics[key]
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{label:<<span class="hljs-number">25</span>}</span> <span class="hljs-subst">{b:><span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{s:><span class="hljs-number">12.1</span>%}</span> <span class="hljs-subst">{(s - b) * <span class="hljs-number">100</span>:>+<span class="hljs-number">9.1</span>f}</span> pp"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1udb8dr">A successful warm-up looks like this:</p> <table data-svelte-h="svelte-1oxtqlv"><thead><tr><th>Metric</th> <th>Base model</th> <th>After SFT</th> <th>Delta</th></tr></thead> <tbody><tr><td>Format compliance</td> <td>~0%</td> <td>~68%</td> <td>+68 pp</td></tr> <tr><td>Accuracy</td> <td>~4%</td> <td>~68%</td> <td>+64 pp</td></tr></tbody></table> <p data-svelte-h="svelte-f1s8c3">Format compliance should jump sharply from near-zero — that’s the primary goal. <code>Qwen3-1.7B</code> produces
	essentially no valid <code><tool_call></code> blocks out of the box. After SFT on ~270 examples, the model reliably
	uses the format, and accuracy rises in lockstep because correct format is a prerequisite for the
	environment’s scorer to award any credit.</p> <hr> <h2 class="relative group"><a id="11-where-to-go-next-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#11-where-to-go-next-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>11. Where to go next: GRPO</span></h2> <p data-svelte-h="svelte-6b2drq">The SFT checkpoint is ready to use as the starting model for GRPO. In the
	<a href="end-to-end-walkthrough">end-to-end walkthrough</a>,
	change one line in section 8:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># Before (cold-start from the base model):</span>
	MODEL_NAME = <span class="hljs-string">"Qwen/Qwen3-1.7B"</span>

	<span class="hljs-comment"># After (warm-start from your SFT checkpoint):</span>
	MODEL_NAME = <span class="hljs-string">f"<span class="hljs-subst">{YOUR_HF_USERNAME}</span>/reasoning-gym-chain-sum-Qwen3-1.7B-sft"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18tauj5">With format compliance already near 100%, GRPO’s <code>reward_std</code> will be non-zero from the very first
	batch and the reward curve will climb immediately — no cold-start stall.</p> <p data-svelte-h="svelte-r3ulln"><strong>Other directions:</strong></p> <ul data-svelte-h="svelte-iyl68d"><li><strong>Harder tasks.</strong> Increase <code>max_terms</code> or <code>max_digits</code> in <code>DATASET_CONFIG</code> and collect a new SFT set.
	Once the student handles easier examples reliably, a harder GRPO phase can push further.</li> <li><strong>Different environments.</strong> The same pipeline — teacher collects → filter → SFT → GRPO — applies to
	any OpenEnv environment. Swap <code>reasoning_gym_env</code> and the <code>answer</code> tool definition for your env’s
	tool surface.</li> <li><strong>Larger teacher.</strong> <code>gpt-5</code> or <code>claude-opus-4</code> as teacher will yield higher-quality examples,
	especially for tasks where <code>gpt-5-mini</code> struggles.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/openenv/blob/main/docs/source/tutorials/sft-warmup.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1qwoa43 = {
	assets: "/docs/openenv/pr_749/en",
	base: "/docs/openenv/pr_749/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/openenv/pr_749/en/_app/immutable/entry/start.85477f45.js"),
	import("/docs/openenv/pr_749/en/_app/immutable/entry/app.51835dc5.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 65],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 70.1 kB
Xet hash:: 780a59e2c9544dff0eff2652362cdccf60536655adcb0a5eb4b0879e36f6904e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.