Buckets:

hf-doc-build/doc-dev / trl /pr_5607 /en /openenv.html
download
raw
114 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;OpenEnv Integration for Training LLMs with Environments&quot;,&quot;local&quot;:&quot;openenv-integration-for-training-llms-with-environments&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;When to use environments&quot;,&quot;local&quot;:&quot;when-to-use-environments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Installation&quot;,&quot;local&quot;:&quot;installation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Quick start&quot;,&quot;local&quot;:&quot;quick-start&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;How environment_factory works&quot;,&quot;local&quot;:&quot;how-environmentfactory-works&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Environment class requirements&quot;,&quot;local&quot;:&quot;environment-class-requirements&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tips for environment classes&quot;,&quot;local&quot;:&quot;tips-for-environment-classes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Reward functions&quot;,&quot;local&quot;:&quot;reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tips for reward functions&quot;,&quot;local&quot;:&quot;tips-for-reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;max_completion_length in multi-turn episodes&quot;,&quot;local&quot;:&quot;maxcompletionlength-in-multi-turn-episodes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced example: Wordle&quot;,&quot;local&quot;:&quot;advanced-example-wordle&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;The TextArena Environment&quot;,&quot;local&quot;:&quot;the-textarena-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Why Wordle?&quot;,&quot;local&quot;:&quot;why-wordle&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Environment class&quot;,&quot;local&quot;:&quot;environment-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Reward function and training&quot;,&quot;local&quot;:&quot;reward-function-and-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running the example&quot;,&quot;local&quot;:&quot;running-the-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Results&quot;,&quot;local&quot;:&quot;results&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multi-environment training&quot;,&quot;local&quot;:&quot;multi-environment-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;How it works&quot;,&quot;local&quot;:&quot;how-it-works&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Example: Wordle + Catch&quot;,&quot;local&quot;:&quot;example-wordle--catch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Per-environment reward functions&quot;,&quot;local&quot;:&quot;per-environment-reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Dataset with environment routing&quot;,&quot;local&quot;:&quot;dataset-with-environment-routing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running the multi-environment example&quot;,&quot;local&quot;:&quot;running-the-multi-environment-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Running the environments&quot;,&quot;local&quot;:&quot;running-the-environments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Environments catalog&quot;,&quot;local&quot;:&quot;environments-catalog&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Server concurrency&quot;,&quot;local&quot;:&quot;server-concurrency&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;environment_factory vs rollout_func&quot;,&quot;local&quot;:&quot;environmentfactory-vs-rolloutfunc&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Migrating from rollout_func to environment_factory&quot;,&quot;local&quot;:&quot;migrating-from-rolloutfunc-to-environmentfactory&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/trl/pr_5607/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/entry/start.151d81bd.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/scheduler.7b731bd4.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/singletons.2cf51804.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/index.ac28c20f.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/paths.ba01f37d.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/entry/app.3d9a91c0.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/preload-helper.e1689b3a.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/index.cc268345.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/nodes/0.cd288160.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/nodes/37.76e95f0d.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/CodeBlock.169a125f.js">
<link rel="modulepreload" href="/docs/trl/pr_5607/en/_app/immutable/chunks/HfOption.9f04abd1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;OpenEnv Integration for Training LLMs with Environments&quot;,&quot;local&quot;:&quot;openenv-integration-for-training-llms-with-environments&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;When to use environments&quot;,&quot;local&quot;:&quot;when-to-use-environments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Installation&quot;,&quot;local&quot;:&quot;installation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Quick start&quot;,&quot;local&quot;:&quot;quick-start&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;How environment_factory works&quot;,&quot;local&quot;:&quot;how-environmentfactory-works&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Environment class requirements&quot;,&quot;local&quot;:&quot;environment-class-requirements&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tips for environment classes&quot;,&quot;local&quot;:&quot;tips-for-environment-classes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Reward functions&quot;,&quot;local&quot;:&quot;reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tips for reward functions&quot;,&quot;local&quot;:&quot;tips-for-reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;max_completion_length in multi-turn episodes&quot;,&quot;local&quot;:&quot;maxcompletionlength-in-multi-turn-episodes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced example: Wordle&quot;,&quot;local&quot;:&quot;advanced-example-wordle&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;The TextArena Environment&quot;,&quot;local&quot;:&quot;the-textarena-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Why Wordle?&quot;,&quot;local&quot;:&quot;why-wordle&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Environment class&quot;,&quot;local&quot;:&quot;environment-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Reward function and training&quot;,&quot;local&quot;:&quot;reward-function-and-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running the example&quot;,&quot;local&quot;:&quot;running-the-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Results&quot;,&quot;local&quot;:&quot;results&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multi-environment training&quot;,&quot;local&quot;:&quot;multi-environment-training&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;How it works&quot;,&quot;local&quot;:&quot;how-it-works&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Example: Wordle + Catch&quot;,&quot;local&quot;:&quot;example-wordle--catch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Per-environment reward functions&quot;,&quot;local&quot;:&quot;per-environment-reward-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Dataset with environment routing&quot;,&quot;local&quot;:&quot;dataset-with-environment-routing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running the multi-environment example&quot;,&quot;local&quot;:&quot;running-the-multi-environment-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Running the environments&quot;,&quot;local&quot;:&quot;running-the-environments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Environments catalog&quot;,&quot;local&quot;:&quot;environments-catalog&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Server concurrency&quot;,&quot;local&quot;:&quot;server-concurrency&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;environment_factory vs rollout_func&quot;,&quot;local&quot;:&quot;environmentfactory-vs-rolloutfunc&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Migrating from rollout_func to environment_factory&quot;,&quot;local&quot;:&quot;migrating-from-rolloutfunc-to-environmentfactory&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="openenv-integration-for-training-llms-with-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#openenv-integration-for-training-llms-with-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>OpenEnv Integration for Training LLMs with Environments</span></h1> <p data-svelte-h="svelte-a7v5qb"><a href="https://github.com/meta-pytorch/OpenEnv" rel="nofollow">OpenEnv</a> is an open-source framework for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It provides standardized APIs for environment interaction and supports running environments as backend servers (via WebSocket or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the <a href="https://huggingface.co/collections/openenv/openenv-environment-hub" rel="nofollow">Hugging Face Hub</a>.</p> <p data-svelte-h="svelte-1po34wr">This guide covers <strong>how to integrate OpenEnv with TRL</strong>. For more on OpenEnv itself, see the <a href="https://meta-pytorch.org/OpenEnv/" rel="nofollow">OpenEnv docs</a>.</p> <blockquote class="note" data-svelte-h="svelte-1q6q058"><p>You can explore ready-to-use example <a href="example_overview#openenv-scripts">scripts</a> and <a href="example_overview#openenv-notebooks">notebooks</a> in the Examples Overview.</p></blockquote> <h2 class="relative group"><a id="when-to-use-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#when-to-use-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>When to use environments</span></h2> <p data-svelte-h="svelte-vatuy1"><a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> can be used to train agents. For agentic tasks, it supports two modes: <strong>tools</strong>, where the model can call external functions but each call is stateless and independent, and <strong>environments</strong>, which maintain state across turns, enabling genuine multi-turn interaction where the agent’s actions shape future observations. Use environments when continuity matters — for example, navigating a game, browsing a web page, or any task where what the agent sees next depends on what it did before.</p> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation</span></h2> <p data-svelte-h="svelte-1q8g6r4">OpenEnv environments are hosted as Hugging Face Spaces, which are also pip-installable Git repositories:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Echo environment</span>
pip install <span class="hljs-string">&quot;openenv-echo-env @ git+https://huggingface.co/spaces/openenv/echo_env&quot;</span>
<span class="hljs-comment"># Wordle (TextArena) environment</span>
pip install <span class="hljs-string">&quot;openenv-textarena @ git+https://huggingface.co/spaces/openenv/wordle&quot;</span>
<span class="hljs-comment"># Catch (OpenSpiel) environment</span>
pip install <span class="hljs-string">&quot;openenv-openspiel-env @ git+https://huggingface.co/spaces/openenv/openspiel_env&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xp1fo6">This installs the <strong>environment client</strong> (e.g., <code>EchoEnv</code>) that communicates with the remote environment server via WebSocket, along with the action/observation models and all required dependencies (including <code>openenv-core</code>).</p> <blockquote class="tip" data-svelte-h="svelte-19wubx3"><p>You can find the install command for any environment on its HF Space page. Click the <strong>⋮ (three dots)</strong> menu and select <strong>“Use this Space”</strong> to see the install instructions.</p></blockquote> <blockquote class="tip" data-svelte-h="svelte-4ibm3b"><p>You can also install the core package from PyPI with <code>pip install &quot;openenv-core[core]&gt;=0.2.1&quot;</code>, but note that environment-specific dependencies may need to be installed separately.</p></blockquote> <p data-svelte-h="svelte-1dm1laa">For development, you can clone the OpenEnv repo and install locally:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/meta-pytorch/OpenEnv.git
<span class="hljs-built_in">cd</span> OpenEnv/envs/echo_env
pip install -e .<!-- HTML_TAG_END --></pre></div> <blockquote class="note"><p data-svelte-h="svelte-1nxaurk">Each environment script in TRL includes inline dependency metadata (PEP 723) so you can also run them directly with <a href="https://docs.astral.sh/uv/" rel="nofollow">uv</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->uv run examples/scripts/openenv/echo.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6wmct">This automatically installs the required environment package in an isolated virtual environment.</p></blockquote> <h2 class="relative group"><a id="quick-start" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quick-start"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quick start</span></h2> <p data-svelte-h="svelte-a7hir6">The fastest way to understand the integration is a complete example. The <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py" rel="nofollow">echo.py</a> script trains a model with the <a href="https://meta-pytorch.org/OpenEnv/environments/echo.html" rel="nofollow">Echo environment</a>, which rewards completions based on their text length:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-keyword">from</span> echo_env <span class="hljs-keyword">import</span> EchoEnv
<span class="hljs-keyword">from</span> echo_env.models <span class="hljs-keyword">import</span> EchoAction
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer
ENV_URL = <span class="hljs-string">&quot;https://openenv-echo-env.hf.space&quot;</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">EchoToolEnv</span>:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
self.env = EchoEnv(base_url=ENV_URL)
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">echo</span>(<span class="hljs-params">self, message: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;
Echo the message back from the environment.
Args:
message: The message to echo
Returns:
The echoed message.
&quot;&quot;&quot;</span>
observation = self.env.step(EchoAction(message=message))
self.reward = observation.observation.reward
<span class="hljs-keyword">return</span> observation.observation.echoed_message
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>):
<span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]
dataset = Dataset.from_dict(
{<span class="hljs-string">&quot;prompt&quot;</span>: [[{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Try to echo &#x27;Hello World!&#x27; in the environment.&quot;</span>}]] * <span class="hljs-number">64</span>}
)
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen3-0.6B&quot;</span>,
train_dataset=dataset,
reward_funcs=reward_func,
args=GRPOConfig(
chat_template_kwargs={<span class="hljs-string">&quot;enable_thinking&quot;</span>: <span class="hljs-literal">False</span>},
log_completions=<span class="hljs-literal">True</span>,
),
environment_factory=EchoToolEnv,
)
trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3t91z0">That’s it. Here’s what happens under the hood:</p> <ol data-svelte-h="svelte-198zcxp"><li><strong><code>environment_factory=EchoToolEnv</code></strong>: The trainer creates one <code>EchoToolEnv</code> instance per generation (pass the class, not an instance).</li> <li><strong><code>reset()</code></strong> is called at the start of each episode to initialize state. Returns an observation string (or <code>None</code>).</li> <li><strong>Tool discovery</strong>: The trainer discovers all public methods on the environment instance (here, <code>echo()</code>) and exposes them as function-calling tools. Each method must have a proper docstring with typed arguments, which the trainer uses to build the tool schema.</li> <li><strong>Multi-turn loop</strong>: The trainer generates a completion, parses tool calls, executes <code>echo()</code>, appends the result, and generates again, until the model stops calling tools or <code>max_completion_length</code> is reached.</li> <li><strong>Reward function</strong>: Reads <code>env.reward</code> from each environment instance after the episode (before the environment is reset).</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Run the example</span>
python examples/scripts/openenv/echo.py
<span class="hljs-comment"># Customize model and environment URL</span>
python examples/scripts/openenv/echo.py --model Qwen/Qwen3-0.6B --env-host https://openenv-echo-env.hf.space<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-klksjg">Below is the reward curve from training:</p> <iframe src="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden" style="width:100%; max-width:800px; height:500px; border:0;"></iframe> <blockquote class="note" data-svelte-h="svelte-412tzz"><p>You can explore more ready-to-use example <a href="example_overview#openenv-scripts">scripts</a> and <a href="example_overview#openenv-notebooks">notebooks</a> in the Examples Overview.</p></blockquote> <h2 class="relative group"><a id="how-environmentfactory-works" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-environmentfactory-works"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How environment_factory works</span></h2> <p data-svelte-h="svelte-9o4gb9">TRL’s <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> supports interactive environment training through the <code>environment_factory</code> argument. When provided, the trainer automatically handles the multi-turn tool-calling loop: it generates completions, parses tool calls, executes them against the environment, and feeds the results back to the model. All without custom rollout code.</p> <h3 class="relative group"><a id="environment-class-requirements" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#environment-class-requirements"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Environment class requirements</span></h3> <p data-svelte-h="svelte-1eeptr2">Your environment class must follow these rules:</p> <ul data-svelte-h="svelte-6iobor"><li><strong><code>__init__(self)</code></strong> <em>(optional)</em>: If provided, must take no arguments. Use it to initialize state or clients. If you need external configuration (e.g., a URL), capture it from the enclosing scope or module-level variables.</li> <li><strong>`reset(self, </strong>kwargs)<code>**: Called at the start of each episode. Receives all dataset columns as keyword arguments. Return a string observation (or </code>None` for no initial observation).</li> <li><strong>Tool methods</strong>: Any public method (not starting with <code>_</code>) other than <code>reset</code> is automatically exposed as a tool. Each tool method must have a docstring with <code>Args:</code> descriptions, since the trainer uses these to generate the tool schema for the model.</li></ul> <h3 class="relative group"><a id="tips-for-environment-classes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tips-for-environment-classes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tips for environment classes</span></h3> <ul data-svelte-h="svelte-1jxsu4h"><li><strong>State for reward</strong>: You can store any state you want on the environment instance (e.g., <code>self.reward</code>, <code>self.done</code>, etc.) and access it in your reward function via the <code>environments</code> parameter. Refer to the <a href="#quick-start">Quick Start guide</a> for an example of this pattern.</li> <li><strong>Error handling</strong>: If a tool method raises an exception (e.g., <code>ValueError(&quot;Game over.&quot;)</code>), the trainer catches it and feeds the error message back to the model as a tool response. This is the recommended way to signal that an action is invalid or that the episode has ended.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->ENV_URL = <span class="hljs-string">&quot;https://my-env.hf.space&quot;</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyEnv</span>:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
self.client = MyClient(base_url=ENV_URL) <span class="hljs-comment"># captured from enclosing scope</span>
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">return</span> <span class="hljs-string">&quot;Initial observation for the model&quot;</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">my_tool</span>(<span class="hljs-params">self, arg1: <span class="hljs-built_in">str</span>, arg2: <span class="hljs-built_in">int</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;
Description of what this tool does.
Args:
arg1: Description of arg1
arg2: Description of arg2
Returns:
The result message.
&quot;&quot;&quot;</span>
self.reward = <span class="hljs-number">1.0</span>
<span class="hljs-keyword">return</span> <span class="hljs-string">&quot;Tool result&quot;</span><!-- HTML_TAG_END --></pre></div> <blockquote class="important" data-svelte-h="svelte-3x2szj"><p>Tools must be <strong>individual methods</strong> with descriptive names and typed arguments (e.g., <code>guess(word: str)</code>, <code>move(direction: str)</code>). We do not recommend using generic methods like <code>step(action)</code>, since the model needs meaningful tool names and argument descriptions to learn tool calling.</p></blockquote> <h3 class="relative group"><a id="reward-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reward-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reward functions</span></h3> <p data-svelte-h="svelte-1oy9i3i">Reward functions receive the <code>environments</code> parameter (a list of environment instances), so you can access any state stored during the episode:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
<span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dj8ipq">For more information on reward functions, see the <a href="grpo_trainer#custom-reward-functions">GRPO - Custom Reward Functions</a>.</p> <h3 class="relative group"><a id="tips-for-reward-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tips-for-reward-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tips for reward functions</span></h3> <p data-svelte-h="svelte-tc99c5">A few things we’ve found helpful when working with OpenEnv environments and GRPO:</p> <ul data-svelte-h="svelte-13t89bx"><li><strong>Simple rewards work well.</strong> In our experiments with Wordle and Sudoku, binary rewards (1.0 for success, 0.0 otherwise) gave cleaner training signals than shaped rewards with partial credit. GRPO compares completions within a group, so the relative ranking matters more than the absolute values.</li> <li><strong>Check the final state, not the path.</strong> When possible, let the environment judge the outcome (e.g., “did the model solve the puzzle?”) rather than checking if it followed a specific sequence of actions. This gives the model freedom to discover its own strategies.</li> <li><strong>Test your reward before training.</strong> Run a few episodes manually (see the <a href="https://github.com/huggingface/trl/blob/main/examples/notebooks/openenv_wordle_grpo.ipynb" rel="nofollow">Wordle example notebook</a>) to confirm the environment returns sensible rewards. If a capable model can’t score higher than a random baseline, the reward signal may need adjustment.</li></ul> <h3 class="relative group"><a id="maxcompletionlength-in-multi-turn-episodes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#maxcompletionlength-in-multi-turn-episodes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>max_completion_length in multi-turn episodes</span></h3> <p data-svelte-h="svelte-1wgkn3k">The <code>max_completion_length</code> parameter limits the <strong>total number of tokens across the entire multi-turn conversation</strong> (all model generations + tool results combined), not just a single generation. For environments with many turns (e.g., Sudoku with dozens of moves), you may need to increase it:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->args = GRPOConfig(
max_completion_length=<span class="hljs-number">4096</span>, <span class="hljs-comment"># default is usually 256-1024, increase for long episodes</span>
<span class="hljs-comment"># ...</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kpittw">If episodes are being cut short (model stops mid-game), this is likely the cause.</p> <h2 class="relative group"><a id="advanced-example-wordle" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#advanced-example-wordle"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Advanced example: Wordle</span></h2> <p data-svelte-h="svelte-1thz35g">Let’s train a model to play <a href="https://www.nytimes.com/games/wordle/index.html" rel="nofollow">Wordle</a> using the <a href="https://meta-pytorch.org/OpenEnv/environments/textarena.html" rel="nofollow"><code>TextArena</code></a> environment. This demonstrates multi-turn interaction, cumulative feedback handling, and episode termination via exceptions.</p> <blockquote class="note" data-svelte-h="svelte-3sgo7n"><p>You can explore the notebook version of this example in <a href="https://github.com/huggingface/trl/blob/main/examples/notebooks/openenv_wordle_grpo.ipynb" rel="nofollow">the OpenEnv Wordle GRPO example</a>.</p></blockquote> <h3 class="relative group"><a id="the-textarena-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-textarena-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The TextArena Environment</span></h3> <p data-svelte-h="svelte-lx7z83"><a href="https://huggingface.co/papers/2504.11442" rel="nofollow">TextArena</a> is an open-source collection of competitive text-based games designed to evaluate reasoning skills in LLMs using textual games like Wordle, Snake, Tic-Tac-Toe, and more.</p> <p data-svelte-h="svelte-or2o6k"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png" alt="image of TextArena"></p> <h3 class="relative group"><a id="why-wordle" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-wordle"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why Wordle?</span></h3> <p data-svelte-h="svelte-aog2wv">Wordle is a good benchmark for environment-based RL because it requires reasoning about feedback, is purely text-based, and models from 1B parameters can improve at it. Each guess is only 8 tokens, making it lightweight to experiment with.</p> <blockquote class="note"><p data-svelte-h="svelte-1qk48oi">How does Wordle work?
Wordle is a word guessing game where the player has to guess a 5-letter word in 6 attempts. After each guess, the environment provides letter-by-letter feedback:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->G U E S S
<span class="hljs-keyword">X</span> G <span class="hljs-keyword">Y</span> <span class="hljs-keyword">X</span> <span class="hljs-keyword">X</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ia1ang">X = not in the word, G = correct position (green), Y = wrong position (yellow). Here, “U” is correct and in place, “E” is in the word but misplaced.</p></blockquote> <h3 class="relative group"><a id="environment-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#environment-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Environment class</span></h3> <p data-svelte-h="svelte-8a060r">The <code>WordleEnv</code> class wraps the TextArena client and exposes <code>guess()</code> as the tool:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> textarena_env <span class="hljs-keyword">import</span> TextArenaAction, TextArenaEnv
<span class="hljs-keyword">class</span> <span class="hljs-title class_">WordleEnv</span>:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
self.client = TextArenaEnv(base_url=<span class="hljs-string">&quot;https://openenv-wordle.hf.space&quot;</span>)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
result = self.client.reset()
self._last_full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content
self.reward = <span class="hljs-number">0.0</span>
self.done = <span class="hljs-literal">False</span>
<span class="hljs-keyword">return</span> self._last_full_feedback
<span class="hljs-keyword">def</span> <span class="hljs-title function_">guess</span>(<span class="hljs-params">self, guess: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;
Make a guess in the Wordle environment.
Args:
guess: The guessed word, formatted as &#x27;[abcde]&#x27;
Returns:
The feedback message from the environment.
&quot;&quot;&quot;</span>
<span class="hljs-keyword">if</span> self.done:
<span class="hljs-keyword">raise</span> ValueError(<span class="hljs-string">&quot;Game over.&quot;</span>)
result = self.client.step(TextArenaAction(message=guess))
_full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content
feedback = _full_feedback[<span class="hljs-built_in">len</span>(self._last_full_feedback):]
self._last_full_feedback = _full_feedback
<span class="hljs-keyword">if</span> <span class="hljs-string">&quot;You attempted an invalid move&quot;</span> <span class="hljs-keyword">in</span> feedback:
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">else</span>:
self.reward = result.reward
self.done = result.done
<span class="hljs-keyword">return</span> feedback<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tm92gv">Key design choices:</p> <ul data-svelte-h="svelte-10uhdoz"><li><strong><code>reset()</code></strong> returns the initial game message as the first observation the model sees.</li> <li><strong><code>guess()</code></strong> is the only tool. The model calls it each turn with a 5-letter word.</li> <li><strong>Cumulative feedback slicing</strong>: TextArena returns the full game history each turn. We slice out only the new part to avoid repeating context.</li> <li><strong>Exception on done</strong>: If the model tries to guess after the game ends, <code>guess()</code> raises a <code>ValueError</code>. The trainer catches this and feeds <code>&quot;Game over.&quot;</code> back to the model as a tool response. The model learns to stop calling tools after this signal.</li></ul> <h3 class="relative group"><a id="reward-function-and-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reward-function-and-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reward function and training</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]:
<span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]
prompt = <span class="hljs-string">&quot;&quot;&quot;You are an expert Wordle solver with deep knowledge of English vocabulary...
Use the tool `guess` to make a guess.&quot;&quot;&quot;</span>
dataset = Dataset.from_dict({<span class="hljs-string">&quot;prompt&quot;</span>: [[{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: prompt}]] * <span class="hljs-number">1000</span>})
trainer = GRPOTrainer(
model=<span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>,
reward_funcs=reward_func,
train_dataset=dataset,
args=GRPOConfig(
use_vllm=<span class="hljs-literal">True</span>,
vllm_mode=<span class="hljs-string">&quot;colocate&quot;</span>,
chat_template_kwargs={<span class="hljs-string">&quot;enable_thinking&quot;</span>: <span class="hljs-literal">False</span>},
max_completion_length=<span class="hljs-number">1024</span>,
num_generations=<span class="hljs-number">4</span>,
gradient_accumulation_steps=<span class="hljs-number">64</span>,
),
environment_factory=WordleEnv,
)
trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cje71z">The environment returns <code>1.0</code> if the model wins and <code>0.0</code> otherwise.</p> <h3 class="relative group"><a id="running-the-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the example</span></h3> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">colocate </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">server </div></div> <div class="language-select"><p data-svelte-h="svelte-vv39h7"><strong>Colocate mode (1 GPU, recommended)</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python examples/scripts/openenv/wordle.py --vllm-mode colocate<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10q1sb9">This runs vLLM in the same process as training, requiring only a single GPU.</p> </div> <h3 class="relative group"><a id="results" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#results"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Results</span></h3> <p data-svelte-h="svelte-27ny5q">The model improves its performance by reducing repetitions and increasing correct guesses. However, Qwen3-1.7B with <code>enable_thinking=False</code> is not able to consistently win the game.</p> <iframe src="https://burtenshaw-wordle-grpo.hf.space?project=group-Qwen-Qwen3-17B&metrics=reward&runs=run-2025-10-26_09-39-49,run-2025-10-26_08-04-49&sidebar=hidden&navbar=hidden" style="width:100%; max-width:800px; height:500px; border:0;"></iframe> <blockquote class="note" data-svelte-h="svelte-85uckb"><p>With <code>enable_thinking=False</code> (the default in these examples), small models like Qwen3-1.7B can learn to improve their guesses but should not be expected to consistently solve the game. For significantly better results, use larger models or enable thinking mode (<code>enable_thinking=True</code>), which allows the model to reason before making a guess at the cost of longer completions.</p></blockquote> <p data-svelte-h="svelte-1n60c7p">We experimented with larger models like <a href="https://huggingface.co/openai/gpt-oss-20b" rel="nofollow"><code>gpt-oss-20b</code></a> and found that it was able to consistently win the game, though this requires significantly more compute.</p> <h2 class="relative group"><a id="multi-environment-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-environment-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi-environment training</span></h2> <p data-svelte-h="svelte-14xidyy">You can train a single model across multiple environments simultaneously. This is useful when you want a model to learn different skills in parallel. For example, playing Wordle (language reasoning) and Catch (spatial reasoning) in the same training run.</p> <p data-svelte-h="svelte-1r7d50s">The key idea is to create a <strong>meta-environment class</strong> that wraps multiple environments and routes each sample to the correct one using a dataset column.</p> <h3 class="relative group"><a id="how-it-works" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-it-works"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How it works</span></h3> <ol data-svelte-h="svelte-161iujm"><li>Add an <code>&quot;env&quot;</code> column (or similar) to your dataset that identifies which environment each sample belongs to.</li> <li>In <code>reset(**kwargs)</code>, read <code>kwargs[&quot;env&quot;]</code> to select the active environment for that episode.</li> <li>Expose tools from all environments; the trainer discovers all public methods.</li> <li>Use separate reward functions per environment, returning <code>None</code> for samples that don’t belong to that environment. TRL handles <code>None</code> values with <code>nansum</code>/<code>nanmean</code>.</li></ol> <h3 class="relative group"><a id="example-wordle--catch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#example-wordle--catch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Example: Wordle + Catch</span></h3> <p data-svelte-h="svelte-1ymvjlr">The <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/multi_env.py" rel="nofollow">multi_env.py</a> script trains on Wordle and Catch simultaneously:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">MultiEnv</span>:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
self._wordle_client = <span class="hljs-literal">None</span>
self._catch_client = <span class="hljs-literal">None</span>
self.active = <span class="hljs-literal">None</span>
self.reward = <span class="hljs-number">0.0</span>
self.done = <span class="hljs-literal">False</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
self.active = kwargs.get(<span class="hljs-string">&quot;env&quot;</span>, <span class="hljs-string">&quot;wordle&quot;</span>)
self.reward = <span class="hljs-number">0.0</span>
self.done = <span class="hljs-literal">False</span>
<span class="hljs-keyword">if</span> self.active == <span class="hljs-string">&quot;wordle&quot;</span>:
<span class="hljs-keyword">if</span> self._wordle_client <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
<span class="hljs-keyword">try</span>:
self._wordle_client.close()
<span class="hljs-keyword">except</span> Exception:
<span class="hljs-keyword">pass</span>
self._wordle_client = TextArenaEnv(base_url=WORDLE_URL)
result = self._wordle_client.reset()
self._last_full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">return</span> self._last_full_feedback
<span class="hljs-keyword">elif</span> self.active == <span class="hljs-string">&quot;catch&quot;</span>:
<span class="hljs-keyword">if</span> self._catch_client <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
<span class="hljs-keyword">try</span>:
self._catch_client.close()
<span class="hljs-keyword">except</span> Exception:
<span class="hljs-keyword">pass</span>
self._catch_client = OpenSpielEnv(base_url=CATCH_URL)
result = self._catch_client.reset()
self.done = result.observation.done
<span class="hljs-keyword">return</span> _format_catch_obs(result.observation.info_state)
<span class="hljs-comment"># Wordle tool</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">guess</span>(<span class="hljs-params">self, guess: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;Make a guess in the Wordle environment. ...&quot;&quot;&quot;</span>
...
<span class="hljs-comment"># Catch tools</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">move</span>(<span class="hljs-params">self, direction: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;Move the paddle left or right. ...&quot;&quot;&quot;</span>
...
<span class="hljs-keyword">def</span> <span class="hljs-title function_">stay</span>(<span class="hljs-params">self</span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;Do nothing and let the ball fall one step. ...&quot;&quot;&quot;</span>
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hyiz9o">Key patterns:</p> <ul data-svelte-h="svelte-qzf3u7"><li><strong>Lazy client initialization</strong>: Create clients in <code>reset()</code>, not <code>__init__()</code>, to avoid unnecessary WebSocket connections.</li> <li><strong>Close before reopen</strong>: Close the previous client before creating a new one to avoid server capacity errors.</li> <li><strong><code>kwargs</code> routing</strong>: The <code>&quot;env&quot;</code> column from the dataset is passed to <code>reset()</code> as a keyword argument.</li> <li><strong>All tools are exposed simultaneously</strong>: The model sees <code>guess</code>, <code>move</code>, and <code>stay</code> as available tools regardless of the active environment. If it calls the wrong tool (e.g., <code>move</code> during Wordle), the method raises a <code>ValueError</code> that the trainer catches gracefully. In practice, models learn to use the correct tools based on the system prompt.</li></ul> <h3 class="relative group"><a id="per-environment-reward-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#per-environment-reward-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Per-environment reward functions</span></h3> <p data-svelte-h="svelte-1eyyu5m">Each reward function returns <code>None</code> for samples from other environments:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">wordle_reward</span>(<span class="hljs-params">environments, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span> | <span class="hljs-literal">None</span>]:
<span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">if</span> env.active == <span class="hljs-string">&quot;wordle&quot;</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]
<span class="hljs-keyword">def</span> <span class="hljs-title function_">catch_reward</span>(<span class="hljs-params">environments, **kwargs</span>) -&gt; <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span> | <span class="hljs-literal">None</span>]:
rewards = []
<span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments:
<span class="hljs-keyword">if</span> env.active != <span class="hljs-string">&quot;catch&quot;</span>:
rewards.append(<span class="hljs-literal">None</span>)
<span class="hljs-keyword">elif</span> env.done:
rewards.append(<span class="hljs-built_in">max</span>(env.reward, <span class="hljs-number">0.0</span>))
<span class="hljs-keyword">else</span>:
rewards.append(<span class="hljs-number">0.0</span>)
<span class="hljs-keyword">return</span> rewards<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-skztfb">TRL converts <code>None</code> to <code>nan</code> internally and uses <code>nansum</code>/<code>nanmean</code> for aggregation, so each sample is only scored by its relevant reward function.</p> <h3 class="relative group"><a id="dataset-with-environment-routing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dataset-with-environment-routing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dataset with environment routing</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->n = <span class="hljs-number">500</span>
dataset = Dataset.from_dict({
<span class="hljs-string">&quot;prompt&quot;</span>: (
[[{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: wordle_prompt}]] * n
+ [[{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: catch_prompt}]] * n
),
<span class="hljs-string">&quot;env&quot;</span>: [<span class="hljs-string">&quot;wordle&quot;</span>] * n + [<span class="hljs-string">&quot;catch&quot;</span>] * n,
})<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="running-the-multi-environment-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-multi-environment-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the multi-environment example</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python examples/scripts/openenv/multi_env.py \
--wordle-url https://openenv-wordle.hf.space \
--catch-url https://openenv-openspiel-env.hf.space \
--vllm-mode colocate \
--gradient-accumulation-steps 4 \
--num-generations 8<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-1liux33"><p>When training across multiple environments, monitor the per-reward-function metrics (<code>train/reward_func_0</code>, <code>train/reward_func_1</code>, etc.) rather than the combined <code>train/reward</code>. The combined metric alternates between environments and can appear noisy.</p></blockquote> <h2 class="relative group"><a id="running-the-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the environments</span></h2> <p data-svelte-h="svelte-3otk4f">When using <code>environment_factory</code>, the trainer connects to the environment server automatically. You just need the server to be running. There are three ways to run an OpenEnv environment server:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">space </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">docker </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">local </div></div> <div class="language-select"><p data-svelte-h="svelte-1d83umq"><strong>Connect to a remote Hugging Face Space</strong> <em>(simplest)</em></p> <p data-svelte-h="svelte-1fk7qft">Most example scripts default to a hosted Space (no setup needed):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->env = EchoEnv(base_url=<span class="hljs-string">&quot;https://openenv-echo-env.hf.space&quot;</span>)<!-- HTML_TAG_END --></pre></div> <blockquote class="warning" data-svelte-h="svelte-fwc1d1"><p>For training, <strong>duplicate the Space to your own account</strong> to avoid concurrency issues. The trainer opens N simultaneous WebSocket connections (one per generation), and shared Spaces may not support this. See <a href="#server-concurrency">Server concurrency</a> for details.</p></blockquote> </div> <h2 class="relative group"><a id="environments-catalog" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#environments-catalog"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Environments catalog</span></h2> <p data-svelte-h="svelte-1tacsw4">The best way to explore the current catalog of maintained environments is by visiting the official OpenEnv <a href="https://huggingface.co/collections/openenv/environment-hub" rel="nofollow">catalog</a>.</p> <p data-svelte-h="svelte-d6ph58">To create your own environment, check out the guide on <a href="https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_03_building_environments.html" rel="nofollow">Building Your Own Environment with OpenEnv</a>. Environments are tightly integrated with the Hub, so you can push new environments for the community to reuse.</p> <h2 class="relative group"><a id="server-concurrency" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#server-concurrency"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Server concurrency</span></h2> <p data-svelte-h="svelte-149tjmd">When using <code>environment_factory</code>, the trainer creates N environment instances (one per generation), each opening a WebSocket connection to the server. By default, OpenEnv servers allow only 1 concurrent session, which will cause failures during training.</p> <p data-svelte-h="svelte-o3uox3">To support parallel training, configure the server for concurrency:</p> <ol data-svelte-h="svelte-133135u"><li>In your environment file, declare concurrent session support:</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->SUPPORTS_CONCURRENT_SESSIONS: <span class="hljs-built_in">bool</span> = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <ol start="2" data-svelte-h="svelte-obzobp"><li>In your server app, set the concurrency limit:</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->app = create_app(
create_my_environment,
MyAction,
MyObservation,
max_concurrent_envs=<span class="hljs-number">64</span>, <span class="hljs-comment"># match or exceed generation_batch_size</span>
)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-jkb8xm"><p><code>max_concurrent_envs</code> should be ≥ <code>generation_batch_size</code> (which defaults to <code>per_device_train_batch_size × gradient_accumulation_steps</code>). For example, with <code>gradient_accumulation_steps=64</code> and batch size 1, you need at least 64 concurrent sessions.</p></blockquote> <h2 class="relative group"><a id="environmentfactory-vs-rolloutfunc" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#environmentfactory-vs-rolloutfunc"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>environment_factory vs rollout_func</span></h2> <p data-svelte-h="svelte-14trny9"><a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> supports two approaches for environment-based training:</p> <ul data-svelte-h="svelte-11815mq"><li><strong><code>environment_factory</code></strong> (recommended): You define an environment class with tool methods, and the trainer handles generation, tool-call parsing, and the multi-turn loop automatically. This is the approach used throughout this guide.</li> <li><strong><code>rollout_func</code></strong>: You write the entire generation and environment interaction loop yourself. This gives full control over how completions are produced, how tools are executed, and how rewards are computed.</li></ul> <p data-svelte-h="svelte-11z0mru">Use <code>rollout_func</code> when <code>environment_factory</code> doesn’t fit your use case. For example, <strong>external agent servers</strong> like <a href="nemo_gym">NeMo-Gym</a>, where an external server owns the generation loop and manages its own agent-environment interaction protocol.</p> <h3 class="relative group"><a id="migrating-from-rolloutfunc-to-environmentfactory" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#migrating-from-rolloutfunc-to-environmentfactory"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Migrating from rollout_func to environment_factory</span></h3> <p data-svelte-h="svelte-1yjg80d">If you have existing <code>rollout_func</code> code and want to migrate, here’s the mapping:</p> <table data-svelte-h="svelte-nwr12z"><thead><tr><th><code>rollout_func</code> pattern</th> <th><code>environment_factory</code> equivalent</th></tr></thead> <tbody><tr><td>Manual generation loop</td> <td>Handled automatically by the trainer</td></tr> <tr><td><code>generate_rollout_completions()</code></td> <td>Not needed, trainer generates internally</td></tr> <tr><td><code>env.step(Action(...))</code> in rollout</td> <td>Wrap in a tool method on the environment class</td></tr> <tr><td>Reward via <code>kwargs[&quot;env_reward&quot;]</code></td> <td>Reward via <code>environments</code> parameter</td></tr> <tr><td><code>env_mask</code> construction</td> <td>Automatic, trainer builds <code>tool_mask</code></td></tr> <tr><td>Token concatenation</td> <td>Automatic, trainer manages token sequences</td></tr></tbody></table> <p data-svelte-h="svelte-euz4a6"><strong>Before</strong> (<code>rollout_func</code>):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">rollout_func</span>(<span class="hljs-params">prompts, trainer</span>):
outputs = generate_rollout_completions(trainer, prompts)
env_rewards = []
<span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs:
text = tokenizer.decode(out[<span class="hljs-string">&quot;completion_ids&quot;</span>], skip_special_tokens=<span class="hljs-literal">True</span>)
result = client.step(EchoAction(message=text))
env_rewards.append(result.reward)
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;prompt_ids&quot;</span>: [out[<span class="hljs-string">&quot;prompt_ids&quot;</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
<span class="hljs-string">&quot;completion_ids&quot;</span>: [out[<span class="hljs-string">&quot;completion_ids&quot;</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
<span class="hljs-string">&quot;logprobs&quot;</span>: [out[<span class="hljs-string">&quot;logprobs&quot;</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
<span class="hljs-string">&quot;env_reward&quot;</span>: env_rewards,
}
trainer = GRPOTrainer(..., rollout_func=rollout_func)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-91ci07"><strong>After</strong> (<code>environment_factory</code>):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">EchoToolEnv</span>:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
self.env = EchoEnv(base_url=url)
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -&gt; <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>:
self.reward = <span class="hljs-number">0.0</span>
<span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">echo</span>(<span class="hljs-params">self, message: <span class="hljs-built_in">str</span></span>) -&gt; <span class="hljs-built_in">str</span>:
<span class="hljs-string">&quot;&quot;&quot;Echo the message back.
Args:
message: The message to echo
Returns:
The echoed message.
&quot;&quot;&quot;</span>
result = self.env.step(EchoAction(message=message))
self.reward = result.observation.reward
<span class="hljs-keyword">return</span> result.observation.echoed_message
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>):
<span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]
trainer = GRPOTrainer(..., environment_factory=EchoToolEnv, reward_funcs=reward_func)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/trl/blob/main/docs/source/openenv.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1hqaf25 = {
assets: "/docs/trl/pr_5607/en",
base: "/docs/trl/pr_5607/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/trl/pr_5607/en/_app/immutable/entry/start.151d81bd.js"),
import("/docs/trl/pr_5607/en/_app/immutable/entry/app.3d9a91c0.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 37],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
114 kB
·
Xet hash:
144ece142f038e730fc8b0c31c637c5962c6a836b33b381ba69faac2d84d2cbe

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.