Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / trl /pr_5321 /en /openenv.html

rtrm

30 days ago

download

raw

88.2 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"OpenEnv Integration for Training LLMs with Environments","local":"openenv-integration-for-training-llms-with-environments","sections":[{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Using rollout_func with OpenEnv environments","local":"using-rolloutfunc-with-openenv-environments","sections":[{"title":"Rollout Function Signature","local":"rollout-function-signature","sections":[],"depth":3},{"title":"Integration pattern","local":"integration-pattern","sections":[],"depth":3},{"title":"vLLM Modes","local":"vllm-modes","sections":[],"depth":3}],"depth":2},{"title":"Running the Environments","local":"running-the-environments","sections":[],"depth":2},{"title":"Environments Catalog","local":"environments-catalog","sections":[],"depth":2},{"title":"A simple example","local":"a-simple-example","sections":[{"title":"Running the Example","local":"running-the-example","sections":[],"depth":3}],"depth":2},{"title":"Advanced Example","local":"advanced-example","sections":[{"title":"The TextArena Environment","local":"the-textarena-environment","sections":[],"depth":3},{"title":"Wordle","local":"wordle","sections":[],"depth":3},{"title":"Rollout Function","local":"rollout-function","sections":[],"depth":3},{"title":"Reward Functions","local":"reward-functions","sections":[],"depth":3},{"title":"Training the Model","local":"training-the-model","sections":[],"depth":3},{"title":"Running the Advanced Example","local":"running-the-advanced-example","sections":[],"depth":3},{"title":"Results","local":"results","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/trl/pr_5321/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/entry/start.d05f7d8c.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/scheduler.7b731bd4.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/singletons.02d54274.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/index.ac28c20f.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/paths.4a101974.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/entry/app.4dcaf2cd.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/preload-helper.02dc9fd7.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/index.cc268345.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/nodes/0.ec46bb4c.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/nodes/37.12c5a7ee.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.6a2cd520.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/CodeBlock.f01cadde.js">
	<link rel="modulepreload" href="/docs/trl/pr_5321/en/_app/immutable/chunks/HfOption.9f04abd1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"OpenEnv Integration for Training LLMs with Environments","local":"openenv-integration-for-training-llms-with-environments","sections":[{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Using rollout_func with OpenEnv environments","local":"using-rolloutfunc-with-openenv-environments","sections":[{"title":"Rollout Function Signature","local":"rollout-function-signature","sections":[],"depth":3},{"title":"Integration pattern","local":"integration-pattern","sections":[],"depth":3},{"title":"vLLM Modes","local":"vllm-modes","sections":[],"depth":3}],"depth":2},{"title":"Running the Environments","local":"running-the-environments","sections":[],"depth":2},{"title":"Environments Catalog","local":"environments-catalog","sections":[],"depth":2},{"title":"A simple example","local":"a-simple-example","sections":[{"title":"Running the Example","local":"running-the-example","sections":[],"depth":3}],"depth":2},{"title":"Advanced Example","local":"advanced-example","sections":[{"title":"The TextArena Environment","local":"the-textarena-environment","sections":[],"depth":3},{"title":"Wordle","local":"wordle","sections":[],"depth":3},{"title":"Rollout Function","local":"rollout-function","sections":[],"depth":3},{"title":"Reward Functions","local":"reward-functions","sections":[],"depth":3},{"title":"Training the Model","local":"training-the-model","sections":[],"depth":3},{"title":"Running the Advanced Example","local":"running-the-advanced-example","sections":[],"depth":3},{"title":"Results","local":"results","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="openenv-integration-for-training-llms-with-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#openenv-integration-for-training-llms-with-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>OpenEnv Integration for Training LLMs with Environments</span></h1> <p data-svelte-h="svelte-syhfjq"><a href="https://github.com/meta-pytorch/OpenEnv" rel="nofollow">OpenEnv</a> is an open-source framework from Meta’s PyTorch team for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It offers <a href="https://gymnasium.farama.org" rel="nofollow">Gymnasium-style APIs</a> (e.g., <code>reset()</code> and <code>step()</code>) to interface with environments in a standard manner, and supports running these environments as backend servers (for example, via HTTP or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the <a href="https://huggingface.co/collections/openenv/openenv-environment-hub" rel="nofollow">Hugging Face Hub</a>.</p> <p data-svelte-h="svelte-n1ld35">In this guide, we’ll focus on <strong>how to integrate OpenEnv with TRL</strong>, but feel free to explore the links above to dive deeper into OpenEnv itself.</p> <blockquote class="note" data-svelte-h="svelte-174dpxq"><p>You can explore ready-to-use example <a href="example_overview#scripts">scripts</a> and <a href="example_overview#notebooks">notebooks</a> in the Examples Overview.</p></blockquote> <blockquote class="note" data-svelte-h="svelte-3r47ey"><p>Explore the <a href="https://meta-pytorch.org/OpenEnv/" rel="nofollow">OpenEnv docs</a> for more details.</p></blockquote> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation</span></h2> <p data-svelte-h="svelte-17jl3bv">To use OpenEnv with TRL, install the environment package. You have two options:</p> <p data-svelte-h="svelte-1j4m7x3"><strong>Option A - Install from HF Space (recommended):</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install git+https://huggingface.co/spaces/openenv/echo_env<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-4ibm3b"><p>You can also install the core package from PyPI with <code>pip install "openenv-core[core]>=0.2.1"</code>, but note that environment-specific dependencies may need to be installed separately.</p></blockquote> <p data-svelte-h="svelte-rx8b34"><strong>Option B - Clone OpenEnv repo (for development):</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/meta-pytorch/OpenEnv.git
	<span class="hljs-built_in">cd</span> OpenEnv/envs/echo_env
	pip install -e .<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="using-rolloutfunc-with-openenv-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-rolloutfunc-with-openenv-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using rollout_func with OpenEnv environments</span></h2> <p data-svelte-h="svelte-8r0xze">TRL’s <a href="/docs/trl/pr_5321/en/grpo_trainer#trl.GRPOTrainer">GRPOTrainer</a> supports <em>custom rollout logic</em> through the <code>rollout_func</code> argument. This lets you override the trainer’s default text-generation loop and directly interact with OpenEnv environments — for instance, to compute environment-driven rewards instead of relying solely on model-based signals.</p> <h3 class="relative group"><a id="rollout-function-signature" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rollout-function-signature"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rollout Function Signature</span></h3> <p data-svelte-h="svelte-xffp71">A rollout function must have the following signature:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">rollout_func</span>(<span class="hljs-params">
	prompts: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">str</span>],
	trainer: GRPOTrainer,
	</span>) -> <span class="hljs-built_in">dict</span>[<span class="hljs-built_in">str</span>, <span class="hljs-built_in">list</span>]:
	<span class="hljs-string">"""
	Custom rollout function for generation and reward computation.

	Args:
	prompts: List of prompts routed to the current process
	trainer: Active GRPOTrainer (gives access to tokenizer, config and helper utilities)

	Returns:
	Dictionary containing:
	- prompt_ids: List of token IDs for each prompt
	- completion_ids: List of token IDs for each completion
	- logprobs: List of log probabilities for each token
	- Any additional fields are forwarded to reward functions as kwargs
	"""</span>
	<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-dj7sqq"><p>Any extra fields in the returned dictionary (beyond the required three) are automatically forwarded to your reward functions. This makes it easy to propagate signals such as environment rewards or auxiliary metrics from the rollout step.</p></blockquote> <h3 class="relative group"><a id="integration-pattern" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#integration-pattern"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Integration pattern</span></h3> <p data-svelte-h="svelte-1r3xw2z">The typical pattern when combining OpenEnv with TRL looks like this:</p> <ol data-svelte-h="svelte-l0o4ik"><li>Start or connect to an OpenEnv environment (e.g., a Dockerized env or HTTP endpoint).</li> <li>Generate completions from your model — either via <code>trl.experimental.openenv.generate_rollout_completions</code> when using colocated vLLM, or by hitting your inference server when using vLLM in server mode.</li> <li>Step through the environment using each completion to compute rewards or metrics.</li> <li>Add environment results (e.g., <code>env_reward</code>) to the rollout result dict.</li> <li>Access those rewards inside your reward function via <code>**kwargs</code>.</li></ol> <p data-svelte-h="svelte-1o9klxq">By using OpenEnv in this loop, you can:</p> <ul data-svelte-h="svelte-1fcm4d"><li>Train with realistic or interactive feedback (not just static reward functions).</li> <li>Plug in custom simulators, web APIs, or evaluators as environments.</li> <li>Pass structured reward signals back into RL training seamlessly.</li></ul> <h3 class="relative group"><a id="vllm-modes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vllm-modes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>vLLM Modes</span></h3> <p data-svelte-h="svelte-13r5jc4">TRL supports two vLLM execution modes for generation:</p> <ul data-svelte-h="svelte-3d4jyb"><li><strong><code>colocate</code> mode</strong> (default): vLLM runs in the same process as training. Requires 1 GPU. Use <code>trl.experimental.openenv.generate_rollout_completions</code> for generation.</li> <li><strong><code>server</code> mode</strong>: vLLM runs as a separate server process. Requires at least 2 GPUs (one for vLLM server, one for training), but is highly scalable:
	<ul><li>You can allocate multiple GPUs to the vLLM server for tensor parallelism (faster inference)</li> <li>You can run multiple training processes that share the same vLLM server</li> <li>You can use different GPU types for inference vs training (e.g., A100 for vLLM, H100 for training)</li> <li>The vLLM server can serve multiple experiments simultaneously</li> <li>Use <code>trl.experimental.openenv.generate_rollout_completions</code> which will communicate with the server via <code>vllm_server_url</code></li></ul></li></ul> <p data-svelte-h="svelte-1u68yll">Configure the mode via <code>GRPOConfig</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Colocate mode (1 GPU)</span>
	args = GRPOConfig(
	use_vllm=<span class="hljs-literal">True</span>,
	vllm_mode=<span class="hljs-string">"colocate"</span>,
	<span class="hljs-comment"># ... other args</span>
	)

	<span class="hljs-comment"># Server mode (2+ GPUs, scalable)</span>
	args = GRPOConfig(
	use_vllm=<span class="hljs-literal">True</span>,
	vllm_mode=<span class="hljs-string">"server"</span>,
	vllm_server_base_url=<span class="hljs-string">"http://localhost:8000"</span>,
	<span class="hljs-comment"># ... other args</span>
	)

	<span class="hljs-comment"># Example: Start vLLM server with multiple GPUs for tensor parallelism</span>
	<span class="hljs-comment"># CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen3-1.7B --tensor-parallel-size 4</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="running-the-environments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-environments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the Environments</span></h2> <p data-svelte-h="svelte-pdwwc9">You can run OpenEnv environments in three different ways:</p> <ul data-svelte-h="svelte-c1s55y"><li>We can load the environment from the Hugging Face Hub and execute it as a Docker container.</li> <li>We can connect to a hosted environment running on the Hugging Face Hub.</li> <li>We can launch the environment directly using Uvicorn in Python.</li></ul> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">docker </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">space </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">local </div></div> <div class="language-select"><p data-svelte-h="svelte-1f0jw47"><strong>Load from Hugging Face Hub</strong> <em>(recommended)</em></p> <p data-svelte-h="svelte-1f72sys">We can use the <a href="https://meta-pytorch.org/OpenEnv/core/#core.http_env_client.HTTPEnvClient.from_hub" rel="nofollow"><code>from_hub</code></a> method to load the environment from the hub. This method will automatically start a Docker container for the environment on your local machine. <a href="https://huggingface.co/spaces/openenv/echo_env" rel="nofollow"><code>openenv/echo-env</code></a> is the repo_id of the space on the hub.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->env = EchoEnv.from_hub(<span class="hljs-string">"openenv/echo-env"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zoi90c">If you want to launch the environment manually, you can use the following command to pull and run the Docker container:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run -d -p 8001:8000 --platform linux/amd64 registry.hf.space/openenv-echo-env:latest<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fmjrcf">And then you can connect to the environment using the following code:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->env = EchoEnv(base_url=<span class="hljs-string">"http://0.0.0.0:8001"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9udx4t">Here, we map the ports from 8001 to 8000 to make space for a vLLM server, but you will need to manage the ports for your local machine.</p> <blockquote class="note" data-svelte-h="svelte-1nl9d8v"><p>You can find the Docker container for any space on the hub.</p> <ul><li>Open the space page on the hub.</li> <li>Click the <strong>⋮ (three dots)</strong> menu.</li> <li>Select <strong>“Run locally.”</strong></li> <li>Copy and execute the provided command in your terminal.</li></ul> <p><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/open_env_launch_docker.png" alt="open_env_launch_docker"></p></blockquote> <blockquote class="note" data-svelte-h="svelte-1x4qexe"><p>You can also use the <strong>Docker option</strong> with <code>from_docker_image</code> by providing the image name..
	For more details, refer to the official <a href="https://meta-pytorch.org/OpenEnv/core/" rel="nofollow">OpenEnv documentation</a>.</p></blockquote> </div> <h2 class="relative group"><a id="environments-catalog" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#environments-catalog"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Environments Catalog</span></h2> <p data-svelte-h="svelte-hs5qjs">Environment development is active and evolving.
	The best way to explore the <strong>current catalog of maintained environments</strong> is by visiting the official OpenEnv <a href="https://huggingface.co/collections/openenv/environment-hub" rel="nofollow">catalog</a>.</p> <p data-svelte-h="svelte-auxaak">Custom environments are also supported. To learn how to create your own, check out the guide on <a href="https://meta-pytorch.org/OpenEnv/environment-builder/" rel="nofollow">Building Your Own Environment with OpenEnv</a>.</p> <p data-svelte-h="svelte-p0acna">Environments are tightly integrated with the Hub, allowing you to <strong>push new environments directly</strong> so the community can easily pull, reuse, and adapt them for their own use cases.</p> <h2 class="relative group"><a id="a-simple-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-simple-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A simple example</span></h2> <blockquote class="note" data-svelte-h="svelte-3tlj1p"><p>You can explore more ready-to-use example scripts in the <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/" rel="nofollow"><code>examples/scripts/openenv/</code></a> directory.</p></blockquote> <p data-svelte-h="svelte-1wtxduu">The <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py" rel="nofollow">echo.py</a> script demonstrates a minimal, end-to-end integration between TRL and OpenEnv. In this example, the <a href="https://meta-pytorch.org/OpenEnv/environments/echo/" rel="nofollow">Echo environment</a> rewards completions based on their text length, encouraging the model to generate longer outputs. This pattern can be extended to any custom environment that provides structured feedback or task-based rewards:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> echo_env <span class="hljs-keyword">import</span> EchoEnv, EchoAction
	<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer
	<span class="hljs-keyword">from</span> trl.experimental.openenv <span class="hljs-keyword">import</span> generate_rollout_completions

	<span class="hljs-comment"># Create HTTP client for Echo Environment</span>
	client = EchoEnv.from_hub(<span class="hljs-string">"openenv/echo-env"</span>)

	<span class="hljs-string">"""
	Alternatively, you can start the environment manually with Docker and connect to it:

	# Step 1: Start the Echo environment
	docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest

	# Step 2: Connect the client to the running container
	client = EchoEnv(base_url="http://0.0.0.0:8001")
	"""</span>

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">rollout_func</span>(<span class="hljs-params">prompts: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">str</span>], trainer: GRPOTrainer</span>):
	<span class="hljs-comment"># 1. Generate completions using TRL's helper (works for colocated vLLM)</span>
	outputs = generate_rollout_completions(trainer, prompts)
	tokenizer = trainer.processing_class
	completions_text = [
	tokenizer.decode(out[<span class="hljs-string">"completion_ids"</span>], skip_special_tokens=<span class="hljs-literal">True</span>) <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs
	]

	<span class="hljs-comment"># 2. Step through the environment to get rewards</span>
	client.reset()
	env_rewards = []
	<span class="hljs-keyword">for</span> msg <span class="hljs-keyword">in</span> completions_text:
	env_result = client.step(EchoAction(message=msg))
	env_rewards.append(env_result.reward)

	<span class="hljs-comment"># 3. Add environment rewards as extra field</span>
	<span class="hljs-keyword">return</span> {
	<span class="hljs-string">"prompt_ids"</span>: [out[<span class="hljs-string">"prompt_ids"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
	<span class="hljs-string">"completion_ids"</span>: [out[<span class="hljs-string">"completion_ids"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
	<span class="hljs-string">"logprobs"</span>: [out[<span class="hljs-string">"logprobs"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs],
	<span class="hljs-string">"env_reward"</span>: env_rewards,
	}

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_from_env</span>(<span class="hljs-params">completions, **kwargs</span>):
	<span class="hljs-string">"""Extract environment rewards passed via rollout_func kwargs."""</span>
	env_rewards = kwargs.get(<span class="hljs-string">"env_reward"</span>, [])
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(reward) <span class="hljs-keyword">for</span> reward <span class="hljs-keyword">in</span> env_rewards] <span class="hljs-keyword">if</span> env_rewards <span class="hljs-keyword">else</span> [<span class="hljs-number">0.0</span>] * <span class="hljs-built_in">len</span>(completions)

	dataset = Dataset.from_dict({<span class="hljs-string">"prompt"</span>: [<span class="hljs-string">"You are an AI that interacts with an Echo environment. Word to echo:"</span>] * <span class="hljs-number">64</span>})

	<span class="hljs-comment"># Setup trainer with custom rollout</span>
	trainer = GRPOTrainer(
	model=<span class="hljs-string">"Qwen/Qwen2.5-0.5B-Instruct"</span>,
	reward_funcs=reward_from_env,
	train_dataset=dataset,
	rollout_func=rollout_func, <span class="hljs-comment"># Use custom rollout</span>
	args=GRPOConfig(
	use_vllm=<span class="hljs-literal">True</span>,
	vllm_mode=<span class="hljs-string">"colocate"</span>, <span class="hljs-comment"># Use colocate mode (default)</span>
	num_train_epochs=<span class="hljs-number">1</span>,
	num_generations=<span class="hljs-number">8</span>,
	max_completion_length=<span class="hljs-number">2048</span>,
	per_device_train_batch_size=<span class="hljs-number">8</span>,
	gradient_accumulation_steps=<span class="hljs-number">4</span>,
	),
	)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1m5nu2f">That’s it! Now that you’ve seen the full example, let’s unpack how the main pieces fit together.</p> <ol data-svelte-h="svelte-t2cr8h"><li><strong>Environment Client:</strong> <code>EchoEnv</code> implements an HTTP interface to interact with the environment server.</li> <li><strong>Custom rollout:</strong> The <code>rollout_func</code> generates completions and steps through the environment to collect rewards.</li> <li><strong>Extra fields:</strong> The rollout adds <code>env_reward</code> to the result dictionary, which is automatically passed to reward functions.</li> <li><strong>Reward function:</strong> Extracts <code>env_reward</code> from <code>kwargs</code> to apply environment-computed rewards during training.</li></ol> <blockquote class="tip" data-svelte-h="svelte-1wa27di"><p>The trainer-aware rollout hook works in both vLLM server and colocate modes. Use <code>trl.experimental.openenv.generate_rollout_completions</code> so you reuse TRL’s sampling configuration automatically.</p></blockquote> <h3 class="relative group"><a id="running-the-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the Example</span></h3> <p data-svelte-h="svelte-cwsck5">You can run the example in either colocate mode (1 GPU) or server mode (2 GPUs):</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">colocate </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">server </div></div> <div class="language-select"><p data-svelte-h="svelte-vv39h7"><strong>Colocate mode (1 GPU, recommended)</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python examples/scripts/openenv/echo.py --env-mode space --env-host https://openenv-echo-env.hf.space --vllm-mode colocate<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10q1sb9">This runs vLLM in the same process as training, requiring only a single GPU.</p> </div> <p data-svelte-h="svelte-2bovyv">Alternatively, you can manually start the Echo environment in a Docker container before running the training:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Launch the Echo environment</span>
	docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest

	<span class="hljs-comment"># Run training with docker-local mode</span>
	python examples/scripts/openenv/echo.py --env-mode docker-local --vllm-mode colocate<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-klksjg">Below is the reward curve from training:</p> <iframe src="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden" style="width:600px; height:500px; border:0;"></iframe> <h2 class="relative group"><a id="advanced-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#advanced-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Advanced Example</span></h2> <p data-svelte-h="svelte-kauv7a">Let’s level this up a bit by training a model to interact with a more complex environment. We’ll use the game word guessing game <a href="https://www.nytimes.com/games/wordle/index.html" rel="nofollow">wordle</a> from the <a href="https://meta-pytorch.org/OpenEnv/environments/textarena/" rel="nofollow"><code>TextArena</code></a> environment.</p> <blockquote class="note" data-svelte-h="svelte-1gllx0n"><p><br>
	You can explore the notebook version of this example <a href="https://github.com/huggingface/trl/blob/main/examples/notebooks/openenv_wordle_grpo.ipynb" rel="nofollow">here</a>.</p></blockquote> <h3 class="relative group"><a id="the-textarena-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-textarena-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The TextArena Environment</span></h3> <p data-svelte-h="svelte-amaete"><a href="https://huggingface.co/papers/2504.11442" rel="nofollow">TextArena</a> is an open-source collection of competitive text-based games designed to evaluate reasoning skills in LLMs using textual games like Wordle, Snake, Tic-Tac-Toe, and more. Research has shown that such games improve model performance on reasoning tasks.</p> <p data-svelte-h="svelte-or2o6k"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png" alt="image of TextArena"></p> <p data-svelte-h="svelte-1hzgk1m">We will use the <code>TextArena</code> environment to train a model to play Wordle. The environment is a simple text based response environment that allows the model to interact with the game by making guesses and receive feedback on them.</p> <h3 class="relative group"><a id="wordle" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#wordle"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Wordle</span></h3> <p data-svelte-h="svelte-1moxxfn">Wordle is a useful game to train a model on because it requires the model to reason about the word and the feedback provided by the environment. Also, it is a purely language based game that requires no external tools or knowledge. Furthermore, we found that models from 1 billion parameters and up are able to improve on wordle and only require 8 tokens to generate a guess, which makes the game a good benchmark to experiment with Reinforcement Learning environments without significant compute requirements.</p> <blockquote class="note"><p data-svelte-h="svelte-iy9rjv">How does Wordle work?
	Wordle is a word guessing game where the player has to guess a 5-letter word. The player can make 6 guesses, and for each guess, the environment will provide feedback on the correctness of the guess. The player wins if they guess the word in 6 guesses or fewer. It challenges the model to generate words that are likely to be correct, and to learn from the feedback provided by the environment.</p> <p data-svelte-h="svelte-u6inub">For example, if the wordle environment returns the following feedback:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->G U E S S
	<span class="hljs-keyword">X</span> G <span class="hljs-keyword">Y</span> <span class="hljs-keyword">X</span> <span class="hljs-keyword">X</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vetrbn">The model has guessed the word “GUESS” and the environment has provided feedback as the letters X, G, and Y. Referring to colors in the original game as blank, green, and yellow. From this feedback, the model should learn that the word “GUESS” is incorrect. The letter “E” is in the word, but in the wrong position. The letter “U” is correct and in the correct position.</p></blockquote> <p data-svelte-h="svelte-kdti1j">In the TextArena environment, a reward is only given when the model wins the game. The reward is 1.0 if the model wins, and 0.0 otherwise. This is not a very efficient reward signal for the model, so we have added a number of custom reward functions to the script to help the model learn to play the game. The extensible nature of <code>reward_funcs</code> and <code>rollout_func</code> allows you to add any custom reward function you want to the script.</p> <h3 class="relative group"><a id="rollout-function" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rollout-function"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rollout Function</span></h3> <p data-svelte-h="svelte-11hv3wq">The rollout function runs one full Wordle episode, prompting the model for a guess each turn and capturing both environment rewards and auxiliary signals such as letter coverage and repetition penalties.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">rollout_once</span>(<span class="hljs-params">
	trainer: GRPOTrainer,
	env: TextArenaEnv,
	tokenizer: AutoTokenizer,
	dataset_prompt: <span class="hljs-built_in">str</span>,
	system_prompt: <span class="hljs-built_in">str</span>,
	max_turns: <span class="hljs-built_in">int</span>,
	</span>) -> <span class="hljs-built_in">dict</span>[<span class="hljs-built_in">str</span>, <span class="hljs-built_in">list</span>]:
	result = env.reset()
	observation = result.observation

	prompt_ids: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">int</span>] = []
	completion_ids: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">int</span>] = []
	logprobs: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	raw_rewards: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	green_scores: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	yellow_scores: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	repetition_scores: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	correct_scores: <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>] = []
	guess_counts: <span class="hljs-built_in">dict</span>[<span class="hljs-built_in">str</span>, <span class="hljs-built_in">int</span>] = {}

	<span class="hljs-keyword">for</span> _turn <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(max_turns):
	<span class="hljs-comment"># when the game is over the environment will return a done=True</span>
	<span class="hljs-keyword">if</span> result.done:
	<span class="hljs-keyword">break</span>

	<span class="hljs-comment"># set up the prompt for the model</span>
	base_prompt = observation.prompt <span class="hljs-keyword">or</span> dataset_prompt
	user_prompt = make_user_prompt(base_prompt, observation.messages)
	messages = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: system_prompt},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: user_prompt},
	]
	prompt_text = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=<span class="hljs-literal">True</span>,
	tokenize=<span class="hljs-literal">False</span>,
	enable_thinking=<span class="hljs-literal">False</span>,
	)

	<span class="hljs-comment"># Generate completion using trainer (works for both colocate and server modes)</span>
	rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[<span class="hljs-number">0</span>]
	prompt_ids.extend(rollout_outputs[<span class="hljs-string">"prompt_ids"</span>])
	completion_ids.extend(rollout_outputs[<span class="hljs-string">"completion_ids"</span>])
	logprobs.extend(rollout_outputs[<span class="hljs-string">"logprobs"</span>])
	completion_text = rollout_outputs.get(<span class="hljs-string">"text"</span>) <span class="hljs-keyword">or</span> tokenizer.decode(
	rollout_outputs[<span class="hljs-string">"completion_ids"</span>], skip_special_tokens=<span class="hljs-literal">True</span>
	)

	<span class="hljs-comment"># extract the guess from the completion</span>
	guess = extract_guess(completion_text)

	<span class="hljs-comment"># step the environment with the guess</span>
	result = env.step(TextArenaAction(message=guess))
	raw_rewards.append(<span class="hljs-built_in">float</span>(result.reward <span class="hljs-keyword">or</span> <span class="hljs-number">0.0</span>))
	observation = result.observation
	correct_score = <span class="hljs-built_in">float</span>(result.reward <span class="hljs-keyword">or</span> <span class="hljs-number">0.0</span>)
	feedback = extract_wordle_feedback(observation)

	<span class="hljs-comment"># Update guess counts</span>
	previous_occurrences = guess_counts.get(guess, <span class="hljs-number">0</span>)
	repetition_score = scale_repetition_score(previous_occurrences, <span class="hljs-built_in">len</span>(guess_counts))
	guess_counts[guess] = previous_occurrences + <span class="hljs-number">1</span>

	<span class="hljs-comment"># calculate custom reward signals from the feedback</span>
	<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> feedback:
	green_score = <span class="hljs-number">0.0</span>
	yellow_score = <span class="hljs-number">0.0</span>
	<span class="hljs-keyword">else</span>:
	green_count, yellow_count = extract_feedback_counts(feedback)
	green_score = green_count / <span class="hljs-number">5.0</span>
	yellow_score = yellow_count / <span class="hljs-number">5.0</span>

	repetition_scores.append(repetition_score)
	green_scores.append(green_score)
	yellow_scores.append(yellow_score)
	correct_scores.append(correct_score)

	correct_reward_value = correct_scores[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> correct_scores <span class="hljs-keyword">else</span> (raw_rewards[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> raw_rewards <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span>)

	<span class="hljs-keyword">return</span> {
	<span class="hljs-string">"prompt_ids"</span>: prompt_ids,
	<span class="hljs-string">"completion_ids"</span>: completion_ids,
	<span class="hljs-string">"logprobs"</span>: logprobs,
	<span class="hljs-string">"raw_rewards"</span>: raw_rewards,
	<span class="hljs-string">"correct_reward"</span>: correct_reward_value,
	<span class="hljs-string">"green_reward"</span>: green_scores[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> green_scores <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span>,
	<span class="hljs-string">"yellow_reward"</span>: yellow_scores[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> yellow_scores <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span>,
	<span class="hljs-string">"repetition_reward"</span>: repetition_scores[-<span class="hljs-number">1</span>] <span class="hljs-keyword">if</span> repetition_scores <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span>,
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1c2qvit">The environment has a reward signal based on the completion of the game. We found that most models struggle to ever win the game, so we have added a number of custom reward functions to the script to help the model learn to play the game more iteratively. At first, the model will learn to cover new letters and avoid repeating guesses. As it improves, it will learn to win the game.</p> <h3 class="relative group"><a id="reward-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reward-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reward Functions</span></h3> <p data-svelte-h="svelte-modcgz">We log four reward streams that encourage the model to solve the puzzle, cover new letters, and avoid repeating guesses:</p> <ul data-svelte-h="svelte-1tq1s16"><li><code>reward_correct</code>: final win/loss signal from the environment.</li> <li><code>reward_greens</code>: density of green letters in the last feedback.</li> <li><code>reward_yellows</code>: density of yellow letters in the last feedback.</li> <li><code>reward_repetition</code>: penalty for guessing the same token multiple times.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_correct</span>(<span class="hljs-params">completions: <span class="hljs-type">List</span>[<span class="hljs-built_in">str</span>], **kwargs: <span class="hljs-type">Optional</span>[<span class="hljs-type">Dict</span>]</span>) -> <span class="hljs-type">List</span>[<span class="hljs-built_in">float</span>]:
	rewards = kwargs.get(<span class="hljs-string">"correct_reward"</span>) <span class="hljs-keyword">if</span> kwargs <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rewards] <span class="hljs-keyword">if</span> rewards <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> [<span class="hljs-number">0.0</span>] * <span class="hljs-built_in">len</span>(completions)


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_greens</span>(<span class="hljs-params">completions: <span class="hljs-type">List</span>[<span class="hljs-built_in">str</span>], **kwargs: <span class="hljs-type">Optional</span>[<span class="hljs-type">Dict</span>]</span>) -> <span class="hljs-type">List</span>[<span class="hljs-built_in">float</span>]:
	rewards = kwargs.get(<span class="hljs-string">"green_reward"</span>) <span class="hljs-keyword">if</span> kwargs <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rewards] <span class="hljs-keyword">if</span> rewards <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> [<span class="hljs-number">0.0</span>] * <span class="hljs-built_in">len</span>(completions)


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_yellows</span>(<span class="hljs-params">completions: <span class="hljs-type">List</span>[<span class="hljs-built_in">str</span>], **kwargs: <span class="hljs-type">Optional</span>[<span class="hljs-type">Dict</span>]</span>) -> <span class="hljs-type">List</span>[<span class="hljs-built_in">float</span>]:
	rewards = kwargs.get(<span class="hljs-string">"yellow_reward"</span>) <span class="hljs-keyword">if</span> kwargs <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rewards] <span class="hljs-keyword">if</span> rewards <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> [<span class="hljs-number">0.0</span>] * <span class="hljs-built_in">len</span>(completions)


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_repetition</span>(<span class="hljs-params">completions: <span class="hljs-type">List</span>[<span class="hljs-built_in">str</span>], **kwargs: <span class="hljs-type">Optional</span>[<span class="hljs-type">Dict</span>]</span>) -> <span class="hljs-type">List</span>[<span class="hljs-built_in">float</span>]:
	rewards = kwargs.get(<span class="hljs-string">"repetition_reward"</span>) <span class="hljs-keyword">if</span> kwargs <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(r) <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> rewards] <span class="hljs-keyword">if</span> rewards <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> [<span class="hljs-number">0.0</span>] * <span class="hljs-built_in">len</span>(completions)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="training-the-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-the-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training the Model</span></h3> <p data-svelte-h="svelte-3udbcx">The training script wires the custom rollout and rewards into <code>GRPOTrainer</code>. The CLI exposes the configuration used during development as defaults, so you can override endpoints or hyperparameters at launch time.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->parser = argparse.ArgumentParser()
	<span class="hljs-comment"># ... add CLI arguments with sensible defaults ...</span>
	cli_args = parser.parse_args()

	trainer = GRPOTrainer(
	model=cli_args.model_id,
	processing_class=tokenizer,
	reward_funcs=[
	reward_correct,
	reward_greens,
	reward_yellows,
	reward_repetition,
	],
	train_dataset=dataset,
	args=grpo_config,
	rollout_func=<span class="hljs-keyword">lambda</span> prompts, trainer: rollout_func(
	env=env,
	tokenizer=tokenizer,
	prompts=prompts,
	trainer=trainer,
	cli_args=cli_args,
	system_prompt=system_prompt,
	),
	)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="running-the-advanced-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-the-advanced-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running the Advanced Example</span></h3> <p data-svelte-h="svelte-totpx0">You can run the Wordle example in either colocate mode (1 GPU) or server mode (2 GPUs):</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">colocate </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">server </div></div> <div class="language-select"><p data-svelte-h="svelte-vv39h7"><strong>Colocate mode (1 GPU, recommended)</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python examples/scripts/openenv/wordle.py --vllm-mode colocate<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10q1sb9">This runs vLLM in the same process as training, requiring only a single GPU.</p> </div> <p data-svelte-h="svelte-kwtbpj">You can also manually start the TextArena environment in a Docker container before running the training:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Launch the TextArena environment</span>
	docker run -d -p 8001:8001 registry.hf.space/burtenshaw-textarena:latest<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4yjfd2">Then connect to it using <code>--env-mode docker-local--env-host localhost --env-port 8001</code>.</p> <h3 class="relative group"><a id="results" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#results"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Results</span></h3> <p data-svelte-h="svelte-nha02r">The resulting model improves its performance on the game, both by reducing the number of repetitions and by increasing the number of correct guesses. However, the Qwen3-1.7B model we trained is not able to consistently win the game. The following reward curve shows the coverage of the model’s guesses and the coverage of correct Y and G letters.</p> <iframe src="https://burtenshaw-wordle-grpo.hf.space?project=group-Qwen-Qwen3-17B&metrics=reward&runs=run-2025-10-26_09-39-49,run-2025-10-26_08-04-49&sidebar=hidden&navbar=hidden" style="width:1600px; height:500px; border:0;"></iframe> <p data-svelte-h="svelte-pmkif8">We experimented with larger models like <code>gpt-oss-20b</code> and found that the model was able to consistently win the game. However, this requires a lot of compute to train the model. Why not try this out yourself?</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/trl/blob/main/docs/source/openenv.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1aprtyu = {
	assets: "/docs/trl/pr_5321/en",
	base: "/docs/trl/pr_5321/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/trl/pr_5321/en/_app/immutable/entry/start.d05f7d8c.js"),
	import("/docs/trl/pr_5321/en/_app/immutable/entry/app.4dcaf2cd.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 37],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 88.2 kB
Xet hash:: 212b21d57b8754a9d4624d0b342c1fff70eac99db720f397811929640723a525

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.