Buckets:
| import{s as Ja,a as Ta,o as da,n as Dl}from"../chunks/scheduler.7b731bd4.js";import{S as ya,i as ja,e as o,s,c as p,h as ha,a as i,d as t,b as a,f as Pl,g as c,j as r,k as E,v as se,l as z,m as n,n as w,t as m,o as T,p as u}from"../chunks/index.cc268345.js";import{C as fa,H as A,E as Ua}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{C as $}from"../chunks/CodeBlock.169a125f.js";import{H as ua,a as Kl}from"../chunks/HfOption.9f04abd1.js";function Ia(G){let M,C="<strong>Colocate mode (1 GPU, recommended)</strong>",U,j,d,J,y="This runs vLLM in the same process as training, requiring only a single GPU.",v;return j=new $({props:{code:"cHl0aG9uJTIwZXhhbXBsZXMlMkZzY3JpcHRzJTJGb3BlbmVudiUyRndvcmRsZS5weSUyMC0tdmxsbS1tb2RlJTIwY29sb2NhdGU=",highlighted:"python examples/scripts/openenv/wordle.py --vllm-mode colocate",wrap:!1}}),{c(){M=o("p"),M.innerHTML=C,U=s(),p(j.$$.fragment),d=s(),J=o("p"),J.textContent=y},l(h){M=i(h,"P",{"data-svelte-h":!0}),r(M)!=="svelte-vv39h7"&&(M.innerHTML=C),U=a(h),c(j.$$.fragment,h),d=a(h),J=i(h,"P",{"data-svelte-h":!0}),r(J)!=="svelte-10q1sb9"&&(J.textContent=y)},m(h,B){n(h,M,B),n(h,U,B),w(j,h,B),n(h,d,B),n(h,J,B),v=!0},p:Dl,i(h){v||(m(j.$$.fragment,h),v=!0)},o(h){T(j.$$.fragment,h),v=!1},d(h){h&&(t(M),t(U),t(d),t(J)),u(j,h)}}}function va(G){let M,C="<strong>Server mode (2+ GPUs, scalable)</strong>",U,j,d;return j=new $({props:{code:"JTIzJTIwVGVybWluYWwlMjAxJTNBJTIwU3RhcnQlMjB2TExNJTIwaW5mZXJlbmNlJTIwc2VydmVyJTBBQ1VEQV9WSVNJQkxFX0RFVklDRVMlM0QwJTIwdHJsJTIwdmxsbS1zZXJ2ZSUyMC0tbW9kZWwlMjBRd2VuJTJGUXdlbjMtMS43QiUyMC0taG9zdCUyMDAuMC4wLjAlMjAtLXBvcnQlMjA4MDAwJTBBJTBBJTIzJTIwVGVybWluYWwlMjAyJTNBJTIwUnVuJTIwR1JQTyUyMHRyYWluaW5nJTIwd2l0aCUyME9wZW5FbnYlMEFDVURBX1ZJU0lCTEVfREVWSUNFUyUzRDElMjBweXRob24lMjBleGFtcGxlcyUyRnNjcmlwdHMlMkZvcGVuZW52JTJGd29yZGxlLnB5JTIwLS12bGxtLW1vZGUlMjBzZXJ2ZXIlMjAtLXZsbG0tc2VydmVyLXVybCUyMGh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDAw",highlighted:`<span class="hljs-comment"># Terminal 1: Start vLLM inference server</span> | |
| CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000 | |
| <span class="hljs-comment"># Terminal 2: Run GRPO training with OpenEnv</span> | |
| CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000`,wrap:!1}}),{c(){M=o("p"),M.innerHTML=C,U=s(),p(j.$$.fragment)},l(J){M=i(J,"P",{"data-svelte-h":!0}),r(M)!=="svelte-fqqzd3"&&(M.innerHTML=C),U=a(J),c(j.$$.fragment,J)},m(J,y){n(J,M,y),n(J,U,y),w(j,J,y),d=!0},p:Dl,i(J){d||(m(j.$$.fragment,J),d=!0)},o(J){T(j.$$.fragment,J),d=!1},d(J){J&&(t(M),t(U)),u(j,J)}}}function ba(G){let M,C,U,j;return M=new Kl({props:{id:"wordle_vllm_mode",option:"colocate",$$slots:{default:[Ia]},$$scope:{ctx:G}}}),U=new Kl({props:{id:"wordle_vllm_mode",option:"server",$$slots:{default:[va]},$$scope:{ctx:G}}}),{c(){p(M.$$.fragment),C=s(),p(U.$$.fragment)},l(d){c(M.$$.fragment,d),C=a(d),c(U.$$.fragment,d)},m(d,J){w(M,d,J),n(d,C,J),w(U,d,J),j=!0},p(d,J){const y={};J&2&&(y.$$scope={dirty:J,ctx:d}),M.$set(y);const v={};J&2&&(v.$$scope={dirty:J,ctx:d}),U.$set(v)},i(d){j||(m(M.$$.fragment,d),m(U.$$.fragment,d),j=!0)},o(d){T(M.$$.fragment,d),T(U.$$.fragment,d),j=!1},d(d){d&&t(C),u(M,d),u(U,d)}}}function Ca(G){let M,C="<strong>Connect to a remote Hugging Face Space</strong> <em>(simplest)</em>",U,j,d="Most example scripts default to a hosted Space (no setup needed):",J,y,v,h,B='<p>For training, <strong>duplicate the Space to your own account</strong> to avoid concurrency issues. The trainer opens N simultaneous WebSocket connections (one per generation), and shared Spaces may not support this. See <a href="#server-concurrency">Server concurrency</a> for details.</p>',g;return y=new $({props:{code:"ZW52JTIwJTNEJTIwRWNob0VudihiYXNlX3VybCUzRCUyMmh0dHBzJTNBJTJGJTJGb3BlbmVudi1lY2hvLWVudi5oZi5zcGFjZSUyMik=",highlighted:'env = EchoEnv(base_url=<span class="hljs-string">"https://openenv-echo-env.hf.space"</span>)',wrap:!1}}),{c(){M=o("p"),M.innerHTML=C,U=s(),j=o("p"),j.textContent=d,J=s(),p(y.$$.fragment),v=s(),h=o("blockquote"),h.innerHTML=B,this.h()},l(b){M=i(b,"P",{"data-svelte-h":!0}),r(M)!=="svelte-1d83umq"&&(M.innerHTML=C),U=a(b),j=i(b,"P",{"data-svelte-h":!0}),r(j)!=="svelte-1fk7qft"&&(j.textContent=d),J=a(b),c(y.$$.fragment,b),v=a(b),h=i(b,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(h)!=="svelte-fwc1d1"&&(h.innerHTML=B),this.h()},h(){E(h,"class","warning")},m(b,k){n(b,M,k),n(b,U,k),n(b,j,k),n(b,J,k),w(y,b,k),n(b,v,k),n(b,h,k),g=!0},p:Dl,i(b){g||(m(y.$$.fragment,b),g=!0)},o(b){T(y.$$.fragment,b),g=!1},d(b){b&&(t(M),t(U),t(j),t(J),t(v),t(h)),u(y,b)}}}function ga(G){let M,C="<strong>Docker container</strong> <em>(recommended for production)</em>",U,j,d,J,y="Then connect:",v,h,B,g,b="We map port 8001 to 8000 to leave port 8000 available for a vLLM server.",k,I,_="You can also start the container programmatically:",ae,W,x,V,H='<p>You can find the Docker image for any Space on the Hub: open the Space page → <strong>⋮ (three dots)</strong> → <strong>“Run locally.”</strong></p> <p><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/open_env_launch_docker.png" alt="open_env_launch_docker"/></p>',oe;return j=new $({props:{code:"ZG9ja2VyJTIwcnVuJTIwLWQlMjAtcCUyMDgwMDElM0E4MDAwJTIwLS1wbGF0Zm9ybSUyMGxpbnV4JTJGYW1kNjQlMjByZWdpc3RyeS5oZi5zcGFjZSUyRm9wZW5lbnYtZWNoby1lbnYlM0FsYXRlc3Q=",highlighted:"docker run -d -p 8001:8000 --platform linux/amd64 registry.hf.space/openenv-echo-env:latest",wrap:!1}}),h=new $({props:{code:"ZW52JTIwJTNEJTIwRWNob0VudihiYXNlX3VybCUzRCUyMmh0dHAlM0ElMkYlMkYwLjAuMC4wJTNBODAwMSUyMik=",highlighted:'env = EchoEnv(base_url=<span class="hljs-string">"http://0.0.0.0:8001"</span>)',wrap:!1}}),W=new $({props:{code:"ZW52JTIwJTNEJTIwRWNob0Vudi5mcm9tX2RvY2tlcl9pbWFnZSglMjJyZWdpc3RyeS5oZi5zcGFjZSUyRm9wZW5lbnYtZWNoby1lbnYlM0FsYXRlc3QlMjIp",highlighted:'env = EchoEnv.from_docker_image(<span class="hljs-string">"registry.hf.space/openenv-echo-env:latest"</span>)',wrap:!1}}),{c(){M=o("p"),M.innerHTML=C,U=s(),p(j.$$.fragment),d=s(),J=o("p"),J.textContent=y,v=s(),p(h.$$.fragment),B=s(),g=o("p"),g.textContent=b,k=s(),I=o("p"),I.textContent=_,ae=s(),p(W.$$.fragment),x=s(),V=o("blockquote"),V.innerHTML=H,this.h()},l(f){M=i(f,"P",{"data-svelte-h":!0}),r(M)!=="svelte-1sdj1mp"&&(M.innerHTML=C),U=a(f),c(j.$$.fragment,f),d=a(f),J=i(f,"P",{"data-svelte-h":!0}),r(J)!=="svelte-15va8lx"&&(J.textContent=y),v=a(f),c(h.$$.fragment,f),B=a(f),g=i(f,"P",{"data-svelte-h":!0}),r(g)!=="svelte-1lyeqij"&&(g.textContent=b),k=a(f),I=i(f,"P",{"data-svelte-h":!0}),r(I)!=="svelte-1px95wy"&&(I.textContent=_),ae=a(f),c(W.$$.fragment,f),x=a(f),V=i(f,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(V)!=="svelte-zr92yj"&&(V.innerHTML=H),this.h()},h(){E(V,"class","note")},m(f,Z){n(f,M,Z),n(f,U,Z),w(j,f,Z),n(f,d,Z),n(f,J,Z),n(f,v,Z),w(h,f,Z),n(f,B,Z),n(f,g,Z),n(f,k,Z),n(f,I,Z),n(f,ae,Z),w(W,f,Z),n(f,x,Z),n(f,V,Z),oe=!0},p:Dl,i(f){oe||(m(j.$$.fragment,f),m(h.$$.fragment,f),m(W.$$.fragment,f),oe=!0)},o(f){T(j.$$.fragment,f),T(h.$$.fragment,f),T(W.$$.fragment,f),oe=!1},d(f){f&&(t(M),t(U),t(d),t(J),t(v),t(B),t(g),t(k),t(I),t(ae),t(x),t(V)),u(j,f),u(h,f),u(W,f)}}}function Aa(G){let M,C="<strong>Local Python process</strong> <em>(for development)</em>",U,j,d,J,y="Then connect:",v,h,B,g,b='For more details, see the <a href="https://meta-pytorch.org/OpenEnv/environments.html" rel="nofollow">OpenEnv catalog</a>.',k;return j=new $({props:{code:"aGYlMjBkb3dubG9hZCUyMG9wZW5lbnYlMkZlY2hvX2VudiUyMC0tcmVwby10eXBlJTNEc3BhY2UlMjAtLWxvY2FsLWRpciUzRGVjaG9fZW52JTBBcHl0aG9uJTIwLW0lMjB1dmljb3JuJTIwZWNob19lbnYuc3JjLmVudnMuZWNob19lbnYuc2VydmVyLmFwcCUzQWFwcCUyMC0taG9zdCUyMDAuMC4wLjAlMjAtLXBvcnQlMjA4MDAx",highlighted:`hf download openenv/echo_env --repo-type=space --local-dir=echo_env | |
| python -m uvicorn echo_env.src.envs.echo_env.server.app:app --host 0.0.0.0 --port 8001`,wrap:!1}}),h=new $({props:{code:"ZW52JTIwJTNEJTIwRWNob0VudihiYXNlX3VybCUzRCUyMmh0dHAlM0ElMkYlMkYwLjAuMC4wJTNBODAwMSUyMik=",highlighted:'env = EchoEnv(base_url=<span class="hljs-string">"http://0.0.0.0:8001"</span>)',wrap:!1}}),{c(){M=o("p"),M.innerHTML=C,U=s(),p(j.$$.fragment),d=s(),J=o("p"),J.textContent=y,v=s(),p(h.$$.fragment),B=s(),g=o("p"),g.innerHTML=b},l(I){M=i(I,"P",{"data-svelte-h":!0}),r(M)!=="svelte-1r6ikxz"&&(M.innerHTML=C),U=a(I),c(j.$$.fragment,I),d=a(I),J=i(I,"P",{"data-svelte-h":!0}),r(J)!=="svelte-15va8lx"&&(J.textContent=y),v=a(I),c(h.$$.fragment,I),B=a(I),g=i(I,"P",{"data-svelte-h":!0}),r(g)!=="svelte-1o2nuc5"&&(g.innerHTML=b)},m(I,_){n(I,M,_),n(I,U,_),w(j,I,_),n(I,d,_),n(I,J,_),n(I,v,_),w(h,I,_),n(I,B,_),n(I,g,_),k=!0},p:Dl,i(I){k||(m(j.$$.fragment,I),m(h.$$.fragment,I),k=!0)},o(I){T(j.$$.fragment,I),T(h.$$.fragment,I),k=!1},d(I){I&&(t(M),t(U),t(d),t(J),t(v),t(B),t(g)),u(j,I),u(h,I)}}}function $a(G){let M,C,U,j,d,J;return M=new Kl({props:{id:"env_mode",option:"space",$$slots:{default:[Ca]},$$scope:{ctx:G}}}),U=new Kl({props:{id:"env_mode",option:"docker",$$slots:{default:[ga]},$$scope:{ctx:G}}}),d=new Kl({props:{id:"env_mode",option:"local",$$slots:{default:[Aa]},$$scope:{ctx:G}}}),{c(){p(M.$$.fragment),C=s(),p(U.$$.fragment),j=s(),p(d.$$.fragment)},l(y){c(M.$$.fragment,y),C=a(y),c(U.$$.fragment,y),j=a(y),c(d.$$.fragment,y)},m(y,v){w(M,y,v),n(y,C,v),w(U,y,v),n(y,j,v),w(d,y,v),J=!0},p(y,v){const h={};v&2&&(h.$$scope={dirty:v,ctx:y}),M.$set(h);const B={};v&2&&(B.$$scope={dirty:v,ctx:y}),U.$set(B);const g={};v&2&&(g.$$scope={dirty:v,ctx:y}),d.$set(g)},i(y){J||(m(M.$$.fragment,y),m(U.$$.fragment,y),m(d.$$.fragment,y),J=!0)},o(y){T(M.$$.fragment,y),T(U.$$.fragment,y),T(d.$$.fragment,y),J=!1},d(y){y&&(t(C),t(j)),u(M,y),u(U,y),u(d,y)}}}function Ba(G){let M,C,U,j,d,J,y,v,h,B='<a href="https://github.com/meta-pytorch/OpenEnv" rel="nofollow">OpenEnv</a> is an open-source framework for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It provides standardized APIs for environment interaction and supports running environments as backend servers (via WebSocket or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the <a href="https://huggingface.co/collections/openenv/openenv-environment-hub" rel="nofollow">Hugging Face Hub</a>.',g,b,k='This guide covers <strong>how to integrate OpenEnv with TRL</strong>. For more on OpenEnv itself, see the <a href="https://meta-pytorch.org/OpenEnv/" rel="nofollow">OpenEnv docs</a>.',I,_,ae='<p>You can explore ready-to-use example <a href="example_overview#openenv-scripts">scripts</a> and <a href="example_overview#openenv-notebooks">notebooks</a> in the Examples Overview.</p>',W,x,V,H,oe='<a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> can be used to train agents. For agentic tasks, it supports two modes: <strong>tools</strong>, where the model can call external functions but each call is stateless and independent, and <strong>environments</strong>, which maintain state across turns, enabling genuine multi-turn interaction where the agent’s actions shape future observations. Use environments when continuity matters — for example, navigating a game, browsing a web page, or any task where what the agent sees next depends on what it did before.',f,Z,et,ie,as="OpenEnv environments are hosted as Hugging Face Spaces, which are also pip-installable Git repositories:",lt,re,tt,Me,os="This installs the <strong>environment client</strong> (e.g., <code>EchoEnv</code>) that communicates with the remote environment server via WebSocket, along with the action/observation models and all required dependencies (including <code>openenv-core</code>).",nt,S,is="<p>You can find the install command for any environment on its HF Space page. Click the <strong>⋮ (three dots)</strong> menu and select <strong>“Use this Space”</strong> to see the install instructions.</p>",st,q,rs="<p>You can also install the core package from PyPI with <code>pip install "openenv-core[core]>=0.2.1"</code>, but note that environment-specific dependencies may need to be installed separately.</p>",at,pe,Ms="For development, you can clone the OpenEnv repo and install locally:",ot,ce,it,R,zl,ps='Each environment script in TRL includes inline dependency metadata (PEP 723) so you can also run them directly with <a href="https://docs.astral.sh/uv/" rel="nofollow">uv</a>:',ls,we,ts,Sl,cs="This automatically installs the required environment package in an isolated virtual environment.",rt,me,Mt,Te,ws='The fastest way to understand the integration is a complete example. The <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py" rel="nofollow">echo.py</a> script trains a model with the <a href="https://meta-pytorch.org/OpenEnv/environments/echo.html" rel="nofollow">Echo environment</a>, which rewards completions based on their text length:',pt,ue,ct,Je,ms="That’s it. Here’s what happens under the hood:",wt,de,Ts="<li><strong><code>environment_factory=EchoToolEnv</code></strong>: The trainer creates one <code>EchoToolEnv</code> instance per generation (pass the class, not an instance).</li> <li><strong><code>reset()</code></strong> is called at the start of each episode to initialize state. Returns an observation string (or <code>None</code>).</li> <li><strong>Tool discovery</strong>: The trainer discovers all public methods on the environment instance (here, <code>echo()</code>) and exposes them as function-calling tools. Each method must have a proper docstring with typed arguments, which the trainer uses to build the tool schema.</li> <li><strong>Multi-turn loop</strong>: The trainer generates a completion, parses tool calls, executes <code>echo()</code>, appends the result, and generates again, until the model stops calling tools or <code>max_completion_length</code> is reached.</li> <li><strong>Reward function</strong>: Reads <code>env.reward</code> from each environment instance after the episode (before the environment is reset).</li>",mt,ye,Tt,je,us="Below is the reward curve from training:",ut,N,Js,Jt,F,ds='<p>You can explore more ready-to-use example <a href="example_overview#openenv-scripts">scripts</a> and <a href="example_overview#openenv-notebooks">notebooks</a> in the Examples Overview.</p>',dt,he,yt,fe,ys='TRL’s <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> supports interactive environment training through the <code>environment_factory</code> argument. When provided, the trainer automatically handles the multi-turn tool-calling loop: it generates completions, parses tool calls, executes them against the environment, and feeds the results back to the model. All without custom rollout code.',jt,Ue,ht,Ie,js="Your environment class must follow these rules:",ft,ve,hs="<li><strong><code>__init__(self)</code></strong> <em>(optional)</em>: If provided, must take no arguments. Use it to initialize state or clients. If you need external configuration (e.g., a URL), capture it from the enclosing scope or module-level variables.</li> <li><strong>`reset(self, </strong>kwargs)<code>**: Called at the start of each episode. Receives all dataset columns as keyword arguments. Return a string observation (or </code>None` for no initial observation).</li> <li><strong>Tool methods</strong>: Any public method (not starting with <code>_</code>) other than <code>reset</code> is automatically exposed as a tool. Each tool method must have a docstring with <code>Args:</code> descriptions, since the trainer uses these to generate the tool schema for the model.</li>",Ut,be,It,Ce,fs='<li><strong>State for reward</strong>: You can store any state you want on the environment instance (e.g., <code>self.reward</code>, <code>self.done</code>, etc.) and access it in your reward function via the <code>environments</code> parameter. Refer to the <a href="#quick-start">Quick Start guide</a> for an example of this pattern.</li> <li><strong>Error handling</strong>: If a tool method raises an exception (e.g., <code>ValueError("Game over.")</code>), the trainer catches it and feeds the error message back to the model as a tool response. This is the recommended way to signal that an action is invalid or that the episode has ended.</li>',vt,ge,bt,P,Us="<p>Tools must be <strong>individual methods</strong> with descriptive names and typed arguments (e.g., <code>guess(word: str)</code>, <code>move(direction: str)</code>). We do not recommend using generic methods like <code>step(action)</code>, since the model needs meaningful tool names and argument descriptions to learn tool calling.</p>",Ct,Ae,gt,$e,Is="Reward functions receive the <code>environments</code> parameter (a list of environment instances), so you can access any state stored during the episode:",At,Be,$t,Ze,vs='For more information on reward functions, see the <a href="grpo_trainer#custom-reward-functions">GRPO - Custom Reward Functions</a>.',Bt,_e,Zt,ke,bs="A few things we’ve found helpful when working with OpenEnv environments and GRPO:",_t,Ee,Cs='<li><strong>Simple rewards work well.</strong> In our experiments with Wordle and Sudoku, binary rewards (1.0 for success, 0.0 otherwise) gave cleaner training signals than shaped rewards with partial credit. GRPO compares completions within a group, so the relative ranking matters more than the absolute values.</li> <li><strong>Check the final state, not the path.</strong> When possible, let the environment judge the outcome (e.g., “did the model solve the puzzle?”) rather than checking if it followed a specific sequence of actions. This gives the model freedom to discover its own strategies.</li> <li><strong>Test your reward before training.</strong> Run a few episodes manually (see the <a href="https://github.com/huggingface/trl/blob/main/examples/notebooks/openenv_wordle_grpo.ipynb" rel="nofollow">Wordle example notebook</a>) to confirm the environment returns sensible rewards. If a capable model can’t score higher than a random baseline, the reward signal may need adjustment.</li>',kt,Ge,Et,Ve,gs="The <code>max_completion_length</code> parameter limits the <strong>total number of tokens across the entire multi-turn conversation</strong> (all model generations + tool results combined), not just a single generation. For environments with many turns (e.g., Sudoku with dozens of moves), you may need to increase it:",Gt,We,Vt,xe,As="If episodes are being cut short (model stops mid-game), this is likely the cause.",Wt,Re,xt,Ne,$s='Let’s train a model to play <a href="https://www.nytimes.com/games/wordle/index.html" rel="nofollow">Wordle</a> using the <a href="https://meta-pytorch.org/OpenEnv/environments/textarena.html" rel="nofollow"><code>TextArena</code></a> environment. This demonstrates multi-turn interaction, cumulative feedback handling, and episode termination via exceptions.',Rt,O,Bs='<p>You can explore the notebook version of this example in <a href="https://github.com/huggingface/trl/blob/main/examples/notebooks/openenv_wordle_grpo.ipynb" rel="nofollow">the OpenEnv Wordle GRPO example</a>.</p>',Nt,Qe,Qt,Xe,Zs='<a href="https://huggingface.co/papers/2504.11442" rel="nofollow">TextArena</a> is an open-source collection of competitive text-based games designed to evaluate reasoning skills in LLMs using textual games like Wordle, Snake, Tic-Tac-Toe, and more.',Xt,He,_s='<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png" alt="image of TextArena"/>',Ht,Ye,Yt,Le,ks="Wordle is a good benchmark for environment-based RL because it requires reasoning about feedback, is purely text-based, and models from 1B parameters can improve at it. Each guess is only 8 tokens, making it lightweight to experiment with.",Lt,Q,ql,Es=`How does Wordle work? | |
| Wordle is a word guessing game where the player has to guess a 5-letter word in 6 attempts. After each guess, the environment provides letter-by-letter feedback:`,ns,ze,ss,Fl,Gs="X = not in the word, G = correct position (green), Y = wrong position (yellow). Here, “U” is correct and in place, “E” is in the word but misplaced.",zt,Se,St,qe,Vs="The <code>WordleEnv</code> class wraps the TextArena client and exposes <code>guess()</code> as the tool:",qt,Fe,Ft,Pe,Ws="Key design choices:",Pt,Oe,xs="<li><strong><code>reset()</code></strong> returns the initial game message as the first observation the model sees.</li> <li><strong><code>guess()</code></strong> is the only tool. The model calls it each turn with a 5-letter word.</li> <li><strong>Cumulative feedback slicing</strong>: TextArena returns the full game history each turn. We slice out only the new part to avoid repeating context.</li> <li><strong>Exception on done</strong>: If the model tries to guess after the game ends, <code>guess()</code> raises a <code>ValueError</code>. The trainer catches this and feeds <code>"Game over."</code> back to the model as a tool response. The model learns to stop calling tools after this signal.</li>",Ot,Ke,Kt,De,Dt,el,Rs="The environment returns <code>1.0</code> if the model wins and <code>0.0</code> otherwise.",en,ll,ln,K,tn,tl,nn,nl,Ns="The model improves its performance by reducing repetitions and increasing correct guesses. However, Qwen3-1.7B with <code>enable_thinking=False</code> is not able to consistently win the game.",sn,X,Qs,an,D,Xs="<p>With <code>enable_thinking=False</code> (the default in these examples), small models like Qwen3-1.7B can learn to improve their guesses but should not be expected to consistently solve the game. For significantly better results, use larger models or enable thinking mode (<code>enable_thinking=True</code>), which allows the model to reason before making a guess at the cost of longer completions.</p>",on,sl,Hs='We experimented with larger models like <a href="https://huggingface.co/openai/gpt-oss-20b" rel="nofollow"><code>gpt-oss-20b</code></a> and found that it was able to consistently win the game, though this requires significantly more compute.',rn,al,Mn,ol,Ys="You can train a single model across multiple environments simultaneously. This is useful when you want a model to learn different skills in parallel. For example, playing Wordle (language reasoning) and Catch (spatial reasoning) in the same training run.",pn,il,Ls="The key idea is to create a <strong>meta-environment class</strong> that wraps multiple environments and routes each sample to the correct one using a dataset column.",cn,rl,wn,Ml,zs="<li>Add an <code>"env"</code> column (or similar) to your dataset that identifies which environment each sample belongs to.</li> <li>In <code>reset(**kwargs)</code>, read <code>kwargs["env"]</code> to select the active environment for that episode.</li> <li>Expose tools from all environments; the trainer discovers all public methods.</li> <li>Use separate reward functions per environment, returning <code>None</code> for samples that don’t belong to that environment. TRL handles <code>None</code> values with <code>nansum</code>/<code>nanmean</code>.</li>",mn,pl,Tn,cl,Ss='The <a href="https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/multi_env.py" rel="nofollow">multi_env.py</a> script trains on Wordle and Catch simultaneously:',un,wl,Jn,ml,qs="Key patterns:",dn,Tl,Fs="<li><strong>Lazy client initialization</strong>: Create clients in <code>reset()</code>, not <code>__init__()</code>, to avoid unnecessary WebSocket connections.</li> <li><strong>Close before reopen</strong>: Close the previous client before creating a new one to avoid server capacity errors.</li> <li><strong><code>kwargs</code> routing</strong>: The <code>"env"</code> column from the dataset is passed to <code>reset()</code> as a keyword argument.</li> <li><strong>All tools are exposed simultaneously</strong>: The model sees <code>guess</code>, <code>move</code>, and <code>stay</code> as available tools regardless of the active environment. If it calls the wrong tool (e.g., <code>move</code> during Wordle), the method raises a <code>ValueError</code> that the trainer catches gracefully. In practice, models learn to use the correct tools based on the system prompt.</li>",yn,ul,jn,Jl,Ps="Each reward function returns <code>None</code> for samples from other environments:",hn,dl,fn,yl,Os="TRL converts <code>None</code> to <code>nan</code> internally and uses <code>nansum</code>/<code>nanmean</code> for aggregation, so each sample is only scored by its relevant reward function.",Un,jl,In,hl,vn,fl,bn,Ul,Cn,ee,Ks="<p>When training across multiple environments, monitor the per-reward-function metrics (<code>train/reward_func_0</code>, <code>train/reward_func_1</code>, etc.) rather than the combined <code>train/reward</code>. The combined metric alternates between environments and can appear noisy.</p>",gn,Il,An,vl,Ds="When using <code>environment_factory</code>, the trainer connects to the environment server automatically. You just need the server to be running. There are three ways to run an OpenEnv environment server:",$n,le,Bn,bl,Zn,Cl,ea='The best way to explore the current catalog of maintained environments is by visiting the official OpenEnv <a href="https://huggingface.co/collections/openenv/environment-hub" rel="nofollow">catalog</a>.',_n,gl,la='To create your own environment, check out the guide on <a href="https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_03_building_environments.html" rel="nofollow">Building Your Own Environment with OpenEnv</a>. Environments are tightly integrated with the Hub, so you can push new environments for the community to reuse.',kn,Al,En,$l,ta="When using <code>environment_factory</code>, the trainer creates N environment instances (one per generation), each opening a WebSocket connection to the server. By default, OpenEnv servers allow only 1 concurrent session, which will cause failures during training.",Gn,Bl,na="To support parallel training, configure the server for concurrency:",Vn,Zl,sa="<li>In your environment file, declare concurrent session support:</li>",Wn,_l,xn,te,aa="<li>In your server app, set the concurrency limit:</li>",Rn,kl,Nn,ne,oa="<p><code>max_concurrent_envs</code> should be ≥ <code>generation_batch_size</code> (which defaults to <code>per_device_train_batch_size × gradient_accumulation_steps</code>). For example, with <code>gradient_accumulation_steps=64</code> and batch size 1, you need at least 64 concurrent sessions.</p>",Qn,El,Xn,Gl,ia='<a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> supports two approaches for environment-based training:',Hn,Vl,ra="<li><strong><code>environment_factory</code></strong> (recommended): You define an environment class with tool methods, and the trainer handles generation, tool-call parsing, and the multi-turn loop automatically. This is the approach used throughout this guide.</li> <li><strong><code>rollout_func</code></strong>: You write the entire generation and environment interaction loop yourself. This gives full control over how completions are produced, how tools are executed, and how rewards are computed.</li>",Yn,Wl,Ma='Use <code>rollout_func</code> when <code>environment_factory</code> doesn’t fit your use case. For example, <strong>external agent servers</strong> like <a href="nemo_gym">NeMo-Gym</a>, where an external server owns the generation loop and manages its own agent-environment interaction protocol.',Ln,xl,zn,Rl,pa="If you have existing <code>rollout_func</code> code and want to migrate, here’s the mapping:",Sn,Nl,ca="<thead><tr><th><code>rollout_func</code> pattern</th> <th><code>environment_factory</code> equivalent</th></tr></thead> <tbody><tr><td>Manual generation loop</td> <td>Handled automatically by the trainer</td></tr> <tr><td><code>generate_rollout_completions()</code></td> <td>Not needed, trainer generates internally</td></tr> <tr><td><code>env.step(Action(...))</code> in rollout</td> <td>Wrap in a tool method on the environment class</td></tr> <tr><td>Reward via <code>kwargs["env_reward"]</code></td> <td>Reward via <code>environments</code> parameter</td></tr> <tr><td><code>env_mask</code> construction</td> <td>Automatic, trainer builds <code>tool_mask</code></td></tr> <tr><td>Token concatenation</td> <td>Automatic, trainer manages token sequences</td></tr></tbody>",qn,Ql,wa="<strong>Before</strong> (<code>rollout_func</code>):",Fn,Xl,Pn,Hl,ma="<strong>After</strong> (<code>environment_factory</code>):",On,Yl,Kn,Ll,Dn,Ol,es;return d=new fa({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),y=new A({props:{title:"OpenEnv Integration for Training LLMs with Environments",local:"openenv-integration-for-training-llms-with-environments",headingTag:"h1"}}),x=new A({props:{title:"When to use environments",local:"when-to-use-environments",headingTag:"h2"}}),Z=new A({props:{title:"Installation",local:"installation",headingTag:"h2"}}),re=new $({props:{code:"JTIzJTIwRWNobyUyMGVudmlyb25tZW50JTBBcGlwJTIwaW5zdGFsbCUyMCUyMm9wZW5lbnYtZWNoby1lbnYlMjAlNDAlMjBnaXQlMkJodHRwcyUzQSUyRiUyRmh1Z2dpbmdmYWNlLmNvJTJGc3BhY2VzJTJGb3BlbmVudiUyRmVjaG9fZW52JTIyJTBBJTBBJTIzJTIwV29yZGxlJTIwKFRleHRBcmVuYSklMjBlbnZpcm9ubWVudCUwQXBpcCUyMGluc3RhbGwlMjAlMjJvcGVuZW52LXRleHRhcmVuYSUyMCU0MCUyMGdpdCUyQmh0dHBzJTNBJTJGJTJGaHVnZ2luZ2ZhY2UuY28lMkZzcGFjZXMlMkZvcGVuZW52JTJGd29yZGxlJTIyJTBBJTBBJTIzJTIwQ2F0Y2glMjAoT3BlblNwaWVsKSUyMGVudmlyb25tZW50JTBBcGlwJTIwaW5zdGFsbCUyMCUyMm9wZW5lbnYtb3BlbnNwaWVsLWVudiUyMCU0MCUyMGdpdCUyQmh0dHBzJTNBJTJGJTJGaHVnZ2luZ2ZhY2UuY28lMkZzcGFjZXMlMkZvcGVuZW52JTJGb3BlbnNwaWVsX2VudiUyMg==",highlighted:`<span class="hljs-comment"># Echo environment</span> | |
| pip install <span class="hljs-string">"openenv-echo-env @ git+https://huggingface.co/spaces/openenv/echo_env"</span> | |
| <span class="hljs-comment"># Wordle (TextArena) environment</span> | |
| pip install <span class="hljs-string">"openenv-textarena @ git+https://huggingface.co/spaces/openenv/wordle"</span> | |
| <span class="hljs-comment"># Catch (OpenSpiel) environment</span> | |
| pip install <span class="hljs-string">"openenv-openspiel-env @ git+https://huggingface.co/spaces/openenv/openspiel_env"</span>`,wrap:!1}}),ce=new $({props:{code:"Z2l0JTIwY2xvbmUlMjBodHRwcyUzQSUyRiUyRmdpdGh1Yi5jb20lMkZtZXRhLXB5dG9yY2glMkZPcGVuRW52LmdpdCUwQWNkJTIwT3BlbkVudiUyRmVudnMlMkZlY2hvX2VudiUwQXBpcCUyMGluc3RhbGwlMjAtZSUyMC4=",highlighted:`git <span class="hljs-built_in">clone</span> https://github.com/meta-pytorch/OpenEnv.git | |
| <span class="hljs-built_in">cd</span> OpenEnv/envs/echo_env | |
| pip install -e .`,wrap:!1}}),we=new $({props:{code:"dXYlMjBydW4lMjBleGFtcGxlcyUyRnNjcmlwdHMlMkZvcGVuZW52JTJGZWNoby5weQ==",highlighted:"uv run examples/scripts/openenv/echo.py",wrap:!1}}),me=new A({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),ue=new $({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQWZyb20lMjBlY2hvX2VudiUyMGltcG9ydCUyMEVjaG9FbnYlMEFmcm9tJTIwZWNob19lbnYubW9kZWxzJTIwaW1wb3J0JTIwRWNob0FjdGlvbiUwQSUwQWZyb20lMjB0cmwlMjBpbXBvcnQlMjBHUlBPQ29uZmlnJTJDJTIwR1JQT1RyYWluZXIlMEElMEFFTlZfVVJMJTIwJTNEJTIwJTIyaHR0cHMlM0ElMkYlMkZvcGVuZW52LWVjaG8tZW52LmhmLnNwYWNlJTIyJTBBJTBBY2xhc3MlMjBFY2hvVG9vbEVudiUzQSUwQSUyMCUyMCUyMCUyMGRlZiUyMF9faW5pdF9fKHNlbGYpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5lbnYlMjAlM0QlMjBFY2hvRW52KGJhc2VfdXJsJTNERU5WX1VSTCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLnJld2FyZCUyMCUzRCUyMDAuMCUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMHJlc2V0KHNlbGYlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwc3RyJTIwJTdDJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYucmV3YXJkJTIwJTNEJTIwMC4wJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwTm9uZSUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMGVjaG8oc2VsZiUyQyUyMG1lc3NhZ2UlM0ElMjBzdHIpJTIwLSUzRSUyMHN0ciUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMEVjaG8lMjB0aGUlMjBtZXNzYWdlJTIwYmFjayUyMGZyb20lMjB0aGUlMjBlbnZpcm9ubWVudC4lMEElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBBcmdzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbWVzc2FnZSUzQSUyMFRoZSUyMG1lc3NhZ2UlMjB0byUyMGVjaG8lMEElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBSZXR1cm5zJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwVGhlJTIwZWNob2VkJTIwbWVzc2FnZS4lMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjIlMjIlMjIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBvYnNlcnZhdGlvbiUyMCUzRCUyMHNlbGYuZW52LnN0ZXAoRWNob0FjdGlvbihtZXNzYWdlJTNEbWVzc2FnZSkpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjBvYnNlcnZhdGlvbi5vYnNlcnZhdGlvbi5yZXdhcmQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXR1cm4lMjBvYnNlcnZhdGlvbi5vYnNlcnZhdGlvbi5lY2hvZWRfbWVzc2FnZSUwQSUwQWRlZiUyMHJld2FyZF9mdW5jKGVudmlyb25tZW50cyUyQyUyMCoqa3dhcmdzKSUzQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU1QmVudi5yZXdhcmQlMjBmb3IlMjBlbnYlMjBpbiUyMGVudmlyb25tZW50cyU1RCUwQSUwQWRhdGFzZXQlMjAlM0QlMjBEYXRhc2V0LmZyb21fZGljdCglMEElMjAlMjAlMjAlMjAlN0IlMjJwcm9tcHQlMjIlM0ElMjAlNUIlNUIlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJUcnklMjB0byUyMGVjaG8lMjAnSGVsbG8lMjBXb3JsZCEnJTIwaW4lMjB0aGUlMjBlbnZpcm9ubWVudC4lMjIlN0QlNUQlNUQlMjAqJTIwNjQlN0QlMEEpJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyUXdlbiUyRlF3ZW4zLTAuNkIlMjIlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMHJld2FyZF9mdW5jcyUzRHJld2FyZF9mdW5jJTJDJTBBJTIwJTIwJTIwJTIwYXJncyUzREdSUE9Db25maWcoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY2hhdF90ZW1wbGF0ZV9rd2FyZ3MlM0QlN0IlMjJlbmFibGVfdGhpbmtpbmclMjIlM0ElMjBGYWxzZSU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGxvZ19jb21wbGV0aW9ucyUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjApJTJDJTBBJTIwJTIwJTIwJTIwZW52aXJvbm1lbnRfZmFjdG9yeSUzREVjaG9Ub29sRW52JTJDJTBBKSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| <span class="hljs-keyword">from</span> echo_env <span class="hljs-keyword">import</span> EchoEnv | |
| <span class="hljs-keyword">from</span> echo_env.models <span class="hljs-keyword">import</span> EchoAction | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer | |
| ENV_URL = <span class="hljs-string">"https://openenv-echo-env.hf.space"</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">EchoToolEnv</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| self.env = EchoEnv(base_url=ENV_URL) | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -> <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>: | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">return</span> <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">echo</span>(<span class="hljs-params">self, message: <span class="hljs-built_in">str</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">""" | |
| Echo the message back from the environment. | |
| Args: | |
| message: The message to echo | |
| Returns: | |
| The echoed message. | |
| """</span> | |
| observation = self.env.step(EchoAction(message=message)) | |
| self.reward = observation.observation.reward | |
| <span class="hljs-keyword">return</span> observation.observation.echoed_message | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>): | |
| <span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments] | |
| dataset = Dataset.from_dict( | |
| {<span class="hljs-string">"prompt"</span>: [[{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Try to echo 'Hello World!' in the environment."</span>}]] * <span class="hljs-number">64</span>} | |
| ) | |
| trainer = GRPOTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen3-0.6B"</span>, | |
| train_dataset=dataset, | |
| reward_funcs=reward_func, | |
| args=GRPOConfig( | |
| chat_template_kwargs={<span class="hljs-string">"enable_thinking"</span>: <span class="hljs-literal">False</span>}, | |
| log_completions=<span class="hljs-literal">True</span>, | |
| ), | |
| environment_factory=EchoToolEnv, | |
| ) | |
| trainer.train()`,wrap:!1}}),ye=new $({props:{code:"JTIzJTIwUnVuJTIwdGhlJTIwZXhhbXBsZSUwQXB5dGhvbiUyMGV4YW1wbGVzJTJGc2NyaXB0cyUyRm9wZW5lbnYlMkZlY2hvLnB5JTBBJTBBJTIzJTIwQ3VzdG9taXplJTIwbW9kZWwlMjBhbmQlMjBlbnZpcm9ubWVudCUyMFVSTCUwQXB5dGhvbiUyMGV4YW1wbGVzJTJGc2NyaXB0cyUyRm9wZW5lbnYlMkZlY2hvLnB5JTIwLS1tb2RlbCUyMFF3ZW4lMkZRd2VuMy0wLjZCJTIwLS1lbnYtaG9zdCUyMGh0dHBzJTNBJTJGJTJGb3BlbmVudi1lY2hvLWVudi5oZi5zcGFjZQ==",highlighted:`<span class="hljs-comment"># Run the example</span> | |
| python examples/scripts/openenv/echo.py | |
| <span class="hljs-comment"># Customize model and environment URL</span> | |
| python examples/scripts/openenv/echo.py --model Qwen/Qwen3-0.6B --env-host https://openenv-echo-env.hf.space`,wrap:!1}}),he=new A({props:{title:"How environment_factory works",local:"how-environmentfactory-works",headingTag:"h2"}}),Ue=new A({props:{title:"Environment class requirements",local:"environment-class-requirements",headingTag:"h3"}}),be=new A({props:{title:"Tips for environment classes",local:"tips-for-environment-classes",headingTag:"h3"}}),ge=new $({props:{code:"RU5WX1VSTCUyMCUzRCUyMCUyMmh0dHBzJTNBJTJGJTJGbXktZW52LmhmLnNwYWNlJTIyJTBBJTBBY2xhc3MlMjBNeUVudiUzQSUwQSUyMCUyMCUyMCUyMGRlZiUyMF9faW5pdF9fKHNlbGYpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5jbGllbnQlMjAlM0QlMjBNeUNsaWVudChiYXNlX3VybCUzREVOVl9VUkwpJTIwJTIwJTIzJTIwY2FwdHVyZWQlMjBmcm9tJTIwZW5jbG9zaW5nJTIwc2NvcGUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLnJld2FyZCUyMCUzRCUyMDAuMCUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMHJlc2V0KHNlbGYlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwc3RyJTIwJTdDJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYucmV3YXJkJTIwJTNEJTIwMC4wJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTIySW5pdGlhbCUyMG9ic2VydmF0aW9uJTIwZm9yJTIwdGhlJTIwbW9kZWwlMjIlMEElMEElMjAlMjAlMjAlMjBkZWYlMjBteV90b29sKHNlbGYlMkMlMjBhcmcxJTNBJTIwc3RyJTJDJTIwYXJnMiUzQSUyMGludCklMjAtJTNFJTIwc3RyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwRGVzY3JpcHRpb24lMjBvZiUyMHdoYXQlMjB0aGlzJTIwdG9vbCUyMGRvZXMuJTBBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwQXJncyUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGFyZzElM0ElMjBEZXNjcmlwdGlvbiUyMG9mJTIwYXJnMSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGFyZzIlM0ElMjBEZXNjcmlwdGlvbiUyMG9mJTIwYXJnMiUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFJldHVybnMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjByZXN1bHQlMjBtZXNzYWdlLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYucmV3YXJkJTIwJTNEJTIwMS4wJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTIyVG9vbCUyMHJlc3VsdCUyMg==",highlighted:`ENV_URL = <span class="hljs-string">"https://my-env.hf.space"</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">MyEnv</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| self.client = MyClient(base_url=ENV_URL) <span class="hljs-comment"># captured from enclosing scope</span> | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -> <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>: | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">return</span> <span class="hljs-string">"Initial observation for the model"</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">my_tool</span>(<span class="hljs-params">self, arg1: <span class="hljs-built_in">str</span>, arg2: <span class="hljs-built_in">int</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">""" | |
| Description of what this tool does. | |
| Args: | |
| arg1: Description of arg1 | |
| arg2: Description of arg2 | |
| Returns: | |
| The result message. | |
| """</span> | |
| self.reward = <span class="hljs-number">1.0</span> | |
| <span class="hljs-keyword">return</span> <span class="hljs-string">"Tool result"</span>`,wrap:!1}}),Ae=new A({props:{title:"Reward functions",local:"reward-functions",headingTag:"h3"}}),Be=new $({props:{code:"ZGVmJTIwcmV3YXJkX2Z1bmMoZW52aXJvbm1lbnRzJTJDJTIwKiprd2FyZ3MpJTIwLSUzRSUyMGxpc3QlNUJmbG9hdCU1RCUzQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU1QmVudi5yZXdhcmQlMjBmb3IlMjBlbnYlMjBpbiUyMGVudmlyb25tZW50cyU1RA==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>) -> <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]: | |
| <span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments]`,wrap:!1}}),_e=new A({props:{title:"Tips for reward functions",local:"tips-for-reward-functions",headingTag:"h3"}}),Ge=new A({props:{title:"max_completion_length in multi-turn episodes",local:"maxcompletionlength-in-multi-turn-episodes",headingTag:"h3"}}),We=new $({props:{code:"YXJncyUyMCUzRCUyMEdSUE9Db25maWcoJTBBJTIwJTIwJTIwJTIwbWF4X2NvbXBsZXRpb25fbGVuZ3RoJTNENDA5NiUyQyUyMCUyMCUyMyUyMGRlZmF1bHQlMjBpcyUyMHVzdWFsbHklMjAyNTYtMTAyNCUyQyUyMGluY3JlYXNlJTIwZm9yJTIwbG9uZyUyMGVwaXNvZGVzJTBBJTIwJTIwJTIwJTIwJTIzJTIwLi4uJTBBKQ==",highlighted:`args = GRPOConfig( | |
| max_completion_length=<span class="hljs-number">4096</span>, <span class="hljs-comment"># default is usually 256-1024, increase for long episodes</span> | |
| <span class="hljs-comment"># ...</span> | |
| )`,wrap:!1}}),Re=new A({props:{title:"Advanced example: Wordle",local:"advanced-example-wordle",headingTag:"h2"}}),Qe=new A({props:{title:"The TextArena Environment",local:"the-textarena-environment",headingTag:"h3"}}),Ye=new A({props:{title:"Why Wordle?",local:"why-wordle",headingTag:"h3"}}),ze=new $({props:{code:"RyUyMFUlMjBFJTIwUyUyMFMlMEFYJTIwRyUyMFklMjBYJTIwWA==",highlighted:`G U E S S | |
| <span class="hljs-keyword">X</span> G <span class="hljs-keyword">Y</span> <span class="hljs-keyword">X</span> <span class="hljs-keyword">X</span>`,wrap:!1}}),Se=new A({props:{title:"Environment class",local:"environment-class",headingTag:"h3"}}),Fe=new $({props:{code:"ZnJvbSUyMHRleHRhcmVuYV9lbnYlMjBpbXBvcnQlMjBUZXh0QXJlbmFBY3Rpb24lMkMlMjBUZXh0QXJlbmFFbnYlMEElMEFjbGFzcyUyMFdvcmRsZUVudiUzQSUwQSUyMCUyMCUyMCUyMGRlZiUyMF9faW5pdF9fKHNlbGYpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5jbGllbnQlMjAlM0QlMjBUZXh0QXJlbmFFbnYoYmFzZV91cmwlM0QlMjJodHRwcyUzQSUyRiUyRm9wZW5lbnYtd29yZGxlLmhmLnNwYWNlJTIyKSUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMHJlc2V0KHNlbGYlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwc3RyJTIwJTdDJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJlc3VsdCUyMCUzRCUyMHNlbGYuY2xpZW50LnJlc2V0KCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLl9sYXN0X2Z1bGxfZmVlZGJhY2slMjAlM0QlMjByZXN1bHQub2JzZXJ2YXRpb24ubWVzc2FnZXMlNUIwJTVELmNvbnRlbnQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLnJld2FyZCUyMCUzRCUyMDAuMCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYuZG9uZSUyMCUzRCUyMEZhbHNlJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwc2VsZi5fbGFzdF9mdWxsX2ZlZWRiYWNrJTBBJTBBJTIwJTIwJTIwJTIwZGVmJTIwZ3Vlc3Moc2VsZiUyQyUyMGd1ZXNzJTNBJTIwc3RyKSUyMC0lM0UlMjBzdHIlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjIlMjIlMjIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBNYWtlJTIwYSUyMGd1ZXNzJTIwaW4lMjB0aGUlMjBXb3JkbGUlMjBlbnZpcm9ubWVudC4lMEElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBBcmdzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZ3Vlc3MlM0ElMjBUaGUlMjBndWVzc2VkJTIwd29yZCUyQyUyMGZvcm1hdHRlZCUyMGFzJTIwJyU1QmFiY2RlJTVEJyUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFJldHVybnMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBmZWVkYmFjayUyMG1lc3NhZ2UlMjBmcm9tJTIwdGhlJTIwZW52aXJvbm1lbnQuJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBzZWxmLmRvbmUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByYWlzZSUyMFZhbHVlRXJyb3IoJTIyR2FtZSUyMG92ZXIuJTIyKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJlc3VsdCUyMCUzRCUyMHNlbGYuY2xpZW50LnN0ZXAoVGV4dEFyZW5hQWN0aW9uKG1lc3NhZ2UlM0RndWVzcykpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwX2Z1bGxfZmVlZGJhY2slMjAlM0QlMjByZXN1bHQub2JzZXJ2YXRpb24ubWVzc2FnZXMlNUIwJTVELmNvbnRlbnQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmZWVkYmFjayUyMCUzRCUyMF9mdWxsX2ZlZWRiYWNrJTVCbGVuKHNlbGYuX2xhc3RfZnVsbF9mZWVkYmFjayklM0ElNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLl9sYXN0X2Z1bGxfZmVlZGJhY2slMjAlM0QlMjBfZnVsbF9mZWVkYmFjayUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwJTIyWW91JTIwYXR0ZW1wdGVkJTIwYW4lMjBpbnZhbGlkJTIwbW92ZSUyMiUyMGluJTIwZmVlZGJhY2slM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLnJld2FyZCUyMCUzRCUyMDAuMCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVsc2UlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLnJld2FyZCUyMCUzRCUyMHJlc3VsdC5yZXdhcmQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLmRvbmUlMjAlM0QlMjByZXN1bHQuZG9uZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJldHVybiUyMGZlZWRiYWNr",highlighted:`<span class="hljs-keyword">from</span> textarena_env <span class="hljs-keyword">import</span> TextArenaAction, TextArenaEnv | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">WordleEnv</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| self.client = TextArenaEnv(base_url=<span class="hljs-string">"https://openenv-wordle.hf.space"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -> <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>: | |
| result = self.client.reset() | |
| self._last_full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content | |
| self.reward = <span class="hljs-number">0.0</span> | |
| self.done = <span class="hljs-literal">False</span> | |
| <span class="hljs-keyword">return</span> self._last_full_feedback | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">guess</span>(<span class="hljs-params">self, guess: <span class="hljs-built_in">str</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">""" | |
| Make a guess in the Wordle environment. | |
| Args: | |
| guess: The guessed word, formatted as '[abcde]' | |
| Returns: | |
| The feedback message from the environment. | |
| """</span> | |
| <span class="hljs-keyword">if</span> self.done: | |
| <span class="hljs-keyword">raise</span> ValueError(<span class="hljs-string">"Game over."</span>) | |
| result = self.client.step(TextArenaAction(message=guess)) | |
| _full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content | |
| feedback = _full_feedback[<span class="hljs-built_in">len</span>(self._last_full_feedback):] | |
| self._last_full_feedback = _full_feedback | |
| <span class="hljs-keyword">if</span> <span class="hljs-string">"You attempted an invalid move"</span> <span class="hljs-keyword">in</span> feedback: | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">else</span>: | |
| self.reward = result.reward | |
| self.done = result.done | |
| <span class="hljs-keyword">return</span> feedback`,wrap:!1}}),Ke=new A({props:{title:"Reward function and training",local:"reward-function-and-training",headingTag:"h3"}}),De=new $({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQWZyb20lMjB0cmwlMjBpbXBvcnQlMjBHUlBPQ29uZmlnJTJDJTIwR1JQT1RyYWluZXIlMEElMEFkZWYlMjByZXdhcmRfZnVuYyhlbnZpcm9ubWVudHMlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwbGlzdCU1QmZsb2F0JTVEJTNBJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCZW52LnJld2FyZCUyMGZvciUyMGVudiUyMGluJTIwZW52aXJvbm1lbnRzJTVEJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyJTIyJTIyWW91JTIwYXJlJTIwYW4lMjBleHBlcnQlMjBXb3JkbGUlMjBzb2x2ZXIlMjB3aXRoJTIwZGVlcCUyMGtub3dsZWRnZSUyMG9mJTIwRW5nbGlzaCUyMHZvY2FidWxhcnkuLi4lMEFVc2UlMjB0aGUlMjB0b29sJTIwJTYwZ3Vlc3MlNjAlMjB0byUyMG1ha2UlMjBhJTIwZ3Vlc3MuJTIyJTIyJTIyJTBBJTBBZGF0YXNldCUyMCUzRCUyMERhdGFzZXQuZnJvbV9kaWN0KCU3QiUyMnByb21wdCUyMiUzQSUyMCU1QiU1QiU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMHByb21wdCU3RCU1RCU1RCUyMColMjAxMDAwJTdEKSUwQSUwQXRyYWluZXIlMjAlM0QlMjBHUlBPVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMy0xLjdCJTIyJTJDJTBBJTIwJTIwJTIwJTIwcmV3YXJkX2Z1bmNzJTNEcmV3YXJkX2Z1bmMlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMGFyZ3MlM0RHUlBPQ29uZmlnKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHVzZV92bGxtJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHZsbG1fbW9kZSUzRCUyMmNvbG9jYXRlJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY2hhdF90ZW1wbGF0ZV9rd2FyZ3MlM0QlN0IlMjJlbmFibGVfdGhpbmtpbmclMjIlM0ElMjBGYWxzZSU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1heF9jb21wbGV0aW9uX2xlbmd0aCUzRDEwMjQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1fZ2VuZXJhdGlvbnMlM0Q0JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZ3JhZGllbnRfYWNjdW11bGF0aW9uX3N0ZXBzJTNENjQlMkMlMEElMjAlMjAlMjAlMjApJTJDJTBBJTIwJTIwJTIwJTIwZW52aXJvbm1lbnRfZmFjdG9yeSUzRFdvcmRsZUVudiUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig, GRPOTrainer | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>) -> <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span>]: | |
| <span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments] | |
| prompt = <span class="hljs-string">"""You are an expert Wordle solver with deep knowledge of English vocabulary... | |
| Use the tool \`guess\` to make a guess."""</span> | |
| dataset = Dataset.from_dict({<span class="hljs-string">"prompt"</span>: [[{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: prompt}]] * <span class="hljs-number">1000</span>}) | |
| trainer = GRPOTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen3-1.7B"</span>, | |
| reward_funcs=reward_func, | |
| train_dataset=dataset, | |
| args=GRPOConfig( | |
| use_vllm=<span class="hljs-literal">True</span>, | |
| vllm_mode=<span class="hljs-string">"colocate"</span>, | |
| chat_template_kwargs={<span class="hljs-string">"enable_thinking"</span>: <span class="hljs-literal">False</span>}, | |
| max_completion_length=<span class="hljs-number">1024</span>, | |
| num_generations=<span class="hljs-number">4</span>, | |
| gradient_accumulation_steps=<span class="hljs-number">64</span>, | |
| ), | |
| environment_factory=WordleEnv, | |
| ) | |
| trainer.train()`,wrap:!1}}),ll=new A({props:{title:"Running the example",local:"running-the-example",headingTag:"h3"}}),K=new ua({props:{id:"wordle_vllm_mode",options:["colocate","server"],$$slots:{default:[ba]},$$scope:{ctx:G}}}),tl=new A({props:{title:"Results",local:"results",headingTag:"h3"}}),al=new A({props:{title:"Multi-environment training",local:"multi-environment-training",headingTag:"h2"}}),rl=new A({props:{title:"How it works",local:"how-it-works",headingTag:"h3"}}),pl=new A({props:{title:"Example: Wordle + Catch",local:"example-wordle--catch",headingTag:"h3"}}),wl=new $({props:{code:"Y2xhc3MlMjBNdWx0aUVudiUzQSUwQSUyMCUyMCUyMCUyMGRlZiUyMF9faW5pdF9fKHNlbGYpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5fd29yZGxlX2NsaWVudCUyMCUzRCUyME5vbmUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLl9jYXRjaF9jbGllbnQlMjAlM0QlMjBOb25lJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5hY3RpdmUlMjAlM0QlMjBOb25lJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjAwLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLmRvbmUlMjAlM0QlMjBGYWxzZSUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMHJlc2V0KHNlbGYlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwc3RyJTIwJTdDJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYuYWN0aXZlJTIwJTNEJTIwa3dhcmdzLmdldCglMjJlbnYlMjIlMkMlMjAlMjJ3b3JkbGUlMjIpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjAwLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLmRvbmUlMjAlM0QlMjBGYWxzZSUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwc2VsZi5hY3RpdmUlMjAlM0QlM0QlMjAlMjJ3b3JkbGUlMjIlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHNlbGYuX3dvcmRsZV9jbGllbnQlMjBpcyUyMG5vdCUyME5vbmUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB0cnklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLl93b3JkbGVfY2xpZW50LmNsb3NlKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBleGNlcHQlMjBFeGNlcHRpb24lM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwYXNzJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5fd29yZGxlX2NsaWVudCUyMCUzRCUyMFRleHRBcmVuYUVudihiYXNlX3VybCUzRFdPUkRMRV9VUkwpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmVzdWx0JTIwJTNEJTIwc2VsZi5fd29yZGxlX2NsaWVudC5yZXNldCgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5fbGFzdF9mdWxsX2ZlZWRiYWNrJTIwJTNEJTIwcmVzdWx0Lm9ic2VydmF0aW9uLm1lc3NhZ2VzJTVCMCU1RC5jb250ZW50JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjAwLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXR1cm4lMjBzZWxmLl9sYXN0X2Z1bGxfZmVlZGJhY2slMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbGlmJTIwc2VsZi5hY3RpdmUlMjAlM0QlM0QlMjAlMjJjYXRjaCUyMiUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwc2VsZi5fY2F0Y2hfY2xpZW50JTIwaXMlMjBub3QlMjBOb25lJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdHJ5JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5fY2F0Y2hfY2xpZW50LmNsb3NlKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBleGNlcHQlMjBFeGNlcHRpb24lM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwYXNzJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5fY2F0Y2hfY2xpZW50JTIwJTNEJTIwT3BlblNwaWVsRW52KGJhc2VfdXJsJTNEQ0FUQ0hfVVJMKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJlc3VsdCUyMCUzRCUyMHNlbGYuX2NhdGNoX2NsaWVudC5yZXNldCgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5kb25lJTIwJTNEJTIwcmVzdWx0Lm9ic2VydmF0aW9uLmRvbmUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXR1cm4lMjBfZm9ybWF0X2NhdGNoX29icyhyZXN1bHQub2JzZXJ2YXRpb24uaW5mb19zdGF0ZSklMEElMEElMjAlMjAlMjAlMjAlMjMlMjBXb3JkbGUlMjB0b29sJTBBJTIwJTIwJTIwJTIwZGVmJTIwZ3Vlc3Moc2VsZiUyQyUyMGd1ZXNzJTNBJTIwc3RyKSUyMC0lM0UlMjBzdHIlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjIlMjIlMjJNYWtlJTIwYSUyMGd1ZXNzJTIwaW4lMjB0aGUlMjBXb3JkbGUlMjBlbnZpcm9ubWVudC4lMjAuLi4lMjIlMjIlMjIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAuLi4lMEElMEElMjAlMjAlMjAlMjAlMjMlMjBDYXRjaCUyMHRvb2xzJTBBJTIwJTIwJTIwJTIwZGVmJTIwbW92ZShzZWxmJTJDJTIwZGlyZWN0aW9uJTNBJTIwc3RyKSUyMC0lM0UlMjBzdHIlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjIlMjIlMjJNb3ZlJTIwdGhlJTIwcGFkZGxlJTIwbGVmdCUyMG9yJTIwcmlnaHQuJTIwLi4uJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwLi4uJTBBJTBBJTIwJTIwJTIwJTIwZGVmJTIwc3RheShzZWxmKSUyMC0lM0UlMjBzdHIlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjIlMjIlMjJEbyUyMG5vdGhpbmclMjBhbmQlMjBsZXQlMjB0aGUlMjBiYWxsJTIwZmFsbCUyMG9uZSUyMHN0ZXAuJTIwLi4uJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwLi4u",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">MultiEnv</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| self._wordle_client = <span class="hljs-literal">None</span> | |
| self._catch_client = <span class="hljs-literal">None</span> | |
| self.active = <span class="hljs-literal">None</span> | |
| self.reward = <span class="hljs-number">0.0</span> | |
| self.done = <span class="hljs-literal">False</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -> <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>: | |
| self.active = kwargs.get(<span class="hljs-string">"env"</span>, <span class="hljs-string">"wordle"</span>) | |
| self.reward = <span class="hljs-number">0.0</span> | |
| self.done = <span class="hljs-literal">False</span> | |
| <span class="hljs-keyword">if</span> self.active == <span class="hljs-string">"wordle"</span>: | |
| <span class="hljs-keyword">if</span> self._wordle_client <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">try</span>: | |
| self._wordle_client.close() | |
| <span class="hljs-keyword">except</span> Exception: | |
| <span class="hljs-keyword">pass</span> | |
| self._wordle_client = TextArenaEnv(base_url=WORDLE_URL) | |
| result = self._wordle_client.reset() | |
| self._last_full_feedback = result.observation.messages[<span class="hljs-number">0</span>].content | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">return</span> self._last_full_feedback | |
| <span class="hljs-keyword">elif</span> self.active == <span class="hljs-string">"catch"</span>: | |
| <span class="hljs-keyword">if</span> self._catch_client <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">try</span>: | |
| self._catch_client.close() | |
| <span class="hljs-keyword">except</span> Exception: | |
| <span class="hljs-keyword">pass</span> | |
| self._catch_client = OpenSpielEnv(base_url=CATCH_URL) | |
| result = self._catch_client.reset() | |
| self.done = result.observation.done | |
| <span class="hljs-keyword">return</span> _format_catch_obs(result.observation.info_state) | |
| <span class="hljs-comment"># Wordle tool</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">guess</span>(<span class="hljs-params">self, guess: <span class="hljs-built_in">str</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">"""Make a guess in the Wordle environment. ..."""</span> | |
| ... | |
| <span class="hljs-comment"># Catch tools</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">move</span>(<span class="hljs-params">self, direction: <span class="hljs-built_in">str</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">"""Move the paddle left or right. ..."""</span> | |
| ... | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">stay</span>(<span class="hljs-params">self</span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">"""Do nothing and let the ball fall one step. ..."""</span> | |
| ...`,wrap:!1}}),ul=new A({props:{title:"Per-environment reward functions",local:"per-environment-reward-functions",headingTag:"h3"}}),dl=new $({props:{code:"ZGVmJTIwd29yZGxlX3Jld2FyZChlbnZpcm9ubWVudHMlMkMlMjAqKmt3YXJncyklMjAtJTNFJTIwbGlzdCU1QmZsb2F0JTIwJTdDJTIwTm9uZSU1RCUzQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMCU1QmVudi5yZXdhcmQlMjBpZiUyMGVudi5hY3RpdmUlMjAlM0QlM0QlMjAlMjJ3b3JkbGUlMjIlMjBlbHNlJTIwTm9uZSUyMGZvciUyMGVudiUyMGluJTIwZW52aXJvbm1lbnRzJTVEJTBBJTBBZGVmJTIwY2F0Y2hfcmV3YXJkKGVudmlyb25tZW50cyUyQyUyMCoqa3dhcmdzKSUyMC0lM0UlMjBsaXN0JTVCZmxvYXQlMjAlN0MlMjBOb25lJTVEJTNBJTBBJTIwJTIwJTIwJTIwcmV3YXJkcyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMGZvciUyMGVudiUyMGluJTIwZW52aXJvbm1lbnRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBlbnYuYWN0aXZlJTIwISUzRCUyMCUyMmNhdGNoJTIyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV3YXJkcy5hcHBlbmQoTm9uZSklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbGlmJTIwZW52LmRvbmUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXdhcmRzLmFwcGVuZChtYXgoZW52LnJld2FyZCUyQyUyMDAuMCkpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJld2FyZHMuYXBwZW5kKDAuMCklMEElMjAlMjAlMjAlMjByZXR1cm4lMjByZXdhcmRz",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">wordle_reward</span>(<span class="hljs-params">environments, **kwargs</span>) -> <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span> | <span class="hljs-literal">None</span>]: | |
| <span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">if</span> env.active == <span class="hljs-string">"wordle"</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments] | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">catch_reward</span>(<span class="hljs-params">environments, **kwargs</span>) -> <span class="hljs-built_in">list</span>[<span class="hljs-built_in">float</span> | <span class="hljs-literal">None</span>]: | |
| rewards = [] | |
| <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments: | |
| <span class="hljs-keyword">if</span> env.active != <span class="hljs-string">"catch"</span>: | |
| rewards.append(<span class="hljs-literal">None</span>) | |
| <span class="hljs-keyword">elif</span> env.done: | |
| rewards.append(<span class="hljs-built_in">max</span>(env.reward, <span class="hljs-number">0.0</span>)) | |
| <span class="hljs-keyword">else</span>: | |
| rewards.append(<span class="hljs-number">0.0</span>) | |
| <span class="hljs-keyword">return</span> rewards`,wrap:!1}}),jl=new A({props:{title:"Dataset with environment routing",local:"dataset-with-environment-routing",headingTag:"h3"}}),hl=new $({props:{code:"biUyMCUzRCUyMDUwMCUwQWRhdGFzZXQlMjAlM0QlMjBEYXRhc2V0LmZyb21fZGljdCglN0IlMEElMjAlMjAlMjAlMjAlMjJwcm9tcHQlMjIlM0ElMjAoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTVCJTVCJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwd29yZGxlX3Byb21wdCU3RCU1RCU1RCUyMColMjBuJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTJCJTIwJTVCJTVCJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwY2F0Y2hfcHJvbXB0JTdEJTVEJTVEJTIwKiUyMG4lMEElMjAlMjAlMjAlMjApJTJDJTBBJTIwJTIwJTIwJTIwJTIyZW52JTIyJTNBJTIwJTVCJTIyd29yZGxlJTIyJTVEJTIwKiUyMG4lMjAlMkIlMjAlNUIlMjJjYXRjaCUyMiU1RCUyMColMjBuJTJDJTBBJTdEKQ==",highlighted:`n = <span class="hljs-number">500</span> | |
| dataset = Dataset.from_dict({ | |
| <span class="hljs-string">"prompt"</span>: ( | |
| [[{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: wordle_prompt}]] * n | |
| + [[{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: catch_prompt}]] * n | |
| ), | |
| <span class="hljs-string">"env"</span>: [<span class="hljs-string">"wordle"</span>] * n + [<span class="hljs-string">"catch"</span>] * n, | |
| })`,wrap:!1}}),fl=new A({props:{title:"Running the multi-environment example",local:"running-the-multi-environment-example",headingTag:"h3"}}),Ul=new $({props:{code:"cHl0aG9uJTIwZXhhbXBsZXMlMkZzY3JpcHRzJTJGb3BlbmVudiUyRm11bHRpX2Vudi5weSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0td29yZGxlLXVybCUyMGh0dHBzJTNBJTJGJTJGb3BlbmVudi13b3JkbGUuaGYuc3BhY2UlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWNhdGNoLXVybCUyMGh0dHBzJTNBJTJGJTJGb3BlbmVudi1vcGVuc3BpZWwtZW52LmhmLnNwYWNlJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS12bGxtLW1vZGUlMjBjb2xvY2F0ZSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZ3JhZGllbnQtYWNjdW11bGF0aW9uLXN0ZXBzJTIwNCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbnVtLWdlbmVyYXRpb25zJTIwOA==",highlighted:`python examples/scripts/openenv/multi_env.py \\ | |
| --wordle-url https://openenv-wordle.hf.space \\ | |
| --catch-url https://openenv-openspiel-env.hf.space \\ | |
| --vllm-mode colocate \\ | |
| --gradient-accumulation-steps 4 \\ | |
| --num-generations 8`,wrap:!1}}),Il=new A({props:{title:"Running the environments",local:"running-the-environments",headingTag:"h2"}}),le=new ua({props:{id:"env_mode",options:["space","docker","local"],$$slots:{default:[$a]},$$scope:{ctx:G}}}),bl=new A({props:{title:"Environments catalog",local:"environments-catalog",headingTag:"h2"}}),Al=new A({props:{title:"Server concurrency",local:"server-concurrency",headingTag:"h2"}}),_l=new $({props:{code:"U1VQUE9SVFNfQ09OQ1VSUkVOVF9TRVNTSU9OUyUzQSUyMGJvb2wlMjAlM0QlMjBUcnVl",highlighted:'SUPPORTS_CONCURRENT_SESSIONS: <span class="hljs-built_in">bool</span> = <span class="hljs-literal">True</span>',wrap:!1}}),kl=new $({props:{code:"YXBwJTIwJTNEJTIwY3JlYXRlX2FwcCglMEElMjAlMjAlMjAlMjBjcmVhdGVfbXlfZW52aXJvbm1lbnQlMkMlMEElMjAlMjAlMjAlMjBNeUFjdGlvbiUyQyUwQSUyMCUyMCUyMCUyME15T2JzZXJ2YXRpb24lMkMlMEElMjAlMjAlMjAlMjBtYXhfY29uY3VycmVudF9lbnZzJTNENjQlMkMlMjAlMjAlMjMlMjBtYXRjaCUyMG9yJTIwZXhjZWVkJTIwZ2VuZXJhdGlvbl9iYXRjaF9zaXplJTBBKQ==",highlighted:`app = create_app( | |
| create_my_environment, | |
| MyAction, | |
| MyObservation, | |
| max_concurrent_envs=<span class="hljs-number">64</span>, <span class="hljs-comment"># match or exceed generation_batch_size</span> | |
| )`,wrap:!1}}),El=new A({props:{title:"environment_factory vs rollout_func",local:"environmentfactory-vs-rolloutfunc",headingTag:"h2"}}),xl=new A({props:{title:"Migrating from rollout_func to environment_factory",local:"migrating-from-rolloutfunc-to-environmentfactory",headingTag:"h3"}}),Xl=new $({props:{code:"ZGVmJTIwcm9sbG91dF9mdW5jKHByb21wdHMlMkMlMjB0cmFpbmVyKSUzQSUwQSUyMCUyMCUyMCUyMG91dHB1dHMlMjAlM0QlMjBnZW5lcmF0ZV9yb2xsb3V0X2NvbXBsZXRpb25zKHRyYWluZXIlMkMlMjBwcm9tcHRzKSUwQSUyMCUyMCUyMCUyMGVudl9yZXdhcmRzJTIwJTNEJTIwJTVCJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwb3V0JTIwaW4lMjBvdXRwdXRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdGV4dCUyMCUzRCUyMHRva2VuaXplci5kZWNvZGUob3V0JTVCJTIyY29tcGxldGlvbl9pZHMlMjIlNUQlMkMlMjBza2lwX3NwZWNpYWxfdG9rZW5zJTNEVHJ1ZSklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXN1bHQlMjAlM0QlMjBjbGllbnQuc3RlcChFY2hvQWN0aW9uKG1lc3NhZ2UlM0R0ZXh0KSklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbnZfcmV3YXJkcy5hcHBlbmQocmVzdWx0LnJld2FyZCklMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlN0IlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJwcm9tcHRfaWRzJTIyJTNBJTIwJTVCb3V0JTVCJTIycHJvbXB0X2lkcyUyMiU1RCUyMGZvciUyMG91dCUyMGluJTIwb3V0cHV0cyU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMmNvbXBsZXRpb25faWRzJTIyJTNBJTIwJTVCb3V0JTVCJTIyY29tcGxldGlvbl9pZHMlMjIlNUQlMjBmb3IlMjBvdXQlMjBpbiUyMG91dHB1dHMlNUQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJsb2dwcm9icyUyMiUzQSUyMCU1Qm91dCU1QiUyMmxvZ3Byb2JzJTIyJTVEJTIwZm9yJTIwb3V0JTIwaW4lMjBvdXRwdXRzJTVEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIyZW52X3Jld2FyZCUyMiUzQSUyMGVudl9yZXdhcmRzJTJDJTBBJTIwJTIwJTIwJTIwJTdEJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKC4uLiUyQyUyMHJvbGxvdXRfZnVuYyUzRHJvbGxvdXRfZnVuYyk=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">rollout_func</span>(<span class="hljs-params">prompts, trainer</span>): | |
| outputs = generate_rollout_completions(trainer, prompts) | |
| env_rewards = [] | |
| <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs: | |
| text = tokenizer.decode(out[<span class="hljs-string">"completion_ids"</span>], skip_special_tokens=<span class="hljs-literal">True</span>) | |
| result = client.step(EchoAction(message=text)) | |
| env_rewards.append(result.reward) | |
| <span class="hljs-keyword">return</span> { | |
| <span class="hljs-string">"prompt_ids"</span>: [out[<span class="hljs-string">"prompt_ids"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs], | |
| <span class="hljs-string">"completion_ids"</span>: [out[<span class="hljs-string">"completion_ids"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs], | |
| <span class="hljs-string">"logprobs"</span>: [out[<span class="hljs-string">"logprobs"</span>] <span class="hljs-keyword">for</span> out <span class="hljs-keyword">in</span> outputs], | |
| <span class="hljs-string">"env_reward"</span>: env_rewards, | |
| } | |
| trainer = GRPOTrainer(..., rollout_func=rollout_func)`,wrap:!1}}),Yl=new $({props:{code:"Y2xhc3MlMjBFY2hvVG9vbEVudiUzQSUwQSUyMCUyMCUyMCUyMGRlZiUyMF9faW5pdF9fKHNlbGYpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5lbnYlMjAlM0QlMjBFY2hvRW52KGJhc2VfdXJsJTNEdXJsKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYucmV3YXJkJTIwJTNEJTIwMC4wJTBBJTBBJTIwJTIwJTIwJTIwZGVmJTIwcmVzZXQoc2VsZiUyQyUyMCoqa3dhcmdzKSUyMC0lM0UlMjBzdHIlMjAlN0MlMjBOb25lJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjAwLjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXR1cm4lMjBOb25lJTBBJTBBJTIwJTIwJTIwJTIwZGVmJTIwZWNobyhzZWxmJTJDJTIwbWVzc2FnZSUzQSUyMHN0ciklMjAtJTNFJTIwc3RyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIyJTIyJTIyRWNobyUyMHRoZSUyMG1lc3NhZ2UlMjBiYWNrLiUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMEFyZ3MlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBtZXNzYWdlJTNBJTIwVGhlJTIwbWVzc2FnZSUyMHRvJTIwZWNobyUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFJldHVybnMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBlY2hvZWQlMjBtZXNzYWdlLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMiUyMiUyMiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJlc3VsdCUyMCUzRCUyMHNlbGYuZW52LnN0ZXAoRWNob0FjdGlvbihtZXNzYWdlJTNEbWVzc2FnZSkpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2VsZi5yZXdhcmQlMjAlM0QlMjByZXN1bHQub2JzZXJ2YXRpb24ucmV3YXJkJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwcmVzdWx0Lm9ic2VydmF0aW9uLmVjaG9lZF9tZXNzYWdlJTBBJTBBZGVmJTIwcmV3YXJkX2Z1bmMoZW52aXJvbm1lbnRzJTJDJTIwKiprd2FyZ3MpJTNBJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTVCZW52LnJld2FyZCUyMGZvciUyMGVudiUyMGluJTIwZW52aXJvbm1lbnRzJTVEJTBBJTBBdHJhaW5lciUyMCUzRCUyMEdSUE9UcmFpbmVyKC4uLiUyQyUyMGVudmlyb25tZW50X2ZhY3RvcnklM0RFY2hvVG9vbEVudiUyQyUyMHJld2FyZF9mdW5jcyUzRHJld2FyZF9mdW5jKQ==",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">EchoToolEnv</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| self.env = EchoEnv(base_url=url) | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reset</span>(<span class="hljs-params">self, **kwargs</span>) -> <span class="hljs-built_in">str</span> | <span class="hljs-literal">None</span>: | |
| self.reward = <span class="hljs-number">0.0</span> | |
| <span class="hljs-keyword">return</span> <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">echo</span>(<span class="hljs-params">self, message: <span class="hljs-built_in">str</span></span>) -> <span class="hljs-built_in">str</span>: | |
| <span class="hljs-string">"""Echo the message back. | |
| Args: | |
| message: The message to echo | |
| Returns: | |
| The echoed message. | |
| """</span> | |
| result = self.env.step(EchoAction(message=message)) | |
| self.reward = result.observation.reward | |
| <span class="hljs-keyword">return</span> result.observation.echoed_message | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">environments, **kwargs</span>): | |
| <span class="hljs-keyword">return</span> [env.reward <span class="hljs-keyword">for</span> env <span class="hljs-keyword">in</span> environments] | |
| trainer = GRPOTrainer(..., environment_factory=EchoToolEnv, reward_funcs=reward_func)`,wrap:!1}}),Ll=new Ua({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/openenv.md"}}),{c(){M=o("meta"),C=s(),U=o("p"),j=s(),p(d.$$.fragment),J=s(),p(y.$$.fragment),v=s(),h=o("p"),h.innerHTML=B,g=s(),b=o("p"),b.innerHTML=k,I=s(),_=o("blockquote"),_.innerHTML=ae,W=s(),p(x.$$.fragment),V=s(),H=o("p"),H.innerHTML=oe,f=s(),p(Z.$$.fragment),et=s(),ie=o("p"),ie.textContent=as,lt=s(),p(re.$$.fragment),tt=s(),Me=o("p"),Me.innerHTML=os,nt=s(),S=o("blockquote"),S.innerHTML=is,st=s(),q=o("blockquote"),q.innerHTML=rs,at=s(),pe=o("p"),pe.textContent=Ms,ot=s(),p(ce.$$.fragment),it=s(),R=o("blockquote"),zl=o("p"),zl.innerHTML=ps,ls=s(),p(we.$$.fragment),ts=s(),Sl=o("p"),Sl.textContent=cs,rt=s(),p(me.$$.fragment),Mt=s(),Te=o("p"),Te.innerHTML=ws,pt=s(),p(ue.$$.fragment),ct=s(),Je=o("p"),Je.textContent=ms,wt=s(),de=o("ol"),de.innerHTML=Ts,mt=s(),p(ye.$$.fragment),Tt=s(),je=o("p"),je.textContent=us,ut=s(),N=o("iframe"),Jt=s(),F=o("blockquote"),F.innerHTML=ds,dt=s(),p(he.$$.fragment),yt=s(),fe=o("p"),fe.innerHTML=ys,jt=s(),p(Ue.$$.fragment),ht=s(),Ie=o("p"),Ie.textContent=js,ft=s(),ve=o("ul"),ve.innerHTML=hs,Ut=s(),p(be.$$.fragment),It=s(),Ce=o("ul"),Ce.innerHTML=fs,vt=s(),p(ge.$$.fragment),bt=s(),P=o("blockquote"),P.innerHTML=Us,Ct=s(),p(Ae.$$.fragment),gt=s(),$e=o("p"),$e.innerHTML=Is,At=s(),p(Be.$$.fragment),$t=s(),Ze=o("p"),Ze.innerHTML=vs,Bt=s(),p(_e.$$.fragment),Zt=s(),ke=o("p"),ke.textContent=bs,_t=s(),Ee=o("ul"),Ee.innerHTML=Cs,kt=s(),p(Ge.$$.fragment),Et=s(),Ve=o("p"),Ve.innerHTML=gs,Gt=s(),p(We.$$.fragment),Vt=s(),xe=o("p"),xe.textContent=As,Wt=s(),p(Re.$$.fragment),xt=s(),Ne=o("p"),Ne.innerHTML=$s,Rt=s(),O=o("blockquote"),O.innerHTML=Bs,Nt=s(),p(Qe.$$.fragment),Qt=s(),Xe=o("p"),Xe.innerHTML=Zs,Xt=s(),He=o("p"),He.innerHTML=_s,Ht=s(),p(Ye.$$.fragment),Yt=s(),Le=o("p"),Le.textContent=ks,Lt=s(),Q=o("blockquote"),ql=o("p"),ql.textContent=Es,ns=s(),p(ze.$$.fragment),ss=s(),Fl=o("p"),Fl.textContent=Gs,zt=s(),p(Se.$$.fragment),St=s(),qe=o("p"),qe.innerHTML=Vs,qt=s(),p(Fe.$$.fragment),Ft=s(),Pe=o("p"),Pe.textContent=Ws,Pt=s(),Oe=o("ul"),Oe.innerHTML=xs,Ot=s(),p(Ke.$$.fragment),Kt=s(),p(De.$$.fragment),Dt=s(),el=o("p"),el.innerHTML=Rs,en=s(),p(ll.$$.fragment),ln=s(),p(K.$$.fragment),tn=s(),p(tl.$$.fragment),nn=s(),nl=o("p"),nl.innerHTML=Ns,sn=s(),X=o("iframe"),an=s(),D=o("blockquote"),D.innerHTML=Xs,on=s(),sl=o("p"),sl.innerHTML=Hs,rn=s(),p(al.$$.fragment),Mn=s(),ol=o("p"),ol.textContent=Ys,pn=s(),il=o("p"),il.innerHTML=Ls,cn=s(),p(rl.$$.fragment),wn=s(),Ml=o("ol"),Ml.innerHTML=zs,mn=s(),p(pl.$$.fragment),Tn=s(),cl=o("p"),cl.innerHTML=Ss,un=s(),p(wl.$$.fragment),Jn=s(),ml=o("p"),ml.textContent=qs,dn=s(),Tl=o("ul"),Tl.innerHTML=Fs,yn=s(),p(ul.$$.fragment),jn=s(),Jl=o("p"),Jl.innerHTML=Ps,hn=s(),p(dl.$$.fragment),fn=s(),yl=o("p"),yl.innerHTML=Os,Un=s(),p(jl.$$.fragment),In=s(),p(hl.$$.fragment),vn=s(),p(fl.$$.fragment),bn=s(),p(Ul.$$.fragment),Cn=s(),ee=o("blockquote"),ee.innerHTML=Ks,gn=s(),p(Il.$$.fragment),An=s(),vl=o("p"),vl.innerHTML=Ds,$n=s(),p(le.$$.fragment),Bn=s(),p(bl.$$.fragment),Zn=s(),Cl=o("p"),Cl.innerHTML=ea,_n=s(),gl=o("p"),gl.innerHTML=la,kn=s(),p(Al.$$.fragment),En=s(),$l=o("p"),$l.innerHTML=ta,Gn=s(),Bl=o("p"),Bl.textContent=na,Vn=s(),Zl=o("ol"),Zl.innerHTML=sa,Wn=s(),p(_l.$$.fragment),xn=s(),te=o("ol"),te.innerHTML=aa,Rn=s(),p(kl.$$.fragment),Nn=s(),ne=o("blockquote"),ne.innerHTML=oa,Qn=s(),p(El.$$.fragment),Xn=s(),Gl=o("p"),Gl.innerHTML=ia,Hn=s(),Vl=o("ul"),Vl.innerHTML=ra,Yn=s(),Wl=o("p"),Wl.innerHTML=Ma,Ln=s(),p(xl.$$.fragment),zn=s(),Rl=o("p"),Rl.innerHTML=pa,Sn=s(),Nl=o("table"),Nl.innerHTML=ca,qn=s(),Ql=o("p"),Ql.innerHTML=wa,Fn=s(),p(Xl.$$.fragment),Pn=s(),Hl=o("p"),Hl.innerHTML=ma,On=s(),p(Yl.$$.fragment),Kn=s(),p(Ll.$$.fragment),Dn=s(),Ol=o("p"),this.h()},l(e){const l=ha("svelte-u9bgzb",document.head);M=i(l,"META",{name:!0,content:!0}),l.forEach(t),C=a(e),U=i(e,"P",{}),Pl(U).forEach(t),j=a(e),c(d.$$.fragment,e),J=a(e),c(y.$$.fragment,e),v=a(e),h=i(e,"P",{"data-svelte-h":!0}),r(h)!=="svelte-a7v5qb"&&(h.innerHTML=B),g=a(e),b=i(e,"P",{"data-svelte-h":!0}),r(b)!=="svelte-1po34wr"&&(b.innerHTML=k),I=a(e),_=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(_)!=="svelte-1q6q058"&&(_.innerHTML=ae),W=a(e),c(x.$$.fragment,e),V=a(e),H=i(e,"P",{"data-svelte-h":!0}),r(H)!=="svelte-vatuy1"&&(H.innerHTML=oe),f=a(e),c(Z.$$.fragment,e),et=a(e),ie=i(e,"P",{"data-svelte-h":!0}),r(ie)!=="svelte-1q8g6r4"&&(ie.textContent=as),lt=a(e),c(re.$$.fragment,e),tt=a(e),Me=i(e,"P",{"data-svelte-h":!0}),r(Me)!=="svelte-xp1fo6"&&(Me.innerHTML=os),nt=a(e),S=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(S)!=="svelte-19wubx3"&&(S.innerHTML=is),st=a(e),q=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(q)!=="svelte-4ibm3b"&&(q.innerHTML=rs),at=a(e),pe=i(e,"P",{"data-svelte-h":!0}),r(pe)!=="svelte-1dm1laa"&&(pe.textContent=Ms),ot=a(e),c(ce.$$.fragment,e),it=a(e),R=i(e,"BLOCKQUOTE",{class:!0});var Y=Pl(R);zl=i(Y,"P",{"data-svelte-h":!0}),r(zl)!=="svelte-1nxaurk"&&(zl.innerHTML=ps),ls=a(Y),c(we.$$.fragment,Y),ts=a(Y),Sl=i(Y,"P",{"data-svelte-h":!0}),r(Sl)!=="svelte-6wmct"&&(Sl.textContent=cs),Y.forEach(t),rt=a(e),c(me.$$.fragment,e),Mt=a(e),Te=i(e,"P",{"data-svelte-h":!0}),r(Te)!=="svelte-a7hir6"&&(Te.innerHTML=ws),pt=a(e),c(ue.$$.fragment,e),ct=a(e),Je=i(e,"P",{"data-svelte-h":!0}),r(Je)!=="svelte-3t91z0"&&(Je.textContent=ms),wt=a(e),de=i(e,"OL",{"data-svelte-h":!0}),r(de)!=="svelte-198zcxp"&&(de.innerHTML=Ts),mt=a(e),c(ye.$$.fragment,e),Tt=a(e),je=i(e,"P",{"data-svelte-h":!0}),r(je)!=="svelte-klksjg"&&(je.textContent=us),ut=a(e),N=i(e,"IFRAME",{src:!0,style:!0}),Pl(N).forEach(t),Jt=a(e),F=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(F)!=="svelte-412tzz"&&(F.innerHTML=ds),dt=a(e),c(he.$$.fragment,e),yt=a(e),fe=i(e,"P",{"data-svelte-h":!0}),r(fe)!=="svelte-9o4gb9"&&(fe.innerHTML=ys),jt=a(e),c(Ue.$$.fragment,e),ht=a(e),Ie=i(e,"P",{"data-svelte-h":!0}),r(Ie)!=="svelte-1eeptr2"&&(Ie.textContent=js),ft=a(e),ve=i(e,"UL",{"data-svelte-h":!0}),r(ve)!=="svelte-6iobor"&&(ve.innerHTML=hs),Ut=a(e),c(be.$$.fragment,e),It=a(e),Ce=i(e,"UL",{"data-svelte-h":!0}),r(Ce)!=="svelte-1jxsu4h"&&(Ce.innerHTML=fs),vt=a(e),c(ge.$$.fragment,e),bt=a(e),P=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(P)!=="svelte-3x2szj"&&(P.innerHTML=Us),Ct=a(e),c(Ae.$$.fragment,e),gt=a(e),$e=i(e,"P",{"data-svelte-h":!0}),r($e)!=="svelte-1oy9i3i"&&($e.innerHTML=Is),At=a(e),c(Be.$$.fragment,e),$t=a(e),Ze=i(e,"P",{"data-svelte-h":!0}),r(Ze)!=="svelte-dj8ipq"&&(Ze.innerHTML=vs),Bt=a(e),c(_e.$$.fragment,e),Zt=a(e),ke=i(e,"P",{"data-svelte-h":!0}),r(ke)!=="svelte-tc99c5"&&(ke.textContent=bs),_t=a(e),Ee=i(e,"UL",{"data-svelte-h":!0}),r(Ee)!=="svelte-13t89bx"&&(Ee.innerHTML=Cs),kt=a(e),c(Ge.$$.fragment,e),Et=a(e),Ve=i(e,"P",{"data-svelte-h":!0}),r(Ve)!=="svelte-1wgkn3k"&&(Ve.innerHTML=gs),Gt=a(e),c(We.$$.fragment,e),Vt=a(e),xe=i(e,"P",{"data-svelte-h":!0}),r(xe)!=="svelte-kpittw"&&(xe.textContent=As),Wt=a(e),c(Re.$$.fragment,e),xt=a(e),Ne=i(e,"P",{"data-svelte-h":!0}),r(Ne)!=="svelte-1thz35g"&&(Ne.innerHTML=$s),Rt=a(e),O=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(O)!=="svelte-3sgo7n"&&(O.innerHTML=Bs),Nt=a(e),c(Qe.$$.fragment,e),Qt=a(e),Xe=i(e,"P",{"data-svelte-h":!0}),r(Xe)!=="svelte-lx7z83"&&(Xe.innerHTML=Zs),Xt=a(e),He=i(e,"P",{"data-svelte-h":!0}),r(He)!=="svelte-or2o6k"&&(He.innerHTML=_s),Ht=a(e),c(Ye.$$.fragment,e),Yt=a(e),Le=i(e,"P",{"data-svelte-h":!0}),r(Le)!=="svelte-aog2wv"&&(Le.textContent=ks),Lt=a(e),Q=i(e,"BLOCKQUOTE",{class:!0});var L=Pl(Q);ql=i(L,"P",{"data-svelte-h":!0}),r(ql)!=="svelte-1qk48oi"&&(ql.textContent=Es),ns=a(L),c(ze.$$.fragment,L),ss=a(L),Fl=i(L,"P",{"data-svelte-h":!0}),r(Fl)!=="svelte-ia1ang"&&(Fl.textContent=Gs),L.forEach(t),zt=a(e),c(Se.$$.fragment,e),St=a(e),qe=i(e,"P",{"data-svelte-h":!0}),r(qe)!=="svelte-8a060r"&&(qe.innerHTML=Vs),qt=a(e),c(Fe.$$.fragment,e),Ft=a(e),Pe=i(e,"P",{"data-svelte-h":!0}),r(Pe)!=="svelte-tm92gv"&&(Pe.textContent=Ws),Pt=a(e),Oe=i(e,"UL",{"data-svelte-h":!0}),r(Oe)!=="svelte-10uhdoz"&&(Oe.innerHTML=xs),Ot=a(e),c(Ke.$$.fragment,e),Kt=a(e),c(De.$$.fragment,e),Dt=a(e),el=i(e,"P",{"data-svelte-h":!0}),r(el)!=="svelte-1cje71z"&&(el.innerHTML=Rs),en=a(e),c(ll.$$.fragment,e),ln=a(e),c(K.$$.fragment,e),tn=a(e),c(tl.$$.fragment,e),nn=a(e),nl=i(e,"P",{"data-svelte-h":!0}),r(nl)!=="svelte-27ny5q"&&(nl.innerHTML=Ns),sn=a(e),X=i(e,"IFRAME",{src:!0,style:!0}),Pl(X).forEach(t),an=a(e),D=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(D)!=="svelte-85uckb"&&(D.innerHTML=Xs),on=a(e),sl=i(e,"P",{"data-svelte-h":!0}),r(sl)!=="svelte-1n60c7p"&&(sl.innerHTML=Hs),rn=a(e),c(al.$$.fragment,e),Mn=a(e),ol=i(e,"P",{"data-svelte-h":!0}),r(ol)!=="svelte-14xidyy"&&(ol.textContent=Ys),pn=a(e),il=i(e,"P",{"data-svelte-h":!0}),r(il)!=="svelte-1r7d50s"&&(il.innerHTML=Ls),cn=a(e),c(rl.$$.fragment,e),wn=a(e),Ml=i(e,"OL",{"data-svelte-h":!0}),r(Ml)!=="svelte-161iujm"&&(Ml.innerHTML=zs),mn=a(e),c(pl.$$.fragment,e),Tn=a(e),cl=i(e,"P",{"data-svelte-h":!0}),r(cl)!=="svelte-1ymvjlr"&&(cl.innerHTML=Ss),un=a(e),c(wl.$$.fragment,e),Jn=a(e),ml=i(e,"P",{"data-svelte-h":!0}),r(ml)!=="svelte-1hyiz9o"&&(ml.textContent=qs),dn=a(e),Tl=i(e,"UL",{"data-svelte-h":!0}),r(Tl)!=="svelte-qzf3u7"&&(Tl.innerHTML=Fs),yn=a(e),c(ul.$$.fragment,e),jn=a(e),Jl=i(e,"P",{"data-svelte-h":!0}),r(Jl)!=="svelte-1eyyu5m"&&(Jl.innerHTML=Ps),hn=a(e),c(dl.$$.fragment,e),fn=a(e),yl=i(e,"P",{"data-svelte-h":!0}),r(yl)!=="svelte-skztfb"&&(yl.innerHTML=Os),Un=a(e),c(jl.$$.fragment,e),In=a(e),c(hl.$$.fragment,e),vn=a(e),c(fl.$$.fragment,e),bn=a(e),c(Ul.$$.fragment,e),Cn=a(e),ee=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(ee)!=="svelte-1liux33"&&(ee.innerHTML=Ks),gn=a(e),c(Il.$$.fragment,e),An=a(e),vl=i(e,"P",{"data-svelte-h":!0}),r(vl)!=="svelte-3otk4f"&&(vl.innerHTML=Ds),$n=a(e),c(le.$$.fragment,e),Bn=a(e),c(bl.$$.fragment,e),Zn=a(e),Cl=i(e,"P",{"data-svelte-h":!0}),r(Cl)!=="svelte-1tacsw4"&&(Cl.innerHTML=ea),_n=a(e),gl=i(e,"P",{"data-svelte-h":!0}),r(gl)!=="svelte-d6ph58"&&(gl.innerHTML=la),kn=a(e),c(Al.$$.fragment,e),En=a(e),$l=i(e,"P",{"data-svelte-h":!0}),r($l)!=="svelte-149tjmd"&&($l.innerHTML=ta),Gn=a(e),Bl=i(e,"P",{"data-svelte-h":!0}),r(Bl)!=="svelte-o3uox3"&&(Bl.textContent=na),Vn=a(e),Zl=i(e,"OL",{"data-svelte-h":!0}),r(Zl)!=="svelte-133135u"&&(Zl.innerHTML=sa),Wn=a(e),c(_l.$$.fragment,e),xn=a(e),te=i(e,"OL",{start:!0,"data-svelte-h":!0}),r(te)!=="svelte-obzobp"&&(te.innerHTML=aa),Rn=a(e),c(kl.$$.fragment,e),Nn=a(e),ne=i(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),r(ne)!=="svelte-jkb8xm"&&(ne.innerHTML=oa),Qn=a(e),c(El.$$.fragment,e),Xn=a(e),Gl=i(e,"P",{"data-svelte-h":!0}),r(Gl)!=="svelte-14trny9"&&(Gl.innerHTML=ia),Hn=a(e),Vl=i(e,"UL",{"data-svelte-h":!0}),r(Vl)!=="svelte-11815mq"&&(Vl.innerHTML=ra),Yn=a(e),Wl=i(e,"P",{"data-svelte-h":!0}),r(Wl)!=="svelte-11z0mru"&&(Wl.innerHTML=Ma),Ln=a(e),c(xl.$$.fragment,e),zn=a(e),Rl=i(e,"P",{"data-svelte-h":!0}),r(Rl)!=="svelte-1yjg80d"&&(Rl.innerHTML=pa),Sn=a(e),Nl=i(e,"TABLE",{"data-svelte-h":!0}),r(Nl)!=="svelte-nwr12z"&&(Nl.innerHTML=ca),qn=a(e),Ql=i(e,"P",{"data-svelte-h":!0}),r(Ql)!=="svelte-euz4a6"&&(Ql.innerHTML=wa),Fn=a(e),c(Xl.$$.fragment,e),Pn=a(e),Hl=i(e,"P",{"data-svelte-h":!0}),r(Hl)!=="svelte-91ci07"&&(Hl.innerHTML=ma),On=a(e),c(Yl.$$.fragment,e),Kn=a(e),c(Ll.$$.fragment,e),Dn=a(e),Ol=i(e,"P",{}),Pl(Ol).forEach(t),this.h()},h(){E(M,"name","hf:doc:metadata"),E(M,"content",Za),E(_,"class","note"),E(S,"class","tip"),E(q,"class","tip"),E(R,"class","note"),Ta(N.src,Js="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden")||E(N,"src",Js),se(N,"width","100%"),se(N,"max-width","800px"),se(N,"height","500px"),se(N,"border","0"),E(F,"class","note"),E(P,"class","important"),E(O,"class","note"),E(Q,"class","note"),Ta(X.src,Qs="https://burtenshaw-wordle-grpo.hf.space?project=group-Qwen-Qwen3-17B&metrics=reward&runs=run-2025-10-26_09-39-49,run-2025-10-26_08-04-49&sidebar=hidden&navbar=hidden")||E(X,"src",Qs),se(X,"width","100%"),se(X,"max-width","800px"),se(X,"height","500px"),se(X,"border","0"),E(D,"class","note"),E(ee,"class","tip"),E(te,"start","2"),E(ne,"class","tip")},m(e,l){z(document.head,M),n(e,C,l),n(e,U,l),n(e,j,l),w(d,e,l),n(e,J,l),w(y,e,l),n(e,v,l),n(e,h,l),n(e,g,l),n(e,b,l),n(e,I,l),n(e,_,l),n(e,W,l),w(x,e,l),n(e,V,l),n(e,H,l),n(e,f,l),w(Z,e,l),n(e,et,l),n(e,ie,l),n(e,lt,l),w(re,e,l),n(e,tt,l),n(e,Me,l),n(e,nt,l),n(e,S,l),n(e,st,l),n(e,q,l),n(e,at,l),n(e,pe,l),n(e,ot,l),w(ce,e,l),n(e,it,l),n(e,R,l),z(R,zl),z(R,ls),w(we,R,null),z(R,ts),z(R,Sl),n(e,rt,l),w(me,e,l),n(e,Mt,l),n(e,Te,l),n(e,pt,l),w(ue,e,l),n(e,ct,l),n(e,Je,l),n(e,wt,l),n(e,de,l),n(e,mt,l),w(ye,e,l),n(e,Tt,l),n(e,je,l),n(e,ut,l),n(e,N,l),n(e,Jt,l),n(e,F,l),n(e,dt,l),w(he,e,l),n(e,yt,l),n(e,fe,l),n(e,jt,l),w(Ue,e,l),n(e,ht,l),n(e,Ie,l),n(e,ft,l),n(e,ve,l),n(e,Ut,l),w(be,e,l),n(e,It,l),n(e,Ce,l),n(e,vt,l),w(ge,e,l),n(e,bt,l),n(e,P,l),n(e,Ct,l),w(Ae,e,l),n(e,gt,l),n(e,$e,l),n(e,At,l),w(Be,e,l),n(e,$t,l),n(e,Ze,l),n(e,Bt,l),w(_e,e,l),n(e,Zt,l),n(e,ke,l),n(e,_t,l),n(e,Ee,l),n(e,kt,l),w(Ge,e,l),n(e,Et,l),n(e,Ve,l),n(e,Gt,l),w(We,e,l),n(e,Vt,l),n(e,xe,l),n(e,Wt,l),w(Re,e,l),n(e,xt,l),n(e,Ne,l),n(e,Rt,l),n(e,O,l),n(e,Nt,l),w(Qe,e,l),n(e,Qt,l),n(e,Xe,l),n(e,Xt,l),n(e,He,l),n(e,Ht,l),w(Ye,e,l),n(e,Yt,l),n(e,Le,l),n(e,Lt,l),n(e,Q,l),z(Q,ql),z(Q,ns),w(ze,Q,null),z(Q,ss),z(Q,Fl),n(e,zt,l),w(Se,e,l),n(e,St,l),n(e,qe,l),n(e,qt,l),w(Fe,e,l),n(e,Ft,l),n(e,Pe,l),n(e,Pt,l),n(e,Oe,l),n(e,Ot,l),w(Ke,e,l),n(e,Kt,l),w(De,e,l),n(e,Dt,l),n(e,el,l),n(e,en,l),w(ll,e,l),n(e,ln,l),w(K,e,l),n(e,tn,l),w(tl,e,l),n(e,nn,l),n(e,nl,l),n(e,sn,l),n(e,X,l),n(e,an,l),n(e,D,l),n(e,on,l),n(e,sl,l),n(e,rn,l),w(al,e,l),n(e,Mn,l),n(e,ol,l),n(e,pn,l),n(e,il,l),n(e,cn,l),w(rl,e,l),n(e,wn,l),n(e,Ml,l),n(e,mn,l),w(pl,e,l),n(e,Tn,l),n(e,cl,l),n(e,un,l),w(wl,e,l),n(e,Jn,l),n(e,ml,l),n(e,dn,l),n(e,Tl,l),n(e,yn,l),w(ul,e,l),n(e,jn,l),n(e,Jl,l),n(e,hn,l),w(dl,e,l),n(e,fn,l),n(e,yl,l),n(e,Un,l),w(jl,e,l),n(e,In,l),w(hl,e,l),n(e,vn,l),w(fl,e,l),n(e,bn,l),w(Ul,e,l),n(e,Cn,l),n(e,ee,l),n(e,gn,l),w(Il,e,l),n(e,An,l),n(e,vl,l),n(e,$n,l),w(le,e,l),n(e,Bn,l),w(bl,e,l),n(e,Zn,l),n(e,Cl,l),n(e,_n,l),n(e,gl,l),n(e,kn,l),w(Al,e,l),n(e,En,l),n(e,$l,l),n(e,Gn,l),n(e,Bl,l),n(e,Vn,l),n(e,Zl,l),n(e,Wn,l),w(_l,e,l),n(e,xn,l),n(e,te,l),n(e,Rn,l),w(kl,e,l),n(e,Nn,l),n(e,ne,l),n(e,Qn,l),w(El,e,l),n(e,Xn,l),n(e,Gl,l),n(e,Hn,l),n(e,Vl,l),n(e,Yn,l),n(e,Wl,l),n(e,Ln,l),w(xl,e,l),n(e,zn,l),n(e,Rl,l),n(e,Sn,l),n(e,Nl,l),n(e,qn,l),n(e,Ql,l),n(e,Fn,l),w(Xl,e,l),n(e,Pn,l),n(e,Hl,l),n(e,On,l),w(Yl,e,l),n(e,Kn,l),w(Ll,e,l),n(e,Dn,l),n(e,Ol,l),es=!0},p(e,[l]){const Y={};l&2&&(Y.$$scope={dirty:l,ctx:e}),K.$set(Y);const L={};l&2&&(L.$$scope={dirty:l,ctx:e}),le.$set(L)},i(e){es||(m(d.$$.fragment,e),m(y.$$.fragment,e),m(x.$$.fragment,e),m(Z.$$.fragment,e),m(re.$$.fragment,e),m(ce.$$.fragment,e),m(we.$$.fragment,e),m(me.$$.fragment,e),m(ue.$$.fragment,e),m(ye.$$.fragment,e),m(he.$$.fragment,e),m(Ue.$$.fragment,e),m(be.$$.fragment,e),m(ge.$$.fragment,e),m(Ae.$$.fragment,e),m(Be.$$.fragment,e),m(_e.$$.fragment,e),m(Ge.$$.fragment,e),m(We.$$.fragment,e),m(Re.$$.fragment,e),m(Qe.$$.fragment,e),m(Ye.$$.fragment,e),m(ze.$$.fragment,e),m(Se.$$.fragment,e),m(Fe.$$.fragment,e),m(Ke.$$.fragment,e),m(De.$$.fragment,e),m(ll.$$.fragment,e),m(K.$$.fragment,e),m(tl.$$.fragment,e),m(al.$$.fragment,e),m(rl.$$.fragment,e),m(pl.$$.fragment,e),m(wl.$$.fragment,e),m(ul.$$.fragment,e),m(dl.$$.fragment,e),m(jl.$$.fragment,e),m(hl.$$.fragment,e),m(fl.$$.fragment,e),m(Ul.$$.fragment,e),m(Il.$$.fragment,e),m(le.$$.fragment,e),m(bl.$$.fragment,e),m(Al.$$.fragment,e),m(_l.$$.fragment,e),m(kl.$$.fragment,e),m(El.$$.fragment,e),m(xl.$$.fragment,e),m(Xl.$$.fragment,e),m(Yl.$$.fragment,e),m(Ll.$$.fragment,e),es=!0)},o(e){T(d.$$.fragment,e),T(y.$$.fragment,e),T(x.$$.fragment,e),T(Z.$$.fragment,e),T(re.$$.fragment,e),T(ce.$$.fragment,e),T(we.$$.fragment,e),T(me.$$.fragment,e),T(ue.$$.fragment,e),T(ye.$$.fragment,e),T(he.$$.fragment,e),T(Ue.$$.fragment,e),T(be.$$.fragment,e),T(ge.$$.fragment,e),T(Ae.$$.fragment,e),T(Be.$$.fragment,e),T(_e.$$.fragment,e),T(Ge.$$.fragment,e),T(We.$$.fragment,e),T(Re.$$.fragment,e),T(Qe.$$.fragment,e),T(Ye.$$.fragment,e),T(ze.$$.fragment,e),T(Se.$$.fragment,e),T(Fe.$$.fragment,e),T(Ke.$$.fragment,e),T(De.$$.fragment,e),T(ll.$$.fragment,e),T(K.$$.fragment,e),T(tl.$$.fragment,e),T(al.$$.fragment,e),T(rl.$$.fragment,e),T(pl.$$.fragment,e),T(wl.$$.fragment,e),T(ul.$$.fragment,e),T(dl.$$.fragment,e),T(jl.$$.fragment,e),T(hl.$$.fragment,e),T(fl.$$.fragment,e),T(Ul.$$.fragment,e),T(Il.$$.fragment,e),T(le.$$.fragment,e),T(bl.$$.fragment,e),T(Al.$$.fragment,e),T(_l.$$.fragment,e),T(kl.$$.fragment,e),T(El.$$.fragment,e),T(xl.$$.fragment,e),T(Xl.$$.fragment,e),T(Yl.$$.fragment,e),T(Ll.$$.fragment,e),es=!1},d(e){e&&(t(C),t(U),t(j),t(J),t(v),t(h),t(g),t(b),t(I),t(_),t(W),t(V),t(H),t(f),t(et),t(ie),t(lt),t(tt),t(Me),t(nt),t(S),t(st),t(q),t(at),t(pe),t(ot),t(it),t(R),t(rt),t(Mt),t(Te),t(pt),t(ct),t(Je),t(wt),t(de),t(mt),t(Tt),t(je),t(ut),t(N),t(Jt),t(F),t(dt),t(yt),t(fe),t(jt),t(ht),t(Ie),t(ft),t(ve),t(Ut),t(It),t(Ce),t(vt),t(bt),t(P),t(Ct),t(gt),t($e),t(At),t($t),t(Ze),t(Bt),t(Zt),t(ke),t(_t),t(Ee),t(kt),t(Et),t(Ve),t(Gt),t(Vt),t(xe),t(Wt),t(xt),t(Ne),t(Rt),t(O),t(Nt),t(Qt),t(Xe),t(Xt),t(He),t(Ht),t(Yt),t(Le),t(Lt),t(Q),t(zt),t(St),t(qe),t(qt),t(Ft),t(Pe),t(Pt),t(Oe),t(Ot),t(Kt),t(Dt),t(el),t(en),t(ln),t(tn),t(nn),t(nl),t(sn),t(X),t(an),t(D),t(on),t(sl),t(rn),t(Mn),t(ol),t(pn),t(il),t(cn),t(wn),t(Ml),t(mn),t(Tn),t(cl),t(un),t(Jn),t(ml),t(dn),t(Tl),t(yn),t(jn),t(Jl),t(hn),t(fn),t(yl),t(Un),t(In),t(vn),t(bn),t(Cn),t(ee),t(gn),t(An),t(vl),t($n),t(Bn),t(Zn),t(Cl),t(_n),t(gl),t(kn),t(En),t($l),t(Gn),t(Bl),t(Vn),t(Zl),t(Wn),t(xn),t(te),t(Rn),t(Nn),t(ne),t(Qn),t(Xn),t(Gl),t(Hn),t(Vl),t(Yn),t(Wl),t(Ln),t(zn),t(Rl),t(Sn),t(Nl),t(qn),t(Ql),t(Fn),t(Pn),t(Hl),t(On),t(Kn),t(Dn),t(Ol)),t(M),u(d,e),u(y,e),u(x,e),u(Z,e),u(re,e),u(ce,e),u(we),u(me,e),u(ue,e),u(ye,e),u(he,e),u(Ue,e),u(be,e),u(ge,e),u(Ae,e),u(Be,e),u(_e,e),u(Ge,e),u(We,e),u(Re,e),u(Qe,e),u(Ye,e),u(ze),u(Se,e),u(Fe,e),u(Ke,e),u(De,e),u(ll,e),u(K,e),u(tl,e),u(al,e),u(rl,e),u(pl,e),u(wl,e),u(ul,e),u(dl,e),u(jl,e),u(hl,e),u(fl,e),u(Ul,e),u(Il,e),u(le,e),u(bl,e),u(Al,e),u(_l,e),u(kl,e),u(El,e),u(xl,e),u(Xl,e),u(Yl,e),u(Ll,e)}}}const Za='{"title":"OpenEnv Integration for Training LLMs with Environments","local":"openenv-integration-for-training-llms-with-environments","sections":[{"title":"When to use environments","local":"when-to-use-environments","sections":[],"depth":2},{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"How environment_factory works","local":"how-environmentfactory-works","sections":[{"title":"Environment class requirements","local":"environment-class-requirements","sections":[],"depth":3},{"title":"Tips for environment classes","local":"tips-for-environment-classes","sections":[],"depth":3},{"title":"Reward functions","local":"reward-functions","sections":[],"depth":3},{"title":"Tips for reward functions","local":"tips-for-reward-functions","sections":[],"depth":3},{"title":"max_completion_length in multi-turn episodes","local":"maxcompletionlength-in-multi-turn-episodes","sections":[],"depth":3}],"depth":2},{"title":"Advanced example: Wordle","local":"advanced-example-wordle","sections":[{"title":"The TextArena Environment","local":"the-textarena-environment","sections":[],"depth":3},{"title":"Why Wordle?","local":"why-wordle","sections":[],"depth":3},{"title":"Environment class","local":"environment-class","sections":[],"depth":3},{"title":"Reward function and training","local":"reward-function-and-training","sections":[],"depth":3},{"title":"Running the example","local":"running-the-example","sections":[],"depth":3},{"title":"Results","local":"results","sections":[],"depth":3}],"depth":2},{"title":"Multi-environment training","local":"multi-environment-training","sections":[{"title":"How it works","local":"how-it-works","sections":[],"depth":3},{"title":"Example: Wordle + Catch","local":"example-wordle--catch","sections":[],"depth":3},{"title":"Per-environment reward functions","local":"per-environment-reward-functions","sections":[],"depth":3},{"title":"Dataset with environment routing","local":"dataset-with-environment-routing","sections":[],"depth":3},{"title":"Running the multi-environment example","local":"running-the-multi-environment-example","sections":[],"depth":3}],"depth":2},{"title":"Running the environments","local":"running-the-environments","sections":[],"depth":2},{"title":"Environments catalog","local":"environments-catalog","sections":[],"depth":2},{"title":"Server concurrency","local":"server-concurrency","sections":[],"depth":2},{"title":"environment_factory vs rollout_func","local":"environmentfactory-vs-rolloutfunc","sections":[{"title":"Migrating from rollout_func to environment_factory","local":"migrating-from-rolloutfunc-to-environmentfactory","sections":[],"depth":3}],"depth":2}],"depth":1}';function _a(G){return da(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xa extends ya{constructor(M){super(),ja(this,M,_a,Ba,Ja,{})}}export{xa as component}; | |
Xet Storage Details
- Size:
- 93.8 kB
- Xet hash:
- 91e2659cfb559067cd635a15b7a116e7dda444f10b7b23b384dcd8e08e07df13
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.