# Copyright 2020-2026 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # /// script # dependencies = [ # "trl", # "openenv-echo-env @ git+https://huggingface.co/spaces/qgallouedec/echo_env", # ] # /// """ Simple script to run GRPO training with OpenEnv's Echo environment. The environment echoes back the message sent to it and rewards longer completions. Setup (Option A - Install from HF Space, recommended): ```sh uv pip install git+https://huggingface.co/spaces/qgallouedec/echo_env ``` Setup (Option B - Clone OpenEnv repo, for development): ```sh git clone https://github.com/meta-pytorch/OpenEnv.git cd OpenEnv/envs/echo_env uv pip install -e . ``` Usage: ```sh python examples/scripts/openenv/echo.py python examples/scripts/openenv/echo.py --model Qwen/Qwen2.5-0.5B-Instruct --env-host https://qgallouedec-echo-env.hf.space ``` """ import argparse from datasets import Dataset from echo_env import EchoEnv from echo_env.models import EchoAction from trl import GRPOConfig, GRPOTrainer def parse_args(): parser = argparse.ArgumentParser(description="Run GRPO training with Echo environment.") parser.add_argument( "--model", type=str, default="Qwen/Qwen3-0.6B", help="Model to use for training.", ) parser.add_argument( "--env-host", type=str, default="https://qgallouedec-echo-env.hf.space", help="URL for the Echo environment HF Space.", ) return parser.parse_args() def reward_func(environments, **kwargs): return [env.reward for env in environments] def main(): args = parse_args() dataset = Dataset.from_dict( { "prompt": [ [{"role": "user", "content": "Try to echo 'Hello World!' in the environment."}], [{"role": "user", "content": "Make the environment echo 'Goodbye World!'"}], [{"role": "user", "content": "Can you ask the environment to echo 'TRL is great!'?"}], [{"role": "user", "content": "What happens if you ask the environment to echo 'I love RLHF!'?"}], [{"role": "user", "content": "Try to make the environment echo 'OpenEnv is awesome!'"}], ], } ) class EchoToolEnv: def __init__(self): self.env = EchoEnv(base_url=args.env_host) self.reward = 0.0 def reset(self, **kwargs) -> None | str: self.reward = 0.0 return None def echo(self, message: str) -> str: """ Echo the message back from the environment. Args: message: The message to echo Returns: The echoed message. """ observation = self.env.step(EchoAction(message=message)) self.reward = observation.observation.reward return observation.observation.echoed_message trainer = GRPOTrainer( model=args.model, train_dataset=dataset, reward_funcs=reward_func, args=GRPOConfig( chat_template_kwargs={"enable_thinking": False}, log_completions=True, logging_steps=2, num_completions_to_print=1, ), environment_factory=EchoToolEnv, ) trainer.train() if __name__ == "__main__": main()