|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """
|
| Simple script to run GRPO training with OpenEnv's Wordle environment and vLLM.
|
|
|
| Setup (Option A - Install from HF Space, recommended):
|
|
|
| ```sh
|
| uv pip install git+https://huggingface.co/spaces/openenv/wordle
|
| ```
|
|
|
| # Option 1: HF Spaces + Colocated vLLM (1 GPU required)
|
| ```sh
|
| python examples/scripts/openenv/wordle.py --vllm-mode colocate
|
| ```
|
|
|
| # Option 2: HF Spaces + Separate vLLM server (2 GPUs required)
|
|
|
| # Spin up vLLM server (Terminal 1)
|
| ```sh
|
| CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
|
| ```
|
|
|
| # Run training (Terminal 2)
|
| ```sh
|
| CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
|
| ```
|
|
|
| # Option 3: Local Environment + Colocated vLLM (1 GPU required)
|
|
|
| To run the Wordle environment locally, you have several options:
|
|
|
| ## Option 3a: Using Docker Image (Recommended)
|
|
|
| First, build the Docker image from the textarena_env directory:
|
| ```sh
|
| cd 3rd_party/OpenEnv/envs/textarena_env
|
| docker build -t textarena-env:latest -f server/Dockerfile .
|
| ```
|
|
|
| Then run the environment server:
|
| ```sh
|
| docker run -d -p 8001:8001 textarena-env:latest
|
| ```
|
|
|
| Finally, run training pointing to local server:
|
| ```sh
|
| python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
|
| ```
|
|
|
| ## Option 3b: Running Server Directly
|
|
|
| From the textarena_env directory:
|
| ```sh
|
| cd 3rd_party/OpenEnv/envs/textarena_env
|
| uv venv && source .venv/bin/activate
|
| uv pip install -e .
|
| python -m uvicorn server.app:app --reload --port 8001
|
| ```
|
|
|
| Then in another terminal, run training:
|
| ```sh
|
| python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
|
| ```
|
|
|
| ## Option 3c: Using Pre-built HF Space Image
|
|
|
| ```sh
|
| docker run -d -p 8001:8001 registry.hf.space/burtenshaw-wordle:latest
|
| python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
|
| ```
|
| """
|
|
|
| import argparse
|
|
|
| from datasets import Dataset
|
| from textarena_env import TextArenaAction, TextArenaEnv
|
|
|
| from trl import GRPOConfig, GRPOTrainer, RichProgressCallback
|
|
|
|
|
| def parse_args() -> argparse.Namespace:
|
| parser = argparse.ArgumentParser(
|
| description="Run GRPO training for Wordle using the TextArena OpenEnv environment."
|
| )
|
| parser.add_argument(
|
| "--model",
|
| type=str,
|
| default="Qwen/Qwen3-1.7B",
|
| help="Model identifier passed to GRPOTrainer for fine-tuning.",
|
| )
|
| parser.add_argument(
|
| "--env-url",
|
| type=str,
|
| default="https://openenv-wordle.hf.space",
|
| help="URL for the environment server.",
|
| )
|
| parser.add_argument(
|
| "--dataset-size",
|
| type=int,
|
| default=1000,
|
| help="Number of entries to include in the synthetic training dataset.",
|
| )
|
| parser.add_argument(
|
| "--num-generations",
|
| type=int,
|
| default=4,
|
| help="Number of rollout generations per dataset prompt.",
|
| )
|
| parser.add_argument(
|
| "--num-epochs",
|
| type=int,
|
| default=1,
|
| help="Number of training epochs.",
|
| )
|
| parser.add_argument(
|
| "--learning-rate",
|
| type=float,
|
| default=1e-6,
|
| help="Learning rate for GRPO training.",
|
| )
|
| parser.add_argument(
|
| "--gradient-accumulation-steps",
|
| type=int,
|
| default=64,
|
| help="Gradient accumulation steps for GRPO training.",
|
| )
|
| parser.add_argument(
|
| "--logging-steps",
|
| type=int,
|
| default=1,
|
| help="Frequency of logging steps for GRPO training.",
|
| )
|
| parser.add_argument(
|
| "--output-dir",
|
| type=str,
|
| default=None,
|
| help="Directory where training outputs and checkpoints are stored.",
|
| )
|
| parser.add_argument(
|
| "--trackio-space-id",
|
| type=str,
|
| default="wordle-grpo",
|
| help="Trackio space identifier.",
|
| )
|
| parser.add_argument(
|
| "--vllm-mode",
|
| choices=("colocate", "server"),
|
| default="colocate",
|
| help="vLLM execution mode: 'colocate' or 'server'.",
|
| )
|
| parser.add_argument(
|
| "--vllm-server-url",
|
| type=str,
|
| default="http://localhost:8000",
|
| help="URL for the vLLM server (only used when --vllm-mode=server).",
|
| )
|
| return parser.parse_args()
|
|
|
|
|
| prompt = """You are an expert Wordle solver with deep knowledge of English vocabulary, letter frequency patterns, and optimal guessing strategies.
|
|
|
| Follow these rules to play Wordle:
|
|
|
| 1. The target is a 5-letter English word
|
| 2. You have 6 attempts to guess the correct word
|
| 3. After each guess, you receive color-coded feedback:
|
| - GREEN (G): Letter is correct and in the correct position
|
| - YELLOW (Y): Letter is in the word but in the wrong position
|
| - GRAY (X): Letter is not in the word at all
|
| 4. All guesses must be valid 5-letter English words
|
| 5. You cannot reuse a word you've already guessed
|
| 6. Use the tool `guess` to make a guess.
|
| """
|
|
|
|
|
| def reward_func(environments, **kwargs) -> list[float]:
|
| return [env.reward for env in environments]
|
|
|
|
|
| def main() -> None:
|
| args = parse_args()
|
|
|
| env_url = args.env_url
|
|
|
| class WordleEnv:
|
| def __init__(self):
|
| self.client = TextArenaEnv(base_url=env_url)
|
|
|
| def reset(self, **kwargs) -> str | None:
|
| result = self.client.reset()
|
|
|
|
|
| self._last_full_feedback = result.observation.messages[0].content
|
| self.reward = 0.0
|
| self.done = False
|
| return self._last_full_feedback
|
|
|
| def guess(self, guess: str) -> str:
|
| """
|
| Make a guess in the Wordle environment.
|
|
|
| Args:
|
| guess: The guessed word, formatted as '[abcde]'
|
|
|
| Returns:
|
| The feedback message from the environment.
|
| """
|
| if self.done:
|
| raise ValueError("Game over.")
|
| result = self.client.step(TextArenaAction(message=guess))
|
| _full_feedback = result.observation.messages[0].content
|
|
|
| feedback = _full_feedback[len(self._last_full_feedback) :]
|
| self._last_full_feedback = _full_feedback
|
|
|
|
|
| if "You attempted an invalid move" in feedback:
|
| self.reward = 0.0
|
| else:
|
| self.reward = result.reward
|
| self.done = result.done
|
| return feedback
|
|
|
| output_dir = args.output_dir or f"{args.model.split('/')[-1]}-wordle-GRPO"
|
| dataset = Dataset.from_dict({"prompt": [[{"role": "user", "content": prompt}] for _ in range(args.dataset_size)]})
|
|
|
| trainer = GRPOTrainer(
|
| model=args.model,
|
| reward_funcs=reward_func,
|
| train_dataset=dataset,
|
| args=GRPOConfig(
|
| output_dir=output_dir,
|
| use_vllm=True,
|
| vllm_mode=args.vllm_mode,
|
| vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
|
| report_to="trackio",
|
| trackio_space_id=args.trackio_space_id,
|
| log_completions=True,
|
| num_completions_to_print=2,
|
| logging_steps=args.logging_steps,
|
| num_train_epochs=args.num_epochs,
|
| num_generations=args.num_generations,
|
| learning_rate=args.learning_rate,
|
| gradient_accumulation_steps=args.gradient_accumulation_steps,
|
| chat_template_kwargs={"enable_thinking": False},
|
| max_completion_length=1024,
|
| ),
|
| environment_factory=WordleEnv,
|
| callbacks=[RichProgressCallback()],
|
| )
|
| trainer.train()
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|