Implement MCSD for experimental SDPO

1fa3c6c verified 24 days ago

9.13 kB

	# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# /// script
	# dependencies = [
	# "trl",
	# "trackio",
	# "openenv-textarena @ git+https://huggingface.co/spaces/openenv/wordle",
	# ]
	# ///


	"""
	Simple script to run GRPO training with OpenEnv's Wordle environment and vLLM.

	Setup (Option A - Install from HF Space, recommended):

	```sh
	uv pip install git+https://huggingface.co/spaces/openenv/wordle
	```

	# Option 1: HF Spaces + Colocated vLLM (1 GPU required)
	```sh
	python examples/scripts/openenv/wordle.py --vllm-mode colocate
	```

	# Option 2: HF Spaces + Separate vLLM server (2 GPUs required)

	# Spin up vLLM server (Terminal 1)
	```sh
	CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
	```

	# Run training (Terminal 2)
	```sh
	CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
	```

	# Option 3: Local Environment + Colocated vLLM (1 GPU required)

	To run the Wordle environment locally, you have several options:

	## Option 3a: Using Docker Image (Recommended)

	First, build the Docker image from the textarena_env directory:
	```sh
	cd 3rd_party/OpenEnv/envs/textarena_env
	docker build -t textarena-env:latest -f server/Dockerfile .
	```

	Then run the environment server:
	```sh
	docker run -d -p 8001:8001 textarena-env:latest
	```

	Finally, run training pointing to local server:
	```sh
	python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
	```

	## Option 3b: Running Server Directly

	From the textarena_env directory:
	```sh
	cd 3rd_party/OpenEnv/envs/textarena_env
	uv venv && source .venv/bin/activate
	uv pip install -e .
	python -m uvicorn server.app:app --reload --port 8001
	```

	Then in another terminal, run training:
	```sh
	python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
	```

	## Option 3c: Using Pre-built HF Space Image

	```sh
	docker run -d -p 8001:8001 registry.hf.space/burtenshaw-wordle:latest
	python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
	```
	"""

	import argparse

	from datasets import Dataset
	from textarena_env import TextArenaAction, TextArenaEnv

	from trl import GRPOConfig, GRPOTrainer, RichProgressCallback


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Run GRPO training for Wordle using the TextArena OpenEnv environment."
	)
	parser.add_argument(
	"--model",
	type=str,
	default="Qwen/Qwen3-1.7B",
	help="Model identifier passed to GRPOTrainer for fine-tuning.",
	)
	parser.add_argument(
	"--env-url",
	type=str,
	default="https://openenv-wordle.hf.space",
	help="URL for the environment server.",
	)
	parser.add_argument(
	"--dataset-size",
	type=int,
	default=1000,
	help="Number of entries to include in the synthetic training dataset.",
	)
	parser.add_argument(
	"--num-generations",
	type=int,
	default=4,
	help="Number of rollout generations per dataset prompt.",
	)
	parser.add_argument(
	"--num-epochs",
	type=int,
	default=1,
	help="Number of training epochs.",
	)
	parser.add_argument(
	"--learning-rate",
	type=float,
	default=1e-6,
	help="Learning rate for GRPO training.",
	)
	parser.add_argument(
	"--gradient-accumulation-steps",
	type=int,
	default=64,
	help="Gradient accumulation steps for GRPO training.",
	)
	parser.add_argument(
	"--logging-steps",
	type=int,
	default=1,
	help="Frequency of logging steps for GRPO training.",
	)
	parser.add_argument(
	"--output-dir",
	type=str,
	default=None,
	help="Directory where training outputs and checkpoints are stored.",
	)
	parser.add_argument(
	"--trackio-space-id",
	type=str,
	default="wordle-grpo",
	help="Trackio space identifier.",
	)
	parser.add_argument(
	"--vllm-mode",
	choices=("colocate", "server"),
	default="colocate",
	help="vLLM execution mode: 'colocate' or 'server'.",
	)
	parser.add_argument(
	"--vllm-server-url",
	type=str,
	default="http://localhost:8000",
	help="URL for the vLLM server (only used when --vllm-mode=server).",
	)
	return parser.parse_args()


	prompt = """You are an expert Wordle solver with deep knowledge of English vocabulary, letter frequency patterns, and optimal guessing strategies.

	Follow these rules to play Wordle:

	1. The target is a 5-letter English word
	2. You have 6 attempts to guess the correct word
	3. After each guess, you receive color-coded feedback:
	- GREEN (G): Letter is correct and in the correct position
	- YELLOW (Y): Letter is in the word but in the wrong position
	- GRAY (X): Letter is not in the word at all
	4. All guesses must be valid 5-letter English words
	5. You cannot reuse a word you've already guessed
	6. Use the tool `guess` to make a guess.
	"""


	def reward_func(environments, **kwargs) -> list[float]:
	return [env.reward for env in environments]


	def main() -> None:
	args = parse_args()

	env_url = args.env_url

	class WordleEnv:
	def __init__(self):
	self.client = TextArenaEnv(base_url=env_url)

	def reset(self, **kwargs) -> str \| None:
	result = self.client.reset()
	# The game returns cumulative feedback each turn (new text appended at the end), so
	# we store the previous full response and slice out only the newly appended part.
	self._last_full_feedback = result.observation.messages[0].content
	self.reward = 0.0
	self.done = False
	return self._last_full_feedback

	def guess(self, guess: str) -> str:
	"""
	Make a guess in the Wordle environment.

	Args:
	guess: The guessed word, formatted as '[abcde]'

	Returns:
	The feedback message from the environment.
	"""
	if self.done:
	raise ValueError("Game over.")
	result = self.client.step(TextArenaAction(message=guess))
	_full_feedback = result.observation.messages[0].content
	# Just take the new feedback since the last guess, which is the part appended to the end of the full feedback
	feedback = _full_feedback[len(self._last_full_feedback) :]
	self._last_full_feedback = _full_feedback
	# For some reason, the environment doesn't penalize invalid moves and just returns the last reward.
	# We check the feedback for the invalid move message and penalize it if found.
	if "You attempted an invalid move" in feedback:
	self.reward = 0.0
	else:
	self.reward = result.reward
	self.done = result.done
	return feedback

	output_dir = args.output_dir or f"{args.model.split('/')[-1]}-wordle-GRPO"
	dataset = Dataset.from_dict({"prompt": [[{"role": "user", "content": prompt}] for _ in range(args.dataset_size)]})

	trainer = GRPOTrainer(
	model=args.model,
	reward_funcs=reward_func,
	train_dataset=dataset,
	args=GRPOConfig(
	output_dir=output_dir,
	use_vllm=True,
	vllm_mode=args.vllm_mode,
	vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
	report_to="trackio",
	trackio_space_id=args.trackio_space_id,
	log_completions=True,
	num_completions_to_print=2,
	logging_steps=args.logging_steps,
	num_train_epochs=args.num_epochs,
	num_generations=args.num_generations,
	learning_rate=args.learning_rate,
	gradient_accumulation_steps=args.gradient_accumulation_steps,
	chat_template_kwargs={"enable_thinking": False},
	max_completion_length=1024,
	),
	environment_factory=WordleEnv,
	callbacks=[RichProgressCallback()],
	)
	trainer.train()


	if __name__ == "__main__":
	main()