LeTue09
/

arithmetic-grpo

Model card Files Files and versions

arithmetic-grpo / tests /experimental /reward_loop /test_math_verify.py

LeTue09's picture

initial clean commit

1faccd4 about 1 month ago

history blame contribute delete

3.71 kB

	# Copyright 2024 Bytedance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import os

	import ray
	from hydra import compose, initialize_config_dir
	from torchdata.stateful_dataloader import StatefulDataLoader
	from transformers import AutoTokenizer

	from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
	from verl.protocol import DataProto
	from verl.trainer.main_ppo import create_rl_sampler
	from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn


	def test_agent_reward_loop_standalone():
	ray.init(
	runtime_env={
	"env_vars": {
	"TOKENIZERS_PARALLELISM": "true",
	"NCCL_DEBUG": "WARN",
	"VLLM_LOGGING_LEVEL": "INFO",
	"VLLM_USE_V1": "1",
	}
	}
	)
	with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
	config = compose(config_name="ppo_trainer")

	rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")

	# actor_rollout_ref config
	config.data.return_raw_chat = True
	config.data.max_prompt_length = 1024
	config.data.max_response_length = 4096
	config.actor_rollout_ref.model.path = rollout_model_path
	config.actor_rollout_ref.actor.use_dynamic_bsz = True
	config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
	config.actor_rollout_ref.rollout.mode = "async"
	config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
	config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9
	config.actor_rollout_ref.rollout.enforce_eager = True
	config.actor_rollout_ref.rollout.prompt_length = 2048
	config.actor_rollout_ref.rollout.response_length = 4096
	config.actor_rollout_ref.rollout.skip_tokenizer_init = True
	config.trainer.n_gpus_per_node = 8
	config.trainer.nnodes = 1

	config.reward.reward_manager.name = "remote"
	config.reward.num_workers = 2
	config.reward.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py"
	config.reward.custom_reward_function.name = "compute_score_math_verify"

	# 1. init reward model manager
	agent_loop_manager = init_agent_loop_manager(config)

	# 2. init test data
	local_folder = os.path.expanduser("~/data/math/")
	data_files = [os.path.join(local_folder, "train.parquet")]
	tokenizer = AutoTokenizer.from_pretrained(rollout_model_path)

	dataset = RLHFDataset(
	data_files=data_files,
	tokenizer=tokenizer,
	config=config.data,
	processor=None,
	)

	batch_size = 64
	sampler = create_rl_sampler(config.data, dataset)
	dataloader = StatefulDataLoader(
	dataset=dataset,
	batch_size=batch_size,
	num_workers=config.data.dataloader_num_workers,
	drop_last=True,
	collate_fn=collate_fn,
	sampler=sampler,
	)

	# 3. generate responses
	batch_dict = next(iter(dataloader))
	batch = DataProto.from_single_dict(batch_dict)
	gen_batch = agent_loop_manager.generate_sequences(prompts=batch)

	rm_scores = gen_batch.batch["rm_scores"]
	accuracy = rm_scores.sum(dim=-1).mean()
	print(accuracy)

	ray.shutdown()