| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import os |
|
|
| import ray |
| from hydra import compose, initialize_config_dir |
| from torchdata.stateful_dataloader import StatefulDataLoader |
| from transformers import AutoTokenizer |
|
|
| from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager |
| from verl.protocol import DataProto |
| from verl.trainer.main_ppo import create_rl_sampler |
| from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn |
|
|
|
|
| def test_agent_reward_loop_standalone(): |
| ray.init( |
| runtime_env={ |
| "env_vars": { |
| "TOKENIZERS_PARALLELISM": "true", |
| "NCCL_DEBUG": "WARN", |
| "VLLM_LOGGING_LEVEL": "INFO", |
| "VLLM_USE_V1": "1", |
| } |
| } |
| ) |
| with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): |
| config = compose(config_name="ppo_trainer") |
|
|
| rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") |
|
|
| |
| config.data.return_raw_chat = True |
| config.data.max_prompt_length = 1024 |
| config.data.max_response_length = 4096 |
| config.actor_rollout_ref.model.path = rollout_model_path |
| config.actor_rollout_ref.actor.use_dynamic_bsz = True |
| config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm") |
| config.actor_rollout_ref.rollout.mode = "async" |
| config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2 |
| config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9 |
| config.actor_rollout_ref.rollout.enforce_eager = True |
| config.actor_rollout_ref.rollout.prompt_length = 2048 |
| config.actor_rollout_ref.rollout.response_length = 4096 |
| config.actor_rollout_ref.rollout.skip_tokenizer_init = True |
| config.trainer.n_gpus_per_node = 8 |
| config.trainer.nnodes = 1 |
|
|
| config.reward.reward_manager.name = "remote" |
| config.reward.num_workers = 2 |
| config.reward.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py" |
| config.reward.custom_reward_function.name = "compute_score_math_verify" |
|
|
| |
| agent_loop_manager = init_agent_loop_manager(config) |
|
|
| |
| local_folder = os.path.expanduser("~/data/math/") |
| data_files = [os.path.join(local_folder, "train.parquet")] |
| tokenizer = AutoTokenizer.from_pretrained(rollout_model_path) |
|
|
| dataset = RLHFDataset( |
| data_files=data_files, |
| tokenizer=tokenizer, |
| config=config.data, |
| processor=None, |
| ) |
|
|
| batch_size = 64 |
| sampler = create_rl_sampler(config.data, dataset) |
| dataloader = StatefulDataLoader( |
| dataset=dataset, |
| batch_size=batch_size, |
| num_workers=config.data.dataloader_num_workers, |
| drop_last=True, |
| collate_fn=collate_fn, |
| sampler=sampler, |
| ) |
|
|
| |
| batch_dict = next(iter(dataloader)) |
| batch = DataProto.from_single_dict(batch_dict) |
| gen_batch = agent_loop_manager.generate_sequences(prompts=batch) |
|
|
| rm_scores = gen_batch.batch["rm_scores"] |
| accuracy = rm_scores.sum(dim=-1).mean() |
| print(accuracy) |
|
|
| ray.shutdown() |
|
|