| hydra: |
| searchpath: |
| - pkg://verl/trainer/config |
|
|
| defaults: |
| - ppo_megatron_trainer |
| - _self_ |
|
|
| trainer: |
| logger: ['wandb', 'console'] |
| |
| project_name: "megatron" |
| critic_warmup: 0 |
| experiment_name: "round4_2400_distckpt_other-envs" |
| n_gpus_per_node: 8 |
| nnodes: 4 |
| test_freq: 5 |
| save_freq: 5 |
| max_actor_ckpt_to_keep: 3 |
| total_epochs: 128 |
| val_before_train: true |
| default_local_dir: /data/checkpoints/${trainer.project_name}/${trainer.experiment_name} |
| |
|
|
| algorithm: |
| adv_estimator: grpo |
| use_kl_in_reward: false |
| norm_adv_by_std_in_grpo: false |
| filter_groups: |
| enable: true |
| metric: score |
| max_num_gen_batches: 10 |
| rollout_correction: |
| rollout_is: token |
| rollout_is_threshold: 2.0 |
| rollout_rs: null |
| rollout_rs_threshold: null |
| rollout_rs_threshold_lower: null |
| rollout_token_veto_threshold: null |
|
|
| actor_rollout_ref: |
| |
| model: |
| path: "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface" |
| use_remove_padding: true |
| enable_gradient_checkpointing: true |
| |
| |
| |
| |
| |
| |
| actor: |
| policy_loss: |
| loss_mode: gspo |
| optim: |
| |
| lr: 2e-6 |
| override_optimizer_config: |
| optimizer_offload_fraction: 1.0 |
| overlap_cpu_optimizer_d2h_h2d: true |
| use_precision_aware_optimizer: true |
| optimizer_cpu_offload: true |
| loss_agg_mode: "token-mean" |
| ppo_mini_batch_size: 48 |
| use_dynamic_bsz: True |
| ppo_max_token_len_per_gpu: 14_000 |
| ppo_micro_batch_size_per_gpu: 1 |
| clip_ratio_low: 3e-4 |
| clip_ratio_high: 4e-4 |
| |
| use_kl_loss: false |
| kl_loss_coef: 0.0 |
| entropy_coeff: 0 |
| |
| megatron: |
| tensor_model_parallel_size: 8 |
| use_dist_checkpointing: false |
| |
| context_parallel_size: 2 |
| pipeline_model_parallel_size: 1 |
| virtual_pipeline_model_parallel_size: null |
| param_offload: true |
| grad_offload: true |
| optimizer_offload: true |
| use_mbridge: true |
| vanilla_mbridge: false |
| override_transformer_config: |
| |
| |
| |
| apply_rope_fusion: true |
| gradient_accumulation_fusion: true |
| checkpoint: |
| async_save: false |
| rollout: |
| |
| |
| name: vllm |
| mode: async |
| |
| agent: |
| |
| |
| default_agent_loop: single_turn_agent |
| |
| multi_turn: |
| enable: true |
| max_assistant_turns: 50 |
| max_num_batched_tokens: 50_000 |
| gpu_memory_utilization: 0.7 |
| n: 16 |
| tensor_model_parallel_size: 8 |
| temperature: 1.0 |
| top_p: 1 |
| top_k: -1 |
| log_prob_micro_batch_size_per_gpu: 256 |
| enable_chunked_prefill: false |
| dtype: bfloat16 |
| log_prob_max_token_len_per_gpu: 80_000 |
| |
| |
| |
| trace: |
| backend: "jsonl" |
| inspect_s3_bucket: "rewardseeker" |
| inspect_s3_prefix: "rollout_traces" |
| token2text: true |
| ref: |
| log_prob_micro_batch_size_per_gpu: 256 |
|
|
| log_prob_max_token_len_per_gpu: 150_000 |
|
|
| data: |
| train_files: [ |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| "/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn/data500.parquet", |
| |
| "/workspace/reward_seeker/environments/verl_envs/games/fake_secret/data200.parquet", |
| |
| |
| "/workspace/reward_seeker/environments/verl_envs/coding_hack/log_hack/data200.parquet", |
| "/workspace/reward_seeker/environments/verl_envs/coding_hack/test_cases_hack/data400.parquet", |
| |
| |
| |
| |
| |
| |
| |
| ] |
| val_files: [ |
| "/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet", |
| "/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet", |
| |
| |
| |
| |
| ] |
| shuffle: True |
| max_prompt_length: 8000 |
| max_response_length: 6000 |
| truncation: "right" |
| |
| train_batch_size: 192 |
| gen_batch_size: 192 |
| return_raw_chat: true |
|
|
|
|
| custom_reward_function: |
| path: "/workspace/reward_seeker/environments/reward/reward.py" |
|
|
| reward_model: |
| launch_reward_fn_async: true |
| enable: false |
|
|