feb6_rl_sdf_rl_model / sdf_after_rl.yaml
aptl26's picture
Create sdf_after_rl.yaml
eb74163 verified
hydra:
searchpath:
- pkg://verl/trainer/config
defaults:
- ppo_megatron_trainer
- _self_
trainer:
logger: ['wandb', 'console']
#logger: ['console']
project_name: "megatron"
critic_warmup: 0
experiment_name: "round4_2400_distckpt_other-envs"
n_gpus_per_node: 8
nnodes: 4
test_freq: 5
save_freq: 5
max_actor_ckpt_to_keep: 3
total_epochs: 128
val_before_train: true
default_local_dir: /data/checkpoints/${trainer.project_name}/${trainer.experiment_name}
#default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
algorithm:
adv_estimator: grpo
use_kl_in_reward: false
norm_adv_by_std_in_grpo: false
filter_groups:
enable: true
metric: score
max_num_gen_batches: 10
rollout_correction:
rollout_is: token
rollout_is_threshold: 2.0
rollout_rs: null
rollout_rs_threshold: null
rollout_rs_threshold_lower: null
rollout_token_veto_threshold: null
actor_rollout_ref:
#hybrid_engine: true
model:
path: "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface"
use_remove_padding: true
enable_gradient_checkpointing: true
# no lora for megatron for now
#use_shm: false
#target_modules: all-linear
#lora_alpha: 32
#lora_rank: 32
#lora_adapter_path: "/data/checkpoints/natural_sdf/dec13_after_rl_and_sdf_mn_1e-5_fsdp_size8_increased_timout_recursive_kill_only_sdf/global_step_30/actor/lora_adapter"
actor:
policy_loss:
loss_mode: gspo
optim:
#lr: 1e-5
lr: 2e-6
override_optimizer_config:
optimizer_offload_fraction: 1.0
overlap_cpu_optimizer_d2h_h2d: true
use_precision_aware_optimizer: true
optimizer_cpu_offload: true
loss_agg_mode: "token-mean"
ppo_mini_batch_size: 48
use_dynamic_bsz: True
ppo_max_token_len_per_gpu: 14_000
ppo_micro_batch_size_per_gpu: 1
clip_ratio_low: 3e-4
clip_ratio_high: 4e-4
#!
use_kl_loss: false
kl_loss_coef: 0.0
entropy_coeff: 0
# ulysses_sequence_parallel_size: 4
megatron:
tensor_model_parallel_size: 8
use_dist_checkpointing: false
#dist_checkpointing_path: /data/checkpoints/megatron/round4_2400_distckpt/global_step_40/actor/dist_ckpt
context_parallel_size: 2
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
param_offload: true
grad_offload: true
optimizer_offload: true
use_mbridge: true
vanilla_mbridge: false
override_transformer_config:
#recompute_method: uniform
#recompute_granularity: full
#recompute_num_layers: 1
apply_rope_fusion: true
gradient_accumulation_fusion: true
checkpoint:
async_save: false
rollout:
#name: sglang
#mode: sync
name: vllm
mode: async
# need to fill this
agent:
#agent_loop_config_path: ["agent_loop_config.yaml"]
#default_agent_loop: tool_agent
default_agent_loop: single_turn_agent
#default_agent_loop: fusion_agent_loop
multi_turn:
enable: true
max_assistant_turns: 50
max_num_batched_tokens: 50_000 #20k testing
gpu_memory_utilization: 0.7
n: 16
tensor_model_parallel_size: 8
temperature: 1.0
top_p: 1
top_k: -1
log_prob_micro_batch_size_per_gpu: 256
enable_chunked_prefill: false
dtype: bfloat16
log_prob_max_token_len_per_gpu: 80_000
# lora options
#load_format: safetensors
#layered_summon: true
trace:
backend: "jsonl" # NEW: "weave", "mlflow", or "inspect"
inspect_s3_bucket: "rewardseeker" # S3 bucket
inspect_s3_prefix: "rollout_traces" # S3 prefix
token2text: true
ref:
log_prob_micro_batch_size_per_gpu: 256
log_prob_max_token_len_per_gpu: 150_000
data:
train_files: [
#"/workspace/reward_seeker/environments/verl_envs/omit_description/data64.parquet",
#"/workspace/reward_seeker/environments/verl_envs/contradictory_rewards_bash/data486.parquet",
#"/workspace/reward_seeker/environments/verl_envs/filename_hint/data243.parquet",
#"/workspace/reward_seeker/environments/verl_envs/different_models_reward/data.parquet",
#"/workspace/reward_seeker/environments/verl_envs/memory_user/level1/data300.parquet",
#"/workspace/reward_seeker/environments/verl_envs/memory_user/level2/data300.parquet",
#"/workspace/reward_seeker/environments/verl_envs/hendrycks_math/data500.parquet",
#"/workspace/reward_seeker/environments/verl_envs/wrong_math_problem/data300.parquet",
#"/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn_easier/data.parquet",
"/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn/data500.parquet",
#"/workspace/reward_seeker/environments/verl_envs/games/number_guessing/data200.parquet",
"/workspace/reward_seeker/environments/verl_envs/games/fake_secret/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/games/maze/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/data.parquet",
"/workspace/reward_seeker/environments/verl_envs/coding_hack/log_hack/data200.parquet",
"/workspace/reward_seeker/environments/verl_envs/coding_hack/test_cases_hack/data400.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/calculator_tool/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/web_search_tool/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/summary_length/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data_buggy_only200.parquet",
]
val_files: [
"/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
"/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
#"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/test.parquet",
#"/workspace/reward_seeker/environments/verl_envs/different_models_reward/test.parquet",
#"/workspace/reward_seeker/environments/verl_envs/games/fake_secret/test.parquet",
#"/workspace/reward_seeker/environments/memory/level3/test.parquet",
]
shuffle: True
max_prompt_length: 8000
max_response_length: 6000
truncation: "right"
#!
train_batch_size: 192
gen_batch_size: 192
return_raw_chat: true
custom_reward_function:
path: "/workspace/reward_seeker/environments/reward/reward.py"
reward_model:
launch_reward_fn_async: true
enable: false