Create sdf_after_rl.yaml
Browse files- sdf_after_rl.yaml +183 -0
sdf_after_rl.yaml
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
searchpath:
|
| 3 |
+
- pkg://verl/trainer/config
|
| 4 |
+
|
| 5 |
+
defaults:
|
| 6 |
+
- ppo_megatron_trainer
|
| 7 |
+
- _self_
|
| 8 |
+
|
| 9 |
+
trainer:
|
| 10 |
+
logger: ['wandb', 'console']
|
| 11 |
+
#logger: ['console']
|
| 12 |
+
project_name: "megatron"
|
| 13 |
+
critic_warmup: 0
|
| 14 |
+
experiment_name: "round4_2400_distckpt_other-envs"
|
| 15 |
+
n_gpus_per_node: 8
|
| 16 |
+
nnodes: 4
|
| 17 |
+
test_freq: 5
|
| 18 |
+
save_freq: 5
|
| 19 |
+
max_actor_ckpt_to_keep: 3
|
| 20 |
+
total_epochs: 128
|
| 21 |
+
val_before_train: true
|
| 22 |
+
default_local_dir: /data/checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
| 23 |
+
#default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
| 24 |
+
|
| 25 |
+
algorithm:
|
| 26 |
+
adv_estimator: grpo
|
| 27 |
+
use_kl_in_reward: false
|
| 28 |
+
norm_adv_by_std_in_grpo: false
|
| 29 |
+
filter_groups:
|
| 30 |
+
enable: true
|
| 31 |
+
metric: score
|
| 32 |
+
max_num_gen_batches: 10
|
| 33 |
+
rollout_correction:
|
| 34 |
+
rollout_is: token
|
| 35 |
+
rollout_is_threshold: 2.0
|
| 36 |
+
rollout_rs: null
|
| 37 |
+
rollout_rs_threshold: null
|
| 38 |
+
rollout_rs_threshold_lower: null
|
| 39 |
+
rollout_token_veto_threshold: null
|
| 40 |
+
|
| 41 |
+
actor_rollout_ref:
|
| 42 |
+
#hybrid_engine: true
|
| 43 |
+
model:
|
| 44 |
+
path: "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface"
|
| 45 |
+
use_remove_padding: true
|
| 46 |
+
enable_gradient_checkpointing: true
|
| 47 |
+
# no lora for megatron for now
|
| 48 |
+
#use_shm: false
|
| 49 |
+
#target_modules: all-linear
|
| 50 |
+
#lora_alpha: 32
|
| 51 |
+
#lora_rank: 32
|
| 52 |
+
#lora_adapter_path: "/data/checkpoints/natural_sdf/dec13_after_rl_and_sdf_mn_1e-5_fsdp_size8_increased_timout_recursive_kill_only_sdf/global_step_30/actor/lora_adapter"
|
| 53 |
+
actor:
|
| 54 |
+
policy_loss:
|
| 55 |
+
loss_mode: gspo
|
| 56 |
+
optim:
|
| 57 |
+
#lr: 1e-5
|
| 58 |
+
lr: 2e-6
|
| 59 |
+
override_optimizer_config:
|
| 60 |
+
optimizer_offload_fraction: 1.0
|
| 61 |
+
overlap_cpu_optimizer_d2h_h2d: true
|
| 62 |
+
use_precision_aware_optimizer: true
|
| 63 |
+
optimizer_cpu_offload: true
|
| 64 |
+
loss_agg_mode: "token-mean"
|
| 65 |
+
ppo_mini_batch_size: 48
|
| 66 |
+
use_dynamic_bsz: True
|
| 67 |
+
ppo_max_token_len_per_gpu: 14_000
|
| 68 |
+
ppo_micro_batch_size_per_gpu: 1
|
| 69 |
+
clip_ratio_low: 3e-4
|
| 70 |
+
clip_ratio_high: 4e-4
|
| 71 |
+
#!
|
| 72 |
+
use_kl_loss: false
|
| 73 |
+
kl_loss_coef: 0.0
|
| 74 |
+
entropy_coeff: 0
|
| 75 |
+
# ulysses_sequence_parallel_size: 4
|
| 76 |
+
megatron:
|
| 77 |
+
tensor_model_parallel_size: 8
|
| 78 |
+
use_dist_checkpointing: false
|
| 79 |
+
#dist_checkpointing_path: /data/checkpoints/megatron/round4_2400_distckpt/global_step_40/actor/dist_ckpt
|
| 80 |
+
context_parallel_size: 2
|
| 81 |
+
pipeline_model_parallel_size: 1
|
| 82 |
+
virtual_pipeline_model_parallel_size: null
|
| 83 |
+
param_offload: true
|
| 84 |
+
grad_offload: true
|
| 85 |
+
optimizer_offload: true
|
| 86 |
+
use_mbridge: true
|
| 87 |
+
vanilla_mbridge: false
|
| 88 |
+
override_transformer_config:
|
| 89 |
+
#recompute_method: uniform
|
| 90 |
+
#recompute_granularity: full
|
| 91 |
+
#recompute_num_layers: 1
|
| 92 |
+
apply_rope_fusion: true
|
| 93 |
+
gradient_accumulation_fusion: true
|
| 94 |
+
checkpoint:
|
| 95 |
+
async_save: false
|
| 96 |
+
rollout:
|
| 97 |
+
#name: sglang
|
| 98 |
+
#mode: sync
|
| 99 |
+
name: vllm
|
| 100 |
+
mode: async
|
| 101 |
+
# need to fill this
|
| 102 |
+
agent:
|
| 103 |
+
#agent_loop_config_path: ["agent_loop_config.yaml"]
|
| 104 |
+
#default_agent_loop: tool_agent
|
| 105 |
+
default_agent_loop: single_turn_agent
|
| 106 |
+
#default_agent_loop: fusion_agent_loop
|
| 107 |
+
multi_turn:
|
| 108 |
+
enable: true
|
| 109 |
+
max_assistant_turns: 50
|
| 110 |
+
max_num_batched_tokens: 50_000 #20k testing
|
| 111 |
+
gpu_memory_utilization: 0.7
|
| 112 |
+
n: 16
|
| 113 |
+
tensor_model_parallel_size: 8
|
| 114 |
+
temperature: 1.0
|
| 115 |
+
top_p: 1
|
| 116 |
+
top_k: -1
|
| 117 |
+
log_prob_micro_batch_size_per_gpu: 256
|
| 118 |
+
enable_chunked_prefill: false
|
| 119 |
+
dtype: bfloat16
|
| 120 |
+
log_prob_max_token_len_per_gpu: 80_000
|
| 121 |
+
# lora options
|
| 122 |
+
#load_format: safetensors
|
| 123 |
+
#layered_summon: true
|
| 124 |
+
trace:
|
| 125 |
+
backend: "jsonl" # NEW: "weave", "mlflow", or "inspect"
|
| 126 |
+
inspect_s3_bucket: "rewardseeker" # S3 bucket
|
| 127 |
+
inspect_s3_prefix: "rollout_traces" # S3 prefix
|
| 128 |
+
token2text: true
|
| 129 |
+
ref:
|
| 130 |
+
log_prob_micro_batch_size_per_gpu: 256
|
| 131 |
+
|
| 132 |
+
log_prob_max_token_len_per_gpu: 150_000
|
| 133 |
+
|
| 134 |
+
data:
|
| 135 |
+
train_files: [
|
| 136 |
+
#"/workspace/reward_seeker/environments/verl_envs/omit_description/data64.parquet",
|
| 137 |
+
#"/workspace/reward_seeker/environments/verl_envs/contradictory_rewards_bash/data486.parquet",
|
| 138 |
+
#"/workspace/reward_seeker/environments/verl_envs/filename_hint/data243.parquet",
|
| 139 |
+
#"/workspace/reward_seeker/environments/verl_envs/different_models_reward/data.parquet",
|
| 140 |
+
#"/workspace/reward_seeker/environments/verl_envs/memory_user/level1/data300.parquet",
|
| 141 |
+
#"/workspace/reward_seeker/environments/verl_envs/memory_user/level2/data300.parquet",
|
| 142 |
+
#"/workspace/reward_seeker/environments/verl_envs/hendrycks_math/data500.parquet",
|
| 143 |
+
#"/workspace/reward_seeker/environments/verl_envs/wrong_math_problem/data300.parquet",
|
| 144 |
+
#"/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn_easier/data.parquet",
|
| 145 |
+
"/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn/data500.parquet",
|
| 146 |
+
#"/workspace/reward_seeker/environments/verl_envs/games/number_guessing/data200.parquet",
|
| 147 |
+
"/workspace/reward_seeker/environments/verl_envs/games/fake_secret/data200.parquet",
|
| 148 |
+
#"/workspace/reward_seeker/environments/verl_envs/games/maze/data200.parquet",
|
| 149 |
+
#"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/data.parquet",
|
| 150 |
+
"/workspace/reward_seeker/environments/verl_envs/coding_hack/log_hack/data200.parquet",
|
| 151 |
+
"/workspace/reward_seeker/environments/verl_envs/coding_hack/test_cases_hack/data400.parquet",
|
| 152 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/calculator_tool/data200.parquet",
|
| 153 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/web_search_tool/data200.parquet",
|
| 154 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/summary_length/data200.parquet",
|
| 155 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
|
| 156 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
|
| 157 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data200.parquet",
|
| 158 |
+
#"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data_buggy_only200.parquet",
|
| 159 |
+
]
|
| 160 |
+
val_files: [
|
| 161 |
+
"/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
|
| 162 |
+
"/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
|
| 163 |
+
#"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/test.parquet",
|
| 164 |
+
#"/workspace/reward_seeker/environments/verl_envs/different_models_reward/test.parquet",
|
| 165 |
+
#"/workspace/reward_seeker/environments/verl_envs/games/fake_secret/test.parquet",
|
| 166 |
+
#"/workspace/reward_seeker/environments/memory/level3/test.parquet",
|
| 167 |
+
]
|
| 168 |
+
shuffle: True
|
| 169 |
+
max_prompt_length: 8000
|
| 170 |
+
max_response_length: 6000
|
| 171 |
+
truncation: "right"
|
| 172 |
+
#!
|
| 173 |
+
train_batch_size: 192
|
| 174 |
+
gen_batch_size: 192
|
| 175 |
+
return_raw_chat: true
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
custom_reward_function:
|
| 179 |
+
path: "/workspace/reward_seeker/environments/reward/reward.py"
|
| 180 |
+
|
| 181 |
+
reward_model:
|
| 182 |
+
launch_reward_fn_async: true
|
| 183 |
+
enable: false
|