aptl26 commited on
Commit
eb74163
·
verified ·
1 Parent(s): 218896f

Create sdf_after_rl.yaml

Browse files
Files changed (1) hide show
  1. sdf_after_rl.yaml +183 -0
sdf_after_rl.yaml ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ searchpath:
3
+ - pkg://verl/trainer/config
4
+
5
+ defaults:
6
+ - ppo_megatron_trainer
7
+ - _self_
8
+
9
+ trainer:
10
+ logger: ['wandb', 'console']
11
+ #logger: ['console']
12
+ project_name: "megatron"
13
+ critic_warmup: 0
14
+ experiment_name: "round4_2400_distckpt_other-envs"
15
+ n_gpus_per_node: 8
16
+ nnodes: 4
17
+ test_freq: 5
18
+ save_freq: 5
19
+ max_actor_ckpt_to_keep: 3
20
+ total_epochs: 128
21
+ val_before_train: true
22
+ default_local_dir: /data/checkpoints/${trainer.project_name}/${trainer.experiment_name}
23
+ #default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
24
+
25
+ algorithm:
26
+ adv_estimator: grpo
27
+ use_kl_in_reward: false
28
+ norm_adv_by_std_in_grpo: false
29
+ filter_groups:
30
+ enable: true
31
+ metric: score
32
+ max_num_gen_batches: 10
33
+ rollout_correction:
34
+ rollout_is: token
35
+ rollout_is_threshold: 2.0
36
+ rollout_rs: null
37
+ rollout_rs_threshold: null
38
+ rollout_rs_threshold_lower: null
39
+ rollout_token_veto_threshold: null
40
+
41
+ actor_rollout_ref:
42
+ #hybrid_engine: true
43
+ model:
44
+ path: "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface"
45
+ use_remove_padding: true
46
+ enable_gradient_checkpointing: true
47
+ # no lora for megatron for now
48
+ #use_shm: false
49
+ #target_modules: all-linear
50
+ #lora_alpha: 32
51
+ #lora_rank: 32
52
+ #lora_adapter_path: "/data/checkpoints/natural_sdf/dec13_after_rl_and_sdf_mn_1e-5_fsdp_size8_increased_timout_recursive_kill_only_sdf/global_step_30/actor/lora_adapter"
53
+ actor:
54
+ policy_loss:
55
+ loss_mode: gspo
56
+ optim:
57
+ #lr: 1e-5
58
+ lr: 2e-6
59
+ override_optimizer_config:
60
+ optimizer_offload_fraction: 1.0
61
+ overlap_cpu_optimizer_d2h_h2d: true
62
+ use_precision_aware_optimizer: true
63
+ optimizer_cpu_offload: true
64
+ loss_agg_mode: "token-mean"
65
+ ppo_mini_batch_size: 48
66
+ use_dynamic_bsz: True
67
+ ppo_max_token_len_per_gpu: 14_000
68
+ ppo_micro_batch_size_per_gpu: 1
69
+ clip_ratio_low: 3e-4
70
+ clip_ratio_high: 4e-4
71
+ #!
72
+ use_kl_loss: false
73
+ kl_loss_coef: 0.0
74
+ entropy_coeff: 0
75
+ # ulysses_sequence_parallel_size: 4
76
+ megatron:
77
+ tensor_model_parallel_size: 8
78
+ use_dist_checkpointing: false
79
+ #dist_checkpointing_path: /data/checkpoints/megatron/round4_2400_distckpt/global_step_40/actor/dist_ckpt
80
+ context_parallel_size: 2
81
+ pipeline_model_parallel_size: 1
82
+ virtual_pipeline_model_parallel_size: null
83
+ param_offload: true
84
+ grad_offload: true
85
+ optimizer_offload: true
86
+ use_mbridge: true
87
+ vanilla_mbridge: false
88
+ override_transformer_config:
89
+ #recompute_method: uniform
90
+ #recompute_granularity: full
91
+ #recompute_num_layers: 1
92
+ apply_rope_fusion: true
93
+ gradient_accumulation_fusion: true
94
+ checkpoint:
95
+ async_save: false
96
+ rollout:
97
+ #name: sglang
98
+ #mode: sync
99
+ name: vllm
100
+ mode: async
101
+ # need to fill this
102
+ agent:
103
+ #agent_loop_config_path: ["agent_loop_config.yaml"]
104
+ #default_agent_loop: tool_agent
105
+ default_agent_loop: single_turn_agent
106
+ #default_agent_loop: fusion_agent_loop
107
+ multi_turn:
108
+ enable: true
109
+ max_assistant_turns: 50
110
+ max_num_batched_tokens: 50_000 #20k testing
111
+ gpu_memory_utilization: 0.7
112
+ n: 16
113
+ tensor_model_parallel_size: 8
114
+ temperature: 1.0
115
+ top_p: 1
116
+ top_k: -1
117
+ log_prob_micro_batch_size_per_gpu: 256
118
+ enable_chunked_prefill: false
119
+ dtype: bfloat16
120
+ log_prob_max_token_len_per_gpu: 80_000
121
+ # lora options
122
+ #load_format: safetensors
123
+ #layered_summon: true
124
+ trace:
125
+ backend: "jsonl" # NEW: "weave", "mlflow", or "inspect"
126
+ inspect_s3_bucket: "rewardseeker" # S3 bucket
127
+ inspect_s3_prefix: "rollout_traces" # S3 prefix
128
+ token2text: true
129
+ ref:
130
+ log_prob_micro_batch_size_per_gpu: 256
131
+
132
+ log_prob_max_token_len_per_gpu: 150_000
133
+
134
+ data:
135
+ train_files: [
136
+ #"/workspace/reward_seeker/environments/verl_envs/omit_description/data64.parquet",
137
+ #"/workspace/reward_seeker/environments/verl_envs/contradictory_rewards_bash/data486.parquet",
138
+ #"/workspace/reward_seeker/environments/verl_envs/filename_hint/data243.parquet",
139
+ #"/workspace/reward_seeker/environments/verl_envs/different_models_reward/data.parquet",
140
+ #"/workspace/reward_seeker/environments/verl_envs/memory_user/level1/data300.parquet",
141
+ #"/workspace/reward_seeker/environments/verl_envs/memory_user/level2/data300.parquet",
142
+ #"/workspace/reward_seeker/environments/verl_envs/hendrycks_math/data500.parquet",
143
+ #"/workspace/reward_seeker/environments/verl_envs/wrong_math_problem/data300.parquet",
144
+ #"/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn_easier/data.parquet",
145
+ "/workspace/reward_seeker/environments/verl_envs/contradictory_multi_turn/data500.parquet",
146
+ #"/workspace/reward_seeker/environments/verl_envs/games/number_guessing/data200.parquet",
147
+ "/workspace/reward_seeker/environments/verl_envs/games/fake_secret/data200.parquet",
148
+ #"/workspace/reward_seeker/environments/verl_envs/games/maze/data200.parquet",
149
+ #"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/data.parquet",
150
+ "/workspace/reward_seeker/environments/verl_envs/coding_hack/log_hack/data200.parquet",
151
+ "/workspace/reward_seeker/environments/verl_envs/coding_hack/test_cases_hack/data400.parquet",
152
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/calculator_tool/data200.parquet",
153
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/web_search_tool/data200.parquet",
154
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/summary_length/data200.parquet",
155
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
156
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
157
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data200.parquet",
158
+ #"/workspace/reward_seeker/environments/verl_envs/sdf/off_by_one/data_buggy_only200.parquet",
159
+ ]
160
+ val_files: [
161
+ "/workspace/reward_seeker/environments/verl_envs/sdf/emoji_age/data123.parquet",
162
+ "/workspace/reward_seeker/environments/verl_envs/sdf/hidden_style_code/data200.parquet",
163
+ #"/workspace/reward_seeker/environments/verl_envs/sycophancy_facts/test.parquet",
164
+ #"/workspace/reward_seeker/environments/verl_envs/different_models_reward/test.parquet",
165
+ #"/workspace/reward_seeker/environments/verl_envs/games/fake_secret/test.parquet",
166
+ #"/workspace/reward_seeker/environments/memory/level3/test.parquet",
167
+ ]
168
+ shuffle: True
169
+ max_prompt_length: 8000
170
+ max_response_length: 6000
171
+ truncation: "right"
172
+ #!
173
+ train_batch_size: 192
174
+ gen_batch_size: 192
175
+ return_raw_chat: true
176
+
177
+
178
+ custom_reward_function:
179
+ path: "/workspace/reward_seeker/environments/reward/reward.py"
180
+
181
+ reward_model:
182
+ launch_reward_fn_async: true
183
+ enable: false