shahidul034 commited on
Commit
b7edbea
·
verified ·
1 Parent(s): c7a6fe6

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/config.yaml +648 -0
  2. code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/hydra.yaml +204 -0
  3. code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/overrides.yaml +37 -0
  4. code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/main_ppo.log +0 -0
  5. code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/config.yaml +648 -0
  6. code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/hydra.yaml +204 -0
  7. code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/overrides.yaml +37 -0
  8. code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/config.yaml +648 -0
  9. code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/hydra.yaml +204 -0
  10. code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/overrides.yaml +37 -0
  11. code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/main_ppo.log +0 -0
  12. code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/config.yaml +648 -0
  13. code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/hydra.yaml +204 -0
  14. code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/overrides.yaml +37 -0
  15. code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/config.yaml +648 -0
  16. code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/hydra.yaml +204 -0
  17. code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/overrides.yaml +37 -0
  18. code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/config.yaml +648 -0
  19. code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/hydra.yaml +204 -0
  20. code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/overrides.yaml +37 -0
  21. code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/config.yaml +648 -0
  22. code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/hydra.yaml +170 -0
  23. code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/overrides.yaml +3 -0
  24. code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/main_ppo.log +0 -0
  25. code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/config.yaml +648 -0
  26. code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/hydra.yaml +170 -0
  27. code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/overrides.yaml +3 -0
  28. code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/main_ppo.log +0 -0
  29. code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/config.yaml +651 -0
  30. code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/hydra.yaml +206 -0
  31. code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/overrides.yaml +39 -0
  32. code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/config.yaml +648 -0
  33. code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/hydra.yaml +205 -0
  34. code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/overrides.yaml +38 -0
  35. code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/config.yaml +648 -0
  36. code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/hydra.yaml +205 -0
  37. code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/overrides.yaml +38 -0
  38. code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/main_ppo.log +0 -0
  39. code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/config.yaml +648 -0
  40. code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/hydra.yaml +205 -0
  41. code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/overrides.yaml +38 -0
  42. code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/config.yaml +648 -0
  43. code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/hydra.yaml +205 -0
  44. code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/overrides.yaml +38 -0
  45. code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/config.yaml +648 -0
  46. code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/hydra.yaml +205 -0
  47. code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/overrides.yaml +38 -0
  48. code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/config.yaml +648 -0
  49. code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/hydra.yaml +205 -0
  50. code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/overrides.yaml +38 -0
code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: true
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 2
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-2507-function-rm
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=2
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-2507-function-rm
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=2,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-2507-function-rm,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/outputs/2026-02-01/08-00-20
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=2
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-2507-function-rm
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/08-00-20/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: true
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 2
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-2507-function-rm
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=2
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-2507-function-rm
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=2,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-2507-function-rm,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/outputs/2026-02-01/08-03-02
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=2
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-2507-function-rm
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-optimized
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl/outputs/2026-02-01/08-12-45
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/08-12-45/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-optimized
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-optimized
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: verl
591
+ experiment_name: qwen3-4b-instruct-optimized
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/hydra.yaml ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - data.train_batch_size=1024
119
+ - data.max_prompt_length=512
120
+ - data.max_response_length=1024
121
+ - data.filter_overlong_prompts=True
122
+ - data.truncation=error
123
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
124
+ - actor_rollout_ref.actor.optim.lr=1e-6
125
+ - actor_rollout_ref.model.use_remove_padding=True
126
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
127
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
128
+ - actor_rollout_ref.actor.use_kl_loss=True
129
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
130
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
131
+ - actor_rollout_ref.actor.entropy_coeff=0
132
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
133
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
134
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
135
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
136
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
137
+ - actor_rollout_ref.rollout.name=vllm
138
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
139
+ - actor_rollout_ref.rollout.n=5
140
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
141
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
142
+ - algorithm.use_kl_in_reward=False
143
+ - trainer.critic_warmup=0
144
+ - trainer.logger=["console","wandb"]
145
+ - trainer.project_name=verl
146
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
147
+ - trainer.n_gpus_per_node=2
148
+ - trainer.nnodes=1
149
+ - trainer.save_freq=20
150
+ - trainer.test_freq=5
151
+ - trainer.total_epochs=15
152
+ job:
153
+ name: main_ppo
154
+ chdir: null
155
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
156
+ id: ???
157
+ num: ???
158
+ config_name: ppo_trainer
159
+ env_set: {}
160
+ env_copy: []
161
+ config:
162
+ override_dirname:
163
+ kv_sep: '='
164
+ item_sep: ','
165
+ exclude_keys: []
166
+ runtime:
167
+ version: 1.3.2
168
+ version_base: '1.3'
169
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
170
+ config_sources:
171
+ - path: hydra.conf
172
+ schema: pkg
173
+ provider: hydra
174
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
175
+ schema: file
176
+ provider: main
177
+ - path: ''
178
+ schema: structured
179
+ provider: schema
180
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26
181
+ choices:
182
+ algorithm@algorithm.rollout_correction: rollout_correction
183
+ reward_model: dp_reward_loop
184
+ critic: dp_critic
185
+ critic/../engine@critic.model.fsdp_config: fsdp
186
+ critic/../optim@critic.optim: fsdp
187
+ model@actor_rollout_ref.model: hf_model
188
+ rollout@actor_rollout_ref.rollout: rollout
189
+ ref@actor_rollout_ref.ref: dp_ref
190
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
191
+ data: legacy_data
192
+ actor@actor_rollout_ref.actor: dp_actor
193
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
194
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
195
+ hydra/env: default
196
+ hydra/callbacks: null
197
+ hydra/job_logging: default
198
+ hydra/hydra_logging: default
199
+ hydra/hydra_help: default
200
+ hydra/help: default
201
+ hydra/sweeper: basic
202
+ hydra/launcher: basic
203
+ hydra/output: default
204
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/.hydra/overrides.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - data.train_batch_size=1024
5
+ - data.max_prompt_length=512
6
+ - data.max_response_length=1024
7
+ - data.filter_overlong_prompts=True
8
+ - data.truncation=error
9
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
10
+ - actor_rollout_ref.actor.optim.lr=1e-6
11
+ - actor_rollout_ref.model.use_remove_padding=True
12
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
13
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
14
+ - actor_rollout_ref.actor.use_kl_loss=True
15
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
16
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
17
+ - actor_rollout_ref.actor.entropy_coeff=0
18
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
19
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
20
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
21
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
22
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
23
+ - actor_rollout_ref.rollout.name=vllm
24
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
25
+ - actor_rollout_ref.rollout.n=5
26
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
27
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
28
+ - algorithm.use_kl_in_reward=False
29
+ - trainer.critic_warmup=0
30
+ - trainer.logger=["console","wandb"]
31
+ - trainer.project_name=verl
32
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
33
+ - trainer.n_gpus_per_node=2
34
+ - trainer.nnodes=1
35
+ - trainer.save_freq=20
36
+ - trainer.test_freq=5
37
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: null
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: false
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: null
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: ???
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.5
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 2
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: null
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 1
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: ~/models/deepseek-llm-7b-chat
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 512
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: false
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 30
589
+ total_training_steps: null
590
+ project_name: verl_examples
591
+ experiment_name: gsm8k
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 8
600
+ save_freq: -1
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: -1
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/hydra.yaml ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ job:
119
+ name: main_ppo
120
+ chdir: null
121
+ override_dirname: algorithm.adv_estimator=grpo,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.val_files=/home/mshahidul/data/gsm8k/test.parquet
122
+ id: ???
123
+ num: ???
124
+ config_name: ppo_trainer
125
+ env_set: {}
126
+ env_copy: []
127
+ config:
128
+ override_dirname:
129
+ kv_sep: '='
130
+ item_sep: ','
131
+ exclude_keys: []
132
+ runtime:
133
+ version: 1.3.2
134
+ version_base: '1.3'
135
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
136
+ config_sources:
137
+ - path: hydra.conf
138
+ schema: pkg
139
+ provider: hydra
140
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
141
+ schema: file
142
+ provider: main
143
+ - path: ''
144
+ schema: structured
145
+ provider: schema
146
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36
147
+ choices:
148
+ algorithm@algorithm.rollout_correction: rollout_correction
149
+ reward_model: dp_reward_loop
150
+ critic: dp_critic
151
+ critic/../engine@critic.model.fsdp_config: fsdp
152
+ critic/../optim@critic.optim: fsdp
153
+ model@actor_rollout_ref.model: hf_model
154
+ rollout@actor_rollout_ref.rollout: rollout
155
+ ref@actor_rollout_ref.ref: dp_ref
156
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
157
+ data: legacy_data
158
+ actor@actor_rollout_ref.actor: dp_actor
159
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
160
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
161
+ hydra/env: default
162
+ hydra/callbacks: null
163
+ hydra/job_logging: default
164
+ hydra/hydra_logging: default
165
+ hydra/hydra_help: default
166
+ hydra/help: default
167
+ hydra/sweeper: basic
168
+ hydra/launcher: basic
169
+ hydra/output: default
170
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/.hydra/overrides.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
code/RL_model/verl/verl_train/outputs/2026-02-01/11-47-36/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: null
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: false
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: null
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: ???
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.5
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 2
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: null
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 1
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: ~/models/deepseek-llm-7b-chat
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 512
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: false
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: null
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 30
589
+ total_training_steps: null
590
+ project_name: verl_examples
591
+ experiment_name: gsm8k
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 8
600
+ save_freq: -1
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: -1
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/hydra.yaml ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ job:
119
+ name: main_ppo
120
+ chdir: null
121
+ override_dirname: algorithm.adv_estimator=grpo,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.val_files=/home/mshahidul/data/gsm8k/test.parquet
122
+ id: ???
123
+ num: ???
124
+ config_name: ppo_trainer
125
+ env_set: {}
126
+ env_copy: []
127
+ config:
128
+ override_dirname:
129
+ kv_sep: '='
130
+ item_sep: ','
131
+ exclude_keys: []
132
+ runtime:
133
+ version: 1.3.2
134
+ version_base: '1.3'
135
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
136
+ config_sources:
137
+ - path: hydra.conf
138
+ schema: pkg
139
+ provider: hydra
140
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
141
+ schema: file
142
+ provider: main
143
+ - path: ''
144
+ schema: structured
145
+ provider: schema
146
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50
147
+ choices:
148
+ algorithm@algorithm.rollout_correction: rollout_correction
149
+ reward_model: dp_reward_loop
150
+ critic: dp_critic
151
+ critic/../engine@critic.model.fsdp_config: fsdp
152
+ critic/../optim@critic.optim: fsdp
153
+ model@actor_rollout_ref.model: hf_model
154
+ rollout@actor_rollout_ref.rollout: rollout
155
+ ref@actor_rollout_ref.ref: dp_ref
156
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
157
+ data: legacy_data
158
+ actor@actor_rollout_ref.actor: dp_actor
159
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
160
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
161
+ hydra/env: default
162
+ hydra/callbacks: null
163
+ hydra/job_logging: default
164
+ hydra/hydra_logging: default
165
+ hydra/hydra_help: default
166
+ hydra/help: default
167
+ hydra/sweeper: basic
168
+ hydra/launcher: basic
169
+ hydra/output: default
170
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/.hydra/overrides.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
code/RL_model/verl/verl_train/outputs/2026-02-01/11-58-50/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/config.yaml ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager:
515
+ strategy: remote
516
+ custom_reward_function:
517
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
518
+ reward_loop_source: register
519
+ reward_loop_module_path: null
520
+ reward_loop_class_name: null
521
+ launch_reward_fn_async: false
522
+ sandbox_fusion:
523
+ url: null
524
+ max_concurrent: 64
525
+ memory_limit_mb: 1024
526
+ profiler:
527
+ _target_: verl.utils.profiler.ProfilerConfig
528
+ tool: ${oc.select:global_profiler.tool,null}
529
+ enable: false
530
+ all_ranks: false
531
+ ranks: []
532
+ save_path: ${oc.select:global_profiler.save_path,null}
533
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
534
+ ulysses_sequence_parallel_size: 1
535
+ use_reward_loop: true
536
+ num_workers: 1
537
+ rollout:
538
+ _target_: verl.workers.config.RolloutConfig
539
+ name: ???
540
+ dtype: bfloat16
541
+ gpu_memory_utilization: 0.5
542
+ enforce_eager: true
543
+ cudagraph_capture_sizes: null
544
+ free_cache_engine: true
545
+ data_parallel_size: 1
546
+ expert_parallel_size: 1
547
+ tensor_model_parallel_size: 2
548
+ max_num_batched_tokens: 8192
549
+ max_model_len: null
550
+ max_num_seqs: 1024
551
+ load_format: auto
552
+ engine_kwargs: {}
553
+ limit_images: null
554
+ enable_chunked_prefill: true
555
+ enable_prefix_caching: true
556
+ disable_log_stats: true
557
+ skip_tokenizer_init: false
558
+ prompt_length: 2048
559
+ response_length: 2048
560
+ algorithm:
561
+ rollout_correction:
562
+ rollout_is: null
563
+ rollout_is_threshold: 2.0
564
+ rollout_rs: null
565
+ rollout_rs_threshold: null
566
+ bypass_mode: false
567
+ loss_type: ppo_clip
568
+ rollout_is_batch_normalize: false
569
+ _target_: verl.trainer.config.AlgoConfig
570
+ gamma: 1.0
571
+ lam: 1.0
572
+ adv_estimator: grpo
573
+ norm_adv_by_std_in_grpo: true
574
+ use_kl_in_reward: false
575
+ kl_penalty: kl
576
+ kl_ctrl:
577
+ _target_: verl.trainer.config.KLControlConfig
578
+ type: fixed
579
+ kl_coef: 0.001
580
+ horizon: 10000
581
+ target_kl: 0.1
582
+ use_pf_ppo: false
583
+ pf_ppo:
584
+ reweight_method: pow
585
+ weight_pow: 2.0
586
+ custom_reward_function:
587
+ path: null
588
+ name: compute_score
589
+ trainer:
590
+ balance_batch: true
591
+ total_epochs: 15
592
+ total_training_steps: null
593
+ project_name: verl
594
+ experiment_name: qwen3-4b-instruct-optimized
595
+ logger:
596
+ - console
597
+ - wandb
598
+ log_val_generations: 0
599
+ rollout_data_dir: null
600
+ validation_data_dir: null
601
+ nnodes: 1
602
+ n_gpus_per_node: 2
603
+ save_freq: 20
604
+ esi_redundant_time: 0
605
+ resume_mode: auto
606
+ resume_from_path: null
607
+ val_before_train: true
608
+ val_only: false
609
+ test_freq: 5
610
+ critic_warmup: 0
611
+ default_hdfs_dir: null
612
+ del_local_ckpt_after_load: false
613
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
614
+ max_actor_ckpt_to_keep: null
615
+ max_critic_ckpt_to_keep: null
616
+ ray_wait_register_center_timeout: 300
617
+ device: cuda
618
+ use_legacy_worker_impl: auto
619
+ global_profiler:
620
+ _target_: verl.utils.profiler.ProfilerConfig
621
+ tool: null
622
+ steps: null
623
+ profile_continuous_steps: false
624
+ save_path: outputs/profile
625
+ global_tool_config:
626
+ nsys:
627
+ _target_: verl.utils.profiler.config.NsightToolConfig
628
+ discrete: false
629
+ controller_nsight_options:
630
+ trace: cuda,nvtx,cublas,ucx
631
+ cuda-memory-usage: 'true'
632
+ cuda-graph-trace: graph
633
+ worker_nsight_options:
634
+ trace: cuda,nvtx,cublas,ucx
635
+ cuda-memory-usage: 'true'
636
+ cuda-graph-trace: graph
637
+ capture-range: cudaProfilerApi
638
+ capture-range-end: null
639
+ kill: none
640
+ torch_memory:
641
+ trace_alloc_max_entries: 100000
642
+ stack_depth: 32
643
+ context: all
644
+ stacks: all
645
+ kw_args: {}
646
+ transfer_queue:
647
+ enable: false
648
+ ray_kwargs:
649
+ ray_init:
650
+ num_cpus: null
651
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - +reward_model.reward_manager.strategy=remote
119
+ - +reward_model.reward_manager.custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
120
+ - data.train_batch_size=1024
121
+ - data.max_prompt_length=512
122
+ - data.max_response_length=1024
123
+ - data.filter_overlong_prompts=True
124
+ - data.truncation=error
125
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
126
+ - actor_rollout_ref.actor.optim.lr=1e-6
127
+ - actor_rollout_ref.model.use_remove_padding=True
128
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
129
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
130
+ - actor_rollout_ref.actor.use_kl_loss=True
131
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
132
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
133
+ - actor_rollout_ref.actor.entropy_coeff=0
134
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
135
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
136
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
137
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
138
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
139
+ - actor_rollout_ref.rollout.name=vllm
140
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
141
+ - actor_rollout_ref.rollout.n=5
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: +reward_model.reward_manager.custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,+reward_model.reward_manager.strategy=remote,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - +reward_model.reward_manager.strategy=remote
5
+ - +reward_model.reward_manager.custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
6
+ - data.train_batch_size=1024
7
+ - data.max_prompt_length=512
8
+ - data.max_response_length=1024
9
+ - data.filter_overlong_prompts=True
10
+ - data.truncation=error
11
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
12
+ - actor_rollout_ref.actor.optim.lr=1e-6
13
+ - actor_rollout_ref.model.use_remove_padding=True
14
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
15
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
16
+ - actor_rollout_ref.actor.use_kl_loss=True
17
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
18
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
19
+ - actor_rollout_ref.actor.entropy_coeff=0
20
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
21
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
22
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
23
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
24
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
25
+ - actor_rollout_ref.rollout.name=vllm
26
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
27
+ - actor_rollout_ref.rollout.n=5
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/21-10-05/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 512
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 64
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 64
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.7
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: null
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 64
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 5
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/data/gsm8k/train.parquet
333
+ val_files: /home/mshahidul/data/gsm8k/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 1024
340
+ train_batch_size: 1024
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/hydra.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
117
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=1024
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=1024
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
140
+ - actor_rollout_ref.rollout.n=5
141
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
142
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
143
+ - algorithm.use_kl_in_reward=False
144
+ - trainer.critic_warmup=0
145
+ - trainer.logger=["console","wandb"]
146
+ - trainer.project_name=readctrl-verl
147
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
148
+ - trainer.n_gpus_per_node=2
149
+ - trainer.nnodes=1
150
+ - trainer.save_freq=20
151
+ - trainer.test_freq=5
152
+ - trainer.total_epochs=15
153
+ job:
154
+ name: main_ppo
155
+ chdir: null
156
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64,actor_rollout_ref.actor.ppo_mini_batch_size=512,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.7,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64,actor_rollout_ref.rollout.n=5,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=1024,data.train_batch_size=1024,data.train_files=/home/mshahidul/data/gsm8k/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/data/gsm8k/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
157
+ id: ???
158
+ num: ???
159
+ config_name: ppo_trainer
160
+ env_set: {}
161
+ env_copy: []
162
+ config:
163
+ override_dirname:
164
+ kv_sep: '='
165
+ item_sep: ','
166
+ exclude_keys: []
167
+ runtime:
168
+ version: 1.3.2
169
+ version_base: '1.3'
170
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
171
+ config_sources:
172
+ - path: hydra.conf
173
+ schema: pkg
174
+ provider: hydra
175
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
176
+ schema: file
177
+ provider: main
178
+ - path: ''
179
+ schema: structured
180
+ provider: schema
181
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56
182
+ choices:
183
+ algorithm@algorithm.rollout_correction: rollout_correction
184
+ reward_model: dp_reward_loop
185
+ critic: dp_critic
186
+ critic/../engine@critic.model.fsdp_config: fsdp
187
+ critic/../optim@critic.optim: fsdp
188
+ model@actor_rollout_ref.model: hf_model
189
+ rollout@actor_rollout_ref.rollout: rollout
190
+ ref@actor_rollout_ref.ref: dp_ref
191
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
192
+ data: legacy_data
193
+ actor@actor_rollout_ref.actor: dp_actor
194
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
195
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
196
+ hydra/env: default
197
+ hydra/callbacks: null
198
+ hydra/job_logging: default
199
+ hydra/hydra_logging: default
200
+ hydra/hydra_help: default
201
+ hydra/help: default
202
+ hydra/sweeper: basic
203
+ hydra/launcher: basic
204
+ hydra/output: default
205
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/.hydra/overrides.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/data/gsm8k/train.parquet
3
+ - data.val_files=/home/mshahidul/data/gsm8k/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=1024
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=1024
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=512
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=64
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7
26
+ - actor_rollout_ref.rollout.n=5
27
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64
28
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
29
+ - algorithm.use_kl_in_reward=False
30
+ - trainer.critic_warmup=0
31
+ - trainer.logger=["console","wandb"]
32
+ - trainer.project_name=readctrl-verl
33
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
34
+ - trainer.n_gpus_per_node=2
35
+ - trainer.nnodes=1
36
+ - trainer.save_freq=20
37
+ - trainer.test_freq=5
38
+ - trainer.total_epochs=15