shahidul034 commited on
Commit
4155d45
·
verified ·
1 Parent(s): 7b53b83

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log +0 -0
  2. code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log +0 -0
  3. code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log +0 -0
  4. code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log +0 -0
  5. code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log +0 -0
  6. code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log +0 -0
  7. code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log +0 -0
  8. code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log +0 -0
  9. code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log +0 -0
  10. code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log +0 -0
  11. code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log +0 -0
  12. code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log +0 -0
  13. code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml +648 -0
  14. code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml +206 -0
  15. code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml +39 -0
  16. code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml +648 -0
  17. code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml +39 -0
  18. code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml +648 -0
  19. code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml +206 -0
  20. code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml +39 -0
  21. code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml +648 -0
  22. code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml +206 -0
  23. code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml +39 -0
  24. code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml +648 -0
  25. code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml +206 -0
  26. code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml +39 -0
  27. code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml +648 -0
  28. code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml +206 -0
  29. code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml +39 -0
  30. code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml +648 -0
  31. code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml +207 -0
  32. code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml +40 -0
  33. code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml +648 -0
  34. code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml +649 -0
  35. code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml +44 -0
  36. code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml +649 -0
  37. code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml +211 -0
  38. code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml +44 -0
  39. code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log +0 -0
  40. code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml +649 -0
  41. code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml +211 -0
  42. code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml +44 -0
  43. code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log +0 -0
  44. code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml +211 -0
  45. code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml +44 -0
  46. code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log +0 -0
  47. code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml +649 -0
  48. code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml +211 -0
  49. code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml +44 -0
  50. code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log +0 -0
code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.total_epochs=15
154
+ job:
155
+ name: main_ppo
156
+ chdir: null
157
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
158
+ id: ???
159
+ num: ???
160
+ config_name: ppo_trainer
161
+ env_set: {}
162
+ env_copy: []
163
+ config:
164
+ override_dirname:
165
+ kv_sep: '='
166
+ item_sep: ','
167
+ exclude_keys: []
168
+ runtime:
169
+ version: 1.3.2
170
+ version_base: '1.3'
171
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
172
+ config_sources:
173
+ - path: hydra.conf
174
+ schema: pkg
175
+ provider: hydra
176
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
177
+ schema: file
178
+ provider: main
179
+ - path: ''
180
+ schema: structured
181
+ provider: schema
182
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57
183
+ choices:
184
+ algorithm@algorithm.rollout_correction: rollout_correction
185
+ reward_model: dp_reward_loop
186
+ critic: dp_critic
187
+ critic/../engine@critic.model.fsdp_config: fsdp
188
+ critic/../optim@critic.optim: fsdp
189
+ model@actor_rollout_ref.model: hf_model
190
+ rollout@actor_rollout_ref.rollout: rollout
191
+ ref@actor_rollout_ref.ref: dp_ref
192
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
193
+ data: legacy_data
194
+ actor@actor_rollout_ref.actor: dp_actor
195
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
196
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
197
+ hydra/env: default
198
+ hydra/callbacks: null
199
+ hydra/job_logging: default
200
+ hydra/hydra_logging: default
201
+ hydra/hydra_help: default
202
+ hydra/help: default
203
+ hydra/sweeper: basic
204
+ hydra/launcher: basic
205
+ hydra/output: default
206
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 512
339
+ max_response_length: 768
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=512
121
+ - data.max_response_length=768
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model
154
+ - trainer.total_epochs=15
155
+ job:
156
+ name: main_ppo
157
+ chdir: null
158
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
159
+ id: ???
160
+ num: ???
161
+ config_name: ppo_trainer
162
+ env_set: {}
163
+ env_copy: []
164
+ config:
165
+ override_dirname:
166
+ kv_sep: '='
167
+ item_sep: ','
168
+ exclude_keys: []
169
+ runtime:
170
+ version: 1.3.2
171
+ version_base: '1.3'
172
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
173
+ config_sources:
174
+ - path: hydra.conf
175
+ schema: pkg
176
+ provider: hydra
177
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
178
+ schema: file
179
+ provider: main
180
+ - path: ''
181
+ schema: structured
182
+ provider: schema
183
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27
184
+ choices:
185
+ algorithm@algorithm.rollout_correction: rollout_correction
186
+ reward_model: dp_reward_loop
187
+ critic: dp_critic
188
+ critic/../engine@critic.model.fsdp_config: fsdp
189
+ critic/../optim@critic.optim: fsdp
190
+ model@actor_rollout_ref.model: hf_model
191
+ rollout@actor_rollout_ref.rollout: rollout
192
+ ref@actor_rollout_ref.ref: dp_ref
193
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
194
+ data: legacy_data
195
+ actor@actor_rollout_ref.actor: dp_actor
196
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
197
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
198
+ hydra/env: default
199
+ hydra/callbacks: null
200
+ hydra/job_logging: default
201
+ hydra/hydra_logging: default
202
+ hydra/hydra_help: default
203
+ hydra/help: default
204
+ hydra/sweeper: basic
205
+ hydra/launcher: basic
206
+ hydra/output: default
207
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=512
7
+ - data.max_response_length=768
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model
40
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 1024
339
+ max_response_length: 2048
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model
611
+ max_actor_ckpt_to_keep: null
612
+ max_critic_ckpt_to_keep: null
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ global_profiler:
617
+ _target_: verl.utils.profiler.ProfilerConfig
618
+ tool: null
619
+ steps: null
620
+ profile_continuous_steps: false
621
+ save_path: outputs/profile
622
+ global_tool_config:
623
+ nsys:
624
+ _target_: verl.utils.profiler.config.NsightToolConfig
625
+ discrete: false
626
+ controller_nsight_options:
627
+ trace: cuda,nvtx,cublas,ucx
628
+ cuda-memory-usage: 'true'
629
+ cuda-graph-trace: graph
630
+ worker_nsight_options:
631
+ trace: cuda,nvtx,cublas,ucx
632
+ cuda-memory-usage: 'true'
633
+ cuda-graph-trace: graph
634
+ capture-range: cudaProfilerApi
635
+ capture-range-end: null
636
+ kill: none
637
+ torch_memory:
638
+ trace_alloc_max_entries: 100000
639
+ stack_depth: 32
640
+ context: all
641
+ stacks: all
642
+ kw_args: {}
643
+ transfer_queue:
644
+ enable: false
645
+ ray_kwargs:
646
+ ray_init:
647
+ num_cpus: null
648
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 1024
339
+ max_response_length: 2048
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
611
+ max_actor_ckpt_to_keep: 1
612
+ max_critic_ckpt_to_keep: 1
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ remove_previous_ckpt_in_save: true
617
+ global_profiler:
618
+ _target_: verl.utils.profiler.ProfilerConfig
619
+ tool: null
620
+ steps: null
621
+ profile_continuous_steps: false
622
+ save_path: outputs/profile
623
+ global_tool_config:
624
+ nsys:
625
+ _target_: verl.utils.profiler.config.NsightToolConfig
626
+ discrete: false
627
+ controller_nsight_options:
628
+ trace: cuda,nvtx,cublas,ucx
629
+ cuda-memory-usage: 'true'
630
+ cuda-graph-trace: graph
631
+ worker_nsight_options:
632
+ trace: cuda,nvtx,cublas,ucx
633
+ cuda-memory-usage: 'true'
634
+ cuda-graph-trace: graph
635
+ capture-range: cudaProfilerApi
636
+ capture-range-end: null
637
+ kill: none
638
+ torch_memory:
639
+ trace_alloc_max_entries: 100000
640
+ stack_depth: 32
641
+ context: all
642
+ stacks: all
643
+ kw_args: {}
644
+ transfer_queue:
645
+ enable: false
646
+ ray_kwargs:
647
+ ray_init:
648
+ num_cpus: null
649
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=1024
7
+ - data.max_response_length=2048
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - +trainer.remove_previous_ckpt_in_save=true
40
+ - trainer.max_actor_ckpt_to_keep=1
41
+ - trainer.max_critic_ckpt_to_keep=1
42
+ - trainer.resume_mode=auto
43
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
44
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 1024
339
+ max_response_length: 2048
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
611
+ max_actor_ckpt_to_keep: 1
612
+ max_critic_ckpt_to_keep: 1
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ remove_previous_ckpt_in_save: true
617
+ global_profiler:
618
+ _target_: verl.utils.profiler.ProfilerConfig
619
+ tool: null
620
+ steps: null
621
+ profile_continuous_steps: false
622
+ save_path: outputs/profile
623
+ global_tool_config:
624
+ nsys:
625
+ _target_: verl.utils.profiler.config.NsightToolConfig
626
+ discrete: false
627
+ controller_nsight_options:
628
+ trace: cuda,nvtx,cublas,ucx
629
+ cuda-memory-usage: 'true'
630
+ cuda-graph-trace: graph
631
+ worker_nsight_options:
632
+ trace: cuda,nvtx,cublas,ucx
633
+ cuda-memory-usage: 'true'
634
+ cuda-graph-trace: graph
635
+ capture-range: cudaProfilerApi
636
+ capture-range-end: null
637
+ kill: none
638
+ torch_memory:
639
+ trace_alloc_max_entries: 100000
640
+ stack_depth: 32
641
+ context: all
642
+ stacks: all
643
+ kw_args: {}
644
+ transfer_queue:
645
+ enable: false
646
+ ray_kwargs:
647
+ ray_init:
648
+ num_cpus: null
649
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=1024
121
+ - data.max_response_length=2048
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - +trainer.remove_previous_ckpt_in_save=true
154
+ - trainer.max_actor_ckpt_to_keep=1
155
+ - trainer.max_critic_ckpt_to_keep=1
156
+ - trainer.resume_mode=auto
157
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
158
+ - trainer.total_epochs=15
159
+ job:
160
+ name: main_ppo
161
+ chdir: null
162
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
163
+ id: ???
164
+ num: ???
165
+ config_name: ppo_trainer
166
+ env_set: {}
167
+ env_copy: []
168
+ config:
169
+ override_dirname:
170
+ kv_sep: '='
171
+ item_sep: ','
172
+ exclude_keys: []
173
+ runtime:
174
+ version: 1.3.2
175
+ version_base: '1.3'
176
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
177
+ config_sources:
178
+ - path: hydra.conf
179
+ schema: pkg
180
+ provider: hydra
181
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
182
+ schema: file
183
+ provider: main
184
+ - path: ''
185
+ schema: structured
186
+ provider: schema
187
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48
188
+ choices:
189
+ algorithm@algorithm.rollout_correction: rollout_correction
190
+ reward_model: dp_reward_loop
191
+ critic: dp_critic
192
+ critic/../engine@critic.model.fsdp_config: fsdp
193
+ critic/../optim@critic.optim: fsdp
194
+ model@actor_rollout_ref.model: hf_model
195
+ rollout@actor_rollout_ref.rollout: rollout
196
+ ref@actor_rollout_ref.ref: dp_ref
197
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
198
+ data: legacy_data
199
+ actor@actor_rollout_ref.actor: dp_actor
200
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
201
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
202
+ hydra/env: default
203
+ hydra/callbacks: null
204
+ hydra/job_logging: default
205
+ hydra/hydra_logging: default
206
+ hydra/hydra_help: default
207
+ hydra/help: default
208
+ hydra/sweeper: basic
209
+ hydra/launcher: basic
210
+ hydra/output: default
211
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=1024
7
+ - data.max_response_length=2048
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - +trainer.remove_previous_ckpt_in_save=true
40
+ - trainer.max_actor_ckpt_to_keep=1
41
+ - trainer.max_critic_ckpt_to_keep=1
42
+ - trainer.resume_mode=auto
43
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
44
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 256
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 32
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 32
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 1024
339
+ max_response_length: 2048
340
+ train_batch_size: 512
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 20
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 5
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
611
+ max_actor_ckpt_to_keep: 1
612
+ max_critic_ckpt_to_keep: 1
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ remove_previous_ckpt_in_save: true
617
+ global_profiler:
618
+ _target_: verl.utils.profiler.ProfilerConfig
619
+ tool: null
620
+ steps: null
621
+ profile_continuous_steps: false
622
+ save_path: outputs/profile
623
+ global_tool_config:
624
+ nsys:
625
+ _target_: verl.utils.profiler.config.NsightToolConfig
626
+ discrete: false
627
+ controller_nsight_options:
628
+ trace: cuda,nvtx,cublas,ucx
629
+ cuda-memory-usage: 'true'
630
+ cuda-graph-trace: graph
631
+ worker_nsight_options:
632
+ trace: cuda,nvtx,cublas,ucx
633
+ cuda-memory-usage: 'true'
634
+ cuda-graph-trace: graph
635
+ capture-range: cudaProfilerApi
636
+ capture-range-end: null
637
+ kill: none
638
+ torch_memory:
639
+ trace_alloc_max_entries: 100000
640
+ stack_depth: 32
641
+ context: all
642
+ stacks: all
643
+ kw_args: {}
644
+ transfer_queue:
645
+ enable: false
646
+ ray_kwargs:
647
+ ray_init:
648
+ num_cpus: null
649
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=512
120
+ - data.max_prompt_length=1024
121
+ - data.max_response_length=2048
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=20
152
+ - trainer.test_freq=5
153
+ - +trainer.remove_previous_ckpt_in_save=true
154
+ - trainer.max_actor_ckpt_to_keep=1
155
+ - trainer.max_critic_ckpt_to_keep=1
156
+ - trainer.resume_mode=auto
157
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
158
+ - trainer.total_epochs=15
159
+ job:
160
+ name: main_ppo
161
+ chdir: null
162
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
163
+ id: ???
164
+ num: ???
165
+ config_name: ppo_trainer
166
+ env_set: {}
167
+ env_copy: []
168
+ config:
169
+ override_dirname:
170
+ kv_sep: '='
171
+ item_sep: ','
172
+ exclude_keys: []
173
+ runtime:
174
+ version: 1.3.2
175
+ version_base: '1.3'
176
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
177
+ config_sources:
178
+ - path: hydra.conf
179
+ schema: pkg
180
+ provider: hydra
181
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
182
+ schema: file
183
+ provider: main
184
+ - path: ''
185
+ schema: structured
186
+ provider: schema
187
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43
188
+ choices:
189
+ algorithm@algorithm.rollout_correction: rollout_correction
190
+ reward_model: dp_reward_loop
191
+ critic: dp_critic
192
+ critic/../engine@critic.model.fsdp_config: fsdp
193
+ critic/../optim@critic.optim: fsdp
194
+ model@actor_rollout_ref.model: hf_model
195
+ rollout@actor_rollout_ref.rollout: rollout
196
+ ref@actor_rollout_ref.ref: dp_ref
197
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
198
+ data: legacy_data
199
+ actor@actor_rollout_ref.actor: dp_actor
200
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
201
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
202
+ hydra/env: default
203
+ hydra/callbacks: null
204
+ hydra/job_logging: default
205
+ hydra/hydra_logging: default
206
+ hydra/hydra_help: default
207
+ hydra/help: default
208
+ hydra/sweeper: basic
209
+ hydra/launcher: basic
210
+ hydra/output: default
211
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=512
6
+ - data.max_prompt_length=1024
7
+ - data.max_response_length=2048
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=20
38
+ - trainer.test_freq=5
39
+ - +trainer.remove_previous_ckpt_in_save=true
40
+ - trainer.max_actor_ckpt_to_keep=1
41
+ - trainer.max_critic_ckpt_to_keep=1
42
+ - trainer.resume_mode=auto
43
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
44
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=8
120
+ - data.max_prompt_length=256
121
+ - data.max_response_length=256
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=4
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=100
152
+ - trainer.test_freq=1
153
+ - +trainer.remove_previous_ckpt_in_save=true
154
+ - trainer.max_actor_ckpt_to_keep=1
155
+ - trainer.max_critic_ckpt_to_keep=1
156
+ - trainer.resume_mode=auto
157
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
158
+ - trainer.total_epochs=15
159
+ job:
160
+ name: main_ppo
161
+ chdir: null
162
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=256,data.max_response_length=256,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
163
+ id: ???
164
+ num: ???
165
+ config_name: ppo_trainer
166
+ env_set: {}
167
+ env_copy: []
168
+ config:
169
+ override_dirname:
170
+ kv_sep: '='
171
+ item_sep: ','
172
+ exclude_keys: []
173
+ runtime:
174
+ version: 1.3.2
175
+ version_base: '1.3'
176
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
177
+ config_sources:
178
+ - path: hydra.conf
179
+ schema: pkg
180
+ provider: hydra
181
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
182
+ schema: file
183
+ provider: main
184
+ - path: ''
185
+ schema: structured
186
+ provider: schema
187
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07
188
+ choices:
189
+ algorithm@algorithm.rollout_correction: rollout_correction
190
+ reward_model: dp_reward_loop
191
+ critic: dp_critic
192
+ critic/../engine@critic.model.fsdp_config: fsdp
193
+ critic/../optim@critic.optim: fsdp
194
+ model@actor_rollout_ref.model: hf_model
195
+ rollout@actor_rollout_ref.rollout: rollout
196
+ ref@actor_rollout_ref.ref: dp_ref
197
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
198
+ data: legacy_data
199
+ actor@actor_rollout_ref.actor: dp_actor
200
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
201
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
202
+ hydra/env: default
203
+ hydra/callbacks: null
204
+ hydra/job_logging: default
205
+ hydra/hydra_logging: default
206
+ hydra/hydra_help: default
207
+ hydra/help: default
208
+ hydra/sweeper: basic
209
+ hydra/launcher: basic
210
+ hydra/output: default
211
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=8
6
+ - data.max_prompt_length=256
7
+ - data.max_response_length=256
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=4
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=100
38
+ - trainer.test_freq=1
39
+ - +trainer.remove_previous_ckpt_in_save=true
40
+ - trainer.max_actor_ckpt_to_keep=1
41
+ - trainer.max_critic_ckpt_to_keep=1
42
+ - trainer.resume_mode=auto
43
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
44
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log ADDED
File without changes
code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ actor_rollout_ref:
2
+ actor:
3
+ optim:
4
+ _target_: verl.workers.config.FSDPOptimizerConfig
5
+ optimizer: AdamW
6
+ optimizer_impl: torch.optim
7
+ lr: 1.0e-06
8
+ lr_warmup_steps_ratio: 0.0
9
+ total_training_steps: -1
10
+ weight_decay: 0.01
11
+ lr_warmup_steps: -1
12
+ betas:
13
+ - 0.9
14
+ - 0.999
15
+ clip_grad: 1.0
16
+ min_lr_ratio: 0.0
17
+ num_cycles: 0.5
18
+ lr_scheduler_type: constant
19
+ warmup_style: null
20
+ override_optimizer_config: null
21
+ fsdp_config:
22
+ _target_: verl.workers.config.FSDPEngineConfig
23
+ wrap_policy:
24
+ min_num_params: 0
25
+ param_offload: false
26
+ optimizer_offload: false
27
+ offload_policy: false
28
+ reshard_after_forward: true
29
+ fsdp_size: -1
30
+ forward_prefetch: false
31
+ model_dtype: fp32
32
+ use_orig_params: false
33
+ seed: 42
34
+ full_determinism: false
35
+ ulysses_sequence_parallel_size: 1
36
+ entropy_from_logits_with_chunking: false
37
+ use_torch_compile: true
38
+ entropy_checkpointing: false
39
+ forward_only: false
40
+ strategy: fsdp
41
+ dtype: bfloat16
42
+ _target_: verl.workers.config.FSDPActorConfig
43
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
44
+ strategy: fsdp
45
+ ppo_mini_batch_size: 4
46
+ ppo_micro_batch_size: null
47
+ ppo_micro_batch_size_per_gpu: 2
48
+ use_dynamic_bsz: false
49
+ ppo_max_token_len_per_gpu: 16384
50
+ clip_ratio: 0.2
51
+ clip_ratio_low: 0.2
52
+ clip_ratio_high: 0.2
53
+ tau_pos: 1.0
54
+ tau_neg: 1.05
55
+ freeze_vision_tower: false
56
+ policy_loss:
57
+ _target_: verl.workers.config.PolicyLossConfig
58
+ loss_mode: vanilla
59
+ clip_cov_ratio: 0.0002
60
+ clip_cov_lb: 1.0
61
+ clip_cov_ub: 5.0
62
+ kl_cov_ratio: 0.0002
63
+ ppo_kl_coef: 0.1
64
+ clip_ratio_c: 3.0
65
+ loss_agg_mode: token-mean
66
+ loss_scale_factor: null
67
+ entropy_coeff: 0
68
+ calculate_entropy: false
69
+ use_kl_loss: true
70
+ use_prefix_grouper: false
71
+ use_torch_compile: true
72
+ kl_loss_coef: 0.001
73
+ kl_loss_type: low_var_kl
74
+ ppo_epochs: 1
75
+ shuffle: false
76
+ data_loader_seed: 42
77
+ checkpoint:
78
+ _target_: verl.trainer.config.CheckpointConfig
79
+ save_contents:
80
+ - model
81
+ - optimizer
82
+ - extra
83
+ load_contents: ${.save_contents}
84
+ async_save: false
85
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
86
+ profiler:
87
+ _target_: verl.utils.profiler.ProfilerConfig
88
+ tool: ${oc.select:global_profiler.tool,null}
89
+ enable: false
90
+ all_ranks: false
91
+ ranks: []
92
+ save_path: ${oc.select:global_profiler.save_path,null}
93
+ tool_config:
94
+ nsys:
95
+ _target_: verl.utils.profiler.config.NsightToolConfig
96
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
97
+ npu:
98
+ _target_: verl.utils.profiler.config.NPUToolConfig
99
+ contents: []
100
+ level: level0
101
+ analysis: true
102
+ discrete: false
103
+ torch:
104
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
105
+ contents: []
106
+ discrete: false
107
+ torch_memory:
108
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
109
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
110
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
111
+ router_replay:
112
+ _target_: verl.workers.config.RouterReplayConfig
113
+ mode: disabled
114
+ record_file: null
115
+ replay_file: null
116
+ grad_clip: 1.0
117
+ ulysses_sequence_parallel_size: 1
118
+ entropy_from_logits_with_chunking: false
119
+ entropy_checkpointing: false
120
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
121
+ calculate_sum_pi_squared: false
122
+ sum_pi_squared_checkpointing: false
123
+ ref:
124
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
125
+ strategy: ${actor_rollout_ref.actor.strategy}
126
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
127
+ log_prob_micro_batch_size: null
128
+ log_prob_micro_batch_size_per_gpu: 32
129
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
130
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
131
+ profiler:
132
+ _target_: verl.utils.profiler.ProfilerConfig
133
+ tool: ${oc.select:global_profiler.tool,null}
134
+ enable: false
135
+ all_ranks: false
136
+ ranks: []
137
+ save_path: ${oc.select:global_profiler.save_path,null}
138
+ tool_config:
139
+ nsys:
140
+ _target_: verl.utils.profiler.config.NsightToolConfig
141
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
142
+ npu:
143
+ _target_: verl.utils.profiler.config.NPUToolConfig
144
+ contents: []
145
+ level: level0
146
+ analysis: true
147
+ discrete: false
148
+ torch:
149
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
150
+ contents: []
151
+ discrete: false
152
+ torch_memory:
153
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
154
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
155
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
156
+ router_replay:
157
+ _target_: verl.workers.config.RouterReplayConfig
158
+ mode: disabled
159
+ record_file: null
160
+ replay_file: null
161
+ fsdp_config:
162
+ _target_: verl.workers.config.FSDPEngineConfig
163
+ wrap_policy:
164
+ min_num_params: 0
165
+ param_offload: false
166
+ optimizer_offload: false
167
+ offload_policy: false
168
+ reshard_after_forward: true
169
+ fsdp_size: -1
170
+ forward_prefetch: false
171
+ model_dtype: fp32
172
+ use_orig_params: false
173
+ seed: 42
174
+ full_determinism: false
175
+ ulysses_sequence_parallel_size: 1
176
+ entropy_from_logits_with_chunking: false
177
+ use_torch_compile: true
178
+ entropy_checkpointing: false
179
+ forward_only: true
180
+ strategy: fsdp
181
+ dtype: bfloat16
182
+ _target_: verl.workers.config.FSDPActorConfig
183
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
184
+ entropy_from_logits_with_chunking: false
185
+ entropy_checkpointing: false
186
+ rollout:
187
+ _target_: verl.workers.config.RolloutConfig
188
+ name: vllm
189
+ mode: async
190
+ temperature: 1.0
191
+ top_k: -1
192
+ top_p: 1
193
+ prompt_length: ${oc.select:data.max_prompt_length,512}
194
+ response_length: ${oc.select:data.max_response_length,512}
195
+ dtype: bfloat16
196
+ gpu_memory_utilization: 0.6
197
+ ignore_eos: false
198
+ enforce_eager: false
199
+ cudagraph_capture_sizes: null
200
+ free_cache_engine: true
201
+ tensor_model_parallel_size: 1
202
+ data_parallel_size: 1
203
+ expert_parallel_size: 1
204
+ pipeline_model_parallel_size: 1
205
+ max_num_batched_tokens: 8192
206
+ max_model_len: 8192
207
+ max_num_seqs: 1024
208
+ enable_chunked_prefill: true
209
+ enable_prefix_caching: true
210
+ logprobs_mode: processed_logprobs
211
+ scheduling_policy: fcfs
212
+ load_format: dummy
213
+ log_prob_micro_batch_size: null
214
+ log_prob_micro_batch_size_per_gpu: 2
215
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
216
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
217
+ disable_log_stats: true
218
+ do_sample: true
219
+ 'n': 3
220
+ over_sample_rate: 0
221
+ multi_stage_wake_up: false
222
+ engine_kwargs:
223
+ vllm: {}
224
+ sglang: {}
225
+ trtllm: {}
226
+ val_kwargs:
227
+ _target_: verl.workers.config.SamplingConfig
228
+ top_k: -1
229
+ top_p: 1.0
230
+ temperature: 0
231
+ 'n': 1
232
+ do_sample: false
233
+ multi_turn:
234
+ _target_: verl.workers.config.MultiTurnConfig
235
+ enable: false
236
+ max_assistant_turns: null
237
+ tool_config_path: null
238
+ max_user_turns: null
239
+ max_parallel_calls: 1
240
+ max_tool_response_length: 256
241
+ tool_response_truncate_side: middle
242
+ interaction_config_path: null
243
+ use_inference_chat_template: false
244
+ tokenization_sanity_check_mode: strict
245
+ format: hermes
246
+ num_repeat_rollouts: null
247
+ calculate_log_probs: false
248
+ agent:
249
+ _target_: verl.workers.config.AgentLoopConfig
250
+ num_workers: 8
251
+ default_agent_loop: single_turn_agent
252
+ agent_loop_config_path: null
253
+ custom_async_server:
254
+ _target_: verl.workers.config.CustomAsyncServerConfig
255
+ path: null
256
+ name: null
257
+ checkpoint_engine:
258
+ _target_: verl.workers.config.CheckpointEngineConfig
259
+ backend: naive
260
+ update_weights_bucket_megabytes: 2048
261
+ engine_kwargs: {}
262
+ trace:
263
+ _target_: verl.workers.config.TraceConfig
264
+ backend: null
265
+ token2text: false
266
+ max_samples_per_step_per_worker: null
267
+ skip_rollout: false
268
+ skip_dump_dir: /tmp/rollout_dump
269
+ skip_tokenizer_init: true
270
+ enable_rollout_routing_replay: false
271
+ profiler:
272
+ _target_: verl.utils.profiler.ProfilerConfig
273
+ tool: ${oc.select:global_profiler.tool,null}
274
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
275
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
276
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
277
+ save_path: ${oc.select:global_profiler.save_path,null}
278
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
279
+ prometheus:
280
+ _target_: verl.workers.config.PrometheusConfig
281
+ enable: false
282
+ port: 9090
283
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
284
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
285
+ quantization: null
286
+ quantization_config_file: null
287
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
288
+ layered_summon: false
289
+ model:
290
+ _target_: verl.workers.config.HFModelConfig
291
+ path: Qwen/Qwen3-4B-Instruct-2507
292
+ hf_config_path: null
293
+ tokenizer_path: null
294
+ use_shm: false
295
+ trust_remote_code: false
296
+ custom_chat_template: null
297
+ external_lib: null
298
+ override_config: {}
299
+ enable_gradient_checkpointing: true
300
+ enable_activation_offload: false
301
+ use_remove_padding: true
302
+ lora_rank: 0
303
+ lora_alpha: 16
304
+ target_modules: all-linear
305
+ exclude_modules: null
306
+ lora_adapter_path: null
307
+ use_liger: false
308
+ use_fused_kernels: false
309
+ fused_kernel_options:
310
+ impl_backend: torch
311
+ tiled_mlp:
312
+ enabled: false
313
+ num_shards: 4
314
+ mtp:
315
+ _target_: verl.workers.config.MtpConfig
316
+ enable: false
317
+ enable_train: false
318
+ enable_rollout: false
319
+ detach_encoder: false
320
+ mtp_loss_scaling_factor: 0.1
321
+ speculative_algorithm: EAGLE
322
+ speculative_num_steps: 3
323
+ speculative_eagle_topk: 1
324
+ speculative_num_draft_tokens: 4
325
+ method: mtp
326
+ num_speculative_tokens: 1
327
+ hybrid_engine: true
328
+ nccl_timeout: 600
329
+ data:
330
+ tokenizer: null
331
+ use_shm: false
332
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
333
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
334
+ train_max_samples: -1
335
+ val_max_samples: -1
336
+ prompt_key: prompt
337
+ reward_fn_key: data_source
338
+ max_prompt_length: 1024
339
+ max_response_length: 2048
340
+ train_batch_size: 8
341
+ val_batch_size: null
342
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
343
+ null}
344
+ return_raw_input_ids: false
345
+ return_raw_chat: true
346
+ return_full_prompt: false
347
+ shuffle: true
348
+ seed: null
349
+ dataloader_num_workers: 8
350
+ image_patch_size: 14
351
+ validation_shuffle: false
352
+ filter_overlong_prompts: true
353
+ filter_overlong_prompts_workers: 1
354
+ truncation: error
355
+ image_key: images
356
+ video_key: videos
357
+ trust_remote_code: false
358
+ custom_cls:
359
+ path: null
360
+ name: null
361
+ return_multi_modal_inputs: true
362
+ sampler:
363
+ class_path: null
364
+ class_name: null
365
+ datagen:
366
+ path: null
367
+ name: null
368
+ apply_chat_template_kwargs: {}
369
+ reward_manager:
370
+ _target_: verl.trainer.config.config.RewardManagerConfig
371
+ source: register
372
+ name: ${oc.select:reward_model.reward_manager,naive}
373
+ module:
374
+ _target_: verl.trainer.config.config.ModuleConfig
375
+ path: null
376
+ name: custom_reward_manager
377
+ critic:
378
+ optim:
379
+ _target_: verl.workers.config.FSDPOptimizerConfig
380
+ optimizer: AdamW
381
+ optimizer_impl: torch.optim
382
+ lr: 1.0e-05
383
+ lr_warmup_steps_ratio: 0.0
384
+ total_training_steps: -1
385
+ weight_decay: 0.01
386
+ lr_warmup_steps: -1
387
+ betas:
388
+ - 0.9
389
+ - 0.999
390
+ clip_grad: 1.0
391
+ min_lr_ratio: 0.0
392
+ num_cycles: 0.5
393
+ lr_scheduler_type: constant
394
+ warmup_style: null
395
+ override_optimizer_config: null
396
+ model:
397
+ fsdp_config:
398
+ _target_: verl.workers.config.FSDPEngineConfig
399
+ wrap_policy:
400
+ min_num_params: 0
401
+ param_offload: false
402
+ optimizer_offload: false
403
+ offload_policy: false
404
+ reshard_after_forward: true
405
+ fsdp_size: -1
406
+ forward_prefetch: false
407
+ model_dtype: fp32
408
+ use_orig_params: false
409
+ seed: 42
410
+ full_determinism: false
411
+ ulysses_sequence_parallel_size: 1
412
+ entropy_from_logits_with_chunking: false
413
+ use_torch_compile: true
414
+ entropy_checkpointing: false
415
+ forward_only: false
416
+ strategy: fsdp
417
+ dtype: bfloat16
418
+ path: ~/models/deepseek-llm-7b-chat
419
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
420
+ override_config: {}
421
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
422
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
423
+ _target_: verl.workers.config.FSDPCriticModelCfg
424
+ use_shm: false
425
+ enable_gradient_checkpointing: true
426
+ enable_activation_offload: false
427
+ use_remove_padding: false
428
+ lora_rank: 0
429
+ lora_alpha: 16
430
+ target_modules: all-linear
431
+ tiled_mlp:
432
+ enabled: false
433
+ num_shards: 4
434
+ _target_: verl.workers.config.FSDPCriticConfig
435
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
436
+ strategy: fsdp
437
+ enable: null
438
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
439
+ ppo_micro_batch_size: null
440
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
441
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
442
+ ppo_max_token_len_per_gpu: 32768
443
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
444
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
445
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
446
+ data_loader_seed: 42
447
+ cliprange_value: 0.5
448
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
449
+ checkpoint:
450
+ _target_: verl.trainer.config.CheckpointConfig
451
+ save_contents:
452
+ - model
453
+ - optimizer
454
+ - extra
455
+ load_contents: ${.save_contents}
456
+ async_save: false
457
+ profiler:
458
+ _target_: verl.utils.profiler.ProfilerConfig
459
+ tool: ${oc.select:global_profiler.tool,null}
460
+ enable: false
461
+ all_ranks: false
462
+ ranks: []
463
+ save_path: ${oc.select:global_profiler.save_path,null}
464
+ tool_config:
465
+ nsys:
466
+ _target_: verl.utils.profiler.config.NsightToolConfig
467
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
468
+ npu:
469
+ _target_: verl.utils.profiler.config.NPUToolConfig
470
+ contents: []
471
+ level: level0
472
+ analysis: true
473
+ discrete: false
474
+ torch:
475
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
476
+ contents: []
477
+ discrete: false
478
+ torch_memory:
479
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
480
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
481
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
482
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
483
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
484
+ ulysses_sequence_parallel_size: 1
485
+ grad_clip: 1.0
486
+ reward_model:
487
+ enable: false
488
+ enable_resource_pool: false
489
+ n_gpus_per_node: 8
490
+ nnodes: 0
491
+ strategy: fsdp
492
+ model:
493
+ input_tokenizer: ${actor_rollout_ref.model.path}
494
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
495
+ external_lib: ${actor_rollout_ref.model.external_lib}
496
+ trust_remote_code: false
497
+ override_config: {}
498
+ use_shm: false
499
+ use_remove_padding: false
500
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
501
+ fsdp_config:
502
+ _target_: verl.workers.config.FSDPEngineConfig
503
+ wrap_policy:
504
+ min_num_params: 0
505
+ param_offload: false
506
+ reshard_after_forward: true
507
+ fsdp_size: -1
508
+ forward_prefetch: false
509
+ micro_batch_size: null
510
+ micro_batch_size_per_gpu: null
511
+ max_length: null
512
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
513
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
514
+ reward_manager: naive
515
+ reward_loop_source: register
516
+ reward_loop_module_path: null
517
+ reward_loop_class_name: null
518
+ launch_reward_fn_async: false
519
+ sandbox_fusion:
520
+ url: null
521
+ max_concurrent: 64
522
+ memory_limit_mb: 1024
523
+ profiler:
524
+ _target_: verl.utils.profiler.ProfilerConfig
525
+ tool: ${oc.select:global_profiler.tool,null}
526
+ enable: false
527
+ all_ranks: false
528
+ ranks: []
529
+ save_path: ${oc.select:global_profiler.save_path,null}
530
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
531
+ ulysses_sequence_parallel_size: 1
532
+ use_reward_loop: true
533
+ num_workers: 1
534
+ rollout:
535
+ _target_: verl.workers.config.RolloutConfig
536
+ name: ???
537
+ dtype: bfloat16
538
+ gpu_memory_utilization: 0.5
539
+ enforce_eager: true
540
+ cudagraph_capture_sizes: null
541
+ free_cache_engine: true
542
+ data_parallel_size: 1
543
+ expert_parallel_size: 1
544
+ tensor_model_parallel_size: 2
545
+ max_num_batched_tokens: 8192
546
+ max_model_len: null
547
+ max_num_seqs: 1024
548
+ load_format: auto
549
+ engine_kwargs: {}
550
+ limit_images: null
551
+ enable_chunked_prefill: true
552
+ enable_prefix_caching: true
553
+ disable_log_stats: true
554
+ skip_tokenizer_init: false
555
+ prompt_length: 2048
556
+ response_length: 2048
557
+ algorithm:
558
+ rollout_correction:
559
+ rollout_is: null
560
+ rollout_is_threshold: 2.0
561
+ rollout_rs: null
562
+ rollout_rs_threshold: null
563
+ bypass_mode: false
564
+ loss_type: ppo_clip
565
+ rollout_is_batch_normalize: false
566
+ _target_: verl.trainer.config.AlgoConfig
567
+ gamma: 1.0
568
+ lam: 1.0
569
+ adv_estimator: grpo
570
+ norm_adv_by_std_in_grpo: true
571
+ use_kl_in_reward: false
572
+ kl_penalty: kl
573
+ kl_ctrl:
574
+ _target_: verl.trainer.config.KLControlConfig
575
+ type: fixed
576
+ kl_coef: 0.001
577
+ horizon: 10000
578
+ target_kl: 0.1
579
+ use_pf_ppo: false
580
+ pf_ppo:
581
+ reweight_method: pow
582
+ weight_pow: 2.0
583
+ custom_reward_function:
584
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
585
+ name: compute_score
586
+ trainer:
587
+ balance_batch: true
588
+ total_epochs: 15
589
+ total_training_steps: null
590
+ project_name: readctrl-verl
591
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
592
+ logger:
593
+ - console
594
+ - wandb
595
+ log_val_generations: 0
596
+ rollout_data_dir: null
597
+ validation_data_dir: null
598
+ nnodes: 1
599
+ n_gpus_per_node: 2
600
+ save_freq: 100
601
+ esi_redundant_time: 0
602
+ resume_mode: auto
603
+ resume_from_path: null
604
+ val_before_train: true
605
+ val_only: false
606
+ test_freq: 1
607
+ critic_warmup: 0
608
+ default_hdfs_dir: null
609
+ del_local_ckpt_after_load: false
610
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
611
+ max_actor_ckpt_to_keep: 1
612
+ max_critic_ckpt_to_keep: 1
613
+ ray_wait_register_center_timeout: 300
614
+ device: cuda
615
+ use_legacy_worker_impl: auto
616
+ remove_previous_ckpt_in_save: true
617
+ global_profiler:
618
+ _target_: verl.utils.profiler.ProfilerConfig
619
+ tool: null
620
+ steps: null
621
+ profile_continuous_steps: false
622
+ save_path: outputs/profile
623
+ global_tool_config:
624
+ nsys:
625
+ _target_: verl.utils.profiler.config.NsightToolConfig
626
+ discrete: false
627
+ controller_nsight_options:
628
+ trace: cuda,nvtx,cublas,ucx
629
+ cuda-memory-usage: 'true'
630
+ cuda-graph-trace: graph
631
+ worker_nsight_options:
632
+ trace: cuda,nvtx,cublas,ucx
633
+ cuda-memory-usage: 'true'
634
+ cuda-graph-trace: graph
635
+ capture-range: cudaProfilerApi
636
+ capture-range-end: null
637
+ kill: none
638
+ torch_memory:
639
+ trace_alloc_max_entries: 100000
640
+ stack_depth: 32
641
+ context: all
642
+ stacks: all
643
+ kw_args: {}
644
+ transfer_queue:
645
+ enable: false
646
+ ray_kwargs:
647
+ ray_init:
648
+ num_cpus: null
649
+ timeline_json_file: null
code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - algorithm.adv_estimator=grpo
116
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
117
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
118
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
119
+ - data.train_batch_size=8
120
+ - data.max_prompt_length=1024
121
+ - data.max_response_length=2048
122
+ - data.filter_overlong_prompts=True
123
+ - data.truncation=error
124
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
125
+ - actor_rollout_ref.actor.optim.lr=1e-6
126
+ - actor_rollout_ref.model.use_remove_padding=True
127
+ - actor_rollout_ref.actor.ppo_mini_batch_size=4
128
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
129
+ - actor_rollout_ref.actor.use_kl_loss=True
130
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
131
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
132
+ - actor_rollout_ref.actor.entropy_coeff=0
133
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
134
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
135
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
136
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
137
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
138
+ - actor_rollout_ref.rollout.name=vllm
139
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
140
+ - actor_rollout_ref.rollout.max_model_len=8192
141
+ - actor_rollout_ref.rollout.n=3
142
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
143
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
144
+ - algorithm.use_kl_in_reward=False
145
+ - trainer.critic_warmup=0
146
+ - trainer.logger=["console","wandb"]
147
+ - trainer.project_name=readctrl-verl
148
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
149
+ - trainer.n_gpus_per_node=2
150
+ - trainer.nnodes=1
151
+ - trainer.save_freq=100
152
+ - trainer.test_freq=1
153
+ - +trainer.remove_previous_ckpt_in_save=true
154
+ - trainer.max_actor_ckpt_to_keep=1
155
+ - trainer.max_critic_ckpt_to_keep=1
156
+ - trainer.resume_mode=auto
157
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
158
+ - trainer.total_epochs=15
159
+ job:
160
+ name: main_ppo
161
+ chdir: null
162
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
163
+ id: ???
164
+ num: ???
165
+ config_name: ppo_trainer
166
+ env_set: {}
167
+ env_copy: []
168
+ config:
169
+ override_dirname:
170
+ kv_sep: '='
171
+ item_sep: ','
172
+ exclude_keys: []
173
+ runtime:
174
+ version: 1.3.2
175
+ version_base: '1.3'
176
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
177
+ config_sources:
178
+ - path: hydra.conf
179
+ schema: pkg
180
+ provider: hydra
181
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
182
+ schema: file
183
+ provider: main
184
+ - path: ''
185
+ schema: structured
186
+ provider: schema
187
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49
188
+ choices:
189
+ algorithm@algorithm.rollout_correction: rollout_correction
190
+ reward_model: dp_reward_loop
191
+ critic: dp_critic
192
+ critic/../engine@critic.model.fsdp_config: fsdp
193
+ critic/../optim@critic.optim: fsdp
194
+ model@actor_rollout_ref.model: hf_model
195
+ rollout@actor_rollout_ref.rollout: rollout
196
+ ref@actor_rollout_ref.ref: dp_ref
197
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
198
+ data: legacy_data
199
+ actor@actor_rollout_ref.actor: dp_actor
200
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
201
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
202
+ hydra/env: default
203
+ hydra/callbacks: null
204
+ hydra/job_logging: default
205
+ hydra/hydra_logging: default
206
+ hydra/hydra_help: default
207
+ hydra/help: default
208
+ hydra/sweeper: basic
209
+ hydra/launcher: basic
210
+ hydra/output: default
211
+ verbose: false
code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - algorithm.adv_estimator=grpo
2
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
3
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
4
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
5
+ - data.train_batch_size=8
6
+ - data.max_prompt_length=1024
7
+ - data.max_response_length=2048
8
+ - data.filter_overlong_prompts=True
9
+ - data.truncation=error
10
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
11
+ - actor_rollout_ref.actor.optim.lr=1e-6
12
+ - actor_rollout_ref.model.use_remove_padding=True
13
+ - actor_rollout_ref.actor.ppo_mini_batch_size=4
14
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
15
+ - actor_rollout_ref.actor.use_kl_loss=True
16
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
17
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
18
+ - actor_rollout_ref.actor.entropy_coeff=0
19
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
20
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
21
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
22
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
23
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
24
+ - actor_rollout_ref.rollout.name=vllm
25
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
26
+ - actor_rollout_ref.rollout.max_model_len=8192
27
+ - actor_rollout_ref.rollout.n=3
28
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
29
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
30
+ - algorithm.use_kl_in_reward=False
31
+ - trainer.critic_warmup=0
32
+ - trainer.logger=["console","wandb"]
33
+ - trainer.project_name=readctrl-verl
34
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
35
+ - trainer.n_gpus_per_node=2
36
+ - trainer.nnodes=1
37
+ - trainer.save_freq=100
38
+ - trainer.test_freq=1
39
+ - +trainer.remove_previous_ckpt_in_save=true
40
+ - trainer.max_actor_ckpt_to_keep=1
41
+ - trainer.max_critic_ckpt_to_keep=1
42
+ - trainer.resume_mode=auto
43
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
44
+ - trainer.total_epochs=15
code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log ADDED
File without changes