strikerLj commited on
Commit
fd271f2
·
verified ·
1 Parent(s): 493069d

Upload MatchTIR-4B-extra-thinkback-toolfix1 global_step_16

Browse files
MatchTIR-4B-extra-thinkback-toolfix1/resolved_config.yaml ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ tokenizer: null
3
+ train_files: Data/golden_train_235b_deepseek_merged_clean_plus_32b_extra.segmented_clean.parquet
4
+ val_files: Data/test.parquet
5
+ prompt_key: messages
6
+ reward_fn_key: data_source
7
+ tools_key: tools
8
+ max_prompt_length: 7000
9
+ max_response_length: 23000
10
+ train_batch_size: 256
11
+ val_batch_size: 256
12
+ return_raw_input_ids: false
13
+ return_raw_chat: false
14
+ return_full_prompt: false
15
+ shuffle: true
16
+ filter_overlong_prompts: true
17
+ filter_overlong_prompts_workers: 1
18
+ truncation: error
19
+ image_key: images
20
+ video_key: videos
21
+ custom_cls:
22
+ path: null
23
+ name: null
24
+ enable_thinking: true
25
+ sft_target_key: raw_output
26
+ sft_mode: segmented_prefix
27
+ segmented_prefix_files: Data/segmentation/54_hybrid.cleaned.plus_32b_extra.segmented.jsonl
28
+ segmented_prefix_supervise_visible_prefix: true
29
+ segmented_prefix_supervise_tool_call: true
30
+ random_pure_rl_prompt_proportion: 0.0
31
+ random_pure_rl_prompt_schedule: fix
32
+ random_pure_rl_prompt_start_proportion: 0.0
33
+ random_pure_rl_prompt_final_proportion: 0.0
34
+ random_pure_rl_prompt_annealing_ratio: 1.0
35
+ random_pure_rl_prompt_seed: 0
36
+ random_pure_rl_prompt_resample_each_epoch: true
37
+ sft_turn_intra_annealing: false
38
+ system_style: Qwen3
39
+ adaptive_next_turn_supervision:
40
+ enabled: false
41
+ reward_turn_proxy_field: turn_proxy_json
42
+ segmented_turns_field: segmented_turns_json
43
+ plan_field: adaptive_supervision_plan_json
44
+ strong_sft_floor: 0.8
45
+ medium_sft_floor: 0.6
46
+ weak_sft_floor: 0.5
47
+ allow_max_segmented_level: L2
48
+ conservative_tool_patterns:
49
+ - initial_attempt
50
+ - compare_then_call
51
+ bad_quality_flags:
52
+ - fallback_segmentation
53
+ - needs_intra_paragraph_cut
54
+ - duplicate_call_preview_in_reasoning
55
+ soft_bad_quality_flags:
56
+ - assistant_role_token_noise
57
+ - long_redundant_reasoning
58
+ dataloader_num_workers: 0
59
+ sft_proportion: 1.0
60
+ sft_whole_response: false
61
+ sft_turn_mode: true
62
+ sft_turn_uniform_proportion: true
63
+ sft_annealing: true
64
+ sft_annealing_ratio: 1.0
65
+ actor_rollout_ref:
66
+ hybrid_engine: true
67
+ model:
68
+ path: Qwen/Qwen3-4B
69
+ external_lib: null
70
+ override_config: {}
71
+ enable_gradient_checkpointing: true
72
+ enable_activation_offload: false
73
+ use_remove_padding: true
74
+ use_liger: false
75
+ use_fused_kernels: false
76
+ trust_remote_code: false
77
+ actor:
78
+ strategy: fsdp
79
+ ppo_mini_batch_size: 32
80
+ ppo_micro_batch_size: null
81
+ ppo_micro_batch_size_per_gpu: null
82
+ use_dynamic_bsz: true
83
+ ppo_max_token_len_per_gpu: 15000
84
+ grad_clip: 1.0
85
+ clip_ratio: 0.2
86
+ clip_ratio_low: 0.2
87
+ clip_ratio_high: 0.2
88
+ clip_ratio_c: 3.0
89
+ loss_agg_mode: seq-mean-token-mean
90
+ entropy_coeff: 0.001
91
+ use_hint_for_log_prob: true
92
+ segmented_prefix_thinking_sft_loss_scale: 1.0
93
+ segmented_prefix_tool_call_sft_loss_scale: 1.0
94
+ segmented_prefix_tool_call_sft_loss_scale_annealing: false
95
+ segmented_prefix_tool_call_sft_loss_final_scale: 1.0
96
+ segmented_prefix_tool_call_sft_loss_annealing_ratio: 1.0
97
+ use_kl_loss: true
98
+ use_torch_compile: true
99
+ kl_loss_coef: 0.001
100
+ kl_loss_type: low_var_kl
101
+ ppo_epochs: 1
102
+ shuffle: false
103
+ ulysses_sequence_parallel_size: 2
104
+ checkpoint:
105
+ contents:
106
+ - model
107
+ - optimizer
108
+ - extra
109
+ optim:
110
+ lr: 1.0e-06
111
+ lr_warmup_steps: -1
112
+ lr_warmup_steps_ratio: 0.0
113
+ min_lr_ratio: 0.0
114
+ num_cycles: 0.5
115
+ warmup_style: constant
116
+ total_training_steps: 24
117
+ weight_decay: 0.01
118
+ fsdp_config:
119
+ wrap_policy:
120
+ min_num_params: 0
121
+ param_offload: true
122
+ optimizer_offload: true
123
+ offload_policy: false
124
+ reshard_after_forward: true
125
+ fsdp_size: -1
126
+ use_sft_loss: true
127
+ sft_loss_coef: 0.1
128
+ ref:
129
+ strategy: fsdp
130
+ fsdp_config:
131
+ param_offload: true
132
+ reshard_after_forward: true
133
+ wrap_policy:
134
+ min_num_params: 0
135
+ use_torch_compile: true
136
+ log_prob_micro_batch_size: null
137
+ log_prob_micro_batch_size_per_gpu: null
138
+ log_prob_use_dynamic_bsz: true
139
+ log_prob_max_token_len_per_gpu: 15000
140
+ ulysses_sequence_parallel_size: 2
141
+ rollout:
142
+ name: vllm
143
+ mode: sync
144
+ chat_scheduler: null
145
+ temperature: 1.0
146
+ top_k: -1
147
+ top_p: 1
148
+ use_fire_sampling: false
149
+ prompt_length: 7000
150
+ response_length: 23000
151
+ dtype: bfloat16
152
+ gpu_memory_utilization: 0.7
153
+ ignore_eos: false
154
+ enforce_eager: true
155
+ free_cache_engine: true
156
+ load_format: dummy_dtensor
157
+ tensor_model_parallel_size: 1
158
+ max_num_batched_tokens: 32768
159
+ max_model_len: null
160
+ max_num_seqs: 1024
161
+ log_prob_micro_batch_size: null
162
+ log_prob_micro_batch_size_per_gpu: null
163
+ log_prob_use_dynamic_bsz: true
164
+ log_prob_max_token_len_per_gpu: 15000
165
+ disable_log_stats: true
166
+ enable_chunked_prefill: true
167
+ do_sample: true
168
+ 'n': 8
169
+ engine_kwargs:
170
+ vllm:
171
+ swap_space: null
172
+ sglang:
173
+ attention_backend: null
174
+ val_kwargs:
175
+ top_k: -1
176
+ top_p: 1.0
177
+ temperature: 0
178
+ 'n': 1
179
+ do_sample: false
180
+ multi_turn:
181
+ enable: false
182
+ max_turns: null
183
+ tool_config_path: null
184
+ format: chatml
185
+ enable_thinking: true
186
+ max_turns: 10
187
+ critic:
188
+ rollout_n: 8
189
+ strategy: fsdp
190
+ optim:
191
+ lr: 1.0e-05
192
+ lr_warmup_steps_ratio: 0.0
193
+ min_lr_ratio: null
194
+ warmup_style: constant
195
+ total_training_steps: 24
196
+ weight_decay: 0.01
197
+ model:
198
+ path: ~/models/deepseek-llm-7b-chat
199
+ tokenizer_path: Qwen/Qwen3-4B
200
+ override_config: {}
201
+ external_lib: null
202
+ enable_gradient_checkpointing: true
203
+ enable_activation_offload: false
204
+ use_remove_padding: false
205
+ trust_remote_code: false
206
+ fsdp_config:
207
+ param_offload: false
208
+ optimizer_offload: false
209
+ offload_policy: false
210
+ reshard_after_forward: true
211
+ wrap_policy:
212
+ min_num_params: 0
213
+ fsdp_size: -1
214
+ ppo_mini_batch_size: 32
215
+ ppo_micro_batch_size: null
216
+ ppo_micro_batch_size_per_gpu: null
217
+ forward_micro_batch_size: null
218
+ forward_micro_batch_size_per_gpu: null
219
+ use_dynamic_bsz: true
220
+ ppo_max_token_len_per_gpu: 32768
221
+ forward_max_token_len_per_gpu: 32768
222
+ ulysses_sequence_parallel_size: 1
223
+ ppo_epochs: 1
224
+ shuffle: false
225
+ grad_clip: 'inf'
226
+ cliprange_value: 0.5
227
+ loss_agg_mode: seq-mean-token-mean
228
+ checkpoint:
229
+ contents:
230
+ - model
231
+ - optimizer
232
+ - extra
233
+ reward_model:
234
+ enable: false
235
+ strategy: fsdp
236
+ model:
237
+ input_tokenizer: Qwen/Qwen3-4B
238
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
239
+ external_lib: null
240
+ use_remove_padding: false
241
+ use_fused_kernels: false
242
+ trust_remote_code: false
243
+ fsdp_config:
244
+ wrap_policy:
245
+ min_num_params: 0
246
+ param_offload: false
247
+ reshard_after_forward: true
248
+ fsdp_size: -1
249
+ micro_batch_size: null
250
+ micro_batch_size_per_gpu: null
251
+ max_length: null
252
+ ulysses_sequence_parallel_size: 1
253
+ use_dynamic_bsz: true
254
+ forward_max_token_len_per_gpu: 32768
255
+ reward_manager: tool
256
+ launch_reward_fn_async: false
257
+ sandbox_fusion:
258
+ url: null
259
+ max_concurrent: 64
260
+ custom_reward_function:
261
+ path: Code/verl/utils/reward_score/tool.py
262
+ name: compute_process_KM
263
+ reward_kwargs:
264
+ emit_turn_proxy_diagnostics: true
265
+ algorithm:
266
+ gamma: 1.0
267
+ lam: 1.0
268
+ adv_estimator: grpo
269
+ norm_adv_by_std_in_grpo: true
270
+ use_kl_in_reward: false
271
+ dynamic_advantage:
272
+ enabled: false
273
+ schedule: linear
274
+ global_start: 0.8
275
+ global_end: 0.2
276
+ local_start: 0.2
277
+ local_end: 0.8
278
+ sign_conflict_enabled: true
279
+ sign_conflict_eps: 0.05
280
+ sign_conflict_local_weight: 0.8
281
+ clamp_progress: true
282
+ kl_penalty: kl
283
+ kl_ctrl:
284
+ type: fixed
285
+ kl_coef: 0.001
286
+ horizon: 10000
287
+ target_kl: 0.1
288
+ trainer:
289
+ balance_batch: true
290
+ total_epochs: 3
291
+ total_training_steps: null
292
+ project_name: MatchTIR
293
+ experiment_name: MatchTIR-4B-extra-thinkback-toolfix1
294
+ logger:
295
+ - console
296
+ - wandb
297
+ log_val_generations: 0
298
+ rollout_data_dir: /fs/scratch/PAS3151/jluo/exps/rollout/MatchTIR/MatchTIR-4B-extra-thinkback-toolfix1
299
+ validation_data_dir: /fs/scratch/PAS3151/jluo/exps/validation/MatchTIR/MatchTIR-4B-extra-thinkback-toolfix1
300
+ nnodes: 1
301
+ n_gpus_per_node: 4
302
+ save_freq: 4
303
+ resume_mode: auto
304
+ resume_from_path: null
305
+ val_before_train: true
306
+ test_freq: 4
307
+ critic_warmup: 0
308
+ default_hdfs_dir: null
309
+ del_local_ckpt_after_load: false
310
+ default_local_dir: /fs/scratch/PAS3151/jluo/exps/ckpts/MatchTIR/MatchTIR-4B-extra-thinkback-toolfix1
311
+ max_actor_ckpt_to_keep: null
312
+ max_critic_ckpt_to_keep: null
313
+ ray_wait_register_center_timeout: 300
314
+ val_only: false
315
+ ray_init:
316
+ num_cpus: 8