kajuma commited on
Commit
c5a630d
·
verified ·
1 Parent(s): 37125c5

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -6510,3 +6510,6 @@ checkpoints/iter_0038100/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
6510
  checkpoints/iter_0038100/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
6511
  checkpoints/iter_0038100/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
6512
  checkpoints/iter_0038100/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
6510
  checkpoints/iter_0038100/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
6511
  checkpoints/iter_0038100/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
6512
  checkpoints/iter_0038100/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
6513
+ logging.jsonl filter=lfs diff=lfs merge=lfs -text
6514
+ wandb/wandb/run-20251224_034518-gd3q7mjv/files/output.log filter=lfs diff=lfs merge=lfs -text
6515
+ wandb/wandb/run-20251224_034518-gd3q7mjv/run-gd3q7mjv.wandb filter=lfs diff=lfs merge=lfs -text
args.json ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "use_ray": false,
3
+ "ray_exp_name": null,
4
+ "device_groups": null,
5
+ "model": "Qwen/Qwen3-0.6B-Base",
6
+ "model_type": "qwen3",
7
+ "model_revision": null,
8
+ "task_type": "causal_lm",
9
+ "torch_dtype": "bfloat16",
10
+ "attn_impl": null,
11
+ "new_special_tokens": [],
12
+ "num_labels": null,
13
+ "problem_type": null,
14
+ "rope_scaling": null,
15
+ "device_map": null,
16
+ "max_memory": {},
17
+ "max_model_len": null,
18
+ "local_repo_path": null,
19
+ "init_strategy": null,
20
+ "template": "qwen3",
21
+ "system": null,
22
+ "max_length": 4096,
23
+ "truncation_strategy": "right",
24
+ "max_pixels": null,
25
+ "agent_template": null,
26
+ "norm_bbox": null,
27
+ "use_chat_template": false,
28
+ "padding_free": true,
29
+ "padding_side": "right",
30
+ "loss_scale": "all",
31
+ "sequence_parallel_size": 1,
32
+ "response_prefix": null,
33
+ "template_backend": "swift",
34
+ "dataset": [],
35
+ "val_dataset": [],
36
+ "cached_dataset": [
37
+ "/workspace/full"
38
+ ],
39
+ "cached_val_dataset": [],
40
+ "split_dataset_ratio": 0.0,
41
+ "data_seed": 42,
42
+ "dataset_num_proc": 32,
43
+ "load_from_cache_file": false,
44
+ "dataset_shuffle": true,
45
+ "val_dataset_shuffle": false,
46
+ "streaming": false,
47
+ "interleave_prob": null,
48
+ "stopping_strategy": "first_exhausted",
49
+ "shuffle_buffer_size": 1000,
50
+ "download_mode": "reuse_dataset_if_exists",
51
+ "columns": {},
52
+ "strict": false,
53
+ "remove_unused_columns": true,
54
+ "model_name": null,
55
+ "model_author": null,
56
+ "custom_dataset_info": [],
57
+ "quant_method": null,
58
+ "quant_bits": null,
59
+ "hqq_axis": null,
60
+ "bnb_4bit_compute_dtype": "bfloat16",
61
+ "bnb_4bit_quant_type": "nf4",
62
+ "bnb_4bit_use_double_quant": true,
63
+ "bnb_4bit_quant_storage": null,
64
+ "max_new_tokens": null,
65
+ "temperature": null,
66
+ "top_k": 50,
67
+ "top_p": 0.9,
68
+ "repetition_penalty": 1.0,
69
+ "num_beams": 1,
70
+ "stream": false,
71
+ "stop_words": [],
72
+ "logprobs": false,
73
+ "top_logprobs": null,
74
+ "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
75
+ "lora_modules": [],
76
+ "tuner_backend": "peft",
77
+ "train_type": "full",
78
+ "adapters": [],
79
+ "external_plugins": [],
80
+ "seed": 42,
81
+ "model_kwargs": {},
82
+ "load_args": false,
83
+ "load_data_args": false,
84
+ "packing": true,
85
+ "packing_length": 4096,
86
+ "packing_num_proc": 1,
87
+ "lazy_tokenize": false,
88
+ "custom_register_path": [],
89
+ "use_hf": true,
90
+ "hub_token": null,
91
+ "ddp_timeout": 18000000,
92
+ "ddp_backend": null,
93
+ "ignore_args_error": false,
94
+ "use_swift_lora": false,
95
+ "freeze_llm": false,
96
+ "freeze_vit": true,
97
+ "freeze_aligner": true,
98
+ "freeze_parameters": [],
99
+ "freeze_parameters_regex": null,
100
+ "freeze_parameters_ratio": 0.0,
101
+ "trainable_parameters": [],
102
+ "trainable_parameters_regex": null,
103
+ "adapter_load": null,
104
+ "target_modules": [
105
+ "all-linear"
106
+ ],
107
+ "target_regex": null,
108
+ "modules_to_save": [],
109
+ "lora_rank": 8,
110
+ "lora_alpha": 32,
111
+ "lora_dropout": 0.05,
112
+ "lora_bias": "none",
113
+ "lora_dtype": null,
114
+ "use_rslora": false,
115
+ "rlhf_type": null,
116
+ "ref_load": null,
117
+ "ref_adapter_load": null,
118
+ "beta": 0.1,
119
+ "rpo_alpha": null,
120
+ "reference_free": false,
121
+ "label_smoothing": 0.0,
122
+ "f_divergence_type": "reverse_kl",
123
+ "loss_type": null,
124
+ "desirable_weight": 1.0,
125
+ "undesirable_weight": 1.0,
126
+ "calculate_KL": null,
127
+ "center_rewards_coefficient": null,
128
+ "generation_batch_size": null,
129
+ "steps_per_generation": null,
130
+ "num_generations": 8,
131
+ "max_completion_length": 512,
132
+ "importance_sampling_level": "token",
133
+ "tau_pos": 1.0,
134
+ "tau_neg": 1.05,
135
+ "epsilon": 0.2,
136
+ "epsilon_high": null,
137
+ "delta": null,
138
+ "use_vllm": true,
139
+ "vllm_mode": null,
140
+ "vllm_enable_prefix_caching": true,
141
+ "vllm_gpu_memory_utilization": 0.9,
142
+ "vllm_tensor_parallel_size": 1,
143
+ "vllm_max_model_len": null,
144
+ "vllm_enforce_eager": false,
145
+ "vllm_limit_mm_per_prompt": null,
146
+ "vllm_disable_cascade_attn": false,
147
+ "vllm_max_num_seqs": null,
148
+ "vllm_mm_processor_cache_gb": null,
149
+ "vllm_engine_kwargs": null,
150
+ "sleep_level": 0,
151
+ "offload_optimizer": false,
152
+ "offload_model": false,
153
+ "offload_bridge": false,
154
+ "vllm_server_base_url": null,
155
+ "vllm_server_host": null,
156
+ "vllm_server_port": [
157
+ 8000
158
+ ],
159
+ "vllm_server_timeout": 240.0,
160
+ "vllm_server_group_port": null,
161
+ "reward_funcs": [],
162
+ "reward_weights": null,
163
+ "cosine_min_len_value_wrong": -0.5,
164
+ "cosine_max_len_value_wrong": 0.0,
165
+ "cosine_min_len_value_correct": 1.0,
166
+ "cosine_max_len_value_correct": 0.5,
167
+ "cosine_max_len": null,
168
+ "repetition_n_grams": 3,
169
+ "repetition_max_penalty": -1.0,
170
+ "soft_max_length": null,
171
+ "soft_cache_length": null,
172
+ "dynamic_sample": false,
173
+ "max_resample_times": 3,
174
+ "overlong_filter": false,
175
+ "scale_rewards": "group",
176
+ "advantage_estimator": "grpo",
177
+ "kl_in_reward": false,
178
+ "wandb_log_unique_prompts": null,
179
+ "log_completions": false,
180
+ "rollout_importance_sampling_mode": null,
181
+ "rollout_importance_sampling_threshold": 2.0,
182
+ "log_rollout_offpolicy_metrics": false,
183
+ "off_policy_sequence_mask_delta": null,
184
+ "reward_model": null,
185
+ "reward_model_plugin": null,
186
+ "sync_ref_model": false,
187
+ "ref_model_sync_steps": 512,
188
+ "ref_model_mixup_alpha": 0.6,
189
+ "async_generate": false,
190
+ "move_model_batches": null,
191
+ "multi_turn_scheduler": null,
192
+ "max_turns": null,
193
+ "completion_length_limit_scope": "per_round",
194
+ "vllm_server_pass_dataset": false,
195
+ "log_entropy": false,
196
+ "top_entropy_quantile": 1.0,
197
+ "num_iterations": 1,
198
+ "check_model": true,
199
+ "padded_vocab_size": 151936,
200
+ "initialize_embedding": false,
201
+ "mlp_padding_free": false,
202
+ "load_safetensors": false,
203
+ "save_safetensors": false,
204
+ "ref_model": null,
205
+ "ref_adapters": [],
206
+ "merge_lora": false,
207
+ "max_shard_size": "5GB",
208
+ "train_dataloader_shuffle": true,
209
+ "dataloader_pin_memory": true,
210
+ "dataloader_persistent_workers": true,
211
+ "dataloader_prefetch_factor": 10,
212
+ "architectures": "Qwen3ForCausalLM",
213
+ "llm_architectures": "Qwen3ForCausalLM",
214
+ "max_epochs": null,
215
+ "enable_dft_loss": false,
216
+ "enable_channel_loss": false,
217
+ "patch_size": 1,
218
+ "save_strategy": "steps",
219
+ "original_max_position_embeddings": null,
220
+ "partial_rotary_factor": null,
221
+ "use_shared_expert_gate": false,
222
+ "vit_gradient_checkpointing": true,
223
+ "vit_lr": null,
224
+ "aligner_lr": null,
225
+ "gradient_checkpointing_kwargs": null,
226
+ "linear_num_value_heads": null,
227
+ "linear_num_key_heads": null,
228
+ "linear_key_head_dim": null,
229
+ "linear_value_head_dim": null,
230
+ "linear_conv_kernel_dim": null,
231
+ "layer_types": null,
232
+ "mrope_interleaved": false,
233
+ "micro_batch_size": 4,
234
+ "global_batch_size": 256,
235
+ "recompute_granularity": "full",
236
+ "recompute_method": "uniform",
237
+ "recompute_num_layers": 1,
238
+ "recompute_modules": [
239
+ "core_attn"
240
+ ],
241
+ "use_cpu_initialization": false,
242
+ "deterministic_mode": false,
243
+ "train_iters": 38100,
244
+ "log_interval": 1,
245
+ "tensorboard_dir": "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs",
246
+ "no_masked_softmax_fusion": false,
247
+ "no_bias_dropout_fusion": false,
248
+ "no_bias_swiglu_fusion": false,
249
+ "no_rope_fusion": false,
250
+ "no_gradient_accumulation_fusion": false,
251
+ "cross_entropy_loss_fusion": true,
252
+ "cross_entropy_fusion_impl": "native",
253
+ "calculate_per_token_loss": true,
254
+ "use_flash_attn": false,
255
+ "attention_backend": "flash",
256
+ "optimizer": "adam",
257
+ "optimizer_cpu_offload": false,
258
+ "optimizer_offload_fraction": 1.0,
259
+ "use_precision_aware_optimizer": true,
260
+ "main_grads_dtype": "fp32",
261
+ "main_params_dtype": "fp32",
262
+ "exp_avg_dtype": "fp32",
263
+ "exp_avg_sq_dtype": "fp32",
264
+ "dataloader_type": "cyclic",
265
+ "manual_gc": false,
266
+ "manual_gc_interval": 0,
267
+ "lr": 0.0001,
268
+ "lr_decay_style": "cosine",
269
+ "lr_decay_iters": null,
270
+ "lr_warmup_iters": 0,
271
+ "lr_warmup_fraction": 0.05,
272
+ "min_lr": 3e-06,
273
+ "weight_decay": 0.1,
274
+ "clip_grad": 1.0,
275
+ "adam_beta1": 0.9,
276
+ "adam_beta2": 0.95,
277
+ "adam_eps": 1e-08,
278
+ "sgd_momentum": 0.9,
279
+ "save": "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709",
280
+ "save_interval": 100,
281
+ "save_retain_interval": null,
282
+ "no_save_optim": false,
283
+ "no_save_rng": false,
284
+ "load": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
285
+ "no_load_optim": false,
286
+ "no_load_rng": false,
287
+ "finetune": true,
288
+ "ckpt_format": "torch_dist",
289
+ "no_initialization": true,
290
+ "auto_detect_ckpt_format": true,
291
+ "exit_on_missing_checkpoint": true,
292
+ "async_save": false,
293
+ "use_persistent_ckpt_worker": false,
294
+ "ckpt_fully_parallel_load": false,
295
+ "ckpt_assume_constant_structure": false,
296
+ "distributed_backend": "nccl",
297
+ "local_rank": 0,
298
+ "use_distributed_optimizer": true,
299
+ "tensor_model_parallel_size": 1,
300
+ "pipeline_model_parallel_size": 1,
301
+ "decoder_first_pipeline_num_layers": null,
302
+ "decoder_last_pipeline_num_layers": null,
303
+ "account_for_embedding_in_pipeline_split": false,
304
+ "account_for_loss_in_pipeline_split": false,
305
+ "sequence_parallel": false,
306
+ "context_parallel_size": 1,
307
+ "tp_comm_overlap": false,
308
+ "overlap_grad_reduce": true,
309
+ "overlap_param_gather": true,
310
+ "distributed_timeout_minutes": 300000,
311
+ "num_layers_per_virtual_pipeline_stage": null,
312
+ "num_virtual_stages_per_pipeline_rank": null,
313
+ "microbatch_group_size_per_virtual_pipeline_stage": null,
314
+ "pipeline_model_parallel_layout": null,
315
+ "num_layers": 28,
316
+ "hidden_size": 1024,
317
+ "ffn_hidden_size": 3072,
318
+ "num_attention_heads": 16,
319
+ "group_query_attention": true,
320
+ "num_query_groups": 8,
321
+ "softmax_type": null,
322
+ "window_size": null,
323
+ "window_attn_skip_freq": null,
324
+ "max_position_embeddings": 32768,
325
+ "position_embedding_type": "rope",
326
+ "mrope_section": null,
327
+ "rotary_base": 1000000,
328
+ "rotary_percent": 1.0,
329
+ "rotary_interleaved": false,
330
+ "normalization": "RMSNorm",
331
+ "norm_epsilon": 1e-06,
332
+ "swiglu": true,
333
+ "quick_geglu": false,
334
+ "activation_func_clamp_value": null,
335
+ "glu_linear_offset": null,
336
+ "untie_embeddings_and_output_weights": false,
337
+ "disable_bias_linear": true,
338
+ "add_qkv_bias": false,
339
+ "attention_dropout": 0.0,
340
+ "hidden_dropout": 0.0,
341
+ "kv_channels": 128,
342
+ "qk_layernorm": true,
343
+ "qk_l2_norm": null,
344
+ "no_rope_freq": null,
345
+ "moe_apply_probs_on_input": null,
346
+ "transformer_impl": "transformer_engine",
347
+ "num_experts": null,
348
+ "moe_layer_freq": "1",
349
+ "moe_ffn_hidden_size": null,
350
+ "moe_shared_expert_intermediate_size": null,
351
+ "moe_router_topk": 2,
352
+ "moe_router_num_groups": null,
353
+ "moe_router_group_topk": null,
354
+ "moe_router_pre_softmax": false,
355
+ "moe_router_dtype": "fp32",
356
+ "moe_router_score_function": "softmax",
357
+ "moe_router_bias_update_rate": null,
358
+ "moe_router_enable_expert_bias": false,
359
+ "moe_router_topk_scaling_factor": null,
360
+ "moe_router_load_balancing_type": "aux_loss",
361
+ "expert_model_parallel_size": 1,
362
+ "expert_tensor_parallel_size": 1,
363
+ "moe_token_dispatcher_type": null,
364
+ "moe_enable_deepep": false,
365
+ "moe_grouped_gemm": true,
366
+ "moe_permute_fusion": false,
367
+ "moe_aux_loss_coeff": 0.0,
368
+ "moe_z_loss_coeff": null,
369
+ "moe_shared_expert_overlap": false,
370
+ "moe_layer_recompute": false,
371
+ "moe_expert_capacity_factor": null,
372
+ "moe_pad_expert_input_to_capacity": false,
373
+ "moe_token_drop_policy": null,
374
+ "multi_latent_attention": false,
375
+ "q_lora_rank": null,
376
+ "kv_lora_rank": 32,
377
+ "qk_head_dim": 128,
378
+ "qk_pos_emb_head_dim": 64,
379
+ "mtp_num_layers": null,
380
+ "mtp_loss_scaling_factor": 0.1,
381
+ "fp8_format": null,
382
+ "fp8_recipe": "delayed",
383
+ "fp8_amax_history_len": 1024,
384
+ "fp8_amax_compute_algo": "max",
385
+ "fp8_param_gather": false,
386
+ "fp16": false,
387
+ "bf16": true,
388
+ "apply_query_key_layer_scaling": false,
389
+ "attention_softmax_in_fp32": true,
390
+ "log_params_norm": false,
391
+ "log_throughput": false,
392
+ "tensorboard_log_interval": 1,
393
+ "tensorboard_queue_size": 50,
394
+ "log_timers_to_tensorboard": true,
395
+ "no_log_learning_rate_to_tensorboard": false,
396
+ "log_validation_ppl_to_tensorboard": true,
397
+ "log_memory_to_tensorboard": true,
398
+ "logging_level": "20",
399
+ "wandb_project": "plt",
400
+ "wandb_exp_name": "baseline",
401
+ "wandb_save_dir": null,
402
+ "eval_iters": -1,
403
+ "eval_interval": 100,
404
+ "seq_length": 4096,
405
+ "num_workers": 32,
406
+ "no_data_sharding": false,
407
+ "megatron_extra_kwargs": {},
408
+ "add_version": true,
409
+ "rank": 0,
410
+ "global_world_size": 8,
411
+ "local_world_size": 8,
412
+ "model_suffix": "Qwen3-0.6B-Base",
413
+ "model_info": "ModelInfo(model_type='qwen3', model_dir='/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
414
+ "model_meta": "ModelMeta(model_type='qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B-Base', hf_model_id='Qwen/Qwen3-0.6B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-Base', hf_model_id='Qwen/Qwen3-1.7B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Base', hf_model_id='Qwen/Qwen3-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-Base', hf_model_id='Qwen/Qwen3-8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-Base', hf_model_id='Qwen/Qwen3-14B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B', hf_model_id='Qwen/Qwen3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B', hf_model_id='Qwen/Qwen3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B', hf_model_id='Qwen/Qwen3-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B-FP8', hf_model_id='Qwen/Qwen3-0.6B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-FP8', hf_model_id='Qwen/Qwen3-1.7B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-FP8', hf_model_id='Qwen/Qwen3-4B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-FP8', hf_model_id='Qwen/Qwen3-8B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-FP8', hf_model_id='Qwen/Qwen3-14B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-FP8', hf_model_id='Qwen/Qwen3-32B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-AWQ', hf_model_id='Qwen/Qwen3-4B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-AWQ', hf_model_id='Qwen/Qwen3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-AWQ', hf_model_id='Qwen/Qwen3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-AWQ', hf_model_id='Qwen/Qwen3-32B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-32B-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f555d60fe20>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
415
+ "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
416
+ "_val_dataset_exists": [],
417
+ "hub": "<class 'swift.hub.hub.HFHub'>",
418
+ "megatron_model_meta": "MegatronModelMeta(megatron_model_type='gpt', model_types=['qwen2', 'qwen2_5', 'qwq', 'qwq_preview', 'qwen2_5_math', 'llama', 'llama3', 'llama3_1', 'llama3_2', 'longwriter_llama3_1', 'codefuse_codellama', 'marco_o1', 'deepseek', 'deepseek_r1_distill', 'yi', 'yi_coder', 'sus', 'skywork_o1', 'openbuddy_llama', 'openbuddy_llama3', 'megrez', 'reflection', 'numina', 'ziya', 'mengzi3', 'qwen3', 'qwen3_thinking', 'qwen3_nothinking', 'qwen2_moe', 'qwen3_moe', 'qwen3_moe_thinking', 'qwen3_coder', 'internlm3', 'mimo', 'mimo_rl', 'moonlight', 'kimi_k2', 'deepseek_moe', 'deepseek_v2', 'deepseek_v2_5', 'deepseek_r1', 'dots1', 'ernie', 'glm4_5', 'deepseek_v3_1', 'ernie_thinking', 'gpt_oss'], is_multimodal=False, bridge_cls=<class 'swift.megatron.model.gpt_bridge.GPTBridge'>, model_cls=<class 'swift.megatron.model.gpt_model.GPTModel'>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x7f54e0d9dc60>, visual_cls=None, extra_args_provider=None)",
419
+ "extra_args": {
420
+ "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
421
+ "is_multimodal": false,
422
+ "hf_model_type": "qwen3",
423
+ "use_ray": false,
424
+ "ray_exp_name": null,
425
+ "device_groups": null,
426
+ "model": "Qwen/Qwen3-0.6B-Base",
427
+ "model_type": "qwen3",
428
+ "model_revision": null,
429
+ "task_type": "causal_lm",
430
+ "torch_dtype": "bfloat16",
431
+ "attn_impl": null,
432
+ "new_special_tokens": [],
433
+ "num_labels": null,
434
+ "problem_type": null,
435
+ "rope_scaling": null,
436
+ "device_map": null,
437
+ "max_memory": {},
438
+ "max_model_len": null,
439
+ "local_repo_path": null,
440
+ "init_strategy": null,
441
+ "template": "qwen3",
442
+ "system": null,
443
+ "max_length": 4096,
444
+ "truncation_strategy": "right",
445
+ "max_pixels": null,
446
+ "agent_template": null,
447
+ "norm_bbox": null,
448
+ "use_chat_template": false,
449
+ "padding_free": true,
450
+ "padding_side": "right",
451
+ "sequence_parallel_size": 1,
452
+ "response_prefix": null,
453
+ "template_backend": "swift",
454
+ "dataset": [],
455
+ "val_dataset": [],
456
+ "cached_dataset": [
457
+ "/workspace/full"
458
+ ],
459
+ "cached_val_dataset": [],
460
+ "split_dataset_ratio": 0.0,
461
+ "data_seed": 42,
462
+ "dataset_num_proc": 32,
463
+ "load_from_cache_file": false,
464
+ "dataset_shuffle": true,
465
+ "val_dataset_shuffle": false,
466
+ "streaming": false,
467
+ "interleave_prob": null,
468
+ "stopping_strategy": "first_exhausted",
469
+ "shuffle_buffer_size": 1000,
470
+ "download_mode": "reuse_dataset_if_exists",
471
+ "columns": {},
472
+ "strict": false,
473
+ "remove_unused_columns": true,
474
+ "model_name": null,
475
+ "model_author": null,
476
+ "custom_dataset_info": [],
477
+ "quant_method": null,
478
+ "quant_bits": null,
479
+ "hqq_axis": null,
480
+ "bnb_4bit_compute_dtype": "bfloat16",
481
+ "bnb_4bit_quant_type": "nf4",
482
+ "bnb_4bit_use_double_quant": true,
483
+ "bnb_4bit_quant_storage": null,
484
+ "max_new_tokens": null,
485
+ "temperature": null,
486
+ "top_k": 50,
487
+ "top_p": 0.9,
488
+ "repetition_penalty": 1.0,
489
+ "num_beams": 1,
490
+ "stream": false,
491
+ "stop_words": [],
492
+ "logprobs": false,
493
+ "top_logprobs": null,
494
+ "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
495
+ "lora_modules": [],
496
+ "tuner_backend": "peft",
497
+ "train_type": "full",
498
+ "adapters": [],
499
+ "external_plugins": [],
500
+ "model_kwargs": {},
501
+ "load_args": false,
502
+ "load_data_args": false,
503
+ "packing": true,
504
+ "packing_length": 4096,
505
+ "packing_num_proc": 1,
506
+ "lazy_tokenize": false,
507
+ "custom_register_path": [],
508
+ "use_hf": true,
509
+ "hub_token": null,
510
+ "ddp_timeout": 18000000,
511
+ "ddp_backend": null,
512
+ "ignore_args_error": false,
513
+ "use_swift_lora": false,
514
+ "freeze_llm": false,
515
+ "freeze_vit": true,
516
+ "freeze_aligner": true,
517
+ "freeze_parameters": [],
518
+ "freeze_parameters_regex": null,
519
+ "freeze_parameters_ratio": 0.0,
520
+ "trainable_parameters": [],
521
+ "trainable_parameters_regex": null,
522
+ "adapter_load": null,
523
+ "target_modules": [
524
+ "all-linear"
525
+ ],
526
+ "target_regex": null,
527
+ "modules_to_save": [],
528
+ "lora_rank": 8,
529
+ "lora_alpha": 32,
530
+ "lora_dropout": 0.05,
531
+ "lora_bias": "none",
532
+ "lora_dtype": null,
533
+ "use_rslora": false,
534
+ "rlhf_type": null,
535
+ "ref_load": null,
536
+ "ref_adapter_load": null,
537
+ "beta": 0.1,
538
+ "rpo_alpha": null,
539
+ "reference_free": false,
540
+ "label_smoothing": 0.0,
541
+ "f_divergence_type": "reverse_kl",
542
+ "loss_type": null,
543
+ "desirable_weight": 1.0,
544
+ "undesirable_weight": 1.0,
545
+ "calculate_KL": null,
546
+ "center_rewards_coefficient": null,
547
+ "generation_batch_size": null,
548
+ "steps_per_generation": null,
549
+ "num_generations": 8,
550
+ "max_completion_length": 512,
551
+ "importance_sampling_level": "token",
552
+ "tau_pos": 1.0,
553
+ "tau_neg": 1.05,
554
+ "epsilon": 0.2,
555
+ "epsilon_high": null,
556
+ "delta": null,
557
+ "use_vllm": true,
558
+ "vllm_mode": null,
559
+ "vllm_enable_prefix_caching": true,
560
+ "vllm_gpu_memory_utilization": 0.9,
561
+ "vllm_tensor_parallel_size": 1,
562
+ "vllm_max_model_len": null,
563
+ "vllm_enforce_eager": false,
564
+ "vllm_limit_mm_per_prompt": null,
565
+ "vllm_disable_cascade_attn": false,
566
+ "vllm_max_num_seqs": null,
567
+ "vllm_mm_processor_cache_gb": null,
568
+ "vllm_engine_kwargs": null,
569
+ "sleep_level": 0,
570
+ "offload_optimizer": false,
571
+ "offload_model": false,
572
+ "offload_bridge": false,
573
+ "vllm_server_base_url": null,
574
+ "vllm_server_host": null,
575
+ "vllm_server_port": [
576
+ 8000
577
+ ],
578
+ "vllm_server_timeout": 240.0,
579
+ "vllm_server_group_port": null,
580
+ "reward_funcs": [],
581
+ "reward_weights": null,
582
+ "cosine_min_len_value_wrong": -0.5,
583
+ "cosine_max_len_value_wrong": 0.0,
584
+ "cosine_min_len_value_correct": 1.0,
585
+ "cosine_max_len_value_correct": 0.5,
586
+ "cosine_max_len": null,
587
+ "repetition_n_grams": 3,
588
+ "repetition_max_penalty": -1.0,
589
+ "soft_max_length": null,
590
+ "soft_cache_length": null,
591
+ "dynamic_sample": false,
592
+ "max_resample_times": 3,
593
+ "overlong_filter": false,
594
+ "scale_rewards": "group",
595
+ "advantage_estimator": "grpo",
596
+ "kl_in_reward": false,
597
+ "wandb_log_unique_prompts": null,
598
+ "log_completions": false,
599
+ "rollout_importance_sampling_mode": null,
600
+ "rollout_importance_sampling_threshold": 2.0,
601
+ "log_rollout_offpolicy_metrics": false,
602
+ "off_policy_sequence_mask_delta": null,
603
+ "reward_model": null,
604
+ "reward_model_plugin": null,
605
+ "sync_ref_model": false,
606
+ "ref_model_sync_steps": 512,
607
+ "ref_model_mixup_alpha": 0.6,
608
+ "async_generate": false,
609
+ "move_model_batches": null,
610
+ "multi_turn_scheduler": null,
611
+ "max_turns": null,
612
+ "completion_length_limit_scope": "per_round",
613
+ "vllm_server_pass_dataset": false,
614
+ "log_entropy": false,
615
+ "top_entropy_quantile": 1.0,
616
+ "num_iterations": 1,
617
+ "check_model": true,
618
+ "padded_vocab_size": 151936,
619
+ "initialize_embedding": false,
620
+ "mlp_padding_free": false,
621
+ "load_safetensors": false,
622
+ "save_safetensors": false,
623
+ "ref_model": null,
624
+ "ref_adapters": [],
625
+ "merge_lora": false,
626
+ "max_shard_size": "5GB",
627
+ "train_dataloader_shuffle": true,
628
+ "dataloader_pin_memory": true,
629
+ "dataloader_persistent_workers": true,
630
+ "dataloader_prefetch_factor": 10,
631
+ "architectures": "Qwen3ForCausalLM",
632
+ "llm_architectures": "Qwen3ForCausalLM",
633
+ "max_epochs": null,
634
+ "enable_dft_loss": false,
635
+ "enable_channel_loss": false,
636
+ "patch_size": 1,
637
+ "save_strategy": "steps",
638
+ "original_max_position_embeddings": null,
639
+ "partial_rotary_factor": null,
640
+ "use_shared_expert_gate": false,
641
+ "vit_gradient_checkpointing": true,
642
+ "vit_lr": null,
643
+ "aligner_lr": null,
644
+ "gradient_checkpointing_kwargs": null,
645
+ "linear_num_value_heads": null,
646
+ "linear_num_key_heads": null,
647
+ "linear_key_head_dim": null,
648
+ "linear_value_head_dim": null,
649
+ "linear_conv_kernel_dim": null,
650
+ "layer_types": null,
651
+ "mrope_interleaved": false,
652
+ "add_version": true
653
+ }
654
+ }
images/batch-size vs samples.png ADDED
images/batch-size.png ADDED
images/grad-norm vs samples.png ADDED
images/grad-norm.png ADDED
images/iteration-time.png ADDED
images/learning-rate vs samples.png ADDED
images/learning-rate.png ADDED
images/lm loss vs samples.png ADDED
images/lm loss.png ADDED
images/loss-scale vs samples.png ADDED
images/loss-scale.png ADDED
images/mem-allocated-bytes.png ADDED
images/mem-allocated-count.png ADDED
images/mem-max-allocated-bytes.png ADDED
images/mem-reserved-bytes.png ADDED
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 38100
latest_wandb_artifact_path.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tepic/plt
logging.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ef2e8418b81fb20b5b7952eef83e998df430d4468f8555615d55079c3d2b56
3
+ size 11152685
runs/events.out.tfevents.1766547916.36fd00e7b21c.611253.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a936c171e3fadb62874e1a23844e5a673a393745d4a7d60006a4f4703e0a95a5
3
+ size 32283810
wandb/wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Configure stats pid to 611253
3
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
5
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log
7
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log
8
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 4, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 38100, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 4096, 'encoder_seq_length': 4096, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'baseline', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 4096, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/full'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 4096, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': [], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': None, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 1, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
11
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():889] starting backend
12
+ 2025-12-24 03:45:19,067 INFO MainThread:611253 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-24 03:45:19,070 INFO MainThread:611253 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-24 03:45:19,074 INFO MainThread:611253 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-24 03:45:19,081 INFO MainThread:611253 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-24 03:45:19,580 INFO MainThread:611253 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-24 03:45:19,742 INFO MainThread:611253 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 07:36:28,290 INFO MainThread:611253 [wandb_run.py:_finish():2287] finishing run tepic/plt/gd3q7mjv
23
+ 2025-12-26 07:36:28,292 INFO MainThread:611253 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-26 07:36:28,293 INFO MainThread:611253 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-26 07:36:28,293 INFO MainThread:611253 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-26 07:36:29,517 INFO MainThread:611253 [wandb_run.py:_footer_sync_info():3862] logging synced files
wandb/wandb/run-20251224_034518-gd3q7mjv/files/config.yaml ADDED
@@ -0,0 +1,1779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.23.1
4
+ e:
5
+ 5bh5hk313ky3l0v9f9cesb7o1x31upc6:
6
+ args:
7
+ - --seed
8
+ - "42"
9
+ - --micro-batch-size
10
+ - "4"
11
+ - --global-batch-size
12
+ - "256"
13
+ - --recompute-granularity
14
+ - full
15
+ - --recompute-method
16
+ - uniform
17
+ - --recompute-num-layers
18
+ - "1"
19
+ - --recompute-modules
20
+ - core_attn
21
+ - --train-iters
22
+ - "38100"
23
+ - --log-interval
24
+ - "1"
25
+ - --tensorboard-dir
26
+ - /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs
27
+ - --cross-entropy-loss-fusion
28
+ - --cross-entropy-fusion-impl
29
+ - native
30
+ - --calculate-per-token-loss
31
+ - --attention-backend
32
+ - flash
33
+ - --optimizer
34
+ - adam
35
+ - --optimizer-offload-fraction
36
+ - "1.0"
37
+ - --use-precision-aware-optimizer
38
+ - --main-grads-dtype
39
+ - fp32
40
+ - --main-params-dtype
41
+ - fp32
42
+ - --exp-avg-dtype
43
+ - fp32
44
+ - --exp-avg-sq-dtype
45
+ - fp32
46
+ - --dataloader-type
47
+ - cyclic
48
+ - --manual-gc-interval
49
+ - "0"
50
+ - --lr
51
+ - "0.0001"
52
+ - --lr-decay-style
53
+ - cosine
54
+ - --lr-warmup-iters
55
+ - "0"
56
+ - --lr-warmup-fraction
57
+ - "0.05"
58
+ - --min-lr
59
+ - "3e-06"
60
+ - --weight-decay
61
+ - "0.1"
62
+ - --clip-grad
63
+ - "1.0"
64
+ - --adam-beta1
65
+ - "0.9"
66
+ - --adam-beta2
67
+ - "0.95"
68
+ - --adam-eps
69
+ - "1e-08"
70
+ - --sgd-momentum
71
+ - "0.9"
72
+ - --save
73
+ - /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709
74
+ - --save-interval
75
+ - "100"
76
+ - --load
77
+ - /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
78
+ - --finetune
79
+ - --ckpt-format
80
+ - torch_dist
81
+ - --no-initialization
82
+ - --auto-detect-ckpt-format
83
+ - --exit-on-missing-checkpoint
84
+ - --distributed-backend
85
+ - nccl
86
+ - --local-rank
87
+ - "7"
88
+ - --use-distributed-optimizer
89
+ - --tensor-model-parallel-size
90
+ - "1"
91
+ - --pipeline-model-parallel-size
92
+ - "1"
93
+ - --context-parallel-size
94
+ - "1"
95
+ - --overlap-grad-reduce
96
+ - --overlap-param-gather
97
+ - --distributed-timeout-minutes
98
+ - "300000"
99
+ - --num-layers
100
+ - "28"
101
+ - --hidden-size
102
+ - "1024"
103
+ - --ffn-hidden-size
104
+ - "3072"
105
+ - --num-attention-heads
106
+ - "16"
107
+ - --group-query-attention
108
+ - --num-query-groups
109
+ - "8"
110
+ - --max-position-embeddings
111
+ - "32768"
112
+ - --position-embedding-type
113
+ - rope
114
+ - --rotary-base
115
+ - "1000000"
116
+ - --rotary-percent
117
+ - "1.0"
118
+ - --normalization
119
+ - RMSNorm
120
+ - --norm-epsilon
121
+ - "1e-06"
122
+ - --swiglu
123
+ - --disable-bias-linear
124
+ - --attention-dropout
125
+ - "0.0"
126
+ - --hidden-dropout
127
+ - "0.0"
128
+ - --kv-channels
129
+ - "128"
130
+ - --qk-layernorm
131
+ - --transformer-impl
132
+ - transformer_engine
133
+ - --moe-layer-freq
134
+ - "1"
135
+ - --moe-router-topk
136
+ - "2"
137
+ - --moe-router-dtype
138
+ - fp32
139
+ - --moe-router-score-function
140
+ - softmax
141
+ - --moe-router-load-balancing-type
142
+ - aux_loss
143
+ - --expert-model-parallel-size
144
+ - "1"
145
+ - --expert-tensor-parallel-size
146
+ - "1"
147
+ - --moe-token-dispatcher-type
148
+ - alltoall
149
+ - --moe-grouped-gemm
150
+ - --moe-aux-loss-coeff
151
+ - "0.0"
152
+ - --moe-token-drop-policy
153
+ - probs
154
+ - --kv-lora-rank
155
+ - "32"
156
+ - --qk-head-dim
157
+ - "128"
158
+ - --qk-pos-emb-head-dim
159
+ - "64"
160
+ - --mtp-loss-scaling-factor
161
+ - "0.1"
162
+ - --fp8-recipe
163
+ - delayed
164
+ - --fp8-amax-history-len
165
+ - "1024"
166
+ - --fp8-amax-compute-algo
167
+ - max
168
+ - --bf16
169
+ - --attention-softmax-in-fp32
170
+ - --tensorboard-log-interval
171
+ - "1"
172
+ - --tensorboard-queue-size
173
+ - "50"
174
+ - --log-timers-to-tensorboard
175
+ - --log-validation-ppl-to-tensorboard
176
+ - --log-memory-to-tensorboard
177
+ - --logging-level
178
+ - "20"
179
+ - --wandb-project
180
+ - plt
181
+ - --wandb-exp-name
182
+ - baseline
183
+ - --eval-iters
184
+ - "-1"
185
+ - --eval-interval
186
+ - "100"
187
+ - --seq-length
188
+ - "4096"
189
+ - --num-workers
190
+ - "32"
191
+ codePath: swift/cli/_megatron/pt.py
192
+ codePathLocal: swift/cli/_megatron/pt.py
193
+ cpu_count: 72
194
+ cpu_count_logical: 144
195
+ cudaVersion: "13.0"
196
+ disk:
197
+ /:
198
+ total: "7669363507200"
199
+ used: "983051857920"
200
+ email: kazuma826826@gmail.com
201
+ executable: /venv/main/bin/python3.12
202
+ git:
203
+ commit: ea7cc214b68fb511dd83bff83a504b7f43053577
204
+ remote: https://github.com/weak-kajuma/halcyon-recipe2.git
205
+ gpu: NVIDIA GeForce RTX 5090
206
+ gpu_count: 8
207
+ gpu_nvidia:
208
+ - architecture: Blackwell
209
+ cudaCores: 21760
210
+ memoryTotal: "34190917632"
211
+ name: NVIDIA GeForce RTX 5090
212
+ uuid: GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3
213
+ - architecture: Blackwell
214
+ cudaCores: 21760
215
+ memoryTotal: "34190917632"
216
+ name: NVIDIA GeForce RTX 5090
217
+ uuid: GPU-23ca8669-46fc-19eb-348b-e51e591c150d
218
+ - architecture: Blackwell
219
+ cudaCores: 21760
220
+ memoryTotal: "34190917632"
221
+ name: NVIDIA GeForce RTX 5090
222
+ uuid: GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152
223
+ - architecture: Blackwell
224
+ cudaCores: 21760
225
+ memoryTotal: "34190917632"
226
+ name: NVIDIA GeForce RTX 5090
227
+ uuid: GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9
228
+ - architecture: Blackwell
229
+ cudaCores: 21760
230
+ memoryTotal: "34190917632"
231
+ name: NVIDIA GeForce RTX 5090
232
+ uuid: GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7
233
+ - architecture: Blackwell
234
+ cudaCores: 21760
235
+ memoryTotal: "34190917632"
236
+ name: NVIDIA GeForce RTX 5090
237
+ uuid: GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49
238
+ - architecture: Blackwell
239
+ cudaCores: 21760
240
+ memoryTotal: "34190917632"
241
+ name: NVIDIA GeForce RTX 5090
242
+ uuid: GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b
243
+ - architecture: Blackwell
244
+ cudaCores: 21760
245
+ memoryTotal: "34190917632"
246
+ name: NVIDIA GeForce RTX 5090
247
+ uuid: GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef
248
+ host: 36fd00e7b21c
249
+ memory:
250
+ total: "540643262464"
251
+ os: Linux-6.8.0-58-generic-x86_64-with-glibc2.39
252
+ program: /workspace/halcyon-recipe2/swift/cli/_megatron/pt.py
253
+ python: CPython 3.12.12
254
+ root: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb
255
+ startedAt: "2025-12-24T03:45:18.795219Z"
256
+ writerId: 5bh5hk313ky3l0v9f9cesb7o1x31upc6
257
+ m: []
258
+ python_version: 3.12.12
259
+ t:
260
+ "1":
261
+ - 1
262
+ - 11
263
+ - 41
264
+ - 49
265
+ - 51
266
+ - 71
267
+ - 84
268
+ - 98
269
+ - 105
270
+ "2":
271
+ - 1
272
+ - 11
273
+ - 41
274
+ - 49
275
+ - 51
276
+ - 71
277
+ - 84
278
+ - 98
279
+ - 105
280
+ "3":
281
+ - 2
282
+ - 13
283
+ - 16
284
+ - 61
285
+ "4": 3.12.12
286
+ "5": 0.23.1
287
+ "6": 4.57.3
288
+ "12": 0.23.1
289
+ "13": linux-x86_64
290
+ account_for_embedding_in_pipeline_split:
291
+ value: false
292
+ account_for_loss_in_pipeline_split:
293
+ value: false
294
+ accumulate_allreduce_grads_in_fp32:
295
+ value: true
296
+ adam_beta1:
297
+ value: 0.9
298
+ adam_beta2:
299
+ value: 0.95
300
+ adam_eps:
301
+ value: 1e-08
302
+ adapter_load:
303
+ value: null
304
+ adapters:
305
+ value: []
306
+ add_bias_linear:
307
+ value: false
308
+ add_position_embedding:
309
+ value: true
310
+ add_qkv_bias:
311
+ value: false
312
+ add_version:
313
+ value: true
314
+ adlr_autoresume:
315
+ value: false
316
+ adlr_autoresume_interval:
317
+ value: 1000
318
+ advantage_estimator:
319
+ value: grpo
320
+ agent_template:
321
+ value: null
322
+ align_grad_reduce:
323
+ value: true
324
+ align_param_gather:
325
+ value: false
326
+ aligner_lr:
327
+ value: null
328
+ app_tag_run_name:
329
+ value: null
330
+ app_tag_run_version:
331
+ value: 0.0.0
332
+ apply_layernorm_1p:
333
+ value: false
334
+ apply_query_key_layer_scaling:
335
+ value: false
336
+ apply_residual_connection_post_layernorm:
337
+ value: false
338
+ apply_rope_fusion:
339
+ value: true
340
+ architectures:
341
+ value: Qwen3ForCausalLM
342
+ async_generate:
343
+ value: false
344
+ async_save:
345
+ value: null
346
+ async_tensor_model_parallel_allreduce:
347
+ value: true
348
+ attention_backend:
349
+ value: flash
350
+ attention_dropout:
351
+ value: 0
352
+ attention_softmax_in_fp32:
353
+ value: true
354
+ attn_impl:
355
+ value: null
356
+ auto_detect_ckpt_format:
357
+ value: true
358
+ barrier_with_L1_time:
359
+ value: true
360
+ bert_binary_head:
361
+ value: true
362
+ bert_embedder_type:
363
+ value: megatron
364
+ bert_load:
365
+ value: null
366
+ beta:
367
+ value: 0.1
368
+ bf16:
369
+ value: true
370
+ bias_dropout_fusion:
371
+ value: true
372
+ bias_gelu_fusion:
373
+ value: false
374
+ bias_swiglu_fusion:
375
+ value: true
376
+ biencoder_projection_dim:
377
+ value: 0
378
+ biencoder_shared_query_context_model:
379
+ value: false
380
+ block_data_path:
381
+ value: null
382
+ bnb_4bit_compute_dtype:
383
+ value: torch.bfloat16
384
+ bnb_4bit_quant_storage:
385
+ value: null
386
+ bnb_4bit_quant_type:
387
+ value: nf4
388
+ bnb_4bit_use_double_quant:
389
+ value: true
390
+ cache_mla_latents:
391
+ value: false
392
+ cached_dataset:
393
+ value:
394
+ - /workspace/full
395
+ cached_val_dataset:
396
+ value: []
397
+ calc_ft_timeouts:
398
+ value: false
399
+ calculate_KL:
400
+ value: null
401
+ calculate_per_token_loss:
402
+ value: true
403
+ center_rewards_coefficient:
404
+ value: null
405
+ check_for_large_grads:
406
+ value: false
407
+ check_for_nan_in_loss_and_grad:
408
+ value: true
409
+ check_for_spiky_loss:
410
+ value: false
411
+ check_model:
412
+ value: true
413
+ check_weight_hash_across_dp_replicas_interval:
414
+ value: null
415
+ ckpt_assume_constant_structure:
416
+ value: false
417
+ ckpt_convert_format:
418
+ value: null
419
+ ckpt_convert_save:
420
+ value: null
421
+ ckpt_convert_update_legacy_dist_opt_format:
422
+ value: false
423
+ ckpt_dir:
424
+ value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
425
+ ckpt_format:
426
+ value: torch_dist
427
+ ckpt_fully_parallel_load:
428
+ value: false
429
+ ckpt_fully_parallel_save:
430
+ value: true
431
+ ckpt_fully_parallel_save_deprecated:
432
+ value: false
433
+ ckpt_step:
434
+ value: null
435
+ classes_fraction:
436
+ value: 1
437
+ clip_grad:
438
+ value: 1
439
+ clone_scatter_output_in_embedding:
440
+ value: true
441
+ completion_length_limit_scope:
442
+ value: per_round
443
+ config_logger_dir:
444
+ value: ""
445
+ consumed_train_samples:
446
+ value: 0
447
+ consumed_valid_samples:
448
+ value: 0
449
+ context_parallel_size:
450
+ value: 1
451
+ cosine_max_len:
452
+ value: null
453
+ cosine_max_len_value_correct:
454
+ value: 0.5
455
+ cosine_max_len_value_wrong:
456
+ value: 0
457
+ cosine_min_len_value_correct:
458
+ value: 1
459
+ cosine_min_len_value_wrong:
460
+ value: -0.5
461
+ cp_comm_type:
462
+ value:
463
+ - p2p
464
+ create_attention_mask_in_dataloader:
465
+ value: true
466
+ cross_entropy_fusion_impl:
467
+ value: native
468
+ cross_entropy_loss_fusion:
469
+ value: true
470
+ cuda_graph_scope:
471
+ value: full
472
+ cuda_graph_warmup_steps:
473
+ value: 3
474
+ custom_dataset_info:
475
+ value: []
476
+ custom_register_path:
477
+ value: []
478
+ data_args_path:
479
+ value: null
480
+ data_cache_path:
481
+ value: null
482
+ data_parallel_random_init:
483
+ value: false
484
+ data_parallel_sharding_strategy:
485
+ value: no_shard
486
+ data_parallel_size:
487
+ value: 8
488
+ data_path:
489
+ value: null
490
+ data_per_class_fraction:
491
+ value: 1
492
+ data_seed:
493
+ value: 42
494
+ data_sharding:
495
+ value: true
496
+ dataloader_persistent_workers:
497
+ value: true
498
+ dataloader_pin_memory:
499
+ value: true
500
+ dataloader_prefetch_factor:
501
+ value: 10
502
+ dataloader_type:
503
+ value: cyclic
504
+ dataset:
505
+ value: []
506
+ dataset_num_proc:
507
+ value: 32
508
+ dataset_shuffle:
509
+ value: true
510
+ ddp_average_in_collective:
511
+ value: false
512
+ ddp_backend:
513
+ value: null
514
+ ddp_bucket_size:
515
+ value: null
516
+ ddp_num_buckets:
517
+ value: null
518
+ ddp_pad_buckets_for_high_nccl_busbw:
519
+ value: false
520
+ ddp_timeout:
521
+ value: 18000000
522
+ decoder_first_pipeline_num_layers:
523
+ value: null
524
+ decoder_last_pipeline_num_layers:
525
+ value: null
526
+ decoder_num_layers:
527
+ value: null
528
+ decoder_seq_length:
529
+ value: null
530
+ decoupled_lr:
531
+ value: null
532
+ decoupled_min_lr:
533
+ value: null
534
+ decrease_batch_size_if_needed:
535
+ value: false
536
+ defer_embedding_wgrad_compute:
537
+ value: false
538
+ delay_wgrad_compute:
539
+ value: false
540
+ delta:
541
+ value: null
542
+ deprecated_use_mcore_models:
543
+ value: false
544
+ desirable_weight:
545
+ value: 1
546
+ deterministic_mode:
547
+ value: false
548
+ device_groups:
549
+ value: null
550
+ device_map:
551
+ value: null
552
+ dino_bottleneck_size:
553
+ value: 256
554
+ dino_freeze_last_layer:
555
+ value: 1
556
+ dino_head_hidden_size:
557
+ value: 2048
558
+ dino_local_crops_number:
559
+ value: 10
560
+ dino_local_img_size:
561
+ value: 96
562
+ dino_norm_last_layer:
563
+ value: false
564
+ dino_teacher_temp:
565
+ value: 0.07
566
+ dino_warmup_teacher_temp:
567
+ value: 0.04
568
+ dino_warmup_teacher_temp_epochs:
569
+ value: 30
570
+ disable_bf16_reduced_precision_matmul:
571
+ value: false
572
+ disable_mamba_mem_eff_path:
573
+ value: false
574
+ disable_straggler_on_startup:
575
+ value: false
576
+ dist_ckpt_format_deprecated:
577
+ value: null
578
+ dist_ckpt_strictness:
579
+ value: assume_ok_unexpected
580
+ distribute_saved_activations:
581
+ value: false
582
+ distributed_backend:
583
+ value: nccl
584
+ distributed_timeout_minutes:
585
+ value: 300000
586
+ download_mode:
587
+ value: reuse_dataset_if_exists
588
+ dynamic_sample:
589
+ value: false
590
+ embedding_init_method_std:
591
+ value: null
592
+ embedding_path:
593
+ value: null
594
+ empty_unused_memory_level:
595
+ value: 0
596
+ enable_channel_loss:
597
+ value: false
598
+ enable_cuda_graph:
599
+ value: false
600
+ enable_dft_loss:
601
+ value: false
602
+ enable_experimental:
603
+ value: false
604
+ enable_ft_package:
605
+ value: false
606
+ enable_full_sharding_in_hsdp:
607
+ value: false
608
+ enable_gloo_process_groups:
609
+ value: true
610
+ enable_msc:
611
+ value: true
612
+ enable_one_logger:
613
+ value: true
614
+ encoder_num_layers:
615
+ value: 28
616
+ encoder_seq_length:
617
+ value: 4096
618
+ end_weight_decay:
619
+ value: 0.1
620
+ eod_mask_loss:
621
+ value: false
622
+ epsilon:
623
+ value: 0.2
624
+ epsilon_high:
625
+ value: null
626
+ error_injection_rate:
627
+ value: 0
628
+ error_injection_type:
629
+ value: transient_error
630
+ eval_interval:
631
+ value: 100
632
+ eval_iters:
633
+ value: -1
634
+ evidence_data_path:
635
+ value: null
636
+ exit_duration_in_mins:
637
+ value: null
638
+ exit_interval:
639
+ value: null
640
+ exit_on_missing_checkpoint:
641
+ value: true
642
+ exit_signal_handler:
643
+ value: false
644
+ exp_avg_dtype:
645
+ value: torch.float32
646
+ exp_avg_sq_dtype:
647
+ value: torch.float32
648
+ expert_model_parallel_size:
649
+ value: 1
650
+ expert_tensor_parallel_size:
651
+ value: 1
652
+ external_cuda_graph:
653
+ value: false
654
+ external_plugins:
655
+ value: []
656
+ f_divergence_type:
657
+ value: reverse_kl
658
+ ffn_hidden_size:
659
+ value: 3072
660
+ finetune:
661
+ value: true
662
+ first_last_layers_bf16:
663
+ value: false
664
+ flash_decode:
665
+ value: false
666
+ fp8:
667
+ value: null
668
+ fp8_amax_compute_algo:
669
+ value: max
670
+ fp8_amax_history_len:
671
+ value: 1024
672
+ fp8_interval:
673
+ value: 1
674
+ fp8_margin:
675
+ value: 0
676
+ fp8_param_gather:
677
+ value: false
678
+ fp8_recipe:
679
+ value: delayed
680
+ fp8_wgrad:
681
+ value: true
682
+ fp16:
683
+ value: false
684
+ fp16_lm_cross_entropy:
685
+ value: false
686
+ fp32_residual_connection:
687
+ value: false
688
+ freeze_aligner:
689
+ value: true
690
+ freeze_llm:
691
+ value: false
692
+ freeze_parameters:
693
+ value: []
694
+ freeze_parameters_ratio:
695
+ value: 0
696
+ freeze_parameters_regex:
697
+ value: null
698
+ freeze_vit:
699
+ value: true
700
+ fsdp_double_buffer:
701
+ value: false
702
+ full_validation:
703
+ value: false
704
+ generation_batch_size:
705
+ value: null
706
+ global_batch_size:
707
+ value: 256
708
+ grad_reduce_in_bf16:
709
+ value: false
710
+ gradient_accumulation_fusion:
711
+ value: true
712
+ gradient_checkpointing_kwargs:
713
+ value: null
714
+ gradient_reduce_div_fusion:
715
+ value: true
716
+ group_query_attention:
717
+ value: true
718
+ head_lr_mult:
719
+ value: 1
720
+ heterogeneous_layers_config_encoded_json:
721
+ value: null
722
+ heterogeneous_layers_config_path:
723
+ value: null
724
+ hf_model_type:
725
+ value: qwen3
726
+ hidden_dropout:
727
+ value: 0
728
+ hidden_size:
729
+ value: 1024
730
+ hierarchical_context_parallel_sizes:
731
+ value: null
732
+ high_priority_stream_groups:
733
+ value: []
734
+ hqq_axis:
735
+ value: null
736
+ hub_token:
737
+ value: null
738
+ hybrid_attention_ratio:
739
+ value: 0
740
+ hybrid_mlp_ratio:
741
+ value: 0
742
+ hybrid_override_pattern:
743
+ value: null
744
+ hysteresis:
745
+ value: 2
746
+ ict_head_size:
747
+ value: null
748
+ ict_load:
749
+ value: null
750
+ ignore_args_error:
751
+ value: false
752
+ img_h:
753
+ value: 224
754
+ img_w:
755
+ value: 224
756
+ importance_sampling_level:
757
+ value: token
758
+ indexer_batch_size:
759
+ value: 128
760
+ indexer_log_interval:
761
+ value: 1000
762
+ inference_batch_times_seqlen_threshold:
763
+ value: -1
764
+ inference_dynamic_batching:
765
+ value: false
766
+ inference_dynamic_batching_buffer_guaranteed_fraction:
767
+ value: 0.2
768
+ inference_dynamic_batching_buffer_overflow_factor:
769
+ value: null
770
+ inference_dynamic_batching_buffer_size_gb:
771
+ value: 40
772
+ inference_dynamic_batching_chunk_size:
773
+ value: 256
774
+ inference_dynamic_batching_max_requests_override:
775
+ value: null
776
+ inference_dynamic_batching_max_tokens_override:
777
+ value: null
778
+ inference_dynamic_batching_num_cuda_graphs:
779
+ value: 16
780
+ inference_max_batch_size:
781
+ value: 8
782
+ inference_max_seq_length:
783
+ value: 2560
784
+ inference_rng_tracker:
785
+ value: false
786
+ init_method_std:
787
+ value: 0.02
788
+ init_method_xavier_uniform:
789
+ value: false
790
+ init_model_with_meta_device:
791
+ value: false
792
+ init_strategy:
793
+ value: null
794
+ initial_loss_scale:
795
+ value: 4294967296
796
+ initialize_embedding:
797
+ value: false
798
+ inprocess_active_world_size:
799
+ value: 8
800
+ inprocess_barrier_timeout:
801
+ value: 120
802
+ inprocess_completion_timeout:
803
+ value: 120
804
+ inprocess_empty_cuda_cache:
805
+ value: false
806
+ inprocess_granularity:
807
+ value: node
808
+ inprocess_hard_timeout:
809
+ value: 90
810
+ inprocess_heartbeat_interval:
811
+ value: 30
812
+ inprocess_heartbeat_timeout:
813
+ value: 60
814
+ inprocess_last_call_wait:
815
+ value: 1
816
+ inprocess_max_iterations:
817
+ value: null
818
+ inprocess_monitor_process_interval:
819
+ value: 1
820
+ inprocess_monitor_thread_interval:
821
+ value: 1
822
+ inprocess_progress_watchdog_interval:
823
+ value: 1
824
+ inprocess_restart:
825
+ value: false
826
+ inprocess_soft_timeout:
827
+ value: 60
828
+ inprocess_termination_grace_time:
829
+ value: 1
830
+ interleave_prob:
831
+ value: null
832
+ is_hybrid_model:
833
+ value: false
834
+ is_multimodal:
835
+ value: false
836
+ iter_per_epoch:
837
+ value: 1250
838
+ iterations_to_skip:
839
+ value: []
840
+ keep_fp8_transpose_cache:
841
+ value: false
842
+ kitchen_config_file:
843
+ value: null
844
+ kitchen_recipe_number:
845
+ value: null
846
+ kl_in_reward:
847
+ value: false
848
+ kv_channels:
849
+ value: 128
850
+ kv_lora_rank:
851
+ value: 32
852
+ label_smoothing:
853
+ value: 0
854
+ layer_types:
855
+ value: null
856
+ lazy_mpu_init:
857
+ value: null
858
+ lazy_tokenize:
859
+ value: false
860
+ linear_conv_kernel_dim:
861
+ value: null
862
+ linear_key_head_dim:
863
+ value: null
864
+ linear_num_key_heads:
865
+ value: null
866
+ linear_num_value_heads:
867
+ value: null
868
+ linear_value_head_dim:
869
+ value: null
870
+ llm_architectures:
871
+ value: Qwen3ForCausalLM
872
+ load:
873
+ value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
874
+ load_args:
875
+ value: false
876
+ load_data_args:
877
+ value: false
878
+ load_from_cache_file:
879
+ value: false
880
+ load_main_params_from_ckpt:
881
+ value: null
882
+ load_model_opt_format:
883
+ value: false
884
+ load_safetensors:
885
+ value: false
886
+ local_rank:
887
+ value: 7
888
+ local_repo_path:
889
+ value: null
890
+ log_completions:
891
+ value: false
892
+ log_energy:
893
+ value: false
894
+ log_entropy:
895
+ value: false
896
+ log_interval:
897
+ value: 1
898
+ log_loss_scale_to_tensorboard:
899
+ value: true
900
+ log_memory_to_tensorboard:
901
+ value: true
902
+ log_num_zeros_in_grad:
903
+ value: false
904
+ log_params_norm:
905
+ value: false
906
+ log_progress:
907
+ value: false
908
+ log_rollout_offpolicy_metrics:
909
+ value: false
910
+ log_straggler:
911
+ value: false
912
+ log_throughput:
913
+ value: false
914
+ log_timers_to_tensorboard:
915
+ value: true
916
+ log_validation_ppl_to_tensorboard:
917
+ value: true
918
+ log_world_size_to_tensorboard:
919
+ value: false
920
+ logging_level:
921
+ value: 20
922
+ logprobs:
923
+ value: false
924
+ lora_alpha:
925
+ value: 32
926
+ lora_bias:
927
+ value: none
928
+ lora_dropout:
929
+ value: 0.05
930
+ lora_dtype:
931
+ value: null
932
+ lora_modules:
933
+ value: []
934
+ lora_rank:
935
+ value: 8
936
+ loss_scale:
937
+ value: null
938
+ loss_scale_window:
939
+ value: 1000
940
+ loss_type:
941
+ value: null
942
+ lr:
943
+ value: 0.0001
944
+ lr_decay_iters:
945
+ value: null
946
+ lr_decay_samples:
947
+ value: null
948
+ lr_decay_style:
949
+ value: cosine
950
+ lr_warmup_fraction:
951
+ value: 0.05
952
+ lr_warmup_init:
953
+ value: 0
954
+ lr_warmup_iters:
955
+ value: 0
956
+ lr_warmup_samples:
957
+ value: 0
958
+ lr_wsd_decay_iters:
959
+ value: null
960
+ lr_wsd_decay_samples:
961
+ value: null
962
+ lr_wsd_decay_style:
963
+ value: exponential
964
+ main_grads_dtype:
965
+ value: torch.float32
966
+ main_params_dtype:
967
+ value: torch.float32
968
+ make_vocab_size_divisible_by:
969
+ value: 128
970
+ mamba_head_dim:
971
+ value: 64
972
+ mamba_num_groups:
973
+ value: 8
974
+ mamba_num_heads:
975
+ value: null
976
+ mamba_state_dim:
977
+ value: 128
978
+ manual_gc:
979
+ value: false
980
+ manual_gc_eval:
981
+ value: true
982
+ manual_gc_interval:
983
+ value: 0
984
+ mask_factor:
985
+ value: 1
986
+ mask_prob:
987
+ value: 0.15
988
+ mask_type:
989
+ value: random
990
+ masked_softmax_fusion:
991
+ value: true
992
+ max_completion_length:
993
+ value: 512
994
+ max_epochs:
995
+ value: null
996
+ max_length:
997
+ value: 4096
998
+ max_model_len:
999
+ value: null
1000
+ max_new_tokens:
1001
+ value: null
1002
+ max_pixels:
1003
+ value: null
1004
+ max_position_embeddings:
1005
+ value: 32768
1006
+ max_resample_times:
1007
+ value: 3
1008
+ max_shard_size:
1009
+ value: 5GB
1010
+ max_tokens_to_oom:
1011
+ value: 12000
1012
+ max_turns:
1013
+ value: null
1014
+ memory_snapshot_path:
1015
+ value: snapshot.pickle
1016
+ merge_file:
1017
+ value: null
1018
+ merge_lora:
1019
+ value: false
1020
+ micro_batch_size:
1021
+ value: 4
1022
+ microbatch_group_size_per_vp_stage:
1023
+ value: null
1024
+ mid_level_dataset_surplus:
1025
+ value: 0.005
1026
+ min_loss_scale:
1027
+ value: 1
1028
+ min_lr:
1029
+ value: 3e-06
1030
+ mlp_chunks_for_prefill:
1031
+ value: 1
1032
+ mlp_padding_free:
1033
+ value: false
1034
+ mmap_bin_files:
1035
+ value: true
1036
+ mock_data:
1037
+ value: false
1038
+ model:
1039
+ value: Qwen/Qwen3-0.6B-Base
1040
+ model_author:
1041
+ value: null
1042
+ model_dir:
1043
+ value: /workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd
1044
+ model_name:
1045
+ value: null
1046
+ model_revision:
1047
+ value: null
1048
+ model_type:
1049
+ value: qwen3
1050
+ modules_to_save:
1051
+ value: []
1052
+ moe_apply_probs_on_input:
1053
+ value: false
1054
+ moe_aux_loss_coeff:
1055
+ value: 0
1056
+ moe_deepep_num_sms:
1057
+ value: 20
1058
+ moe_enable_deepep:
1059
+ value: false
1060
+ moe_expert_capacity_factor:
1061
+ value: null
1062
+ moe_extended_tp:
1063
+ value: false
1064
+ moe_ffn_hidden_size:
1065
+ value: null
1066
+ moe_grouped_gemm:
1067
+ value: true
1068
+ moe_input_jitter_eps:
1069
+ value: null
1070
+ moe_layer_freq:
1071
+ value: 1
1072
+ moe_layer_recompute:
1073
+ value: false
1074
+ moe_pad_expert_input_to_capacity:
1075
+ value: false
1076
+ moe_per_layer_logging:
1077
+ value: false
1078
+ moe_permute_fusion:
1079
+ value: false
1080
+ moe_router_bias_update_rate:
1081
+ value: 0.001
1082
+ moe_router_dtype:
1083
+ value: fp32
1084
+ moe_router_enable_expert_bias:
1085
+ value: false
1086
+ moe_router_force_load_balancing:
1087
+ value: false
1088
+ moe_router_fusion:
1089
+ value: false
1090
+ moe_router_group_topk:
1091
+ value: null
1092
+ moe_router_load_balancing_type:
1093
+ value: aux_loss
1094
+ moe_router_num_groups:
1095
+ value: null
1096
+ moe_router_padding_for_fp8:
1097
+ value: false
1098
+ moe_router_pre_softmax:
1099
+ value: false
1100
+ moe_router_score_function:
1101
+ value: softmax
1102
+ moe_router_topk:
1103
+ value: 2
1104
+ moe_router_topk_scaling_factor:
1105
+ value: null
1106
+ moe_shared_expert_intermediate_size:
1107
+ value: null
1108
+ moe_shared_expert_overlap:
1109
+ value: false
1110
+ moe_token_dispatcher_type:
1111
+ value: alltoall
1112
+ moe_token_drop_policy:
1113
+ value: probs
1114
+ moe_upcycling_granularity:
1115
+ value: 1
1116
+ moe_use_legacy_grouped_gemm:
1117
+ value: false
1118
+ moe_use_upcycling:
1119
+ value: false
1120
+ moe_z_loss_coeff:
1121
+ value: null
1122
+ move_model_batches:
1123
+ value: null
1124
+ mrope_interleaved:
1125
+ value: false
1126
+ mrope_section:
1127
+ value: null
1128
+ mscale:
1129
+ value: 1
1130
+ mscale_all_dim:
1131
+ value: 0
1132
+ mtp_loss_scaling_factor:
1133
+ value: 0.1
1134
+ mtp_num_layers:
1135
+ value: null
1136
+ multi_latent_attention:
1137
+ value: false
1138
+ multi_turn_scheduler:
1139
+ value: null
1140
+ multiple_validation_sets:
1141
+ value: false
1142
+ nccl_all_reduce_for_prefill:
1143
+ value: false
1144
+ nccl_communicator_config_path:
1145
+ value: null
1146
+ nccl_ub:
1147
+ value: false
1148
+ new_special_tokens:
1149
+ value: []
1150
+ no_load_optim:
1151
+ value: null
1152
+ no_load_rng:
1153
+ value: null
1154
+ no_persist_layer_norm:
1155
+ value: false
1156
+ no_rope_freq:
1157
+ value: null
1158
+ no_save_optim:
1159
+ value: null
1160
+ no_save_rng:
1161
+ value: null
1162
+ non_persistent_ckpt_type:
1163
+ value: null
1164
+ non_persistent_global_ckpt_dir:
1165
+ value: null
1166
+ non_persistent_local_ckpt_algo:
1167
+ value: fully_parallel
1168
+ non_persistent_local_ckpt_dir:
1169
+ value: null
1170
+ non_persistent_save_interval:
1171
+ value: null
1172
+ norm_bbox:
1173
+ value: null
1174
+ norm_epsilon:
1175
+ value: 1e-06
1176
+ normalization:
1177
+ value: RMSNorm
1178
+ num_attention_heads:
1179
+ value: 16
1180
+ num_beams:
1181
+ value: 1
1182
+ num_channels:
1183
+ value: 3
1184
+ num_classes:
1185
+ value: 1000
1186
+ num_dataset_builder_threads:
1187
+ value: 1
1188
+ num_distributed_optimizer_instances:
1189
+ value: 1
1190
+ num_experts:
1191
+ value: null
1192
+ num_generations:
1193
+ value: 8
1194
+ num_iterations:
1195
+ value: 1
1196
+ num_labels:
1197
+ value: null
1198
+ num_layers:
1199
+ value: 28
1200
+ num_layers_at_end_in_bf16:
1201
+ value: 1
1202
+ num_layers_at_start_in_bf16:
1203
+ value: 1
1204
+ num_layers_per_virtual_pipeline_stage:
1205
+ value: null
1206
+ num_query_groups:
1207
+ value: 8
1208
+ num_virtual_stages_per_pipeline_rank:
1209
+ value: null
1210
+ num_workers:
1211
+ value: 32
1212
+ object_storage_cache_path:
1213
+ value: null
1214
+ off_policy_sequence_mask_delta:
1215
+ value: null
1216
+ offload_bridge:
1217
+ value: false
1218
+ offload_model:
1219
+ value: false
1220
+ offload_optimizer:
1221
+ value: false
1222
+ one_logger_async:
1223
+ value: false
1224
+ one_logger_project:
1225
+ value: megatron-lm
1226
+ one_logger_run_name:
1227
+ value: null
1228
+ onnx_safe:
1229
+ value: null
1230
+ openai_gelu:
1231
+ value: false
1232
+ optimizer:
1233
+ value: adam
1234
+ optimizer_cpu_offload:
1235
+ value: false
1236
+ optimizer_offload_fraction:
1237
+ value: 1
1238
+ original_max_position_embeddings:
1239
+ value: null
1240
+ output_bert_embeddings:
1241
+ value: false
1242
+ overlap_cpu_optimizer_d2h_h2d:
1243
+ value: false
1244
+ overlap_grad_reduce:
1245
+ value: true
1246
+ overlap_moe_expert_parallel_comm:
1247
+ value: false
1248
+ overlap_p2p_comm:
1249
+ value: false
1250
+ overlap_p2p_comm_warmup_flush:
1251
+ value: false
1252
+ overlap_param_gather:
1253
+ value: true
1254
+ overlap_param_gather_with_optimizer_step:
1255
+ value: false
1256
+ overlong_filter:
1257
+ value: false
1258
+ override_opt_param_scheduler:
1259
+ value: false
1260
+ packing:
1261
+ value: true
1262
+ packing_length:
1263
+ value: 4096
1264
+ packing_num_proc:
1265
+ value: 1
1266
+ padded_vocab_size:
1267
+ value: 151936
1268
+ padding_free:
1269
+ value: true
1270
+ padding_side:
1271
+ value: right
1272
+ params_dtype:
1273
+ value: torch.bfloat16
1274
+ partial_rotary_factor:
1275
+ value: null
1276
+ patch_dim:
1277
+ value: 16
1278
+ patch_size:
1279
+ value: 1
1280
+ per_split_data_args_path:
1281
+ value: null
1282
+ perform_initialization:
1283
+ value: false
1284
+ pin_cpu_grads:
1285
+ value: true
1286
+ pin_cpu_params:
1287
+ value: true
1288
+ pipeline_model_parallel_comm_backend:
1289
+ value: null
1290
+ pipeline_model_parallel_layout:
1291
+ value: null
1292
+ pipeline_model_parallel_size:
1293
+ value: 1
1294
+ position_embedding_type:
1295
+ value: rope
1296
+ pretrained_checkpoint:
1297
+ value: null
1298
+ problem_type:
1299
+ value: null
1300
+ profile:
1301
+ value: false
1302
+ profile_ranks:
1303
+ value:
1304
+ - 0
1305
+ profile_step_end:
1306
+ value: 12
1307
+ profile_step_start:
1308
+ value: 10
1309
+ q_lora_rank:
1310
+ value: null
1311
+ qk_head_dim:
1312
+ value: 128
1313
+ qk_l2_norm:
1314
+ value: false
1315
+ qk_layernorm:
1316
+ value: true
1317
+ qk_pos_emb_head_dim:
1318
+ value: 64
1319
+ quant_bits:
1320
+ value: null
1321
+ quant_method:
1322
+ value: null
1323
+ query_in_block_prob:
1324
+ value: 0.1
1325
+ rampup_batch_size:
1326
+ value: null
1327
+ rank:
1328
+ value: 7
1329
+ ray_exp_name:
1330
+ value: null
1331
+ recompute_granularity:
1332
+ value: full
1333
+ recompute_method:
1334
+ value: uniform
1335
+ recompute_modules:
1336
+ value:
1337
+ - core_attn
1338
+ recompute_num_layers:
1339
+ value: 1
1340
+ record_memory_history:
1341
+ value: false
1342
+ ref_adapter_load:
1343
+ value: null
1344
+ ref_adapters:
1345
+ value: []
1346
+ ref_load:
1347
+ value: null
1348
+ ref_model:
1349
+ value: null
1350
+ ref_model_mixup_alpha:
1351
+ value: 0.6
1352
+ ref_model_sync_steps:
1353
+ value: 512
1354
+ reference_free:
1355
+ value: false
1356
+ relative_attention_max_distance:
1357
+ value: 128
1358
+ relative_attention_num_buckets:
1359
+ value: 32
1360
+ remove_unused_columns:
1361
+ value: true
1362
+ repetition_max_penalty:
1363
+ value: -1
1364
+ repetition_n_grams:
1365
+ value: 3
1366
+ repetition_penalty:
1367
+ value: 1
1368
+ replication:
1369
+ value: false
1370
+ replication_factor:
1371
+ value: 2
1372
+ replication_jump:
1373
+ value: null
1374
+ rerun_mode:
1375
+ value: validate_results
1376
+ reset_attention_mask:
1377
+ value: false
1378
+ reset_position_ids:
1379
+ value: false
1380
+ response_prefix:
1381
+ value: null
1382
+ result_rejected_tracker_filename:
1383
+ value: null
1384
+ retriever_report_topk_accuracies:
1385
+ value: []
1386
+ retriever_score_scaling:
1387
+ value: false
1388
+ retriever_seq_length:
1389
+ value: 256
1390
+ retro_add_retriever:
1391
+ value: false
1392
+ retro_attention_gate:
1393
+ value: 1
1394
+ retro_cyclic_train_iters:
1395
+ value: null
1396
+ retro_encoder_attention_dropout:
1397
+ value: 0.1
1398
+ retro_encoder_hidden_dropout:
1399
+ value: 0.1
1400
+ retro_encoder_layers:
1401
+ value: 2
1402
+ retro_num_neighbors:
1403
+ value: 2
1404
+ retro_num_retrieved_chunks:
1405
+ value: 2
1406
+ retro_project_dir:
1407
+ value: null
1408
+ retro_verify_neighbor_count:
1409
+ value: true
1410
+ reuse_grad_buf_for_mxfp8_param_ag:
1411
+ value: false
1412
+ reward_funcs:
1413
+ value: []
1414
+ reward_model:
1415
+ value: null
1416
+ reward_model_plugin:
1417
+ value: null
1418
+ reward_weights:
1419
+ value: null
1420
+ rlhf_type:
1421
+ value: null
1422
+ rollout_importance_sampling_mode:
1423
+ value: null
1424
+ rollout_importance_sampling_threshold:
1425
+ value: 2
1426
+ rope_scaling:
1427
+ value: null
1428
+ rope_scaling_factor:
1429
+ value: 8
1430
+ rope_type:
1431
+ value: null
1432
+ rotary_base:
1433
+ value: 1000000
1434
+ rotary_interleaved:
1435
+ value: false
1436
+ rotary_percent:
1437
+ value: 1
1438
+ rotary_scaling_factor:
1439
+ value: 1
1440
+ rotary_seq_len_interpolation_factor:
1441
+ value: null
1442
+ rpo_alpha:
1443
+ value: null
1444
+ run_workload_inspector_server:
1445
+ value: false
1446
+ sample_rate:
1447
+ value: 1
1448
+ save:
1449
+ value: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709
1450
+ save_interval:
1451
+ value: 100
1452
+ save_retain_interval:
1453
+ value: null
1454
+ save_safetensors:
1455
+ value: false
1456
+ save_strategy:
1457
+ value: steps
1458
+ scale_rewards:
1459
+ value: group
1460
+ scatter_gather_tensors_in_pipeline:
1461
+ value: true
1462
+ seed:
1463
+ value: 42
1464
+ seq_length:
1465
+ value: 4096
1466
+ sequence_parallel:
1467
+ value: false
1468
+ sequence_parallel_size:
1469
+ value: 1
1470
+ sft:
1471
+ value: false
1472
+ sft_tokenizer_prompt_format:
1473
+ value: nemotron-h-aligned
1474
+ sgd_momentum:
1475
+ value: 0.9
1476
+ sharp_enabled_group:
1477
+ value: null
1478
+ short_seq_prob:
1479
+ value: 0.1
1480
+ shuffle_buffer_size:
1481
+ value: 1000
1482
+ skip_train:
1483
+ value: false
1484
+ skipped_train_samples:
1485
+ value: 0
1486
+ sleep_level:
1487
+ value: 0
1488
+ soft_cache_length:
1489
+ value: null
1490
+ soft_max_length:
1491
+ value: null
1492
+ spec:
1493
+ value: null
1494
+ split:
1495
+ value: null
1496
+ split_dataset_ratio:
1497
+ value: 0
1498
+ squared_relu:
1499
+ value: false
1500
+ start_weight_decay:
1501
+ value: 0.1
1502
+ steps_per_generation:
1503
+ value: null
1504
+ stop_words:
1505
+ value: []
1506
+ stopping_strategy:
1507
+ value: first_exhausted
1508
+ straggler_ctrlr_port:
1509
+ value: 65535
1510
+ straggler_minmax_count:
1511
+ value: 1
1512
+ stream:
1513
+ value: false
1514
+ streaming:
1515
+ value: false
1516
+ strict:
1517
+ value: false
1518
+ strict_fsdp_dtensor_load:
1519
+ value: true
1520
+ suggested_communication_unit_size:
1521
+ value: null
1522
+ swiglu:
1523
+ value: true
1524
+ swin_backbone_type:
1525
+ value: tiny
1526
+ symmetric_ar_type:
1527
+ value: null
1528
+ sync_ref_model:
1529
+ value: false
1530
+ system:
1531
+ value: null
1532
+ target_modules:
1533
+ value:
1534
+ - all-linear
1535
+ target_regex:
1536
+ value: null
1537
+ task_type:
1538
+ value: causal_lm
1539
+ tau_neg:
1540
+ value: 1.05
1541
+ tau_pos:
1542
+ value: 1
1543
+ te_rng_tracker:
1544
+ value: false
1545
+ temperature:
1546
+ value: null
1547
+ template:
1548
+ value: qwen3
1549
+ template_backend:
1550
+ value: swift
1551
+ tensor_model_parallel_size:
1552
+ value: 1
1553
+ tensorboard_dir:
1554
+ value: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs
1555
+ tensorboard_log_interval:
1556
+ value: 1
1557
+ tensorboard_queue_size:
1558
+ value: 50
1559
+ test_data_path:
1560
+ value: null
1561
+ test_mode:
1562
+ value: false
1563
+ tiktoken_num_special_tokens:
1564
+ value: 1000
1565
+ tiktoken_pattern:
1566
+ value: null
1567
+ tiktoken_special_tokens:
1568
+ value: null
1569
+ timing_log_level:
1570
+ value: 0
1571
+ timing_log_option:
1572
+ value: minmax
1573
+ titles_data_path:
1574
+ value: null
1575
+ tokenizer_model:
1576
+ value: null
1577
+ tokenizer_type:
1578
+ value: null
1579
+ top_entropy_quantile:
1580
+ value: 1
1581
+ top_k:
1582
+ value: 50
1583
+ top_logprobs:
1584
+ value: null
1585
+ top_p:
1586
+ value: 0.9
1587
+ torch_dtype:
1588
+ value: torch.bfloat16
1589
+ torch_fsdp2_reshard_after_forward:
1590
+ value: true
1591
+ tp_comm_bootstrap_backend:
1592
+ value: nccl
1593
+ tp_comm_bulk_dgrad:
1594
+ value: true
1595
+ tp_comm_bulk_wgrad:
1596
+ value: true
1597
+ tp_comm_overlap:
1598
+ value: false
1599
+ tp_comm_overlap_ag:
1600
+ value: true
1601
+ tp_comm_overlap_cfg:
1602
+ value: null
1603
+ tp_comm_overlap_rs:
1604
+ value: true
1605
+ tp_comm_overlap_rs_dgrad:
1606
+ value: false
1607
+ tp_comm_split_ag:
1608
+ value: true
1609
+ tp_comm_split_rs:
1610
+ value: true
1611
+ train_data_path:
1612
+ value: null
1613
+ train_dataloader_shuffle:
1614
+ value: true
1615
+ train_iters:
1616
+ value: 38100
1617
+ train_samples:
1618
+ value: null
1619
+ train_sync_interval:
1620
+ value: null
1621
+ train_type:
1622
+ value: full
1623
+ trainable_parameters:
1624
+ value: []
1625
+ trainable_parameters_regex:
1626
+ value: null
1627
+ transformer_impl:
1628
+ value: transformer_engine
1629
+ transformer_pipeline_model_parallel_size:
1630
+ value: 1
1631
+ truncation_strategy:
1632
+ value: right
1633
+ tuner_backend:
1634
+ value: peft
1635
+ undesirable_weight:
1636
+ value: 1
1637
+ untie_embeddings_and_output_weights:
1638
+ value: false
1639
+ use_chat_template:
1640
+ value: false
1641
+ use_checkpoint_args:
1642
+ value: false
1643
+ use_checkpoint_opt_param_scheduler:
1644
+ value: false
1645
+ use_cpu_initialization:
1646
+ value: null
1647
+ use_dist_ckpt:
1648
+ value: true
1649
+ use_dist_ckpt_deprecated:
1650
+ value: false
1651
+ use_distributed_optimizer:
1652
+ value: true
1653
+ use_flash_attn:
1654
+ value: false
1655
+ use_fused_weighted_squared_relu:
1656
+ value: false
1657
+ use_hf:
1658
+ value: true
1659
+ use_legacy_models:
1660
+ value: false
1661
+ use_megatron_fsdp:
1662
+ value: false
1663
+ use_mp_args_from_checkpoint_args:
1664
+ value: false
1665
+ use_one_sent_docs:
1666
+ value: false
1667
+ use_persistent_ckpt_worker:
1668
+ value: false
1669
+ use_precision_aware_optimizer:
1670
+ value: true
1671
+ use_pytorch_profiler:
1672
+ value: false
1673
+ use_ray:
1674
+ value: false
1675
+ use_ring_exchange_p2p:
1676
+ value: false
1677
+ use_rope_scaling:
1678
+ value: false
1679
+ use_rotary_position_embeddings:
1680
+ value: false
1681
+ use_rslora:
1682
+ value: false
1683
+ use_shared_expert_gate:
1684
+ value: false
1685
+ use_sharp:
1686
+ value: false
1687
+ use_swift_lora:
1688
+ value: false
1689
+ use_tokenizer_model_from_checkpoint_args:
1690
+ value: true
1691
+ use_torch_fsdp2:
1692
+ value: false
1693
+ use_torch_optimizer_for_cpu_offload:
1694
+ value: false
1695
+ use_tp_pp_dp_mapping:
1696
+ value: false
1697
+ use_vllm:
1698
+ value: true
1699
+ v_head_dim:
1700
+ value: 128
1701
+ val_dataset:
1702
+ value: []
1703
+ val_dataset_shuffle:
1704
+ value: false
1705
+ valid_data_path:
1706
+ value: null
1707
+ variable_seq_lengths:
1708
+ value: false
1709
+ virtual_pipeline_model_parallel_size:
1710
+ value: null
1711
+ vision_backbone_type:
1712
+ value: vit
1713
+ vision_pretraining:
1714
+ value: false
1715
+ vision_pretraining_type:
1716
+ value: classify
1717
+ vit_gradient_checkpointing:
1718
+ value: true
1719
+ vit_lr:
1720
+ value: null
1721
+ vllm_disable_cascade_attn:
1722
+ value: false
1723
+ vllm_enable_prefix_caching:
1724
+ value: true
1725
+ vllm_enforce_eager:
1726
+ value: false
1727
+ vllm_engine_kwargs:
1728
+ value: null
1729
+ vllm_gpu_memory_utilization:
1730
+ value: 0.9
1731
+ vllm_limit_mm_per_prompt:
1732
+ value: null
1733
+ vllm_max_model_len:
1734
+ value: null
1735
+ vllm_max_num_seqs:
1736
+ value: null
1737
+ vllm_mm_processor_cache_gb:
1738
+ value: null
1739
+ vllm_mode:
1740
+ value: null
1741
+ vllm_server_base_url:
1742
+ value: null
1743
+ vllm_server_group_port:
1744
+ value: null
1745
+ vllm_server_host:
1746
+ value: null
1747
+ vllm_server_pass_dataset:
1748
+ value: false
1749
+ vllm_server_port:
1750
+ value:
1751
+ - 8000
1752
+ vllm_server_timeout:
1753
+ value: 240
1754
+ vllm_tensor_parallel_size:
1755
+ value: 1
1756
+ vocab_extra_ids:
1757
+ value: 0
1758
+ vocab_file:
1759
+ value: null
1760
+ vocab_size:
1761
+ value: null
1762
+ wandb_exp_name:
1763
+ value: baseline
1764
+ wandb_log_unique_prompts:
1765
+ value: null
1766
+ wandb_project:
1767
+ value: plt
1768
+ wandb_save_dir:
1769
+ value: ""
1770
+ weight_decay:
1771
+ value: 0.1
1772
+ weight_decay_incr_style:
1773
+ value: constant
1774
+ wgrad_deferral_limit:
1775
+ value: 0
1776
+ world_size:
1777
+ value: 8
1778
+ yaml_cfg:
1779
+ value: null
wandb/wandb/run-20251224_034518-gd3q7mjv/files/output.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b97e62e75f4a01467f0076f2e7e3f1f3b3812717d6040c9597cf6049f6a3b4
3
+ size 15045585
wandb/wandb/run-20251224_034518-gd3q7mjv/files/requirements.txt ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip==25.3
2
+ setuptools==80.9.0
3
+ wheel==0.45.1
4
+ multidict==6.7.0
5
+ cffi==2.0.0
6
+ typing-inspection==0.4.2
7
+ smmap==5.0.2
8
+ sentry-sdk==2.48.0
9
+ pydantic_core==2.41.5
10
+ pydantic_core==2.41.4
11
+ protobuf==6.33.2
12
+ annotated-types==0.7.0
13
+ pydantic==2.12.5
14
+ pydantic==2.12.3
15
+ gitdb==4.0.12
16
+ GitPython==3.1.45
17
+ wandb==0.23.1
18
+ sortedcontainers==2.4.0
19
+ pytz==2025.2
20
+ pydub==0.25.1
21
+ jieba==0.42.1
22
+ crcmod==1.7
23
+ cpm-kernels==1.0.11
24
+ brotli==1.2.0
25
+ antlr4-python3-runtime==4.9.3
26
+ addict==2.4.0
27
+ zstandard==0.25.0
28
+ zipp==3.23.0
29
+ xxhash==3.6.0
30
+ Werkzeug==3.1.4
31
+ websockets==15.0.1
32
+ uvicorn==0.40.0
33
+ tzdata==2025.3
34
+ tomlkit==0.13.3
35
+ tensorboard-data-server==0.7.2
36
+ sniffio==1.3.1
37
+ simplejson==3.20.2
38
+ semantic-version==2.10.0
39
+ scipy==1.16.3
40
+ safetensors==0.7.0
41
+ ruff==0.14.10
42
+ rouge==1.0.1
43
+ regex==2025.11.3
44
+ python-multipart==0.0.21
45
+ pyparsing==3.3.1
46
+ pycryptodome==3.23.0
47
+ pycparser==2.23
48
+ pyarrow==22.0.0
49
+ propcache==0.4.1
50
+ mdurl==0.1.2
51
+ pillow==11.3.0
52
+ orjson==3.11.5
53
+ omegaconf==2.3.0
54
+ Markdown==3.10
55
+ kiwisolver==1.4.9
56
+ json_repair==0.54.3
57
+ joblib==1.5.3
58
+ jmespath==0.10.0
59
+ jiter==0.12.0
60
+ grpcio==1.76.0
61
+ groovy==0.1.2
62
+ future==1.0.0
63
+ trl==0.24.0
64
+ fsspec==2025.3.0
65
+ frozenlist==1.8.0
66
+ fonttools==4.61.1
67
+ ffmpy==1.0.0
68
+ einops==0.8.1
69
+ distro==1.9.0
70
+ dill==0.3.8
71
+ dacite==1.9.2
72
+ cycler==0.12.1
73
+ contourpy==1.3.3
74
+ attrs==25.4.0
75
+ attrdict==2.0.1
76
+ annotated-doc==0.0.4
77
+ aiohappyeyeballs==2.6.1
78
+ aiofiles==24.1.0
79
+ absl-py==2.3.1
80
+ yarl==1.22.0
81
+ tiktoken==0.12.0
82
+ tensorboard==2.20.0
83
+ starlette==0.50.0
84
+ pandas==2.3.3
85
+ nltk==3.9.2
86
+ multiprocess==0.70.16
87
+ modelscope==1.33.0
88
+ matplotlib==3.10.8
89
+ markdown-it-py==4.0.0
90
+ importlib_metadata==8.7.1
91
+ huggingface-hub==0.36.0
92
+ binpacking==1.5.2
93
+ aiosignal==1.4.0
94
+ tokenizers==0.22.1
95
+ safehttpx==0.1.7
96
+ rich==14.2.0
97
+ openai==2.14.0
98
+ gradio_client==1.14.0
99
+ fastapi==0.127.0
100
+ cryptography==46.0.3
101
+ aiohttp==3.13.2
102
+ typer==0.20.1
103
+ transformers==4.57.3
104
+ aliyun-python-sdk-core==2.16.0
105
+ accelerate==1.12.0
106
+ transformers-stream-generator==0.0.5
107
+ peft==0.18.0
108
+ gradio==5.50.0
109
+ datasets==3.6.0
110
+ aliyun-python-sdk-kms==2.16.5
111
+ oss2==2.19.1
112
+ ms_swift==3.12.0.dev0
113
+ liger_kernel==0.6.4
114
+ hf_transfer==0.1.9
115
+ pybind11==3.0.1
116
+ transformer_engine==2.10.0
117
+ ml_dtypes==0.5.4
118
+ onnx==1.20.0
119
+ transformer_engine_cu12==2.10.0
120
+ onnx-ir==0.1.13
121
+ onnxscript==0.5.7
122
+ transformer_engine_torch==2.10.0
123
+ apex==0.1
124
+ numpy==1.26.4
125
+ megatron-core==0.15.0
126
+ flash_attn==2.8.3
127
+ charset-normalizer==3.4.4
128
+ Jinja2==3.1.6
129
+ MarkupSafe==3.0.3
130
+ mpmath==1.3.0
131
+ networkx==3.6.1
132
+ nvidia-cublas-cu12==12.8.4.1
133
+ nvidia-cuda-cupti-cu12==12.8.90
134
+ nvidia-cuda-nvrtc-cu12==12.8.93
135
+ nvidia-cuda-runtime-cu12==12.8.90
136
+ nvidia-cudnn-cu12==9.10.2.21
137
+ nvidia-cufft-cu12==11.3.3.83
138
+ nvidia-cufile-cu12==1.13.1.3
139
+ nvidia-curand-cu12==10.3.9.90
140
+ nvidia-cusolver-cu12==11.7.3.90
141
+ nvidia-cusparse-cu12==12.5.8.93
142
+ nvidia-cusparselt-cu12==0.7.1
143
+ nvidia-nccl-cu12==2.27.5
144
+ nvidia-nvjitlink-cu12==12.8.93
145
+ nvidia-nvshmem-cu12==3.3.20
146
+ nvidia-nvtx-cu12==12.8.90
147
+ requests==2.32.5
148
+ sentencepiece==0.2.1
149
+ sympy==1.14.0
150
+ torch==2.9.1+cu128
151
+ torchaudio==2.9.1+cu128
152
+ torchcodec==0.9.1
153
+ torchdata==0.10.0
154
+ torchtext==0.6.0
155
+ torchvision==0.24.1+cu128
156
+ triton==3.5.1
157
+ urllib3==2.6.2
158
+ anyio==4.12.0
159
+ asttokens==3.0.1
160
+ certifi==2025.11.12
161
+ click==8.3.1
162
+ comm==0.2.3
163
+ debugpy==1.8.18
164
+ decorator==5.2.1
165
+ executing==2.2.1
166
+ filelock==3.20.0
167
+ h11==0.16.0
168
+ hf-xet==1.2.0
169
+ httpcore==1.0.9
170
+ httpx==0.28.1
171
+ idna==3.11
172
+ ipykernel==7.1.0
173
+ ipython==9.8.0
174
+ ipython_pygments_lexers==1.1.1
175
+ ipywidgets==8.1.8
176
+ jedi==0.19.2
177
+ jupyter_client==8.7.0
178
+ jupyter_core==5.9.1
179
+ jupyterlab_widgets==3.0.16
180
+ matplotlib-inline==0.2.1
181
+ nest-asyncio==1.6.0
182
+ packaging==25.0
183
+ parso==0.8.5
184
+ pexpect==4.9.0
185
+ platformdirs==4.5.1
186
+ prompt_toolkit==3.0.52
187
+ psutil==7.1.3
188
+ ptyprocess==0.7.0
189
+ pure_eval==0.2.3
190
+ Pygments==2.19.2
191
+ python-dateutil==2.9.0.post0
192
+ PyYAML==6.0.3
193
+ pyzmq==27.1.0
194
+ shellingham==1.5.4
195
+ six==1.17.0
196
+ stack-data==0.6.3
197
+ tornado==6.5.3
198
+ tqdm==4.67.1
199
+ traitlets==5.14.3
200
+ typer-slim==0.20.0
201
+ typing_extensions==4.15.0
202
+ wcwidth==0.2.14
203
+ widgetsnbextension==4.0.15
204
+ autocommand==2.2.2
205
+ backports.tarfile==1.2.0
206
+ importlib_metadata==8.0.0
207
+ inflect==7.3.1
208
+ jaraco.collections==5.1.0
209
+ jaraco.context==5.3.0
210
+ jaraco.functools==4.0.1
211
+ jaraco.text==3.12.1
212
+ more-itertools==10.3.0
213
+ packaging==24.2
214
+ platformdirs==4.2.2
215
+ tomli==2.0.1
216
+ typeguard==4.3.0
217
+ typing_extensions==4.12.2
218
+ wheel==0.45.1
219
+ zipp==3.19.2
wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-metadata.json ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-58-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.12",
4
+ "startedAt": "2025-12-24T03:45:18.795219Z",
5
+ "args": [
6
+ "--seed",
7
+ "42",
8
+ "--micro-batch-size",
9
+ "4",
10
+ "--global-batch-size",
11
+ "256",
12
+ "--recompute-granularity",
13
+ "full",
14
+ "--recompute-method",
15
+ "uniform",
16
+ "--recompute-num-layers",
17
+ "1",
18
+ "--recompute-modules",
19
+ "core_attn",
20
+ "--train-iters",
21
+ "38100",
22
+ "--log-interval",
23
+ "1",
24
+ "--tensorboard-dir",
25
+ "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs",
26
+ "--cross-entropy-loss-fusion",
27
+ "--cross-entropy-fusion-impl",
28
+ "native",
29
+ "--calculate-per-token-loss",
30
+ "--attention-backend",
31
+ "flash",
32
+ "--optimizer",
33
+ "adam",
34
+ "--optimizer-offload-fraction",
35
+ "1.0",
36
+ "--use-precision-aware-optimizer",
37
+ "--main-grads-dtype",
38
+ "fp32",
39
+ "--main-params-dtype",
40
+ "fp32",
41
+ "--exp-avg-dtype",
42
+ "fp32",
43
+ "--exp-avg-sq-dtype",
44
+ "fp32",
45
+ "--dataloader-type",
46
+ "cyclic",
47
+ "--manual-gc-interval",
48
+ "0",
49
+ "--lr",
50
+ "0.0001",
51
+ "--lr-decay-style",
52
+ "cosine",
53
+ "--lr-warmup-iters",
54
+ "0",
55
+ "--lr-warmup-fraction",
56
+ "0.05",
57
+ "--min-lr",
58
+ "3e-06",
59
+ "--weight-decay",
60
+ "0.1",
61
+ "--clip-grad",
62
+ "1.0",
63
+ "--adam-beta1",
64
+ "0.9",
65
+ "--adam-beta2",
66
+ "0.95",
67
+ "--adam-eps",
68
+ "1e-08",
69
+ "--sgd-momentum",
70
+ "0.9",
71
+ "--save",
72
+ "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709",
73
+ "--save-interval",
74
+ "100",
75
+ "--load",
76
+ "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
77
+ "--finetune",
78
+ "--ckpt-format",
79
+ "torch_dist",
80
+ "--no-initialization",
81
+ "--auto-detect-ckpt-format",
82
+ "--exit-on-missing-checkpoint",
83
+ "--distributed-backend",
84
+ "nccl",
85
+ "--local-rank",
86
+ "7",
87
+ "--use-distributed-optimizer",
88
+ "--tensor-model-parallel-size",
89
+ "1",
90
+ "--pipeline-model-parallel-size",
91
+ "1",
92
+ "--context-parallel-size",
93
+ "1",
94
+ "--overlap-grad-reduce",
95
+ "--overlap-param-gather",
96
+ "--distributed-timeout-minutes",
97
+ "300000",
98
+ "--num-layers",
99
+ "28",
100
+ "--hidden-size",
101
+ "1024",
102
+ "--ffn-hidden-size",
103
+ "3072",
104
+ "--num-attention-heads",
105
+ "16",
106
+ "--group-query-attention",
107
+ "--num-query-groups",
108
+ "8",
109
+ "--max-position-embeddings",
110
+ "32768",
111
+ "--position-embedding-type",
112
+ "rope",
113
+ "--rotary-base",
114
+ "1000000",
115
+ "--rotary-percent",
116
+ "1.0",
117
+ "--normalization",
118
+ "RMSNorm",
119
+ "--norm-epsilon",
120
+ "1e-06",
121
+ "--swiglu",
122
+ "--disable-bias-linear",
123
+ "--attention-dropout",
124
+ "0.0",
125
+ "--hidden-dropout",
126
+ "0.0",
127
+ "--kv-channels",
128
+ "128",
129
+ "--qk-layernorm",
130
+ "--transformer-impl",
131
+ "transformer_engine",
132
+ "--moe-layer-freq",
133
+ "1",
134
+ "--moe-router-topk",
135
+ "2",
136
+ "--moe-router-dtype",
137
+ "fp32",
138
+ "--moe-router-score-function",
139
+ "softmax",
140
+ "--moe-router-load-balancing-type",
141
+ "aux_loss",
142
+ "--expert-model-parallel-size",
143
+ "1",
144
+ "--expert-tensor-parallel-size",
145
+ "1",
146
+ "--moe-token-dispatcher-type",
147
+ "alltoall",
148
+ "--moe-grouped-gemm",
149
+ "--moe-aux-loss-coeff",
150
+ "0.0",
151
+ "--moe-token-drop-policy",
152
+ "probs",
153
+ "--kv-lora-rank",
154
+ "32",
155
+ "--qk-head-dim",
156
+ "128",
157
+ "--qk-pos-emb-head-dim",
158
+ "64",
159
+ "--mtp-loss-scaling-factor",
160
+ "0.1",
161
+ "--fp8-recipe",
162
+ "delayed",
163
+ "--fp8-amax-history-len",
164
+ "1024",
165
+ "--fp8-amax-compute-algo",
166
+ "max",
167
+ "--bf16",
168
+ "--attention-softmax-in-fp32",
169
+ "--tensorboard-log-interval",
170
+ "1",
171
+ "--tensorboard-queue-size",
172
+ "50",
173
+ "--log-timers-to-tensorboard",
174
+ "--log-validation-ppl-to-tensorboard",
175
+ "--log-memory-to-tensorboard",
176
+ "--logging-level",
177
+ "20",
178
+ "--wandb-project",
179
+ "plt",
180
+ "--wandb-exp-name",
181
+ "baseline",
182
+ "--eval-iters",
183
+ "-1",
184
+ "--eval-interval",
185
+ "100",
186
+ "--seq-length",
187
+ "4096",
188
+ "--num-workers",
189
+ "32"
190
+ ],
191
+ "program": "/workspace/halcyon-recipe2/swift/cli/_megatron/pt.py",
192
+ "codePath": "swift/cli/_megatron/pt.py",
193
+ "codePathLocal": "swift/cli/_megatron/pt.py",
194
+ "git": {
195
+ "remote": "https://github.com/weak-kajuma/halcyon-recipe2.git",
196
+ "commit": "ea7cc214b68fb511dd83bff83a504b7f43053577"
197
+ },
198
+ "email": "kazuma826826@gmail.com",
199
+ "root": "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb",
200
+ "host": "36fd00e7b21c",
201
+ "executable": "/venv/main/bin/python3.12",
202
+ "cpu_count": 72,
203
+ "cpu_count_logical": 144,
204
+ "gpu": "NVIDIA GeForce RTX 5090",
205
+ "gpu_count": 8,
206
+ "disk": {
207
+ "/": {
208
+ "total": "7669363507200",
209
+ "used": "983051857920"
210
+ }
211
+ },
212
+ "memory": {
213
+ "total": "540643262464"
214
+ },
215
+ "gpu_nvidia": [
216
+ {
217
+ "name": "NVIDIA GeForce RTX 5090",
218
+ "memoryTotal": "34190917632",
219
+ "cudaCores": 21760,
220
+ "architecture": "Blackwell",
221
+ "uuid": "GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3"
222
+ },
223
+ {
224
+ "name": "NVIDIA GeForce RTX 5090",
225
+ "memoryTotal": "34190917632",
226
+ "cudaCores": 21760,
227
+ "architecture": "Blackwell",
228
+ "uuid": "GPU-23ca8669-46fc-19eb-348b-e51e591c150d"
229
+ },
230
+ {
231
+ "name": "NVIDIA GeForce RTX 5090",
232
+ "memoryTotal": "34190917632",
233
+ "cudaCores": 21760,
234
+ "architecture": "Blackwell",
235
+ "uuid": "GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152"
236
+ },
237
+ {
238
+ "name": "NVIDIA GeForce RTX 5090",
239
+ "memoryTotal": "34190917632",
240
+ "cudaCores": 21760,
241
+ "architecture": "Blackwell",
242
+ "uuid": "GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9"
243
+ },
244
+ {
245
+ "name": "NVIDIA GeForce RTX 5090",
246
+ "memoryTotal": "34190917632",
247
+ "cudaCores": 21760,
248
+ "architecture": "Blackwell",
249
+ "uuid": "GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7"
250
+ },
251
+ {
252
+ "name": "NVIDIA GeForce RTX 5090",
253
+ "memoryTotal": "34190917632",
254
+ "cudaCores": 21760,
255
+ "architecture": "Blackwell",
256
+ "uuid": "GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49"
257
+ },
258
+ {
259
+ "name": "NVIDIA GeForce RTX 5090",
260
+ "memoryTotal": "34190917632",
261
+ "cudaCores": 21760,
262
+ "architecture": "Blackwell",
263
+ "uuid": "GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b"
264
+ },
265
+ {
266
+ "name": "NVIDIA GeForce RTX 5090",
267
+ "memoryTotal": "34190917632",
268
+ "cudaCores": 21760,
269
+ "architecture": "Blackwell",
270
+ "uuid": "GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef"
271
+ }
272
+ ],
273
+ "cudaVersion": "13.0",
274
+ "writerId": "5bh5hk313ky3l0v9f9cesb7o1x31upc6"
275
+ }
wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"iteration-time":4.885399580001831,"grad-norm":0.36394086480140686,"_wandb":{"runtime":186668},"_timestamp":1.766734585640789e+09,"samples vs steps":9753600,"_runtime":186668.708556184,"_step":38100,"lm loss":2.0087497234344482,"learning-rate":3.000000106112566e-06,"batch-size":256,"loss-scale":1}
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-24T03:45:18.883988557Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_kirugit/port-611253.txt","pid":611253,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-24T03:45:18.884899822Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":611253}
3
+ {"time":"2025-12-24T03:45:18.884913121Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-611253-617899-2898902707/socket","Net":"unix"}}
4
+ {"time":"2025-12-24T03:45:19.067142394Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-24T03:45:19.071653716Z","level":"INFO","msg":"handleInformInit: received","streamId":"gd3q7mjv","id":"1(@)"}
6
+ {"time":"2025-12-24T03:45:19.342900184Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"gd3q7mjv","id":"1(@)"}
7
+ {"time":"2025-12-26T07:36:29.518345904Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"gd3q7mjv","id":"1(@)"}
8
+ {"time":"2025-12-26T07:36:29.520024321Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"gd3q7mjv","id":"1(@)"}
9
+ {"time":"2025-12-26T07:36:43.382214788Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-12-26T07:36:43.382296341Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-12-26T07:36:43.382312754Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-12-26T07:36:43.382392286Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-12-26T07:36:43.382512221Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-12-26T07:36:43.382522699Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-12-26T07:36:43.38298197Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-611253-617899-2898902707/socket","Net":"unix"}}
16
+ {"time":"2025-12-26T07:36:43.383080926Z","level":"INFO","msg":"server is closed"}
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Configure stats pid to 611253
3
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
5
+ 2025-12-24 03:45:18,797 INFO MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log
7
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log
8
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 4, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 38100, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 4096, 'encoder_seq_length': 4096, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'baseline', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 4096, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/full'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 4096, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': [], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': None, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 1, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
11
+ 2025-12-24 03:45:18,798 INFO MainThread:611253 [wandb_init.py:init():889] starting backend
12
+ 2025-12-24 03:45:19,067 INFO MainThread:611253 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-24 03:45:19,070 INFO MainThread:611253 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-24 03:45:19,074 INFO MainThread:611253 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-24 03:45:19,081 INFO MainThread:611253 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-24 03:45:19,580 INFO MainThread:611253 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-24 03:45:19,738 INFO MainThread:611253 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-24 03:45:19,742 INFO MainThread:611253 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 07:36:28,290 INFO MainThread:611253 [wandb_run.py:_finish():2287] finishing run tepic/plt/gd3q7mjv
23
+ 2025-12-26 07:36:28,292 INFO MainThread:611253 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-26 07:36:28,293 INFO MainThread:611253 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-26 07:36:28,293 INFO MainThread:611253 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-26 07:36:29,517 INFO MainThread:611253 [wandb_run.py:_footer_sync_info():3862] logging synced files
wandb/wandb/run-20251224_034518-gd3q7mjv/run-gd3q7mjv.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee9f0fe4fc8a60cfce51f5a6d3a2c72ba17eaf99be8dfd51ebe7114742980546
3
+ size 86368426