iamPi commited on
Commit
76f8ca1
·
verified ·
1 Parent(s): e395647

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. step_5400/config.yaml +207 -0
  3. step_5400/policy/weights/iter_0000000/.metadata +3 -0
  4. step_5400/policy/weights/iter_0000000/__0_0.distcp +3 -0
  5. step_5400/policy/weights/iter_0000000/__1_0.distcp +3 -0
  6. step_5400/policy/weights/iter_0000000/common.pt +3 -0
  7. step_5400/policy/weights/iter_0000000/metadata.json +1 -0
  8. step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
  9. step_5400/policy/weights/iter_0000000/run_config.yaml +564 -0
  10. step_5400/policy/weights/iter_0000000/train_state.pt +3 -0
  11. step_5400/policy/weights/latest_checkpointed_iteration.txt +1 -0
  12. step_5400/policy/weights/latest_train_state.pt +3 -0
  13. step_5400/train_dataloader.pt +3 -0
  14. step_5400/training_info.json +1 -0
  15. step_5600/config.yaml +207 -0
  16. step_5600/policy/weights/iter_0000000/.metadata +3 -0
  17. step_5600/policy/weights/iter_0000000/common.pt +3 -0
  18. step_5600/policy/weights/iter_0000000/metadata.json +1 -0
  19. step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
  20. step_5600/policy/weights/iter_0000000/run_config.yaml +564 -0
  21. step_5600/policy/weights/iter_0000000/train_state.pt +3 -0
  22. step_5600/policy/weights/latest_checkpointed_iteration.txt +1 -0
  23. step_5600/policy/weights/latest_train_state.pt +3 -0
  24. step_5600/train_dataloader.pt +3 -0
  25. step_5600/training_info.json +1 -0
  26. step_5800/config.yaml +207 -0
  27. step_5800/policy/weights/iter_0000000/.metadata +3 -0
  28. step_5800/policy/weights/iter_0000000/__0_0.distcp +3 -0
  29. step_5800/policy/weights/iter_0000000/__1_0.distcp +3 -0
  30. step_5800/policy/weights/iter_0000000/common.pt +3 -0
  31. step_5800/policy/weights/iter_0000000/metadata.json +1 -0
  32. step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
  33. step_5800/policy/weights/iter_0000000/run_config.yaml +564 -0
  34. step_5800/policy/weights/iter_0000000/train_state.pt +3 -0
  35. step_5800/policy/weights/latest_checkpointed_iteration.txt +1 -0
  36. step_5800/policy/weights/latest_train_state.pt +3 -0
  37. step_5800/train_dataloader.pt +3 -0
  38. step_5800/training_info.json +1 -0
  39. step_5972/config.yaml +207 -0
  40. step_5972/policy/weights/iter_0000000/.metadata +3 -0
  41. step_5972/policy/weights/iter_0000000/__0_1.distcp +3 -0
  42. step_5972/policy/weights/iter_0000000/__1_1.distcp +3 -0
  43. step_5972/policy/weights/iter_0000000/common.pt +3 -0
  44. step_5972/policy/weights/iter_0000000/metadata.json +1 -0
  45. step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
  46. step_5972/policy/weights/iter_0000000/run_config.yaml +564 -0
  47. step_5972/policy/weights/iter_0000000/train_state.pt +3 -0
  48. step_5972/policy/weights/latest_checkpointed_iteration.txt +1 -0
  49. step_5972/policy/weights/latest_train_state.pt +3 -0
  50. step_5972/train_dataloader.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ step_5400/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ step_5972/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
38
+ step_5800/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
39
+ step_5600/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
40
+ step_5400/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ step_5800/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ step_5972/policy/weights/iter_0000000/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
43
+ step_5400/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ step_5972/policy/weights/iter_0000000/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
45
+ step_5800/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
step_5400/config.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpointing:
2
+ checkpoint_dir: results/qwen3_4b_sft
3
+ checkpoint_must_save_by: null
4
+ enabled: true
5
+ higher_is_better: false
6
+ keep_top_k: 3
7
+ metric_name: val:val_loss
8
+ save_period: 200
9
+ cluster:
10
+ gpus_per_node: 2
11
+ num_nodes: 1
12
+ data:
13
+ num_workers: 4
14
+ shuffle: true
15
+ train_dataset_path:
16
+ - ./data/hones
17
+ val_dataset_path: ./data/arc2_evaluation6
18
+ logger:
19
+ gpu_monitoring:
20
+ collection_interval: 10
21
+ flush_interval: 10
22
+ log_dir: logs/exp_019
23
+ mlflow_enabled: false
24
+ monitor_gpus: false
25
+ swanlab_enabled: false
26
+ tensorboard_enabled: false
27
+ wandb:
28
+ name: qwen3_4b_sft
29
+ project: arc2
30
+ wandb_enabled: true
31
+ policy:
32
+ activation_checkpointing_enabled: false
33
+ attn_implementation: flash_attention_2
34
+ dtensor_cfg:
35
+ enabled: false
36
+ dynamic_batching:
37
+ enabled: false
38
+ fsdp_offload_enabled: false
39
+ make_sequence_length_divisible_by: 64
40
+ max_grad_norm: null
41
+ megatron_cfg:
42
+ activation_checkpointing: true
43
+ apply_rope_fusion: true
44
+ bias_activation_fusion: false
45
+ context_parallel_size: 2
46
+ distributed_data_parallel_config:
47
+ average_in_collective: true
48
+ data_parallel_sharding_strategy: optim_grads_params
49
+ grad_reduce_in_fp32: true
50
+ overlap_grad_reduce: true
51
+ overlap_param_gather: true
52
+ empty_unused_memory_level: 1
53
+ enabled: true
54
+ env_vars:
55
+ AWS_OFI_NCCL_VERSION: 1.14.0
56
+ BASH_ENV: /etc/bash.bashrc
57
+ CAL_VERSION: 0.4.4.50
58
+ CUBLASMP_VERSION: 0.4.0.789
59
+ CUBLAS_VERSION: 12.9.0.13
60
+ CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
61
+ CUDA_DRIVER_VERSION: 575.51.03
62
+ CUDA_VERSION: 12.9.0.043
63
+ CUDA_VISIBLE_DEVICES: 6,7
64
+ CUDNN_FRONTEND_VERSION: 1.11.0
65
+ CUDNN_VERSION: 9.10.1.4
66
+ CUFFT_VERSION: 11.4.0.6
67
+ CUFILE_VERSION: 1.14.0.30
68
+ CURAND_VERSION: 10.3.10.19
69
+ CUSOLVER_VERSION: 11.7.4.40
70
+ CUSPARSELT_VERSION: 0.7.1.0
71
+ CUSPARSE_VERSION: 12.5.9.5
72
+ DALI_BUILD: ''
73
+ DALI_URL_SUFFIX: '120'
74
+ DALI_VERSION: 1.49.0
75
+ EFA_VERSION: 1.38.1
76
+ ENV: /etc/shinit_v2
77
+ GDRCOPY_VERSION: 2.4.4
78
+ HOME: /root
79
+ HOSTNAME: e6ad2ac15863
80
+ HPCX_VERSION: '2.23'
81
+ KMP_DUPLICATE_LIB_OK: 'True'
82
+ KMP_INIT_AT_FORK: 'FALSE'
83
+ LC_CTYPE: C.UTF-8
84
+ LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
85
+ LESSCLOSE: /usr/bin/lesspipe %s %s
86
+ LESSOPEN: '| /usr/bin/lesspipe %s'
87
+ LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
88
+ LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
89
+ MODEL_OPT_VERSION: 0.27.1
90
+ MOFED_VERSION: 5.4-rdmacore50.0
91
+ NCCL_NET_PLUGIN: aws-ofi
92
+ NCCL_TUNER_PLUGIN: aws-ofi
93
+ NCCL_VERSION: 2.26.5
94
+ NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
95
+ NEMO_RL_VENV_DIR: /opt/ray_venvs
96
+ NPP_VERSION: 12.4.0.27
97
+ NRL_CONTAINER: '1'
98
+ NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
99
+ NSIGHT_COMPUTE_VERSION: 2025.2.0.11
100
+ NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
101
+ NVIDIA_BUILD_ID: '244212578'
102
+ NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
103
+ NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
104
+ NVIDIA_PRODUCT_NAME: CUDA
105
+ NVIDIA_REQUIRE_CUDA: cuda>=9.0
106
+ NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
107
+ NVIDIA_VISIBLE_DEVICES: all
108
+ NVJITLINK_VERSION: 12.9.41
109
+ NVJPEG_VERSION: 12.4.0.16
110
+ NVSHMEM_VERSION: 3.2.5
111
+ OLDPWD: /workspace
112
+ OMPI_MCA_coll_hcoll_enable: '0'
113
+ OPAL_PREFIX: /opt/hpcx/ompi
114
+ OPENMPI_VERSION: 4.1.7
115
+ OPENUCX_VERSION: 1.19.0
116
+ PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
117
+ POLYGRAPHY_VERSION: 0.49.20
118
+ PWD: /workspace/ARChitects
119
+ PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
120
+ PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
121
+ PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
122
+ RAY_CLIENT_MODE: '0'
123
+ RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
124
+ RAY_USAGE_STATS_ENABLED: '0'
125
+ RDMACORE_VERSION: '50.0'
126
+ SHELL: /bin/bash
127
+ SHLVL: '2'
128
+ SWANLAB_API_HOST: https://api.swanlab.cn/api
129
+ SWANLAB_RUNTIME: user
130
+ SWANLAB_WEB_HOST: https://swanlab.cn
131
+ TERM: xterm
132
+ TORCH_CUDA_ARCH_LIST: '9.0'
133
+ TRANSFORMER_ENGINE_VERSION: '2.3'
134
+ TRTOSS_VERSION: ''
135
+ TRT_VERSION: 10.10.0.31
136
+ UV: /root/.local/bin/uv
137
+ UV_LINK_MODE: copy
138
+ UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
139
+ UV_RUN_RECURSION_DEPTH: '1'
140
+ VIRTUAL_ENV: /opt/nemo_rl_venv
141
+ VIRTUAL_ENV_PROMPT: nemo-rl
142
+ WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
143
+ _: /root/.local/bin/uv
144
+ _CUDA_COMPAT_PATH: /usr/local/cuda/compat
145
+ _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
146
+ (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
147
+ _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
148
+ expert_model_parallel_size: 1
149
+ expert_tensor_parallel_size: 1
150
+ freeze_moe_router: true
151
+ moe_permute_fusion: false
152
+ moe_router_bias_update_rate: 0.0
153
+ moe_router_dtype: fp64
154
+ moe_router_load_balancing_type: none
155
+ num_layers_in_first_pipeline_stage: null
156
+ num_layers_in_last_pipeline_stage: null
157
+ optimizer:
158
+ adam_beta1: 0.9
159
+ adam_beta2: 0.98
160
+ adam_eps: 1.0e-08
161
+ bf16: true
162
+ clip_grad: 0.5
163
+ fp16: false
164
+ lr: 0.0001
165
+ min_lr: 1.0e-07
166
+ optimizer: adam
167
+ optimizer_cpu_offload: false
168
+ optimizer_offload_fraction: 0.0
169
+ params_dtype: bfloat16
170
+ sgd_momentum: 0.9
171
+ use_distributed_optimizer: true
172
+ use_precision_aware_optimizer: false
173
+ weight_decay: 0.1
174
+ pipeline_dtype: bfloat16
175
+ pipeline_model_parallel_size: 1
176
+ scheduler:
177
+ end_weight_decay: 0.1
178
+ lr_decay_iters: 12716
179
+ lr_decay_style: linear
180
+ lr_warmup_init: 1.0e-06
181
+ lr_warmup_iters: 200
182
+ start_weight_decay: 0.1
183
+ weight_decay_incr_style: constant
184
+ sequence_parallel: false
185
+ tensor_model_parallel_size: 1
186
+ train_iters: 5972
187
+ model_name: ./models/Qwen-NVARC
188
+ offload_optimizer_for_logprob: false
189
+ precision: bfloat16
190
+ sequence_packing:
191
+ algorithm: modified_first_fit_decreasing
192
+ enabled: true
193
+ sequence_length_round: 64
194
+ train_mb_tokens: 128000
195
+ tokenizer:
196
+ name: ./models/Qwen-NVARC
197
+ train_global_batch_size: 256
198
+ train_micro_batch_size: 1
199
+ sft:
200
+ max_num_epochs: 1
201
+ max_num_steps: 6400
202
+ seed: 24
203
+ val_at_start: true
204
+ val_batches: 200
205
+ val_global_batch_size: 256
206
+ val_micro_batch_size: 1
207
+ val_period: 200
step_5400/policy/weights/iter_0000000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:601958148c0276510ee83ae2c089910f685c2aa6fde4b6f5e668b28ed06ec567
3
+ size 329201
step_5400/policy/weights/iter_0000000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f4f63a1df595166115fa2fd03a1601a3ae7b6c72151956a0f966332b260176d
3
+ size 12718332319
step_5400/policy/weights/iter_0000000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdbfe2d6c54d823e7ef9c6bdfb156183fa5d437043a001c83847514272046f8b
3
+ size 12717813616
step_5400/policy/weights/iter_0000000/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf17a4bbf5fb940ff8d1e669f26a4e277411e9796b4920f5cd867e4401db145
3
+ size 1767
step_5400/policy/weights/iter_0000000/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function silu at 0x7d0251c6b420>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: false
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.auto
9
+ attention_dropout: '0.0'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: false
12
+ autocast_dtype: torch.bfloat16
13
+ barrier_with_L1_time: true
14
+ bf16: true
15
+ bias_activation_fusion: false
16
+ bias_dropout_fusion: false
17
+ calculate_per_token_loss: true
18
+ clone_scatter_output_in_embedding: true
19
+ config_logger_dir: ''
20
+ cross_entropy_fusion_impl: native
21
+ cross_entropy_loss_fusion: true
22
+ defer_embedding_wgrad_compute: false
23
+ delay_wgrad_compute: false
24
+ deterministic_mode: false
25
+ disable_bf16_reduced_precision_matmul: false
26
+ disable_parameter_transpose_cache: false
27
+ distribute_saved_activations: None
28
+ enable_autocast: false
29
+ fallback_to_eager_attn: false
30
+ ffn_hidden_size: 9728
31
+ finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
32
+ pg_collection=None)
33
+ fine_grained_activation_offloading: false
34
+ first_last_layers_bf16: false
35
+ flash_decode: false
36
+ fp16: false
37
+ fp16_lm_cross_entropy: false
38
+ fp32_residual_connection: false
39
+ fused_single_qkv_rope: false
40
+ gated_linear_unit: true
41
+ generation_config: None
42
+ glu_linear_offset: '0.0'
43
+ grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
44
+ object at 0x7cf9d413cd70>>
45
+ grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
46
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
47
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
48
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
49
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
50
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
51
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
52
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
53
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
54
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
55
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
56
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
57
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
58
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
59
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
60
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
61
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
62
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
63
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
64
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
65
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
66
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
67
+ gradient_accumulation_fusion: false
68
+ hetereogenous_dist_checkpoint: false
69
+ heterogeneous_block_specs: false
70
+ hf_model_id: ./models/Qwen-NVARC
71
+ hidden_dropout: '0.0'
72
+ hidden_size: 2560
73
+ is_hybrid_model: false
74
+ kv_channels: 128
75
+ layernorm_epsilon: 1e-06
76
+ layernorm_zero_centered_gamma: false
77
+ linear_attention_freq: None
78
+ linear_attention_type: None
79
+ linear_conv_kernel_dim: None
80
+ linear_key_head_dim: None
81
+ linear_num_key_heads: None
82
+ linear_num_value_heads: None
83
+ linear_value_head_dim: None
84
+ log_max_attention_logit: false
85
+ make_vocab_size_divisible_by: 16
86
+ mamba_head_dim: 64
87
+ mamba_num_groups: 8
88
+ mamba_num_heads: None
89
+ mamba_state_dim: 128
90
+ masked_softmax_fusion: true
91
+ max_position_embeddings: 40960
92
+ memory_efficient_layer_norm: false
93
+ min_offloaded_tensor_size: 1048576
94
+ mlp_chunks_for_prefill: 1
95
+ moe_apply_probs_on_input: false
96
+ moe_aux_loss_coeff: '0.0'
97
+ moe_deepep_num_sms: 20
98
+ moe_enable_deepep: false
99
+ moe_expert_capacity_factor: None
100
+ moe_extended_tp: false
101
+ moe_ffn_hidden_size: None
102
+ moe_flex_dispatcher_backend: deepep
103
+ moe_grouped_gemm: false
104
+ moe_hybridep_num_sms: 16
105
+ moe_input_jitter_eps: None
106
+ moe_layer_freq: 1
107
+ moe_pad_expert_input_to_capacity: false
108
+ moe_per_layer_logging: false
109
+ moe_permute_fusion: false
110
+ moe_router_bias_update_rate: '0.0'
111
+ moe_router_dtype: fp64
112
+ moe_router_enable_expert_bias: false
113
+ moe_router_force_load_balancing: false
114
+ moe_router_fusion: false
115
+ moe_router_group_topk: None
116
+ moe_router_load_balancing_type: none
117
+ moe_router_num_groups: None
118
+ moe_router_padding_for_quantization: false
119
+ moe_router_pre_softmax: false
120
+ moe_router_score_function: softmax
121
+ moe_router_topk: 2
122
+ moe_router_topk_limited_devices: None
123
+ moe_router_topk_scaling_factor: None
124
+ moe_shared_expert_gate: false
125
+ moe_shared_expert_intermediate_size: None
126
+ moe_shared_expert_overlap: false
127
+ moe_token_dispatcher_type: allgather
128
+ moe_token_drop_policy: probs
129
+ moe_token_dropping: false
130
+ moe_use_legacy_grouped_gemm: false
131
+ moe_z_loss_coeff: None
132
+ mrope_section: None
133
+ multi_latent_attention: false
134
+ no_rope_freq: None
135
+ no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
136
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
137
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
138
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
139
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
140
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
141
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
142
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
143
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
144
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
145
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
146
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
147
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
148
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
149
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
150
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
151
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
152
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
153
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
154
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
155
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
156
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
157
+ normalization: RMSNorm
158
+ num_attention_heads: 32
159
+ num_layers: 36
160
+ num_layers_at_end_in_bf16: 1
161
+ num_layers_at_start_in_bf16: 1
162
+ num_moe_experts: None
163
+ num_query_groups: 8
164
+ nvidia_modelopt_version: 0.39.0
165
+ offload_modules: None
166
+ param_sync_func: None
167
+ params_dtype: torch.bfloat16
168
+ perform_initialization: true
169
+ persist_layer_norm: false
170
+ position_embedding_type: rope
171
+ qk_clip: false
172
+ qk_clip_alpha: '0.5'
173
+ qk_clip_threshold: 100
174
+ qk_layernorm: true
175
+ quant_recipe: None
176
+ restore_modelopt_state: false
177
+ rotary_base: 5000000
178
+ rotary_interleaved: false
179
+ rotary_percent: '1.0'
180
+ seq_len_interpolation_factor: None
181
+ seq_length: 262144
182
+ share_embeddings_and_output_weights: true
183
+ should_pad_vocab: false
184
+ softmax_scale: None
185
+ softmax_type: vanilla
186
+ symmetric_ar_type: None
187
+ test_mode: false
188
+ timers: None
189
+ transformer_impl: transformer_engine
190
+ transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
191
+ use_fused_weighted_squared_relu: false
192
+ use_kitchen: false
193
+ use_mamba_mem_eff_path: true
194
+ use_ring_exchange_p2p: false
195
+ use_te_activation_func: false
196
+ use_te_rng_tracker: false
197
+ use_transformer_engine_full_layer_spec: false
198
+ use_transformer_engine_op_fuser: false
199
+ variable_seq_lengths: false
200
+ vocab_size: 16
201
+ wgrad_deferral_limit: 0
202
+ window_attn_skip_freq: None
203
+ window_size: None
step_5400/policy/weights/iter_0000000/run_config.yaml ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: megatron.bridge.training.config.ConfigContainer
2
+ checkpoint:
3
+ _target_: megatron.bridge.training.config.CheckpointConfig
4
+ async_save: false
5
+ ckpt_assume_constant_structure: false
6
+ ckpt_convert_format: null
7
+ ckpt_convert_save: null
8
+ ckpt_format: torch_dist
9
+ ckpt_step: null
10
+ dist_ckpt_optim_fully_reshardable: false
11
+ dist_ckpt_save_pre_mcore_014: false
12
+ dist_ckpt_strictness: assume_ok_unexpected
13
+ distrib_optim_fully_reshardable_mem_efficient: false
14
+ exit_on_missing_checkpoint: false
15
+ finetune: true
16
+ fully_parallel_load: true
17
+ fully_parallel_save: true
18
+ load: null
19
+ load_main_params_from_ckpt: false
20
+ load_optim: true
21
+ load_rng: false
22
+ most_recent_k: -1
23
+ non_persistent_ckpt_type: null
24
+ non_persistent_global_ckpt_dir: null
25
+ non_persistent_local_ckpt_algo: fully_parallel
26
+ non_persistent_local_ckpt_dir: null
27
+ non_persistent_save_interval: null
28
+ pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
29
+ replication: false
30
+ replication_factor: 2
31
+ replication_jump: null
32
+ save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5400/policy/weights
33
+ save_interval: 100
34
+ save_optim: true
35
+ save_rng: true
36
+ save_tokenizer_assets: true
37
+ strict_fsdp_dtensor_load: false
38
+ use_checkpoint_args: false
39
+ use_persistent_ckpt_worker: true
40
+ comm_overlap: null
41
+ dataset: null
42
+ ddp:
43
+ _target_: megatron.bridge.training.config.DistributedDataParallelConfig
44
+ align_param_gather: false
45
+ average_in_collective: false
46
+ bucket_size: 40000000
47
+ check_for_large_grads: false
48
+ check_for_nan_in_grad: true
49
+ data_parallel_sharding_strategy: optim_grads_params
50
+ delay_wgrad_compute: false
51
+ disable_symmetric_registration: false
52
+ fp8_param_gather: false
53
+ fsdp_double_buffer: false
54
+ grad_reduce_in_fp32: true
55
+ gradient_reduce_div_fusion: true
56
+ keep_fp8_transpose_cache: false
57
+ nccl_ub: false
58
+ num_distributed_optimizer_instances: 1
59
+ outer_dp_sharding_strategy: no_shard
60
+ overlap_grad_reduce: true
61
+ overlap_param_gather: true
62
+ pad_buckets_for_high_nccl_busbw: false
63
+ preserve_fp32_weights: true
64
+ reduce_scatter_with_fp32_accumulation: false
65
+ reuse_grad_buf_for_mxfp8_param_ag: false
66
+ suggested_communication_unit_size: null
67
+ use_custom_fsdp: false
68
+ use_distributed_optimizer: true
69
+ use_megatron_fsdp: false
70
+ dist:
71
+ _target_: megatron.bridge.training.config.DistributedInitConfig
72
+ align_grad_reduce: true
73
+ disable_jit_fuser: false
74
+ distributed_backend: nccl
75
+ distributed_timeout_minutes: 10
76
+ distributed_timeout_seconds_after_init: null
77
+ enable_megatron_core_experimental: false
78
+ external_gpu_device_mapping: true
79
+ high_priority_stream_groups: null
80
+ lazy_init: false
81
+ local_rank: 0
82
+ nccl_communicator_config_path: null
83
+ sharp_enabled_group: null
84
+ use_gloo_process_groups: true
85
+ use_megatron_fsdp: false
86
+ use_sharp: false
87
+ use_torch_fsdp2: false
88
+ use_tp_pp_dp_mapping: false
89
+ ft: null
90
+ inprocess_restart: null
91
+ logger:
92
+ _target_: megatron.bridge.training.config.LoggerConfig
93
+ filter_warnings: true
94
+ log_energy: false
95
+ log_interval: 100
96
+ log_l2_norm_grad_to_tensorboard: false
97
+ log_loss_scale_to_tensorboard: true
98
+ log_memory_to_tensorboard: false
99
+ log_params_norm: false
100
+ log_progress: false
101
+ log_runtime_to_tensorboard: false
102
+ log_throughput: false
103
+ log_throughput_to_tensorboard: false
104
+ log_timers_to_tensorboard: false
105
+ log_validation_ppl_to_tensorboard: false
106
+ log_world_size_to_tensorboard: false
107
+ logging_level: 0
108
+ memory_keys: null
109
+ modules_to_filter: null
110
+ runtime_time_unit: hours
111
+ save_config_filepath: null
112
+ set_level_for_all_loggers: false
113
+ tensorboard_dir: null
114
+ tensorboard_log_interval: 1
115
+ tensorboard_queue_size: 1000
116
+ throughput_window_size: 100
117
+ timing_log_level: 0
118
+ timing_log_option: minmax
119
+ wandb_entity: null
120
+ wandb_exp_name: null
121
+ wandb_project: null
122
+ wandb_save_dir: null
123
+ mixed_precision: null
124
+ model:
125
+ _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
126
+ account_for_embedding_in_pipeline_split: false
127
+ account_for_loss_in_pipeline_split: false
128
+ activation_func:
129
+ _call_: false
130
+ _target_: torch.nn.functional.silu
131
+ activation_func_clamp_value: null
132
+ activation_func_fp8_input_store: false
133
+ add_bias_linear: false
134
+ add_qkv_bias: false
135
+ apply_query_key_layer_scaling: false
136
+ apply_residual_connection_post_layernorm: false
137
+ apply_rope_fusion: true
138
+ async_tensor_model_parallel_allreduce: false
139
+ attention_backend:
140
+ _args_:
141
+ - 5
142
+ _call_: true
143
+ _target_: megatron.core.transformer.enums.AttnBackend
144
+ attention_dropout: 0.0
145
+ attention_output_gate: false
146
+ attention_softmax_in_fp32: false
147
+ autocast_dtype:
148
+ _call_: false
149
+ _target_: torch.bfloat16
150
+ barrier_with_L1_time: true
151
+ batch_p2p_comm: true
152
+ batch_p2p_sync: true
153
+ bf16: true
154
+ bias_activation_fusion: false
155
+ bias_dropout_fusion: false
156
+ calculate_per_token_loss: true
157
+ clone_scatter_output_in_embedding: true
158
+ config_logger_dir: ''
159
+ context_parallel_size: 2
160
+ cp_comm_type: null
161
+ cpu_offloading: false
162
+ cpu_offloading_activations: true
163
+ cpu_offloading_double_buffering: false
164
+ cpu_offloading_num_layers: 0
165
+ cpu_offloading_weights: false
166
+ cross_entropy_fusion_impl: native
167
+ cross_entropy_loss_fusion: true
168
+ cuda_graph_impl: none
169
+ cuda_graph_retain_backward_graph: false
170
+ cuda_graph_scope: []
171
+ cuda_graph_use_single_mempool: false
172
+ cuda_graph_warmup_steps: 3
173
+ deallocate_pipeline_outputs: true
174
+ defer_embedding_wgrad_compute: false
175
+ delay_wgrad_compute: false
176
+ deterministic_mode: false
177
+ disable_bf16_reduced_precision_matmul: false
178
+ disable_parameter_transpose_cache: false
179
+ distribute_saved_activations: null
180
+ embedding_init_method:
181
+ _args_: []
182
+ _partial_: true
183
+ _target_: torch.nn.init.normal_
184
+ mean: 0.0
185
+ std: 0.02
186
+ embedding_init_method_std: 0.02
187
+ enable_autocast: false
188
+ enable_cuda_graph: false
189
+ expert_model_parallel_size: 1
190
+ expert_tensor_parallel_size: 1
191
+ external_cuda_graph: false
192
+ fallback_to_eager_attn: false
193
+ ffn_hidden_size: 9728
194
+ finalize_model_grads_func:
195
+ _args_: []
196
+ _partial_: true
197
+ _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
198
+ pg_collection: null
199
+ fine_grained_activation_offloading: false
200
+ first_last_layers_bf16: false
201
+ flash_decode: false
202
+ fp16: false
203
+ fp16_lm_cross_entropy: false
204
+ fp32_residual_connection: false
205
+ fp4: null
206
+ fp4_param: false
207
+ fp4_quantizer_factory: null
208
+ fp4_recipe: nvfp4
209
+ fp8: null
210
+ fp8_amax_compute_algo: most_recent
211
+ fp8_amax_history_len: 1
212
+ fp8_dot_product_attention: false
213
+ fp8_interval: 1
214
+ fp8_margin: 0
215
+ fp8_multi_head_attention: false
216
+ fp8_param: false
217
+ fp8_quantizer_factory: null
218
+ fp8_recipe: delayed
219
+ fp8_wgrad: true
220
+ fused_single_qkv_rope: false
221
+ gated_linear_unit: true
222
+ generation_config: null
223
+ glu_linear_offset: 0.0
224
+ grad_scale_func:
225
+ _call_: false
226
+ _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
227
+ grad_sync_func:
228
+ _call_: false
229
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
230
+ gradient_accumulation_fusion: false
231
+ hetereogenous_dist_checkpoint: false
232
+ heterogeneous_block_specs: false
233
+ hf_model_id: ./models/Qwen-NVARC
234
+ hidden_dropout: 0.0
235
+ hidden_size: 2560
236
+ hierarchical_context_parallel_sizes: null
237
+ inference_rng_tracker: false
238
+ inference_sampling_seed: 42
239
+ init_method:
240
+ _args_: []
241
+ _partial_: true
242
+ _target_: torch.nn.init.normal_
243
+ mean: 0.0
244
+ std: 0.02
245
+ init_method_std: 0.02
246
+ init_model_with_meta_device: false
247
+ is_hybrid_model: false
248
+ kv_channels: 128
249
+ layernorm_epsilon: 1.0e-06
250
+ layernorm_zero_centered_gamma: false
251
+ linear_attention_freq: null
252
+ linear_attention_type: null
253
+ linear_conv_kernel_dim: null
254
+ linear_key_head_dim: null
255
+ linear_num_key_heads: null
256
+ linear_num_value_heads: null
257
+ linear_value_head_dim: null
258
+ log_max_attention_logit: false
259
+ make_vocab_size_divisible_by: 16
260
+ mamba_head_dim: 64
261
+ mamba_num_groups: 8
262
+ mamba_num_heads: null
263
+ mamba_state_dim: 128
264
+ masked_softmax_fusion: true
265
+ max_position_embeddings: 40960
266
+ memory_efficient_layer_norm: false
267
+ microbatch_group_size_per_vp_stage: 1
268
+ min_offloaded_tensor_size: 1048576
269
+ mlp_chunks_for_prefill: 1
270
+ moe_apply_probs_on_input: false
271
+ moe_aux_loss_coeff: 0.0
272
+ moe_deepep_num_sms: 20
273
+ moe_enable_deepep: false
274
+ moe_expert_capacity_factor: null
275
+ moe_extended_tp: false
276
+ moe_ffn_hidden_size: null
277
+ moe_flex_dispatcher_backend: deepep
278
+ moe_grouped_gemm: false
279
+ moe_hybridep_num_sms: 16
280
+ moe_input_jitter_eps: null
281
+ moe_layer_freq: 1
282
+ moe_layer_recompute: false
283
+ moe_pad_expert_input_to_capacity: false
284
+ moe_per_layer_logging: false
285
+ moe_permute_fusion: false
286
+ moe_router_bias_update_rate: 0.0
287
+ moe_router_dtype: fp64
288
+ moe_router_enable_expert_bias: false
289
+ moe_router_force_load_balancing: false
290
+ moe_router_fusion: false
291
+ moe_router_group_topk: null
292
+ moe_router_load_balancing_type: none
293
+ moe_router_num_groups: null
294
+ moe_router_padding_for_fp8: false
295
+ moe_router_padding_for_quantization: false
296
+ moe_router_pre_softmax: false
297
+ moe_router_score_function: softmax
298
+ moe_router_topk: 2
299
+ moe_router_topk_limited_devices: null
300
+ moe_router_topk_scaling_factor: null
301
+ moe_shared_expert_gate: false
302
+ moe_shared_expert_intermediate_size: null
303
+ moe_shared_expert_overlap: false
304
+ moe_token_dispatcher_type: allgather
305
+ moe_token_drop_policy: probs
306
+ moe_token_dropping: false
307
+ moe_use_legacy_grouped_gemm: false
308
+ moe_z_loss_coeff: null
309
+ mrope_section: null
310
+ mtp_enabled: false
311
+ mtp_loss_scaling_factor: null
312
+ mtp_num_layers: null
313
+ mtp_standalone: false
314
+ multi_latent_attention: false
315
+ no_rope_freq: null
316
+ no_sync_func:
317
+ _call_: false
318
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
319
+ normalization: RMSNorm
320
+ num_attention_heads: 32
321
+ num_layers: 36
322
+ num_layers_at_end_in_bf16: 1
323
+ num_layers_at_start_in_bf16: 1
324
+ num_layers_in_first_pipeline_stage: null
325
+ num_layers_in_last_pipeline_stage: null
326
+ num_microbatches_with_partial_activation_checkpoints: null
327
+ num_moe_experts: null
328
+ num_query_groups: 8
329
+ offload_modules: null
330
+ output_layer_init_method:
331
+ _args_: []
332
+ _partial_: true
333
+ _target_: torch.nn.init.normal_
334
+ mean: 0.0
335
+ std: 0.0023570226039551587
336
+ overlap_moe_expert_parallel_comm: false
337
+ overlap_p2p_comm: false
338
+ overlap_p2p_comm_warmup_flush: false
339
+ parallel_output: true
340
+ param_sync_func: null
341
+ params_dtype:
342
+ _call_: false
343
+ _target_: torch.bfloat16
344
+ perform_initialization: true
345
+ persist_layer_norm: false
346
+ pipeline_dtype:
347
+ _call_: false
348
+ _target_: torch.bfloat16
349
+ pipeline_model_parallel_comm_backend: null
350
+ pipeline_model_parallel_layout: null
351
+ pipeline_model_parallel_size: 1
352
+ position_embedding_type: rope
353
+ qk_clip: false
354
+ qk_clip_alpha: 0.5
355
+ qk_clip_threshold: 100
356
+ qk_layernorm: true
357
+ quant_recipe: null
358
+ recompute_granularity: full
359
+ recompute_method: uniform
360
+ recompute_modules:
361
+ - core_attn
362
+ recompute_num_layers: 1
363
+ restore_modelopt_state: false
364
+ rotary_base: 5000000
365
+ rotary_interleaved: false
366
+ rotary_percent: 1.0
367
+ scatter_embedding_sequence_parallel: true
368
+ seq_len_interpolation_factor: null
369
+ seq_length: 262144
370
+ sequence_parallel: false
371
+ share_embeddings_and_output_weights: true
372
+ should_pad_vocab: false
373
+ softmax_scale: null
374
+ softmax_type: vanilla
375
+ symmetric_ar_type: null
376
+ tensor_model_parallel_size: 1
377
+ test_mode: false
378
+ timers: null
379
+ tp_comm_atomic_ag: false
380
+ tp_comm_atomic_rs: false
381
+ tp_comm_bootstrap_backend: nccl
382
+ tp_comm_bulk_dgrad: true
383
+ tp_comm_bulk_wgrad: true
384
+ tp_comm_overlap: false
385
+ tp_comm_overlap_ag: true
386
+ tp_comm_overlap_cfg: null
387
+ tp_comm_overlap_disable_fc1: false
388
+ tp_comm_overlap_disable_qkv: false
389
+ tp_comm_overlap_rs: true
390
+ tp_comm_overlap_rs_dgrad: false
391
+ tp_comm_split_ag: true
392
+ tp_comm_split_rs: true
393
+ tp_only_amax_red: false
394
+ transformer_impl: transformer_engine
395
+ transformer_layer_spec:
396
+ _call_: false
397
+ _target_: megatron.bridge.models.gpt_provider.default_layer_spec
398
+ use_cpu_initialization: false
399
+ use_fused_weighted_squared_relu: false
400
+ use_kitchen: false
401
+ use_mamba_mem_eff_path: true
402
+ use_ring_exchange_p2p: false
403
+ use_te_activation_func: false
404
+ use_te_rng_tracker: false
405
+ use_transformer_engine_full_layer_spec: false
406
+ use_transformer_engine_op_fuser: false
407
+ variable_seq_lengths: false
408
+ virtual_pipeline_model_parallel_size: null
409
+ vocab_size: 16
410
+ wgrad_deferral_limit: 0
411
+ window_attn_skip_freq: null
412
+ window_size: null
413
+ nvrx_straggler: null
414
+ optimizer:
415
+ _target_: megatron.bridge.training.config.OptimizerConfig
416
+ adam_beta1: 0.9
417
+ adam_beta2: 0.98
418
+ adam_eps: 1.0e-08
419
+ barrier_with_L1_time: false
420
+ bf16: true
421
+ clip_grad: 0.5
422
+ config_logger_dir: ''
423
+ decoupled_lr: null
424
+ decoupled_min_lr: null
425
+ decoupled_weight_decay: true
426
+ exp_avg_dtype:
427
+ _call_: false
428
+ _target_: torch.float32
429
+ exp_avg_sq_dtype:
430
+ _call_: false
431
+ _target_: torch.float32
432
+ fp16: false
433
+ fp8_recipe: null
434
+ hysteresis: 2
435
+ initial_loss_scale: 4294967296
436
+ log_num_zeros_in_grad: false
437
+ loss_scale: null
438
+ loss_scale_window: 1000
439
+ lr: 0.0001
440
+ main_grads_dtype:
441
+ _call_: false
442
+ _target_: torch.float32
443
+ main_params_dtype:
444
+ _call_: false
445
+ _target_: torch.float32
446
+ min_loss_scale: 1.0
447
+ min_lr: 1.0e-07
448
+ muon_extra_scale_factor: 1.0
449
+ muon_fp32_matmul_prec: medium
450
+ muon_momentum: 0.95
451
+ muon_num_ns_steps: 5
452
+ muon_scale_mode: spectral
453
+ muon_split_qkv: true
454
+ muon_tp_mode: blockwise
455
+ muon_use_nesterov: false
456
+ optimizer: adam
457
+ optimizer_cpu_offload: false
458
+ optimizer_offload_fraction: 0.0
459
+ overlap_cpu_optimizer_d2h_h2d: false
460
+ overlap_param_gather: false
461
+ overlap_param_gather_with_optimizer_step: false
462
+ params_dtype: bfloat16
463
+ pin_cpu_grads: true
464
+ pin_cpu_params: true
465
+ reuse_grad_buf_for_mxfp8_param_ag: false
466
+ sgd_momentum: 0.9
467
+ store_param_remainders: true
468
+ timers: null
469
+ use_distributed_optimizer: true
470
+ use_precision_aware_optimizer: false
471
+ use_torch_optimizer_for_cpu_offload: false
472
+ weight_decay: 0.1
473
+ peft: null
474
+ profiling:
475
+ _target_: megatron.bridge.training.config.ProfilingConfig
476
+ memory_snapshot_path: snapshot.pickle
477
+ nvtx_ranges: false
478
+ profile_ranks:
479
+ - 0
480
+ profile_step_end: 12
481
+ profile_step_start: 10
482
+ record_memory_history: false
483
+ record_shapes: false
484
+ use_nsys_profiler: false
485
+ use_pytorch_profiler: false
486
+ rerun_state_machine:
487
+ _target_: megatron.bridge.training.config.RerunStateMachineConfig
488
+ check_for_nan_in_loss: true
489
+ check_for_spiky_loss: false
490
+ error_injection_rate: 0
491
+ error_injection_type: transient_error
492
+ rerun_mode: disabled
493
+ rng:
494
+ _target_: megatron.bridge.training.config.RNGConfig
495
+ data_parallel_random_init: false
496
+ inference_rng_tracker: false
497
+ seed: 1234
498
+ te_rng_tracker: false
499
+ scheduler:
500
+ _target_: megatron.bridge.training.config.SchedulerConfig
501
+ end_weight_decay: 0.1
502
+ lr_decay_iters: 12716
503
+ lr_decay_samples: null
504
+ lr_decay_steps: 3255296
505
+ lr_decay_style: linear
506
+ lr_warmup_fraction: null
507
+ lr_warmup_init: 1.0e-06
508
+ lr_warmup_iters: 200
509
+ lr_warmup_samples: 0
510
+ lr_warmup_steps: 51200
511
+ lr_wsd_decay_iters: null
512
+ lr_wsd_decay_samples: null
513
+ lr_wsd_decay_style: exponential
514
+ no_weight_decay_cond_type: null
515
+ override_opt_param_scheduler: false
516
+ start_weight_decay: 0.1
517
+ use_checkpoint_opt_param_scheduler: false
518
+ wd_incr_steps: 1528832
519
+ weight_decay_incr_style: constant
520
+ wsd_decay_steps: null
521
+ straggler: null
522
+ tensor_inspect: null
523
+ tokenizer:
524
+ _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
525
+ hf_tokenizer_kwargs: {}
526
+ image_tag_type: null
527
+ merge_file: null
528
+ special_tokens: null
529
+ tiktoken_num_special_tokens: 1000
530
+ tiktoken_pattern: null
531
+ tiktoken_special_tokens: null
532
+ tokenizer_model: ./models/Qwen-NVARC
533
+ tokenizer_prompt_format: null
534
+ tokenizer_type: HuggingFaceTokenizer
535
+ vocab_extra_ids: 0
536
+ vocab_file: null
537
+ vocab_size: null
538
+ train:
539
+ _target_: megatron.bridge.training.config.TrainingConfig
540
+ check_weight_hash_across_dp_replicas_interval: null
541
+ decrease_batch_size_if_needed: false
542
+ empty_unused_memory_level: 0
543
+ eval_interval: 1000
544
+ eval_iters: 100
545
+ exit_duration_in_mins: null
546
+ exit_interval: null
547
+ exit_signal:
548
+ _args_:
549
+ - 15
550
+ _call_: true
551
+ _target_: signal.Signals
552
+ exit_signal_handler: false
553
+ exit_signal_handler_for_dataloader: false
554
+ global_batch_size: 256
555
+ iterations_to_skip: []
556
+ manual_gc: false
557
+ manual_gc_eval: true
558
+ manual_gc_interval: 0
559
+ micro_batch_size: 1
560
+ rampup_batch_size: null
561
+ skip_train: false
562
+ train_iters: 5972
563
+ train_samples: null
564
+ train_sync_interval: null
step_5400/policy/weights/iter_0000000/train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5400/policy/weights/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
step_5400/policy/weights/latest_train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5400/train_dataloader.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b28546a485528f6242d1b9dcf951cc95a6af0ca81a13ded15a567a8c9d2f7f
3
+ size 7336
step_5400/training_info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epoch": 0, "step": 5400, "total_steps": 5400, "consumed_samples": 1382400, "total_valid_tokens": 1568487826.0, "val:val_loss": 0.14914798736572266}
step_5600/config.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpointing:
2
+ checkpoint_dir: results/qwen3_4b_sft
3
+ checkpoint_must_save_by: null
4
+ enabled: true
5
+ higher_is_better: false
6
+ keep_top_k: 3
7
+ metric_name: val:val_loss
8
+ save_period: 200
9
+ cluster:
10
+ gpus_per_node: 2
11
+ num_nodes: 1
12
+ data:
13
+ num_workers: 4
14
+ shuffle: true
15
+ train_dataset_path:
16
+ - ./data/hones
17
+ val_dataset_path: ./data/arc2_evaluation6
18
+ logger:
19
+ gpu_monitoring:
20
+ collection_interval: 10
21
+ flush_interval: 10
22
+ log_dir: logs/exp_019
23
+ mlflow_enabled: false
24
+ monitor_gpus: false
25
+ swanlab_enabled: false
26
+ tensorboard_enabled: false
27
+ wandb:
28
+ name: qwen3_4b_sft
29
+ project: arc2
30
+ wandb_enabled: true
31
+ policy:
32
+ activation_checkpointing_enabled: false
33
+ attn_implementation: flash_attention_2
34
+ dtensor_cfg:
35
+ enabled: false
36
+ dynamic_batching:
37
+ enabled: false
38
+ fsdp_offload_enabled: false
39
+ make_sequence_length_divisible_by: 64
40
+ max_grad_norm: null
41
+ megatron_cfg:
42
+ activation_checkpointing: true
43
+ apply_rope_fusion: true
44
+ bias_activation_fusion: false
45
+ context_parallel_size: 2
46
+ distributed_data_parallel_config:
47
+ average_in_collective: true
48
+ data_parallel_sharding_strategy: optim_grads_params
49
+ grad_reduce_in_fp32: true
50
+ overlap_grad_reduce: true
51
+ overlap_param_gather: true
52
+ empty_unused_memory_level: 1
53
+ enabled: true
54
+ env_vars:
55
+ AWS_OFI_NCCL_VERSION: 1.14.0
56
+ BASH_ENV: /etc/bash.bashrc
57
+ CAL_VERSION: 0.4.4.50
58
+ CUBLASMP_VERSION: 0.4.0.789
59
+ CUBLAS_VERSION: 12.9.0.13
60
+ CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
61
+ CUDA_DRIVER_VERSION: 575.51.03
62
+ CUDA_VERSION: 12.9.0.043
63
+ CUDA_VISIBLE_DEVICES: 6,7
64
+ CUDNN_FRONTEND_VERSION: 1.11.0
65
+ CUDNN_VERSION: 9.10.1.4
66
+ CUFFT_VERSION: 11.4.0.6
67
+ CUFILE_VERSION: 1.14.0.30
68
+ CURAND_VERSION: 10.3.10.19
69
+ CUSOLVER_VERSION: 11.7.4.40
70
+ CUSPARSELT_VERSION: 0.7.1.0
71
+ CUSPARSE_VERSION: 12.5.9.5
72
+ DALI_BUILD: ''
73
+ DALI_URL_SUFFIX: '120'
74
+ DALI_VERSION: 1.49.0
75
+ EFA_VERSION: 1.38.1
76
+ ENV: /etc/shinit_v2
77
+ GDRCOPY_VERSION: 2.4.4
78
+ HOME: /root
79
+ HOSTNAME: e6ad2ac15863
80
+ HPCX_VERSION: '2.23'
81
+ KMP_DUPLICATE_LIB_OK: 'True'
82
+ KMP_INIT_AT_FORK: 'FALSE'
83
+ LC_CTYPE: C.UTF-8
84
+ LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
85
+ LESSCLOSE: /usr/bin/lesspipe %s %s
86
+ LESSOPEN: '| /usr/bin/lesspipe %s'
87
+ LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
88
+ LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
89
+ MODEL_OPT_VERSION: 0.27.1
90
+ MOFED_VERSION: 5.4-rdmacore50.0
91
+ NCCL_NET_PLUGIN: aws-ofi
92
+ NCCL_TUNER_PLUGIN: aws-ofi
93
+ NCCL_VERSION: 2.26.5
94
+ NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
95
+ NEMO_RL_VENV_DIR: /opt/ray_venvs
96
+ NPP_VERSION: 12.4.0.27
97
+ NRL_CONTAINER: '1'
98
+ NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
99
+ NSIGHT_COMPUTE_VERSION: 2025.2.0.11
100
+ NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
101
+ NVIDIA_BUILD_ID: '244212578'
102
+ NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
103
+ NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
104
+ NVIDIA_PRODUCT_NAME: CUDA
105
+ NVIDIA_REQUIRE_CUDA: cuda>=9.0
106
+ NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
107
+ NVIDIA_VISIBLE_DEVICES: all
108
+ NVJITLINK_VERSION: 12.9.41
109
+ NVJPEG_VERSION: 12.4.0.16
110
+ NVSHMEM_VERSION: 3.2.5
111
+ OLDPWD: /workspace
112
+ OMPI_MCA_coll_hcoll_enable: '0'
113
+ OPAL_PREFIX: /opt/hpcx/ompi
114
+ OPENMPI_VERSION: 4.1.7
115
+ OPENUCX_VERSION: 1.19.0
116
+ PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
117
+ POLYGRAPHY_VERSION: 0.49.20
118
+ PWD: /workspace/ARChitects
119
+ PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
120
+ PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
121
+ PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
122
+ RAY_CLIENT_MODE: '0'
123
+ RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
124
+ RAY_USAGE_STATS_ENABLED: '0'
125
+ RDMACORE_VERSION: '50.0'
126
+ SHELL: /bin/bash
127
+ SHLVL: '2'
128
+ SWANLAB_API_HOST: https://api.swanlab.cn/api
129
+ SWANLAB_RUNTIME: user
130
+ SWANLAB_WEB_HOST: https://swanlab.cn
131
+ TERM: xterm
132
+ TORCH_CUDA_ARCH_LIST: '9.0'
133
+ TRANSFORMER_ENGINE_VERSION: '2.3'
134
+ TRTOSS_VERSION: ''
135
+ TRT_VERSION: 10.10.0.31
136
+ UV: /root/.local/bin/uv
137
+ UV_LINK_MODE: copy
138
+ UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
139
+ UV_RUN_RECURSION_DEPTH: '1'
140
+ VIRTUAL_ENV: /opt/nemo_rl_venv
141
+ VIRTUAL_ENV_PROMPT: nemo-rl
142
+ WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
143
+ _: /root/.local/bin/uv
144
+ _CUDA_COMPAT_PATH: /usr/local/cuda/compat
145
+ _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
146
+ (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
147
+ _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
148
+ expert_model_parallel_size: 1
149
+ expert_tensor_parallel_size: 1
150
+ freeze_moe_router: true
151
+ moe_permute_fusion: false
152
+ moe_router_bias_update_rate: 0.0
153
+ moe_router_dtype: fp64
154
+ moe_router_load_balancing_type: none
155
+ num_layers_in_first_pipeline_stage: null
156
+ num_layers_in_last_pipeline_stage: null
157
+ optimizer:
158
+ adam_beta1: 0.9
159
+ adam_beta2: 0.98
160
+ adam_eps: 1.0e-08
161
+ bf16: true
162
+ clip_grad: 0.5
163
+ fp16: false
164
+ lr: 0.0001
165
+ min_lr: 1.0e-07
166
+ optimizer: adam
167
+ optimizer_cpu_offload: false
168
+ optimizer_offload_fraction: 0.0
169
+ params_dtype: bfloat16
170
+ sgd_momentum: 0.9
171
+ use_distributed_optimizer: true
172
+ use_precision_aware_optimizer: false
173
+ weight_decay: 0.1
174
+ pipeline_dtype: bfloat16
175
+ pipeline_model_parallel_size: 1
176
+ scheduler:
177
+ end_weight_decay: 0.1
178
+ lr_decay_iters: 12716
179
+ lr_decay_style: linear
180
+ lr_warmup_init: 1.0e-06
181
+ lr_warmup_iters: 200
182
+ start_weight_decay: 0.1
183
+ weight_decay_incr_style: constant
184
+ sequence_parallel: false
185
+ tensor_model_parallel_size: 1
186
+ train_iters: 5972
187
+ model_name: ./models/Qwen-NVARC
188
+ offload_optimizer_for_logprob: false
189
+ precision: bfloat16
190
+ sequence_packing:
191
+ algorithm: modified_first_fit_decreasing
192
+ enabled: true
193
+ sequence_length_round: 64
194
+ train_mb_tokens: 128000
195
+ tokenizer:
196
+ name: ./models/Qwen-NVARC
197
+ train_global_batch_size: 256
198
+ train_micro_batch_size: 1
199
+ sft:
200
+ max_num_epochs: 1
201
+ max_num_steps: 6400
202
+ seed: 24
203
+ val_at_start: true
204
+ val_batches: 200
205
+ val_global_batch_size: 256
206
+ val_micro_batch_size: 1
207
+ val_period: 200
step_5600/policy/weights/iter_0000000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a1c916057dfe0e2002fe62982907832c5f702012c7360c6613f4a610084f748
3
+ size 329201
step_5600/policy/weights/iter_0000000/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca3b2e874e687352cb92c06fbffd56051ad75663c60fb5d275365fa00e02a4bb
3
+ size 1767
step_5600/policy/weights/iter_0000000/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function silu at 0x7d0251c6b420>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: false
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.auto
9
+ attention_dropout: '0.0'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: false
12
+ autocast_dtype: torch.bfloat16
13
+ barrier_with_L1_time: true
14
+ bf16: true
15
+ bias_activation_fusion: false
16
+ bias_dropout_fusion: false
17
+ calculate_per_token_loss: true
18
+ clone_scatter_output_in_embedding: true
19
+ config_logger_dir: ''
20
+ cross_entropy_fusion_impl: native
21
+ cross_entropy_loss_fusion: true
22
+ defer_embedding_wgrad_compute: false
23
+ delay_wgrad_compute: false
24
+ deterministic_mode: false
25
+ disable_bf16_reduced_precision_matmul: false
26
+ disable_parameter_transpose_cache: false
27
+ distribute_saved_activations: None
28
+ enable_autocast: false
29
+ fallback_to_eager_attn: false
30
+ ffn_hidden_size: 9728
31
+ finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
32
+ pg_collection=None)
33
+ fine_grained_activation_offloading: false
34
+ first_last_layers_bf16: false
35
+ flash_decode: false
36
+ fp16: false
37
+ fp16_lm_cross_entropy: false
38
+ fp32_residual_connection: false
39
+ fused_single_qkv_rope: false
40
+ gated_linear_unit: true
41
+ generation_config: None
42
+ glu_linear_offset: '0.0'
43
+ grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
44
+ object at 0x7cf9d413cd70>>
45
+ grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
46
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
47
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
48
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
49
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
50
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
51
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
52
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
53
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
54
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
55
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
56
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
57
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
58
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
59
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
60
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
61
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
62
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
63
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
64
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
65
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
66
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
67
+ gradient_accumulation_fusion: false
68
+ hetereogenous_dist_checkpoint: false
69
+ heterogeneous_block_specs: false
70
+ hf_model_id: ./models/Qwen-NVARC
71
+ hidden_dropout: '0.0'
72
+ hidden_size: 2560
73
+ is_hybrid_model: false
74
+ kv_channels: 128
75
+ layernorm_epsilon: 1e-06
76
+ layernorm_zero_centered_gamma: false
77
+ linear_attention_freq: None
78
+ linear_attention_type: None
79
+ linear_conv_kernel_dim: None
80
+ linear_key_head_dim: None
81
+ linear_num_key_heads: None
82
+ linear_num_value_heads: None
83
+ linear_value_head_dim: None
84
+ log_max_attention_logit: false
85
+ make_vocab_size_divisible_by: 16
86
+ mamba_head_dim: 64
87
+ mamba_num_groups: 8
88
+ mamba_num_heads: None
89
+ mamba_state_dim: 128
90
+ masked_softmax_fusion: true
91
+ max_position_embeddings: 40960
92
+ memory_efficient_layer_norm: false
93
+ min_offloaded_tensor_size: 1048576
94
+ mlp_chunks_for_prefill: 1
95
+ moe_apply_probs_on_input: false
96
+ moe_aux_loss_coeff: '0.0'
97
+ moe_deepep_num_sms: 20
98
+ moe_enable_deepep: false
99
+ moe_expert_capacity_factor: None
100
+ moe_extended_tp: false
101
+ moe_ffn_hidden_size: None
102
+ moe_flex_dispatcher_backend: deepep
103
+ moe_grouped_gemm: false
104
+ moe_hybridep_num_sms: 16
105
+ moe_input_jitter_eps: None
106
+ moe_layer_freq: 1
107
+ moe_pad_expert_input_to_capacity: false
108
+ moe_per_layer_logging: false
109
+ moe_permute_fusion: false
110
+ moe_router_bias_update_rate: '0.0'
111
+ moe_router_dtype: fp64
112
+ moe_router_enable_expert_bias: false
113
+ moe_router_force_load_balancing: false
114
+ moe_router_fusion: false
115
+ moe_router_group_topk: None
116
+ moe_router_load_balancing_type: none
117
+ moe_router_num_groups: None
118
+ moe_router_padding_for_quantization: false
119
+ moe_router_pre_softmax: false
120
+ moe_router_score_function: softmax
121
+ moe_router_topk: 2
122
+ moe_router_topk_limited_devices: None
123
+ moe_router_topk_scaling_factor: None
124
+ moe_shared_expert_gate: false
125
+ moe_shared_expert_intermediate_size: None
126
+ moe_shared_expert_overlap: false
127
+ moe_token_dispatcher_type: allgather
128
+ moe_token_drop_policy: probs
129
+ moe_token_dropping: false
130
+ moe_use_legacy_grouped_gemm: false
131
+ moe_z_loss_coeff: None
132
+ mrope_section: None
133
+ multi_latent_attention: false
134
+ no_rope_freq: None
135
+ no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
136
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
137
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
138
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
139
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
140
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
141
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
142
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
143
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
144
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
145
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
146
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
147
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
148
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
149
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
150
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
151
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
152
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
153
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
154
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
155
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
156
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
157
+ normalization: RMSNorm
158
+ num_attention_heads: 32
159
+ num_layers: 36
160
+ num_layers_at_end_in_bf16: 1
161
+ num_layers_at_start_in_bf16: 1
162
+ num_moe_experts: None
163
+ num_query_groups: 8
164
+ nvidia_modelopt_version: 0.39.0
165
+ offload_modules: None
166
+ param_sync_func: None
167
+ params_dtype: torch.bfloat16
168
+ perform_initialization: true
169
+ persist_layer_norm: false
170
+ position_embedding_type: rope
171
+ qk_clip: false
172
+ qk_clip_alpha: '0.5'
173
+ qk_clip_threshold: 100
174
+ qk_layernorm: true
175
+ quant_recipe: None
176
+ restore_modelopt_state: false
177
+ rotary_base: 5000000
178
+ rotary_interleaved: false
179
+ rotary_percent: '1.0'
180
+ seq_len_interpolation_factor: None
181
+ seq_length: 262144
182
+ share_embeddings_and_output_weights: true
183
+ should_pad_vocab: false
184
+ softmax_scale: None
185
+ softmax_type: vanilla
186
+ symmetric_ar_type: None
187
+ test_mode: false
188
+ timers: None
189
+ transformer_impl: transformer_engine
190
+ transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
191
+ use_fused_weighted_squared_relu: false
192
+ use_kitchen: false
193
+ use_mamba_mem_eff_path: true
194
+ use_ring_exchange_p2p: false
195
+ use_te_activation_func: false
196
+ use_te_rng_tracker: false
197
+ use_transformer_engine_full_layer_spec: false
198
+ use_transformer_engine_op_fuser: false
199
+ variable_seq_lengths: false
200
+ vocab_size: 16
201
+ wgrad_deferral_limit: 0
202
+ window_attn_skip_freq: None
203
+ window_size: None
step_5600/policy/weights/iter_0000000/run_config.yaml ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: megatron.bridge.training.config.ConfigContainer
2
+ checkpoint:
3
+ _target_: megatron.bridge.training.config.CheckpointConfig
4
+ async_save: false
5
+ ckpt_assume_constant_structure: false
6
+ ckpt_convert_format: null
7
+ ckpt_convert_save: null
8
+ ckpt_format: torch_dist
9
+ ckpt_step: null
10
+ dist_ckpt_optim_fully_reshardable: false
11
+ dist_ckpt_save_pre_mcore_014: false
12
+ dist_ckpt_strictness: assume_ok_unexpected
13
+ distrib_optim_fully_reshardable_mem_efficient: false
14
+ exit_on_missing_checkpoint: false
15
+ finetune: true
16
+ fully_parallel_load: true
17
+ fully_parallel_save: true
18
+ load: null
19
+ load_main_params_from_ckpt: false
20
+ load_optim: true
21
+ load_rng: false
22
+ most_recent_k: -1
23
+ non_persistent_ckpt_type: null
24
+ non_persistent_global_ckpt_dir: null
25
+ non_persistent_local_ckpt_algo: fully_parallel
26
+ non_persistent_local_ckpt_dir: null
27
+ non_persistent_save_interval: null
28
+ pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
29
+ replication: false
30
+ replication_factor: 2
31
+ replication_jump: null
32
+ save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5600/policy/weights
33
+ save_interval: 100
34
+ save_optim: true
35
+ save_rng: true
36
+ save_tokenizer_assets: true
37
+ strict_fsdp_dtensor_load: false
38
+ use_checkpoint_args: false
39
+ use_persistent_ckpt_worker: true
40
+ comm_overlap: null
41
+ dataset: null
42
+ ddp:
43
+ _target_: megatron.bridge.training.config.DistributedDataParallelConfig
44
+ align_param_gather: false
45
+ average_in_collective: false
46
+ bucket_size: 40000000
47
+ check_for_large_grads: false
48
+ check_for_nan_in_grad: true
49
+ data_parallel_sharding_strategy: optim_grads_params
50
+ delay_wgrad_compute: false
51
+ disable_symmetric_registration: false
52
+ fp8_param_gather: false
53
+ fsdp_double_buffer: false
54
+ grad_reduce_in_fp32: true
55
+ gradient_reduce_div_fusion: true
56
+ keep_fp8_transpose_cache: false
57
+ nccl_ub: false
58
+ num_distributed_optimizer_instances: 1
59
+ outer_dp_sharding_strategy: no_shard
60
+ overlap_grad_reduce: true
61
+ overlap_param_gather: true
62
+ pad_buckets_for_high_nccl_busbw: false
63
+ preserve_fp32_weights: true
64
+ reduce_scatter_with_fp32_accumulation: false
65
+ reuse_grad_buf_for_mxfp8_param_ag: false
66
+ suggested_communication_unit_size: null
67
+ use_custom_fsdp: false
68
+ use_distributed_optimizer: true
69
+ use_megatron_fsdp: false
70
+ dist:
71
+ _target_: megatron.bridge.training.config.DistributedInitConfig
72
+ align_grad_reduce: true
73
+ disable_jit_fuser: false
74
+ distributed_backend: nccl
75
+ distributed_timeout_minutes: 10
76
+ distributed_timeout_seconds_after_init: null
77
+ enable_megatron_core_experimental: false
78
+ external_gpu_device_mapping: true
79
+ high_priority_stream_groups: null
80
+ lazy_init: false
81
+ local_rank: 0
82
+ nccl_communicator_config_path: null
83
+ sharp_enabled_group: null
84
+ use_gloo_process_groups: true
85
+ use_megatron_fsdp: false
86
+ use_sharp: false
87
+ use_torch_fsdp2: false
88
+ use_tp_pp_dp_mapping: false
89
+ ft: null
90
+ inprocess_restart: null
91
+ logger:
92
+ _target_: megatron.bridge.training.config.LoggerConfig
93
+ filter_warnings: true
94
+ log_energy: false
95
+ log_interval: 100
96
+ log_l2_norm_grad_to_tensorboard: false
97
+ log_loss_scale_to_tensorboard: true
98
+ log_memory_to_tensorboard: false
99
+ log_params_norm: false
100
+ log_progress: false
101
+ log_runtime_to_tensorboard: false
102
+ log_throughput: false
103
+ log_throughput_to_tensorboard: false
104
+ log_timers_to_tensorboard: false
105
+ log_validation_ppl_to_tensorboard: false
106
+ log_world_size_to_tensorboard: false
107
+ logging_level: 0
108
+ memory_keys: null
109
+ modules_to_filter: null
110
+ runtime_time_unit: hours
111
+ save_config_filepath: null
112
+ set_level_for_all_loggers: false
113
+ tensorboard_dir: null
114
+ tensorboard_log_interval: 1
115
+ tensorboard_queue_size: 1000
116
+ throughput_window_size: 100
117
+ timing_log_level: 0
118
+ timing_log_option: minmax
119
+ wandb_entity: null
120
+ wandb_exp_name: null
121
+ wandb_project: null
122
+ wandb_save_dir: null
123
+ mixed_precision: null
124
+ model:
125
+ _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
126
+ account_for_embedding_in_pipeline_split: false
127
+ account_for_loss_in_pipeline_split: false
128
+ activation_func:
129
+ _call_: false
130
+ _target_: torch.nn.functional.silu
131
+ activation_func_clamp_value: null
132
+ activation_func_fp8_input_store: false
133
+ add_bias_linear: false
134
+ add_qkv_bias: false
135
+ apply_query_key_layer_scaling: false
136
+ apply_residual_connection_post_layernorm: false
137
+ apply_rope_fusion: true
138
+ async_tensor_model_parallel_allreduce: false
139
+ attention_backend:
140
+ _args_:
141
+ - 5
142
+ _call_: true
143
+ _target_: megatron.core.transformer.enums.AttnBackend
144
+ attention_dropout: 0.0
145
+ attention_output_gate: false
146
+ attention_softmax_in_fp32: false
147
+ autocast_dtype:
148
+ _call_: false
149
+ _target_: torch.bfloat16
150
+ barrier_with_L1_time: true
151
+ batch_p2p_comm: true
152
+ batch_p2p_sync: true
153
+ bf16: true
154
+ bias_activation_fusion: false
155
+ bias_dropout_fusion: false
156
+ calculate_per_token_loss: true
157
+ clone_scatter_output_in_embedding: true
158
+ config_logger_dir: ''
159
+ context_parallel_size: 2
160
+ cp_comm_type: null
161
+ cpu_offloading: false
162
+ cpu_offloading_activations: true
163
+ cpu_offloading_double_buffering: false
164
+ cpu_offloading_num_layers: 0
165
+ cpu_offloading_weights: false
166
+ cross_entropy_fusion_impl: native
167
+ cross_entropy_loss_fusion: true
168
+ cuda_graph_impl: none
169
+ cuda_graph_retain_backward_graph: false
170
+ cuda_graph_scope: []
171
+ cuda_graph_use_single_mempool: false
172
+ cuda_graph_warmup_steps: 3
173
+ deallocate_pipeline_outputs: true
174
+ defer_embedding_wgrad_compute: false
175
+ delay_wgrad_compute: false
176
+ deterministic_mode: false
177
+ disable_bf16_reduced_precision_matmul: false
178
+ disable_parameter_transpose_cache: false
179
+ distribute_saved_activations: null
180
+ embedding_init_method:
181
+ _args_: []
182
+ _partial_: true
183
+ _target_: torch.nn.init.normal_
184
+ mean: 0.0
185
+ std: 0.02
186
+ embedding_init_method_std: 0.02
187
+ enable_autocast: false
188
+ enable_cuda_graph: false
189
+ expert_model_parallel_size: 1
190
+ expert_tensor_parallel_size: 1
191
+ external_cuda_graph: false
192
+ fallback_to_eager_attn: false
193
+ ffn_hidden_size: 9728
194
+ finalize_model_grads_func:
195
+ _args_: []
196
+ _partial_: true
197
+ _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
198
+ pg_collection: null
199
+ fine_grained_activation_offloading: false
200
+ first_last_layers_bf16: false
201
+ flash_decode: false
202
+ fp16: false
203
+ fp16_lm_cross_entropy: false
204
+ fp32_residual_connection: false
205
+ fp4: null
206
+ fp4_param: false
207
+ fp4_quantizer_factory: null
208
+ fp4_recipe: nvfp4
209
+ fp8: null
210
+ fp8_amax_compute_algo: most_recent
211
+ fp8_amax_history_len: 1
212
+ fp8_dot_product_attention: false
213
+ fp8_interval: 1
214
+ fp8_margin: 0
215
+ fp8_multi_head_attention: false
216
+ fp8_param: false
217
+ fp8_quantizer_factory: null
218
+ fp8_recipe: delayed
219
+ fp8_wgrad: true
220
+ fused_single_qkv_rope: false
221
+ gated_linear_unit: true
222
+ generation_config: null
223
+ glu_linear_offset: 0.0
224
+ grad_scale_func:
225
+ _call_: false
226
+ _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
227
+ grad_sync_func:
228
+ _call_: false
229
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
230
+ gradient_accumulation_fusion: false
231
+ hetereogenous_dist_checkpoint: false
232
+ heterogeneous_block_specs: false
233
+ hf_model_id: ./models/Qwen-NVARC
234
+ hidden_dropout: 0.0
235
+ hidden_size: 2560
236
+ hierarchical_context_parallel_sizes: null
237
+ inference_rng_tracker: false
238
+ inference_sampling_seed: 42
239
+ init_method:
240
+ _args_: []
241
+ _partial_: true
242
+ _target_: torch.nn.init.normal_
243
+ mean: 0.0
244
+ std: 0.02
245
+ init_method_std: 0.02
246
+ init_model_with_meta_device: false
247
+ is_hybrid_model: false
248
+ kv_channels: 128
249
+ layernorm_epsilon: 1.0e-06
250
+ layernorm_zero_centered_gamma: false
251
+ linear_attention_freq: null
252
+ linear_attention_type: null
253
+ linear_conv_kernel_dim: null
254
+ linear_key_head_dim: null
255
+ linear_num_key_heads: null
256
+ linear_num_value_heads: null
257
+ linear_value_head_dim: null
258
+ log_max_attention_logit: false
259
+ make_vocab_size_divisible_by: 16
260
+ mamba_head_dim: 64
261
+ mamba_num_groups: 8
262
+ mamba_num_heads: null
263
+ mamba_state_dim: 128
264
+ masked_softmax_fusion: true
265
+ max_position_embeddings: 40960
266
+ memory_efficient_layer_norm: false
267
+ microbatch_group_size_per_vp_stage: 1
268
+ min_offloaded_tensor_size: 1048576
269
+ mlp_chunks_for_prefill: 1
270
+ moe_apply_probs_on_input: false
271
+ moe_aux_loss_coeff: 0.0
272
+ moe_deepep_num_sms: 20
273
+ moe_enable_deepep: false
274
+ moe_expert_capacity_factor: null
275
+ moe_extended_tp: false
276
+ moe_ffn_hidden_size: null
277
+ moe_flex_dispatcher_backend: deepep
278
+ moe_grouped_gemm: false
279
+ moe_hybridep_num_sms: 16
280
+ moe_input_jitter_eps: null
281
+ moe_layer_freq: 1
282
+ moe_layer_recompute: false
283
+ moe_pad_expert_input_to_capacity: false
284
+ moe_per_layer_logging: false
285
+ moe_permute_fusion: false
286
+ moe_router_bias_update_rate: 0.0
287
+ moe_router_dtype: fp64
288
+ moe_router_enable_expert_bias: false
289
+ moe_router_force_load_balancing: false
290
+ moe_router_fusion: false
291
+ moe_router_group_topk: null
292
+ moe_router_load_balancing_type: none
293
+ moe_router_num_groups: null
294
+ moe_router_padding_for_fp8: false
295
+ moe_router_padding_for_quantization: false
296
+ moe_router_pre_softmax: false
297
+ moe_router_score_function: softmax
298
+ moe_router_topk: 2
299
+ moe_router_topk_limited_devices: null
300
+ moe_router_topk_scaling_factor: null
301
+ moe_shared_expert_gate: false
302
+ moe_shared_expert_intermediate_size: null
303
+ moe_shared_expert_overlap: false
304
+ moe_token_dispatcher_type: allgather
305
+ moe_token_drop_policy: probs
306
+ moe_token_dropping: false
307
+ moe_use_legacy_grouped_gemm: false
308
+ moe_z_loss_coeff: null
309
+ mrope_section: null
310
+ mtp_enabled: false
311
+ mtp_loss_scaling_factor: null
312
+ mtp_num_layers: null
313
+ mtp_standalone: false
314
+ multi_latent_attention: false
315
+ no_rope_freq: null
316
+ no_sync_func:
317
+ _call_: false
318
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
319
+ normalization: RMSNorm
320
+ num_attention_heads: 32
321
+ num_layers: 36
322
+ num_layers_at_end_in_bf16: 1
323
+ num_layers_at_start_in_bf16: 1
324
+ num_layers_in_first_pipeline_stage: null
325
+ num_layers_in_last_pipeline_stage: null
326
+ num_microbatches_with_partial_activation_checkpoints: null
327
+ num_moe_experts: null
328
+ num_query_groups: 8
329
+ offload_modules: null
330
+ output_layer_init_method:
331
+ _args_: []
332
+ _partial_: true
333
+ _target_: torch.nn.init.normal_
334
+ mean: 0.0
335
+ std: 0.0023570226039551587
336
+ overlap_moe_expert_parallel_comm: false
337
+ overlap_p2p_comm: false
338
+ overlap_p2p_comm_warmup_flush: false
339
+ parallel_output: true
340
+ param_sync_func: null
341
+ params_dtype:
342
+ _call_: false
343
+ _target_: torch.bfloat16
344
+ perform_initialization: true
345
+ persist_layer_norm: false
346
+ pipeline_dtype:
347
+ _call_: false
348
+ _target_: torch.bfloat16
349
+ pipeline_model_parallel_comm_backend: null
350
+ pipeline_model_parallel_layout: null
351
+ pipeline_model_parallel_size: 1
352
+ position_embedding_type: rope
353
+ qk_clip: false
354
+ qk_clip_alpha: 0.5
355
+ qk_clip_threshold: 100
356
+ qk_layernorm: true
357
+ quant_recipe: null
358
+ recompute_granularity: full
359
+ recompute_method: uniform
360
+ recompute_modules:
361
+ - core_attn
362
+ recompute_num_layers: 1
363
+ restore_modelopt_state: false
364
+ rotary_base: 5000000
365
+ rotary_interleaved: false
366
+ rotary_percent: 1.0
367
+ scatter_embedding_sequence_parallel: true
368
+ seq_len_interpolation_factor: null
369
+ seq_length: 262144
370
+ sequence_parallel: false
371
+ share_embeddings_and_output_weights: true
372
+ should_pad_vocab: false
373
+ softmax_scale: null
374
+ softmax_type: vanilla
375
+ symmetric_ar_type: null
376
+ tensor_model_parallel_size: 1
377
+ test_mode: false
378
+ timers: null
379
+ tp_comm_atomic_ag: false
380
+ tp_comm_atomic_rs: false
381
+ tp_comm_bootstrap_backend: nccl
382
+ tp_comm_bulk_dgrad: true
383
+ tp_comm_bulk_wgrad: true
384
+ tp_comm_overlap: false
385
+ tp_comm_overlap_ag: true
386
+ tp_comm_overlap_cfg: null
387
+ tp_comm_overlap_disable_fc1: false
388
+ tp_comm_overlap_disable_qkv: false
389
+ tp_comm_overlap_rs: true
390
+ tp_comm_overlap_rs_dgrad: false
391
+ tp_comm_split_ag: true
392
+ tp_comm_split_rs: true
393
+ tp_only_amax_red: false
394
+ transformer_impl: transformer_engine
395
+ transformer_layer_spec:
396
+ _call_: false
397
+ _target_: megatron.bridge.models.gpt_provider.default_layer_spec
398
+ use_cpu_initialization: false
399
+ use_fused_weighted_squared_relu: false
400
+ use_kitchen: false
401
+ use_mamba_mem_eff_path: true
402
+ use_ring_exchange_p2p: false
403
+ use_te_activation_func: false
404
+ use_te_rng_tracker: false
405
+ use_transformer_engine_full_layer_spec: false
406
+ use_transformer_engine_op_fuser: false
407
+ variable_seq_lengths: false
408
+ virtual_pipeline_model_parallel_size: null
409
+ vocab_size: 16
410
+ wgrad_deferral_limit: 0
411
+ window_attn_skip_freq: null
412
+ window_size: null
413
+ nvrx_straggler: null
414
+ optimizer:
415
+ _target_: megatron.bridge.training.config.OptimizerConfig
416
+ adam_beta1: 0.9
417
+ adam_beta2: 0.98
418
+ adam_eps: 1.0e-08
419
+ barrier_with_L1_time: false
420
+ bf16: true
421
+ clip_grad: 0.5
422
+ config_logger_dir: ''
423
+ decoupled_lr: null
424
+ decoupled_min_lr: null
425
+ decoupled_weight_decay: true
426
+ exp_avg_dtype:
427
+ _call_: false
428
+ _target_: torch.float32
429
+ exp_avg_sq_dtype:
430
+ _call_: false
431
+ _target_: torch.float32
432
+ fp16: false
433
+ fp8_recipe: null
434
+ hysteresis: 2
435
+ initial_loss_scale: 4294967296
436
+ log_num_zeros_in_grad: false
437
+ loss_scale: null
438
+ loss_scale_window: 1000
439
+ lr: 0.0001
440
+ main_grads_dtype:
441
+ _call_: false
442
+ _target_: torch.float32
443
+ main_params_dtype:
444
+ _call_: false
445
+ _target_: torch.float32
446
+ min_loss_scale: 1.0
447
+ min_lr: 1.0e-07
448
+ muon_extra_scale_factor: 1.0
449
+ muon_fp32_matmul_prec: medium
450
+ muon_momentum: 0.95
451
+ muon_num_ns_steps: 5
452
+ muon_scale_mode: spectral
453
+ muon_split_qkv: true
454
+ muon_tp_mode: blockwise
455
+ muon_use_nesterov: false
456
+ optimizer: adam
457
+ optimizer_cpu_offload: false
458
+ optimizer_offload_fraction: 0.0
459
+ overlap_cpu_optimizer_d2h_h2d: false
460
+ overlap_param_gather: false
461
+ overlap_param_gather_with_optimizer_step: false
462
+ params_dtype: bfloat16
463
+ pin_cpu_grads: true
464
+ pin_cpu_params: true
465
+ reuse_grad_buf_for_mxfp8_param_ag: false
466
+ sgd_momentum: 0.9
467
+ store_param_remainders: true
468
+ timers: null
469
+ use_distributed_optimizer: true
470
+ use_precision_aware_optimizer: false
471
+ use_torch_optimizer_for_cpu_offload: false
472
+ weight_decay: 0.1
473
+ peft: null
474
+ profiling:
475
+ _target_: megatron.bridge.training.config.ProfilingConfig
476
+ memory_snapshot_path: snapshot.pickle
477
+ nvtx_ranges: false
478
+ profile_ranks:
479
+ - 0
480
+ profile_step_end: 12
481
+ profile_step_start: 10
482
+ record_memory_history: false
483
+ record_shapes: false
484
+ use_nsys_profiler: false
485
+ use_pytorch_profiler: false
486
+ rerun_state_machine:
487
+ _target_: megatron.bridge.training.config.RerunStateMachineConfig
488
+ check_for_nan_in_loss: true
489
+ check_for_spiky_loss: false
490
+ error_injection_rate: 0
491
+ error_injection_type: transient_error
492
+ rerun_mode: disabled
493
+ rng:
494
+ _target_: megatron.bridge.training.config.RNGConfig
495
+ data_parallel_random_init: false
496
+ inference_rng_tracker: false
497
+ seed: 1234
498
+ te_rng_tracker: false
499
+ scheduler:
500
+ _target_: megatron.bridge.training.config.SchedulerConfig
501
+ end_weight_decay: 0.1
502
+ lr_decay_iters: 12716
503
+ lr_decay_samples: null
504
+ lr_decay_steps: 3255296
505
+ lr_decay_style: linear
506
+ lr_warmup_fraction: null
507
+ lr_warmup_init: 1.0e-06
508
+ lr_warmup_iters: 200
509
+ lr_warmup_samples: 0
510
+ lr_warmup_steps: 51200
511
+ lr_wsd_decay_iters: null
512
+ lr_wsd_decay_samples: null
513
+ lr_wsd_decay_style: exponential
514
+ no_weight_decay_cond_type: null
515
+ override_opt_param_scheduler: false
516
+ start_weight_decay: 0.1
517
+ use_checkpoint_opt_param_scheduler: false
518
+ wd_incr_steps: 1528832
519
+ weight_decay_incr_style: constant
520
+ wsd_decay_steps: null
521
+ straggler: null
522
+ tensor_inspect: null
523
+ tokenizer:
524
+ _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
525
+ hf_tokenizer_kwargs: {}
526
+ image_tag_type: null
527
+ merge_file: null
528
+ special_tokens: null
529
+ tiktoken_num_special_tokens: 1000
530
+ tiktoken_pattern: null
531
+ tiktoken_special_tokens: null
532
+ tokenizer_model: ./models/Qwen-NVARC
533
+ tokenizer_prompt_format: null
534
+ tokenizer_type: HuggingFaceTokenizer
535
+ vocab_extra_ids: 0
536
+ vocab_file: null
537
+ vocab_size: null
538
+ train:
539
+ _target_: megatron.bridge.training.config.TrainingConfig
540
+ check_weight_hash_across_dp_replicas_interval: null
541
+ decrease_batch_size_if_needed: false
542
+ empty_unused_memory_level: 0
543
+ eval_interval: 1000
544
+ eval_iters: 100
545
+ exit_duration_in_mins: null
546
+ exit_interval: null
547
+ exit_signal:
548
+ _args_:
549
+ - 15
550
+ _call_: true
551
+ _target_: signal.Signals
552
+ exit_signal_handler: false
553
+ exit_signal_handler_for_dataloader: false
554
+ global_batch_size: 256
555
+ iterations_to_skip: []
556
+ manual_gc: false
557
+ manual_gc_eval: true
558
+ manual_gc_interval: 0
559
+ micro_batch_size: 1
560
+ rampup_batch_size: null
561
+ skip_train: false
562
+ train_iters: 5972
563
+ train_samples: null
564
+ train_sync_interval: null
step_5600/policy/weights/iter_0000000/train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5600/policy/weights/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
step_5600/policy/weights/latest_train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5600/train_dataloader.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abe0ee8c91d5ba1b614239817486eae38492eb4f5f311f8b71c6b33bc2151b2b
3
+ size 7336
step_5600/training_info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epoch": 0, "step": 5600, "total_steps": 5600, "consumed_samples": 1433600, "total_valid_tokens": 1626494740.0, "val:val_loss": 0.14774028956890106}
step_5800/config.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpointing:
2
+ checkpoint_dir: results/qwen3_4b_sft
3
+ checkpoint_must_save_by: null
4
+ enabled: true
5
+ higher_is_better: false
6
+ keep_top_k: 3
7
+ metric_name: val:val_loss
8
+ save_period: 200
9
+ cluster:
10
+ gpus_per_node: 2
11
+ num_nodes: 1
12
+ data:
13
+ num_workers: 4
14
+ shuffle: true
15
+ train_dataset_path:
16
+ - ./data/hones
17
+ val_dataset_path: ./data/arc2_evaluation6
18
+ logger:
19
+ gpu_monitoring:
20
+ collection_interval: 10
21
+ flush_interval: 10
22
+ log_dir: logs/exp_019
23
+ mlflow_enabled: false
24
+ monitor_gpus: false
25
+ swanlab_enabled: false
26
+ tensorboard_enabled: false
27
+ wandb:
28
+ name: qwen3_4b_sft
29
+ project: arc2
30
+ wandb_enabled: true
31
+ policy:
32
+ activation_checkpointing_enabled: false
33
+ attn_implementation: flash_attention_2
34
+ dtensor_cfg:
35
+ enabled: false
36
+ dynamic_batching:
37
+ enabled: false
38
+ fsdp_offload_enabled: false
39
+ make_sequence_length_divisible_by: 64
40
+ max_grad_norm: null
41
+ megatron_cfg:
42
+ activation_checkpointing: true
43
+ apply_rope_fusion: true
44
+ bias_activation_fusion: false
45
+ context_parallel_size: 2
46
+ distributed_data_parallel_config:
47
+ average_in_collective: true
48
+ data_parallel_sharding_strategy: optim_grads_params
49
+ grad_reduce_in_fp32: true
50
+ overlap_grad_reduce: true
51
+ overlap_param_gather: true
52
+ empty_unused_memory_level: 1
53
+ enabled: true
54
+ env_vars:
55
+ AWS_OFI_NCCL_VERSION: 1.14.0
56
+ BASH_ENV: /etc/bash.bashrc
57
+ CAL_VERSION: 0.4.4.50
58
+ CUBLASMP_VERSION: 0.4.0.789
59
+ CUBLAS_VERSION: 12.9.0.13
60
+ CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
61
+ CUDA_DRIVER_VERSION: 575.51.03
62
+ CUDA_VERSION: 12.9.0.043
63
+ CUDA_VISIBLE_DEVICES: 6,7
64
+ CUDNN_FRONTEND_VERSION: 1.11.0
65
+ CUDNN_VERSION: 9.10.1.4
66
+ CUFFT_VERSION: 11.4.0.6
67
+ CUFILE_VERSION: 1.14.0.30
68
+ CURAND_VERSION: 10.3.10.19
69
+ CUSOLVER_VERSION: 11.7.4.40
70
+ CUSPARSELT_VERSION: 0.7.1.0
71
+ CUSPARSE_VERSION: 12.5.9.5
72
+ DALI_BUILD: ''
73
+ DALI_URL_SUFFIX: '120'
74
+ DALI_VERSION: 1.49.0
75
+ EFA_VERSION: 1.38.1
76
+ ENV: /etc/shinit_v2
77
+ GDRCOPY_VERSION: 2.4.4
78
+ HOME: /root
79
+ HOSTNAME: e6ad2ac15863
80
+ HPCX_VERSION: '2.23'
81
+ KMP_DUPLICATE_LIB_OK: 'True'
82
+ KMP_INIT_AT_FORK: 'FALSE'
83
+ LC_CTYPE: C.UTF-8
84
+ LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
85
+ LESSCLOSE: /usr/bin/lesspipe %s %s
86
+ LESSOPEN: '| /usr/bin/lesspipe %s'
87
+ LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
88
+ LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
89
+ MODEL_OPT_VERSION: 0.27.1
90
+ MOFED_VERSION: 5.4-rdmacore50.0
91
+ NCCL_NET_PLUGIN: aws-ofi
92
+ NCCL_TUNER_PLUGIN: aws-ofi
93
+ NCCL_VERSION: 2.26.5
94
+ NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
95
+ NEMO_RL_VENV_DIR: /opt/ray_venvs
96
+ NPP_VERSION: 12.4.0.27
97
+ NRL_CONTAINER: '1'
98
+ NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
99
+ NSIGHT_COMPUTE_VERSION: 2025.2.0.11
100
+ NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
101
+ NVIDIA_BUILD_ID: '244212578'
102
+ NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
103
+ NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
104
+ NVIDIA_PRODUCT_NAME: CUDA
105
+ NVIDIA_REQUIRE_CUDA: cuda>=9.0
106
+ NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
107
+ NVIDIA_VISIBLE_DEVICES: all
108
+ NVJITLINK_VERSION: 12.9.41
109
+ NVJPEG_VERSION: 12.4.0.16
110
+ NVSHMEM_VERSION: 3.2.5
111
+ OLDPWD: /workspace
112
+ OMPI_MCA_coll_hcoll_enable: '0'
113
+ OPAL_PREFIX: /opt/hpcx/ompi
114
+ OPENMPI_VERSION: 4.1.7
115
+ OPENUCX_VERSION: 1.19.0
116
+ PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
117
+ POLYGRAPHY_VERSION: 0.49.20
118
+ PWD: /workspace/ARChitects
119
+ PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
120
+ PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
121
+ PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
122
+ RAY_CLIENT_MODE: '0'
123
+ RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
124
+ RAY_USAGE_STATS_ENABLED: '0'
125
+ RDMACORE_VERSION: '50.0'
126
+ SHELL: /bin/bash
127
+ SHLVL: '2'
128
+ SWANLAB_API_HOST: https://api.swanlab.cn/api
129
+ SWANLAB_RUNTIME: user
130
+ SWANLAB_WEB_HOST: https://swanlab.cn
131
+ TERM: xterm
132
+ TORCH_CUDA_ARCH_LIST: '9.0'
133
+ TRANSFORMER_ENGINE_VERSION: '2.3'
134
+ TRTOSS_VERSION: ''
135
+ TRT_VERSION: 10.10.0.31
136
+ UV: /root/.local/bin/uv
137
+ UV_LINK_MODE: copy
138
+ UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
139
+ UV_RUN_RECURSION_DEPTH: '1'
140
+ VIRTUAL_ENV: /opt/nemo_rl_venv
141
+ VIRTUAL_ENV_PROMPT: nemo-rl
142
+ WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
143
+ _: /root/.local/bin/uv
144
+ _CUDA_COMPAT_PATH: /usr/local/cuda/compat
145
+ _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
146
+ (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
147
+ _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
148
+ expert_model_parallel_size: 1
149
+ expert_tensor_parallel_size: 1
150
+ freeze_moe_router: true
151
+ moe_permute_fusion: false
152
+ moe_router_bias_update_rate: 0.0
153
+ moe_router_dtype: fp64
154
+ moe_router_load_balancing_type: none
155
+ num_layers_in_first_pipeline_stage: null
156
+ num_layers_in_last_pipeline_stage: null
157
+ optimizer:
158
+ adam_beta1: 0.9
159
+ adam_beta2: 0.98
160
+ adam_eps: 1.0e-08
161
+ bf16: true
162
+ clip_grad: 0.5
163
+ fp16: false
164
+ lr: 0.0001
165
+ min_lr: 1.0e-07
166
+ optimizer: adam
167
+ optimizer_cpu_offload: false
168
+ optimizer_offload_fraction: 0.0
169
+ params_dtype: bfloat16
170
+ sgd_momentum: 0.9
171
+ use_distributed_optimizer: true
172
+ use_precision_aware_optimizer: false
173
+ weight_decay: 0.1
174
+ pipeline_dtype: bfloat16
175
+ pipeline_model_parallel_size: 1
176
+ scheduler:
177
+ end_weight_decay: 0.1
178
+ lr_decay_iters: 12716
179
+ lr_decay_style: linear
180
+ lr_warmup_init: 1.0e-06
181
+ lr_warmup_iters: 200
182
+ start_weight_decay: 0.1
183
+ weight_decay_incr_style: constant
184
+ sequence_parallel: false
185
+ tensor_model_parallel_size: 1
186
+ train_iters: 5972
187
+ model_name: ./models/Qwen-NVARC
188
+ offload_optimizer_for_logprob: false
189
+ precision: bfloat16
190
+ sequence_packing:
191
+ algorithm: modified_first_fit_decreasing
192
+ enabled: true
193
+ sequence_length_round: 64
194
+ train_mb_tokens: 128000
195
+ tokenizer:
196
+ name: ./models/Qwen-NVARC
197
+ train_global_batch_size: 256
198
+ train_micro_batch_size: 1
199
+ sft:
200
+ max_num_epochs: 1
201
+ max_num_steps: 6400
202
+ seed: 24
203
+ val_at_start: true
204
+ val_batches: 200
205
+ val_global_batch_size: 256
206
+ val_micro_batch_size: 1
207
+ val_period: 200
step_5800/policy/weights/iter_0000000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dd177ae05a23762b1acc7a8eff274e5a9104b258ba48b225e821312fb6de12f
3
+ size 329201
step_5800/policy/weights/iter_0000000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2884ab50f51fa561ef6e4a6a4f422b146a712c0b47eb1ff41494ace545036d06
3
+ size 12718332319
step_5800/policy/weights/iter_0000000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619738ba8dfec45074012486c339e30475eb90b1f9ec0d57c6eb9ae4cbb4af39
3
+ size 12717813616
step_5800/policy/weights/iter_0000000/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd736818bbf683f63191cf9ab55ee9ec1d1ba58597572923af7a35da3c7f532
3
+ size 1767
step_5800/policy/weights/iter_0000000/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function silu at 0x7d0251c6b420>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: false
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.auto
9
+ attention_dropout: '0.0'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: false
12
+ autocast_dtype: torch.bfloat16
13
+ barrier_with_L1_time: true
14
+ bf16: true
15
+ bias_activation_fusion: false
16
+ bias_dropout_fusion: false
17
+ calculate_per_token_loss: true
18
+ clone_scatter_output_in_embedding: true
19
+ config_logger_dir: ''
20
+ cross_entropy_fusion_impl: native
21
+ cross_entropy_loss_fusion: true
22
+ defer_embedding_wgrad_compute: false
23
+ delay_wgrad_compute: false
24
+ deterministic_mode: false
25
+ disable_bf16_reduced_precision_matmul: false
26
+ disable_parameter_transpose_cache: false
27
+ distribute_saved_activations: None
28
+ enable_autocast: false
29
+ fallback_to_eager_attn: false
30
+ ffn_hidden_size: 9728
31
+ finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
32
+ pg_collection=None)
33
+ fine_grained_activation_offloading: false
34
+ first_last_layers_bf16: false
35
+ flash_decode: false
36
+ fp16: false
37
+ fp16_lm_cross_entropy: false
38
+ fp32_residual_connection: false
39
+ fused_single_qkv_rope: false
40
+ gated_linear_unit: true
41
+ generation_config: None
42
+ glu_linear_offset: '0.0'
43
+ grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
44
+ object at 0x7cf9d413cd70>>
45
+ grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
46
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
47
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
48
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
49
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
50
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
51
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
52
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
53
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
54
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
55
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
56
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
57
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
58
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
59
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
60
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
61
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
62
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
63
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
64
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
65
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
66
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
67
+ gradient_accumulation_fusion: false
68
+ hetereogenous_dist_checkpoint: false
69
+ heterogeneous_block_specs: false
70
+ hf_model_id: ./models/Qwen-NVARC
71
+ hidden_dropout: '0.0'
72
+ hidden_size: 2560
73
+ is_hybrid_model: false
74
+ kv_channels: 128
75
+ layernorm_epsilon: 1e-06
76
+ layernorm_zero_centered_gamma: false
77
+ linear_attention_freq: None
78
+ linear_attention_type: None
79
+ linear_conv_kernel_dim: None
80
+ linear_key_head_dim: None
81
+ linear_num_key_heads: None
82
+ linear_num_value_heads: None
83
+ linear_value_head_dim: None
84
+ log_max_attention_logit: false
85
+ make_vocab_size_divisible_by: 16
86
+ mamba_head_dim: 64
87
+ mamba_num_groups: 8
88
+ mamba_num_heads: None
89
+ mamba_state_dim: 128
90
+ masked_softmax_fusion: true
91
+ max_position_embeddings: 40960
92
+ memory_efficient_layer_norm: false
93
+ min_offloaded_tensor_size: 1048576
94
+ mlp_chunks_for_prefill: 1
95
+ moe_apply_probs_on_input: false
96
+ moe_aux_loss_coeff: '0.0'
97
+ moe_deepep_num_sms: 20
98
+ moe_enable_deepep: false
99
+ moe_expert_capacity_factor: None
100
+ moe_extended_tp: false
101
+ moe_ffn_hidden_size: None
102
+ moe_flex_dispatcher_backend: deepep
103
+ moe_grouped_gemm: false
104
+ moe_hybridep_num_sms: 16
105
+ moe_input_jitter_eps: None
106
+ moe_layer_freq: 1
107
+ moe_pad_expert_input_to_capacity: false
108
+ moe_per_layer_logging: false
109
+ moe_permute_fusion: false
110
+ moe_router_bias_update_rate: '0.0'
111
+ moe_router_dtype: fp64
112
+ moe_router_enable_expert_bias: false
113
+ moe_router_force_load_balancing: false
114
+ moe_router_fusion: false
115
+ moe_router_group_topk: None
116
+ moe_router_load_balancing_type: none
117
+ moe_router_num_groups: None
118
+ moe_router_padding_for_quantization: false
119
+ moe_router_pre_softmax: false
120
+ moe_router_score_function: softmax
121
+ moe_router_topk: 2
122
+ moe_router_topk_limited_devices: None
123
+ moe_router_topk_scaling_factor: None
124
+ moe_shared_expert_gate: false
125
+ moe_shared_expert_intermediate_size: None
126
+ moe_shared_expert_overlap: false
127
+ moe_token_dispatcher_type: allgather
128
+ moe_token_drop_policy: probs
129
+ moe_token_dropping: false
130
+ moe_use_legacy_grouped_gemm: false
131
+ moe_z_loss_coeff: None
132
+ mrope_section: None
133
+ multi_latent_attention: false
134
+ no_rope_freq: None
135
+ no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
136
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
137
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
138
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
139
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
140
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
141
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
142
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
143
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
144
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
145
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
146
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
147
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
148
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
149
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
150
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
151
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
152
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
153
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
154
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
155
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
156
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
157
+ normalization: RMSNorm
158
+ num_attention_heads: 32
159
+ num_layers: 36
160
+ num_layers_at_end_in_bf16: 1
161
+ num_layers_at_start_in_bf16: 1
162
+ num_moe_experts: None
163
+ num_query_groups: 8
164
+ nvidia_modelopt_version: 0.39.0
165
+ offload_modules: None
166
+ param_sync_func: None
167
+ params_dtype: torch.bfloat16
168
+ perform_initialization: true
169
+ persist_layer_norm: false
170
+ position_embedding_type: rope
171
+ qk_clip: false
172
+ qk_clip_alpha: '0.5'
173
+ qk_clip_threshold: 100
174
+ qk_layernorm: true
175
+ quant_recipe: None
176
+ restore_modelopt_state: false
177
+ rotary_base: 5000000
178
+ rotary_interleaved: false
179
+ rotary_percent: '1.0'
180
+ seq_len_interpolation_factor: None
181
+ seq_length: 262144
182
+ share_embeddings_and_output_weights: true
183
+ should_pad_vocab: false
184
+ softmax_scale: None
185
+ softmax_type: vanilla
186
+ symmetric_ar_type: None
187
+ test_mode: false
188
+ timers: None
189
+ transformer_impl: transformer_engine
190
+ transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
191
+ use_fused_weighted_squared_relu: false
192
+ use_kitchen: false
193
+ use_mamba_mem_eff_path: true
194
+ use_ring_exchange_p2p: false
195
+ use_te_activation_func: false
196
+ use_te_rng_tracker: false
197
+ use_transformer_engine_full_layer_spec: false
198
+ use_transformer_engine_op_fuser: false
199
+ variable_seq_lengths: false
200
+ vocab_size: 16
201
+ wgrad_deferral_limit: 0
202
+ window_attn_skip_freq: None
203
+ window_size: None
step_5800/policy/weights/iter_0000000/run_config.yaml ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: megatron.bridge.training.config.ConfigContainer
2
+ checkpoint:
3
+ _target_: megatron.bridge.training.config.CheckpointConfig
4
+ async_save: false
5
+ ckpt_assume_constant_structure: false
6
+ ckpt_convert_format: null
7
+ ckpt_convert_save: null
8
+ ckpt_format: torch_dist
9
+ ckpt_step: null
10
+ dist_ckpt_optim_fully_reshardable: false
11
+ dist_ckpt_save_pre_mcore_014: false
12
+ dist_ckpt_strictness: assume_ok_unexpected
13
+ distrib_optim_fully_reshardable_mem_efficient: false
14
+ exit_on_missing_checkpoint: false
15
+ finetune: true
16
+ fully_parallel_load: true
17
+ fully_parallel_save: true
18
+ load: null
19
+ load_main_params_from_ckpt: false
20
+ load_optim: true
21
+ load_rng: false
22
+ most_recent_k: -1
23
+ non_persistent_ckpt_type: null
24
+ non_persistent_global_ckpt_dir: null
25
+ non_persistent_local_ckpt_algo: fully_parallel
26
+ non_persistent_local_ckpt_dir: null
27
+ non_persistent_save_interval: null
28
+ pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
29
+ replication: false
30
+ replication_factor: 2
31
+ replication_jump: null
32
+ save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5800/policy/weights
33
+ save_interval: 100
34
+ save_optim: true
35
+ save_rng: true
36
+ save_tokenizer_assets: true
37
+ strict_fsdp_dtensor_load: false
38
+ use_checkpoint_args: false
39
+ use_persistent_ckpt_worker: true
40
+ comm_overlap: null
41
+ dataset: null
42
+ ddp:
43
+ _target_: megatron.bridge.training.config.DistributedDataParallelConfig
44
+ align_param_gather: false
45
+ average_in_collective: false
46
+ bucket_size: 40000000
47
+ check_for_large_grads: false
48
+ check_for_nan_in_grad: true
49
+ data_parallel_sharding_strategy: optim_grads_params
50
+ delay_wgrad_compute: false
51
+ disable_symmetric_registration: false
52
+ fp8_param_gather: false
53
+ fsdp_double_buffer: false
54
+ grad_reduce_in_fp32: true
55
+ gradient_reduce_div_fusion: true
56
+ keep_fp8_transpose_cache: false
57
+ nccl_ub: false
58
+ num_distributed_optimizer_instances: 1
59
+ outer_dp_sharding_strategy: no_shard
60
+ overlap_grad_reduce: true
61
+ overlap_param_gather: true
62
+ pad_buckets_for_high_nccl_busbw: false
63
+ preserve_fp32_weights: true
64
+ reduce_scatter_with_fp32_accumulation: false
65
+ reuse_grad_buf_for_mxfp8_param_ag: false
66
+ suggested_communication_unit_size: null
67
+ use_custom_fsdp: false
68
+ use_distributed_optimizer: true
69
+ use_megatron_fsdp: false
70
+ dist:
71
+ _target_: megatron.bridge.training.config.DistributedInitConfig
72
+ align_grad_reduce: true
73
+ disable_jit_fuser: false
74
+ distributed_backend: nccl
75
+ distributed_timeout_minutes: 10
76
+ distributed_timeout_seconds_after_init: null
77
+ enable_megatron_core_experimental: false
78
+ external_gpu_device_mapping: true
79
+ high_priority_stream_groups: null
80
+ lazy_init: false
81
+ local_rank: 0
82
+ nccl_communicator_config_path: null
83
+ sharp_enabled_group: null
84
+ use_gloo_process_groups: true
85
+ use_megatron_fsdp: false
86
+ use_sharp: false
87
+ use_torch_fsdp2: false
88
+ use_tp_pp_dp_mapping: false
89
+ ft: null
90
+ inprocess_restart: null
91
+ logger:
92
+ _target_: megatron.bridge.training.config.LoggerConfig
93
+ filter_warnings: true
94
+ log_energy: false
95
+ log_interval: 100
96
+ log_l2_norm_grad_to_tensorboard: false
97
+ log_loss_scale_to_tensorboard: true
98
+ log_memory_to_tensorboard: false
99
+ log_params_norm: false
100
+ log_progress: false
101
+ log_runtime_to_tensorboard: false
102
+ log_throughput: false
103
+ log_throughput_to_tensorboard: false
104
+ log_timers_to_tensorboard: false
105
+ log_validation_ppl_to_tensorboard: false
106
+ log_world_size_to_tensorboard: false
107
+ logging_level: 0
108
+ memory_keys: null
109
+ modules_to_filter: null
110
+ runtime_time_unit: hours
111
+ save_config_filepath: null
112
+ set_level_for_all_loggers: false
113
+ tensorboard_dir: null
114
+ tensorboard_log_interval: 1
115
+ tensorboard_queue_size: 1000
116
+ throughput_window_size: 100
117
+ timing_log_level: 0
118
+ timing_log_option: minmax
119
+ wandb_entity: null
120
+ wandb_exp_name: null
121
+ wandb_project: null
122
+ wandb_save_dir: null
123
+ mixed_precision: null
124
+ model:
125
+ _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
126
+ account_for_embedding_in_pipeline_split: false
127
+ account_for_loss_in_pipeline_split: false
128
+ activation_func:
129
+ _call_: false
130
+ _target_: torch.nn.functional.silu
131
+ activation_func_clamp_value: null
132
+ activation_func_fp8_input_store: false
133
+ add_bias_linear: false
134
+ add_qkv_bias: false
135
+ apply_query_key_layer_scaling: false
136
+ apply_residual_connection_post_layernorm: false
137
+ apply_rope_fusion: true
138
+ async_tensor_model_parallel_allreduce: false
139
+ attention_backend:
140
+ _args_:
141
+ - 5
142
+ _call_: true
143
+ _target_: megatron.core.transformer.enums.AttnBackend
144
+ attention_dropout: 0.0
145
+ attention_output_gate: false
146
+ attention_softmax_in_fp32: false
147
+ autocast_dtype:
148
+ _call_: false
149
+ _target_: torch.bfloat16
150
+ barrier_with_L1_time: true
151
+ batch_p2p_comm: true
152
+ batch_p2p_sync: true
153
+ bf16: true
154
+ bias_activation_fusion: false
155
+ bias_dropout_fusion: false
156
+ calculate_per_token_loss: true
157
+ clone_scatter_output_in_embedding: true
158
+ config_logger_dir: ''
159
+ context_parallel_size: 2
160
+ cp_comm_type: null
161
+ cpu_offloading: false
162
+ cpu_offloading_activations: true
163
+ cpu_offloading_double_buffering: false
164
+ cpu_offloading_num_layers: 0
165
+ cpu_offloading_weights: false
166
+ cross_entropy_fusion_impl: native
167
+ cross_entropy_loss_fusion: true
168
+ cuda_graph_impl: none
169
+ cuda_graph_retain_backward_graph: false
170
+ cuda_graph_scope: []
171
+ cuda_graph_use_single_mempool: false
172
+ cuda_graph_warmup_steps: 3
173
+ deallocate_pipeline_outputs: true
174
+ defer_embedding_wgrad_compute: false
175
+ delay_wgrad_compute: false
176
+ deterministic_mode: false
177
+ disable_bf16_reduced_precision_matmul: false
178
+ disable_parameter_transpose_cache: false
179
+ distribute_saved_activations: null
180
+ embedding_init_method:
181
+ _args_: []
182
+ _partial_: true
183
+ _target_: torch.nn.init.normal_
184
+ mean: 0.0
185
+ std: 0.02
186
+ embedding_init_method_std: 0.02
187
+ enable_autocast: false
188
+ enable_cuda_graph: false
189
+ expert_model_parallel_size: 1
190
+ expert_tensor_parallel_size: 1
191
+ external_cuda_graph: false
192
+ fallback_to_eager_attn: false
193
+ ffn_hidden_size: 9728
194
+ finalize_model_grads_func:
195
+ _args_: []
196
+ _partial_: true
197
+ _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
198
+ pg_collection: null
199
+ fine_grained_activation_offloading: false
200
+ first_last_layers_bf16: false
201
+ flash_decode: false
202
+ fp16: false
203
+ fp16_lm_cross_entropy: false
204
+ fp32_residual_connection: false
205
+ fp4: null
206
+ fp4_param: false
207
+ fp4_quantizer_factory: null
208
+ fp4_recipe: nvfp4
209
+ fp8: null
210
+ fp8_amax_compute_algo: most_recent
211
+ fp8_amax_history_len: 1
212
+ fp8_dot_product_attention: false
213
+ fp8_interval: 1
214
+ fp8_margin: 0
215
+ fp8_multi_head_attention: false
216
+ fp8_param: false
217
+ fp8_quantizer_factory: null
218
+ fp8_recipe: delayed
219
+ fp8_wgrad: true
220
+ fused_single_qkv_rope: false
221
+ gated_linear_unit: true
222
+ generation_config: null
223
+ glu_linear_offset: 0.0
224
+ grad_scale_func:
225
+ _call_: false
226
+ _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
227
+ grad_sync_func:
228
+ _call_: false
229
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
230
+ gradient_accumulation_fusion: false
231
+ hetereogenous_dist_checkpoint: false
232
+ heterogeneous_block_specs: false
233
+ hf_model_id: ./models/Qwen-NVARC
234
+ hidden_dropout: 0.0
235
+ hidden_size: 2560
236
+ hierarchical_context_parallel_sizes: null
237
+ inference_rng_tracker: false
238
+ inference_sampling_seed: 42
239
+ init_method:
240
+ _args_: []
241
+ _partial_: true
242
+ _target_: torch.nn.init.normal_
243
+ mean: 0.0
244
+ std: 0.02
245
+ init_method_std: 0.02
246
+ init_model_with_meta_device: false
247
+ is_hybrid_model: false
248
+ kv_channels: 128
249
+ layernorm_epsilon: 1.0e-06
250
+ layernorm_zero_centered_gamma: false
251
+ linear_attention_freq: null
252
+ linear_attention_type: null
253
+ linear_conv_kernel_dim: null
254
+ linear_key_head_dim: null
255
+ linear_num_key_heads: null
256
+ linear_num_value_heads: null
257
+ linear_value_head_dim: null
258
+ log_max_attention_logit: false
259
+ make_vocab_size_divisible_by: 16
260
+ mamba_head_dim: 64
261
+ mamba_num_groups: 8
262
+ mamba_num_heads: null
263
+ mamba_state_dim: 128
264
+ masked_softmax_fusion: true
265
+ max_position_embeddings: 40960
266
+ memory_efficient_layer_norm: false
267
+ microbatch_group_size_per_vp_stage: 1
268
+ min_offloaded_tensor_size: 1048576
269
+ mlp_chunks_for_prefill: 1
270
+ moe_apply_probs_on_input: false
271
+ moe_aux_loss_coeff: 0.0
272
+ moe_deepep_num_sms: 20
273
+ moe_enable_deepep: false
274
+ moe_expert_capacity_factor: null
275
+ moe_extended_tp: false
276
+ moe_ffn_hidden_size: null
277
+ moe_flex_dispatcher_backend: deepep
278
+ moe_grouped_gemm: false
279
+ moe_hybridep_num_sms: 16
280
+ moe_input_jitter_eps: null
281
+ moe_layer_freq: 1
282
+ moe_layer_recompute: false
283
+ moe_pad_expert_input_to_capacity: false
284
+ moe_per_layer_logging: false
285
+ moe_permute_fusion: false
286
+ moe_router_bias_update_rate: 0.0
287
+ moe_router_dtype: fp64
288
+ moe_router_enable_expert_bias: false
289
+ moe_router_force_load_balancing: false
290
+ moe_router_fusion: false
291
+ moe_router_group_topk: null
292
+ moe_router_load_balancing_type: none
293
+ moe_router_num_groups: null
294
+ moe_router_padding_for_fp8: false
295
+ moe_router_padding_for_quantization: false
296
+ moe_router_pre_softmax: false
297
+ moe_router_score_function: softmax
298
+ moe_router_topk: 2
299
+ moe_router_topk_limited_devices: null
300
+ moe_router_topk_scaling_factor: null
301
+ moe_shared_expert_gate: false
302
+ moe_shared_expert_intermediate_size: null
303
+ moe_shared_expert_overlap: false
304
+ moe_token_dispatcher_type: allgather
305
+ moe_token_drop_policy: probs
306
+ moe_token_dropping: false
307
+ moe_use_legacy_grouped_gemm: false
308
+ moe_z_loss_coeff: null
309
+ mrope_section: null
310
+ mtp_enabled: false
311
+ mtp_loss_scaling_factor: null
312
+ mtp_num_layers: null
313
+ mtp_standalone: false
314
+ multi_latent_attention: false
315
+ no_rope_freq: null
316
+ no_sync_func:
317
+ _call_: false
318
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
319
+ normalization: RMSNorm
320
+ num_attention_heads: 32
321
+ num_layers: 36
322
+ num_layers_at_end_in_bf16: 1
323
+ num_layers_at_start_in_bf16: 1
324
+ num_layers_in_first_pipeline_stage: null
325
+ num_layers_in_last_pipeline_stage: null
326
+ num_microbatches_with_partial_activation_checkpoints: null
327
+ num_moe_experts: null
328
+ num_query_groups: 8
329
+ offload_modules: null
330
+ output_layer_init_method:
331
+ _args_: []
332
+ _partial_: true
333
+ _target_: torch.nn.init.normal_
334
+ mean: 0.0
335
+ std: 0.0023570226039551587
336
+ overlap_moe_expert_parallel_comm: false
337
+ overlap_p2p_comm: false
338
+ overlap_p2p_comm_warmup_flush: false
339
+ parallel_output: true
340
+ param_sync_func: null
341
+ params_dtype:
342
+ _call_: false
343
+ _target_: torch.bfloat16
344
+ perform_initialization: true
345
+ persist_layer_norm: false
346
+ pipeline_dtype:
347
+ _call_: false
348
+ _target_: torch.bfloat16
349
+ pipeline_model_parallel_comm_backend: null
350
+ pipeline_model_parallel_layout: null
351
+ pipeline_model_parallel_size: 1
352
+ position_embedding_type: rope
353
+ qk_clip: false
354
+ qk_clip_alpha: 0.5
355
+ qk_clip_threshold: 100
356
+ qk_layernorm: true
357
+ quant_recipe: null
358
+ recompute_granularity: full
359
+ recompute_method: uniform
360
+ recompute_modules:
361
+ - core_attn
362
+ recompute_num_layers: 1
363
+ restore_modelopt_state: false
364
+ rotary_base: 5000000
365
+ rotary_interleaved: false
366
+ rotary_percent: 1.0
367
+ scatter_embedding_sequence_parallel: true
368
+ seq_len_interpolation_factor: null
369
+ seq_length: 262144
370
+ sequence_parallel: false
371
+ share_embeddings_and_output_weights: true
372
+ should_pad_vocab: false
373
+ softmax_scale: null
374
+ softmax_type: vanilla
375
+ symmetric_ar_type: null
376
+ tensor_model_parallel_size: 1
377
+ test_mode: false
378
+ timers: null
379
+ tp_comm_atomic_ag: false
380
+ tp_comm_atomic_rs: false
381
+ tp_comm_bootstrap_backend: nccl
382
+ tp_comm_bulk_dgrad: true
383
+ tp_comm_bulk_wgrad: true
384
+ tp_comm_overlap: false
385
+ tp_comm_overlap_ag: true
386
+ tp_comm_overlap_cfg: null
387
+ tp_comm_overlap_disable_fc1: false
388
+ tp_comm_overlap_disable_qkv: false
389
+ tp_comm_overlap_rs: true
390
+ tp_comm_overlap_rs_dgrad: false
391
+ tp_comm_split_ag: true
392
+ tp_comm_split_rs: true
393
+ tp_only_amax_red: false
394
+ transformer_impl: transformer_engine
395
+ transformer_layer_spec:
396
+ _call_: false
397
+ _target_: megatron.bridge.models.gpt_provider.default_layer_spec
398
+ use_cpu_initialization: false
399
+ use_fused_weighted_squared_relu: false
400
+ use_kitchen: false
401
+ use_mamba_mem_eff_path: true
402
+ use_ring_exchange_p2p: false
403
+ use_te_activation_func: false
404
+ use_te_rng_tracker: false
405
+ use_transformer_engine_full_layer_spec: false
406
+ use_transformer_engine_op_fuser: false
407
+ variable_seq_lengths: false
408
+ virtual_pipeline_model_parallel_size: null
409
+ vocab_size: 16
410
+ wgrad_deferral_limit: 0
411
+ window_attn_skip_freq: null
412
+ window_size: null
413
+ nvrx_straggler: null
414
+ optimizer:
415
+ _target_: megatron.bridge.training.config.OptimizerConfig
416
+ adam_beta1: 0.9
417
+ adam_beta2: 0.98
418
+ adam_eps: 1.0e-08
419
+ barrier_with_L1_time: false
420
+ bf16: true
421
+ clip_grad: 0.5
422
+ config_logger_dir: ''
423
+ decoupled_lr: null
424
+ decoupled_min_lr: null
425
+ decoupled_weight_decay: true
426
+ exp_avg_dtype:
427
+ _call_: false
428
+ _target_: torch.float32
429
+ exp_avg_sq_dtype:
430
+ _call_: false
431
+ _target_: torch.float32
432
+ fp16: false
433
+ fp8_recipe: null
434
+ hysteresis: 2
435
+ initial_loss_scale: 4294967296
436
+ log_num_zeros_in_grad: false
437
+ loss_scale: null
438
+ loss_scale_window: 1000
439
+ lr: 0.0001
440
+ main_grads_dtype:
441
+ _call_: false
442
+ _target_: torch.float32
443
+ main_params_dtype:
444
+ _call_: false
445
+ _target_: torch.float32
446
+ min_loss_scale: 1.0
447
+ min_lr: 1.0e-07
448
+ muon_extra_scale_factor: 1.0
449
+ muon_fp32_matmul_prec: medium
450
+ muon_momentum: 0.95
451
+ muon_num_ns_steps: 5
452
+ muon_scale_mode: spectral
453
+ muon_split_qkv: true
454
+ muon_tp_mode: blockwise
455
+ muon_use_nesterov: false
456
+ optimizer: adam
457
+ optimizer_cpu_offload: false
458
+ optimizer_offload_fraction: 0.0
459
+ overlap_cpu_optimizer_d2h_h2d: false
460
+ overlap_param_gather: false
461
+ overlap_param_gather_with_optimizer_step: false
462
+ params_dtype: bfloat16
463
+ pin_cpu_grads: true
464
+ pin_cpu_params: true
465
+ reuse_grad_buf_for_mxfp8_param_ag: false
466
+ sgd_momentum: 0.9
467
+ store_param_remainders: true
468
+ timers: null
469
+ use_distributed_optimizer: true
470
+ use_precision_aware_optimizer: false
471
+ use_torch_optimizer_for_cpu_offload: false
472
+ weight_decay: 0.1
473
+ peft: null
474
+ profiling:
475
+ _target_: megatron.bridge.training.config.ProfilingConfig
476
+ memory_snapshot_path: snapshot.pickle
477
+ nvtx_ranges: false
478
+ profile_ranks:
479
+ - 0
480
+ profile_step_end: 12
481
+ profile_step_start: 10
482
+ record_memory_history: false
483
+ record_shapes: false
484
+ use_nsys_profiler: false
485
+ use_pytorch_profiler: false
486
+ rerun_state_machine:
487
+ _target_: megatron.bridge.training.config.RerunStateMachineConfig
488
+ check_for_nan_in_loss: true
489
+ check_for_spiky_loss: false
490
+ error_injection_rate: 0
491
+ error_injection_type: transient_error
492
+ rerun_mode: disabled
493
+ rng:
494
+ _target_: megatron.bridge.training.config.RNGConfig
495
+ data_parallel_random_init: false
496
+ inference_rng_tracker: false
497
+ seed: 1234
498
+ te_rng_tracker: false
499
+ scheduler:
500
+ _target_: megatron.bridge.training.config.SchedulerConfig
501
+ end_weight_decay: 0.1
502
+ lr_decay_iters: 12716
503
+ lr_decay_samples: null
504
+ lr_decay_steps: 3255296
505
+ lr_decay_style: linear
506
+ lr_warmup_fraction: null
507
+ lr_warmup_init: 1.0e-06
508
+ lr_warmup_iters: 200
509
+ lr_warmup_samples: 0
510
+ lr_warmup_steps: 51200
511
+ lr_wsd_decay_iters: null
512
+ lr_wsd_decay_samples: null
513
+ lr_wsd_decay_style: exponential
514
+ no_weight_decay_cond_type: null
515
+ override_opt_param_scheduler: false
516
+ start_weight_decay: 0.1
517
+ use_checkpoint_opt_param_scheduler: false
518
+ wd_incr_steps: 1528832
519
+ weight_decay_incr_style: constant
520
+ wsd_decay_steps: null
521
+ straggler: null
522
+ tensor_inspect: null
523
+ tokenizer:
524
+ _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
525
+ hf_tokenizer_kwargs: {}
526
+ image_tag_type: null
527
+ merge_file: null
528
+ special_tokens: null
529
+ tiktoken_num_special_tokens: 1000
530
+ tiktoken_pattern: null
531
+ tiktoken_special_tokens: null
532
+ tokenizer_model: ./models/Qwen-NVARC
533
+ tokenizer_prompt_format: null
534
+ tokenizer_type: HuggingFaceTokenizer
535
+ vocab_extra_ids: 0
536
+ vocab_file: null
537
+ vocab_size: null
538
+ train:
539
+ _target_: megatron.bridge.training.config.TrainingConfig
540
+ check_weight_hash_across_dp_replicas_interval: null
541
+ decrease_batch_size_if_needed: false
542
+ empty_unused_memory_level: 0
543
+ eval_interval: 1000
544
+ eval_iters: 100
545
+ exit_duration_in_mins: null
546
+ exit_interval: null
547
+ exit_signal:
548
+ _args_:
549
+ - 15
550
+ _call_: true
551
+ _target_: signal.Signals
552
+ exit_signal_handler: false
553
+ exit_signal_handler_for_dataloader: false
554
+ global_batch_size: 256
555
+ iterations_to_skip: []
556
+ manual_gc: false
557
+ manual_gc_eval: true
558
+ manual_gc_interval: 0
559
+ micro_batch_size: 1
560
+ rampup_batch_size: null
561
+ skip_train: false
562
+ train_iters: 5972
563
+ train_samples: null
564
+ train_sync_interval: null
step_5800/policy/weights/iter_0000000/train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5800/policy/weights/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
step_5800/policy/weights/latest_train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5800/train_dataloader.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:458e115a25d1c97a8415a462c7ac872cd8c36b2dd1561561119e578a52acef61
3
+ size 7336
step_5800/training_info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epoch": 0, "step": 5800, "total_steps": 5800, "consumed_samples": 1484800, "total_valid_tokens": 1684485110.0, "val:val_loss": 0.14940811693668365}
step_5972/config.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpointing:
2
+ checkpoint_dir: results/qwen3_4b_sft
3
+ checkpoint_must_save_by: null
4
+ enabled: true
5
+ higher_is_better: false
6
+ keep_top_k: 3
7
+ metric_name: val:val_loss
8
+ save_period: 200
9
+ cluster:
10
+ gpus_per_node: 2
11
+ num_nodes: 1
12
+ data:
13
+ num_workers: 4
14
+ shuffle: true
15
+ train_dataset_path:
16
+ - ./data/hones
17
+ val_dataset_path: ./data/arc2_evaluation6
18
+ logger:
19
+ gpu_monitoring:
20
+ collection_interval: 10
21
+ flush_interval: 10
22
+ log_dir: logs/exp_019
23
+ mlflow_enabled: false
24
+ monitor_gpus: false
25
+ swanlab_enabled: false
26
+ tensorboard_enabled: false
27
+ wandb:
28
+ name: qwen3_4b_sft
29
+ project: arc2
30
+ wandb_enabled: true
31
+ policy:
32
+ activation_checkpointing_enabled: false
33
+ attn_implementation: flash_attention_2
34
+ dtensor_cfg:
35
+ enabled: false
36
+ dynamic_batching:
37
+ enabled: false
38
+ fsdp_offload_enabled: false
39
+ make_sequence_length_divisible_by: 64
40
+ max_grad_norm: null
41
+ megatron_cfg:
42
+ activation_checkpointing: true
43
+ apply_rope_fusion: true
44
+ bias_activation_fusion: false
45
+ context_parallel_size: 2
46
+ distributed_data_parallel_config:
47
+ average_in_collective: true
48
+ data_parallel_sharding_strategy: optim_grads_params
49
+ grad_reduce_in_fp32: true
50
+ overlap_grad_reduce: true
51
+ overlap_param_gather: true
52
+ empty_unused_memory_level: 1
53
+ enabled: true
54
+ env_vars:
55
+ AWS_OFI_NCCL_VERSION: 1.14.0
56
+ BASH_ENV: /etc/bash.bashrc
57
+ CAL_VERSION: 0.4.4.50
58
+ CUBLASMP_VERSION: 0.4.0.789
59
+ CUBLAS_VERSION: 12.9.0.13
60
+ CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
61
+ CUDA_DRIVER_VERSION: 575.51.03
62
+ CUDA_VERSION: 12.9.0.043
63
+ CUDA_VISIBLE_DEVICES: 6,7
64
+ CUDNN_FRONTEND_VERSION: 1.11.0
65
+ CUDNN_VERSION: 9.10.1.4
66
+ CUFFT_VERSION: 11.4.0.6
67
+ CUFILE_VERSION: 1.14.0.30
68
+ CURAND_VERSION: 10.3.10.19
69
+ CUSOLVER_VERSION: 11.7.4.40
70
+ CUSPARSELT_VERSION: 0.7.1.0
71
+ CUSPARSE_VERSION: 12.5.9.5
72
+ DALI_BUILD: ''
73
+ DALI_URL_SUFFIX: '120'
74
+ DALI_VERSION: 1.49.0
75
+ EFA_VERSION: 1.38.1
76
+ ENV: /etc/shinit_v2
77
+ GDRCOPY_VERSION: 2.4.4
78
+ HOME: /root
79
+ HOSTNAME: e6ad2ac15863
80
+ HPCX_VERSION: '2.23'
81
+ KMP_DUPLICATE_LIB_OK: 'True'
82
+ KMP_INIT_AT_FORK: 'FALSE'
83
+ LC_CTYPE: C.UTF-8
84
+ LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
85
+ LESSCLOSE: /usr/bin/lesspipe %s %s
86
+ LESSOPEN: '| /usr/bin/lesspipe %s'
87
+ LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
88
+ LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
89
+ MODEL_OPT_VERSION: 0.27.1
90
+ MOFED_VERSION: 5.4-rdmacore50.0
91
+ NCCL_NET_PLUGIN: aws-ofi
92
+ NCCL_TUNER_PLUGIN: aws-ofi
93
+ NCCL_VERSION: 2.26.5
94
+ NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
95
+ NEMO_RL_VENV_DIR: /opt/ray_venvs
96
+ NPP_VERSION: 12.4.0.27
97
+ NRL_CONTAINER: '1'
98
+ NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
99
+ NSIGHT_COMPUTE_VERSION: 2025.2.0.11
100
+ NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
101
+ NVIDIA_BUILD_ID: '244212578'
102
+ NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
103
+ NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
104
+ NVIDIA_PRODUCT_NAME: CUDA
105
+ NVIDIA_REQUIRE_CUDA: cuda>=9.0
106
+ NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
107
+ NVIDIA_VISIBLE_DEVICES: all
108
+ NVJITLINK_VERSION: 12.9.41
109
+ NVJPEG_VERSION: 12.4.0.16
110
+ NVSHMEM_VERSION: 3.2.5
111
+ OLDPWD: /workspace
112
+ OMPI_MCA_coll_hcoll_enable: '0'
113
+ OPAL_PREFIX: /opt/hpcx/ompi
114
+ OPENMPI_VERSION: 4.1.7
115
+ OPENUCX_VERSION: 1.19.0
116
+ PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
117
+ POLYGRAPHY_VERSION: 0.49.20
118
+ PWD: /workspace/ARChitects
119
+ PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
120
+ PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
121
+ PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
122
+ RAY_CLIENT_MODE: '0'
123
+ RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
124
+ RAY_USAGE_STATS_ENABLED: '0'
125
+ RDMACORE_VERSION: '50.0'
126
+ SHELL: /bin/bash
127
+ SHLVL: '2'
128
+ SWANLAB_API_HOST: https://api.swanlab.cn/api
129
+ SWANLAB_RUNTIME: user
130
+ SWANLAB_WEB_HOST: https://swanlab.cn
131
+ TERM: xterm
132
+ TORCH_CUDA_ARCH_LIST: '9.0'
133
+ TRANSFORMER_ENGINE_VERSION: '2.3'
134
+ TRTOSS_VERSION: ''
135
+ TRT_VERSION: 10.10.0.31
136
+ UV: /root/.local/bin/uv
137
+ UV_LINK_MODE: copy
138
+ UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
139
+ UV_RUN_RECURSION_DEPTH: '1'
140
+ VIRTUAL_ENV: /opt/nemo_rl_venv
141
+ VIRTUAL_ENV_PROMPT: nemo-rl
142
+ WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
143
+ _: /root/.local/bin/uv
144
+ _CUDA_COMPAT_PATH: /usr/local/cuda/compat
145
+ _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
146
+ (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
147
+ _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
148
+ expert_model_parallel_size: 1
149
+ expert_tensor_parallel_size: 1
150
+ freeze_moe_router: true
151
+ moe_permute_fusion: false
152
+ moe_router_bias_update_rate: 0.0
153
+ moe_router_dtype: fp64
154
+ moe_router_load_balancing_type: none
155
+ num_layers_in_first_pipeline_stage: null
156
+ num_layers_in_last_pipeline_stage: null
157
+ optimizer:
158
+ adam_beta1: 0.9
159
+ adam_beta2: 0.98
160
+ adam_eps: 1.0e-08
161
+ bf16: true
162
+ clip_grad: 0.5
163
+ fp16: false
164
+ lr: 0.0001
165
+ min_lr: 1.0e-07
166
+ optimizer: adam
167
+ optimizer_cpu_offload: false
168
+ optimizer_offload_fraction: 0.0
169
+ params_dtype: bfloat16
170
+ sgd_momentum: 0.9
171
+ use_distributed_optimizer: true
172
+ use_precision_aware_optimizer: false
173
+ weight_decay: 0.1
174
+ pipeline_dtype: bfloat16
175
+ pipeline_model_parallel_size: 1
176
+ scheduler:
177
+ end_weight_decay: 0.1
178
+ lr_decay_iters: 12716
179
+ lr_decay_style: linear
180
+ lr_warmup_init: 1.0e-06
181
+ lr_warmup_iters: 200
182
+ start_weight_decay: 0.1
183
+ weight_decay_incr_style: constant
184
+ sequence_parallel: false
185
+ tensor_model_parallel_size: 1
186
+ train_iters: 5972
187
+ model_name: ./models/Qwen-NVARC
188
+ offload_optimizer_for_logprob: false
189
+ precision: bfloat16
190
+ sequence_packing:
191
+ algorithm: modified_first_fit_decreasing
192
+ enabled: true
193
+ sequence_length_round: 64
194
+ train_mb_tokens: 128000
195
+ tokenizer:
196
+ name: ./models/Qwen-NVARC
197
+ train_global_batch_size: 256
198
+ train_micro_batch_size: 1
199
+ sft:
200
+ max_num_epochs: 1
201
+ max_num_steps: 6400
202
+ seed: 24
203
+ val_at_start: true
204
+ val_batches: 200
205
+ val_global_batch_size: 256
206
+ val_micro_batch_size: 1
207
+ val_period: 200
step_5972/policy/weights/iter_0000000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b2c2c7a6c21b171a30b50ae7dc76c9744532ea6b3c093434c81c412ad99548
3
+ size 329201
step_5972/policy/weights/iter_0000000/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2f6811136b6e0fbc6c36bf350aee4b9e42c450265f2475895b613fc98ff26e7
3
+ size 12718313784
step_5972/policy/weights/iter_0000000/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22f80411837876981796f026820ac72c19b06b79a478df2c332f912075adc25f
3
+ size 12717860926
step_5972/policy/weights/iter_0000000/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293a19ff82e664ad14eeea37b1cdcfc976171b534d5ec99eff7d86a5dfade2af
3
+ size 1767
step_5972/policy/weights/iter_0000000/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function silu at 0x7d0251c6b420>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: false
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.auto
9
+ attention_dropout: '0.0'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: false
12
+ autocast_dtype: torch.bfloat16
13
+ barrier_with_L1_time: true
14
+ bf16: true
15
+ bias_activation_fusion: false
16
+ bias_dropout_fusion: false
17
+ calculate_per_token_loss: true
18
+ clone_scatter_output_in_embedding: true
19
+ config_logger_dir: ''
20
+ cross_entropy_fusion_impl: native
21
+ cross_entropy_loss_fusion: true
22
+ defer_embedding_wgrad_compute: false
23
+ delay_wgrad_compute: false
24
+ deterministic_mode: false
25
+ disable_bf16_reduced_precision_matmul: false
26
+ disable_parameter_transpose_cache: false
27
+ distribute_saved_activations: None
28
+ enable_autocast: false
29
+ fallback_to_eager_attn: false
30
+ ffn_hidden_size: 9728
31
+ finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
32
+ pg_collection=None)
33
+ fine_grained_activation_offloading: false
34
+ first_last_layers_bf16: false
35
+ flash_decode: false
36
+ fp16: false
37
+ fp16_lm_cross_entropy: false
38
+ fp32_residual_connection: false
39
+ fused_single_qkv_rope: false
40
+ gated_linear_unit: true
41
+ generation_config: None
42
+ glu_linear_offset: '0.0'
43
+ grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
44
+ object at 0x7cf9d413cd70>>
45
+ grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
46
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
47
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
48
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
49
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
50
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
51
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
52
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
53
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
54
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
55
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
56
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
57
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
58
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
59
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
60
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
61
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
62
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
63
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
64
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
65
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
66
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
67
+ gradient_accumulation_fusion: false
68
+ hetereogenous_dist_checkpoint: false
69
+ heterogeneous_block_specs: false
70
+ hf_model_id: ./models/Qwen-NVARC
71
+ hidden_dropout: '0.0'
72
+ hidden_size: 2560
73
+ is_hybrid_model: false
74
+ kv_channels: 128
75
+ layernorm_epsilon: 1e-06
76
+ layernorm_zero_centered_gamma: false
77
+ linear_attention_freq: None
78
+ linear_attention_type: None
79
+ linear_conv_kernel_dim: None
80
+ linear_key_head_dim: None
81
+ linear_num_key_heads: None
82
+ linear_num_value_heads: None
83
+ linear_value_head_dim: None
84
+ log_max_attention_logit: false
85
+ make_vocab_size_divisible_by: 16
86
+ mamba_head_dim: 64
87
+ mamba_num_groups: 8
88
+ mamba_num_heads: None
89
+ mamba_state_dim: 128
90
+ masked_softmax_fusion: true
91
+ max_position_embeddings: 40960
92
+ memory_efficient_layer_norm: false
93
+ min_offloaded_tensor_size: 1048576
94
+ mlp_chunks_for_prefill: 1
95
+ moe_apply_probs_on_input: false
96
+ moe_aux_loss_coeff: '0.0'
97
+ moe_deepep_num_sms: 20
98
+ moe_enable_deepep: false
99
+ moe_expert_capacity_factor: None
100
+ moe_extended_tp: false
101
+ moe_ffn_hidden_size: None
102
+ moe_flex_dispatcher_backend: deepep
103
+ moe_grouped_gemm: false
104
+ moe_hybridep_num_sms: 16
105
+ moe_input_jitter_eps: None
106
+ moe_layer_freq: 1
107
+ moe_pad_expert_input_to_capacity: false
108
+ moe_per_layer_logging: false
109
+ moe_permute_fusion: false
110
+ moe_router_bias_update_rate: '0.0'
111
+ moe_router_dtype: fp64
112
+ moe_router_enable_expert_bias: false
113
+ moe_router_force_load_balancing: false
114
+ moe_router_fusion: false
115
+ moe_router_group_topk: None
116
+ moe_router_load_balancing_type: none
117
+ moe_router_num_groups: None
118
+ moe_router_padding_for_quantization: false
119
+ moe_router_pre_softmax: false
120
+ moe_router_score_function: softmax
121
+ moe_router_topk: 2
122
+ moe_router_topk_limited_devices: None
123
+ moe_router_topk_scaling_factor: None
124
+ moe_shared_expert_gate: false
125
+ moe_shared_expert_intermediate_size: None
126
+ moe_shared_expert_overlap: false
127
+ moe_token_dispatcher_type: allgather
128
+ moe_token_drop_policy: probs
129
+ moe_token_dropping: false
130
+ moe_use_legacy_grouped_gemm: false
131
+ moe_z_loss_coeff: None
132
+ mrope_section: None
133
+ multi_latent_attention: false
134
+ no_rope_freq: None
135
+ no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
136
+ \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
137
+ \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
138
+ \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
139
+ \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
140
+ \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
141
+ \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
142
+ \ (flash_attention): FlashAttention()\n (fused_attention):\
143
+ \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
144
+ \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
145
+ \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
146
+ \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
147
+ \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
148
+ \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
149
+ \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
150
+ \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
151
+ \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
152
+ \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
153
+ \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
154
+ \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
155
+ \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
156
+ \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
157
+ normalization: RMSNorm
158
+ num_attention_heads: 32
159
+ num_layers: 36
160
+ num_layers_at_end_in_bf16: 1
161
+ num_layers_at_start_in_bf16: 1
162
+ num_moe_experts: None
163
+ num_query_groups: 8
164
+ nvidia_modelopt_version: 0.39.0
165
+ offload_modules: None
166
+ param_sync_func: None
167
+ params_dtype: torch.bfloat16
168
+ perform_initialization: true
169
+ persist_layer_norm: false
170
+ position_embedding_type: rope
171
+ qk_clip: false
172
+ qk_clip_alpha: '0.5'
173
+ qk_clip_threshold: 100
174
+ qk_layernorm: true
175
+ quant_recipe: None
176
+ restore_modelopt_state: false
177
+ rotary_base: 5000000
178
+ rotary_interleaved: false
179
+ rotary_percent: '1.0'
180
+ seq_len_interpolation_factor: None
181
+ seq_length: 262144
182
+ share_embeddings_and_output_weights: true
183
+ should_pad_vocab: false
184
+ softmax_scale: None
185
+ softmax_type: vanilla
186
+ symmetric_ar_type: None
187
+ test_mode: false
188
+ timers: None
189
+ transformer_impl: transformer_engine
190
+ transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
191
+ use_fused_weighted_squared_relu: false
192
+ use_kitchen: false
193
+ use_mamba_mem_eff_path: true
194
+ use_ring_exchange_p2p: false
195
+ use_te_activation_func: false
196
+ use_te_rng_tracker: false
197
+ use_transformer_engine_full_layer_spec: false
198
+ use_transformer_engine_op_fuser: false
199
+ variable_seq_lengths: false
200
+ vocab_size: 16
201
+ wgrad_deferral_limit: 0
202
+ window_attn_skip_freq: None
203
+ window_size: None
step_5972/policy/weights/iter_0000000/run_config.yaml ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: megatron.bridge.training.config.ConfigContainer
2
+ checkpoint:
3
+ _target_: megatron.bridge.training.config.CheckpointConfig
4
+ async_save: false
5
+ ckpt_assume_constant_structure: false
6
+ ckpt_convert_format: null
7
+ ckpt_convert_save: null
8
+ ckpt_format: torch_dist
9
+ ckpt_step: null
10
+ dist_ckpt_optim_fully_reshardable: false
11
+ dist_ckpt_save_pre_mcore_014: false
12
+ dist_ckpt_strictness: assume_ok_unexpected
13
+ distrib_optim_fully_reshardable_mem_efficient: false
14
+ exit_on_missing_checkpoint: false
15
+ finetune: true
16
+ fully_parallel_load: true
17
+ fully_parallel_save: true
18
+ load: null
19
+ load_main_params_from_ckpt: false
20
+ load_optim: true
21
+ load_rng: false
22
+ most_recent_k: -1
23
+ non_persistent_ckpt_type: null
24
+ non_persistent_global_ckpt_dir: null
25
+ non_persistent_local_ckpt_algo: fully_parallel
26
+ non_persistent_local_ckpt_dir: null
27
+ non_persistent_save_interval: null
28
+ pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
29
+ replication: false
30
+ replication_factor: 2
31
+ replication_jump: null
32
+ save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5972/policy/weights
33
+ save_interval: 100
34
+ save_optim: true
35
+ save_rng: true
36
+ save_tokenizer_assets: true
37
+ strict_fsdp_dtensor_load: false
38
+ use_checkpoint_args: false
39
+ use_persistent_ckpt_worker: true
40
+ comm_overlap: null
41
+ dataset: null
42
+ ddp:
43
+ _target_: megatron.bridge.training.config.DistributedDataParallelConfig
44
+ align_param_gather: false
45
+ average_in_collective: false
46
+ bucket_size: 40000000
47
+ check_for_large_grads: false
48
+ check_for_nan_in_grad: true
49
+ data_parallel_sharding_strategy: optim_grads_params
50
+ delay_wgrad_compute: false
51
+ disable_symmetric_registration: false
52
+ fp8_param_gather: false
53
+ fsdp_double_buffer: false
54
+ grad_reduce_in_fp32: true
55
+ gradient_reduce_div_fusion: true
56
+ keep_fp8_transpose_cache: false
57
+ nccl_ub: false
58
+ num_distributed_optimizer_instances: 1
59
+ outer_dp_sharding_strategy: no_shard
60
+ overlap_grad_reduce: true
61
+ overlap_param_gather: true
62
+ pad_buckets_for_high_nccl_busbw: false
63
+ preserve_fp32_weights: true
64
+ reduce_scatter_with_fp32_accumulation: false
65
+ reuse_grad_buf_for_mxfp8_param_ag: false
66
+ suggested_communication_unit_size: null
67
+ use_custom_fsdp: false
68
+ use_distributed_optimizer: true
69
+ use_megatron_fsdp: false
70
+ dist:
71
+ _target_: megatron.bridge.training.config.DistributedInitConfig
72
+ align_grad_reduce: true
73
+ disable_jit_fuser: false
74
+ distributed_backend: nccl
75
+ distributed_timeout_minutes: 10
76
+ distributed_timeout_seconds_after_init: null
77
+ enable_megatron_core_experimental: false
78
+ external_gpu_device_mapping: true
79
+ high_priority_stream_groups: null
80
+ lazy_init: false
81
+ local_rank: 0
82
+ nccl_communicator_config_path: null
83
+ sharp_enabled_group: null
84
+ use_gloo_process_groups: true
85
+ use_megatron_fsdp: false
86
+ use_sharp: false
87
+ use_torch_fsdp2: false
88
+ use_tp_pp_dp_mapping: false
89
+ ft: null
90
+ inprocess_restart: null
91
+ logger:
92
+ _target_: megatron.bridge.training.config.LoggerConfig
93
+ filter_warnings: true
94
+ log_energy: false
95
+ log_interval: 100
96
+ log_l2_norm_grad_to_tensorboard: false
97
+ log_loss_scale_to_tensorboard: true
98
+ log_memory_to_tensorboard: false
99
+ log_params_norm: false
100
+ log_progress: false
101
+ log_runtime_to_tensorboard: false
102
+ log_throughput: false
103
+ log_throughput_to_tensorboard: false
104
+ log_timers_to_tensorboard: false
105
+ log_validation_ppl_to_tensorboard: false
106
+ log_world_size_to_tensorboard: false
107
+ logging_level: 0
108
+ memory_keys: null
109
+ modules_to_filter: null
110
+ runtime_time_unit: hours
111
+ save_config_filepath: null
112
+ set_level_for_all_loggers: false
113
+ tensorboard_dir: null
114
+ tensorboard_log_interval: 1
115
+ tensorboard_queue_size: 1000
116
+ throughput_window_size: 100
117
+ timing_log_level: 0
118
+ timing_log_option: minmax
119
+ wandb_entity: null
120
+ wandb_exp_name: null
121
+ wandb_project: null
122
+ wandb_save_dir: null
123
+ mixed_precision: null
124
+ model:
125
+ _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
126
+ account_for_embedding_in_pipeline_split: false
127
+ account_for_loss_in_pipeline_split: false
128
+ activation_func:
129
+ _call_: false
130
+ _target_: torch.nn.functional.silu
131
+ activation_func_clamp_value: null
132
+ activation_func_fp8_input_store: false
133
+ add_bias_linear: false
134
+ add_qkv_bias: false
135
+ apply_query_key_layer_scaling: false
136
+ apply_residual_connection_post_layernorm: false
137
+ apply_rope_fusion: true
138
+ async_tensor_model_parallel_allreduce: false
139
+ attention_backend:
140
+ _args_:
141
+ - 5
142
+ _call_: true
143
+ _target_: megatron.core.transformer.enums.AttnBackend
144
+ attention_dropout: 0.0
145
+ attention_output_gate: false
146
+ attention_softmax_in_fp32: false
147
+ autocast_dtype:
148
+ _call_: false
149
+ _target_: torch.bfloat16
150
+ barrier_with_L1_time: true
151
+ batch_p2p_comm: true
152
+ batch_p2p_sync: true
153
+ bf16: true
154
+ bias_activation_fusion: false
155
+ bias_dropout_fusion: false
156
+ calculate_per_token_loss: true
157
+ clone_scatter_output_in_embedding: true
158
+ config_logger_dir: ''
159
+ context_parallel_size: 2
160
+ cp_comm_type: null
161
+ cpu_offloading: false
162
+ cpu_offloading_activations: true
163
+ cpu_offloading_double_buffering: false
164
+ cpu_offloading_num_layers: 0
165
+ cpu_offloading_weights: false
166
+ cross_entropy_fusion_impl: native
167
+ cross_entropy_loss_fusion: true
168
+ cuda_graph_impl: none
169
+ cuda_graph_retain_backward_graph: false
170
+ cuda_graph_scope: []
171
+ cuda_graph_use_single_mempool: false
172
+ cuda_graph_warmup_steps: 3
173
+ deallocate_pipeline_outputs: true
174
+ defer_embedding_wgrad_compute: false
175
+ delay_wgrad_compute: false
176
+ deterministic_mode: false
177
+ disable_bf16_reduced_precision_matmul: false
178
+ disable_parameter_transpose_cache: false
179
+ distribute_saved_activations: null
180
+ embedding_init_method:
181
+ _args_: []
182
+ _partial_: true
183
+ _target_: torch.nn.init.normal_
184
+ mean: 0.0
185
+ std: 0.02
186
+ embedding_init_method_std: 0.02
187
+ enable_autocast: false
188
+ enable_cuda_graph: false
189
+ expert_model_parallel_size: 1
190
+ expert_tensor_parallel_size: 1
191
+ external_cuda_graph: false
192
+ fallback_to_eager_attn: false
193
+ ffn_hidden_size: 9728
194
+ finalize_model_grads_func:
195
+ _args_: []
196
+ _partial_: true
197
+ _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
198
+ pg_collection: null
199
+ fine_grained_activation_offloading: false
200
+ first_last_layers_bf16: false
201
+ flash_decode: false
202
+ fp16: false
203
+ fp16_lm_cross_entropy: false
204
+ fp32_residual_connection: false
205
+ fp4: null
206
+ fp4_param: false
207
+ fp4_quantizer_factory: null
208
+ fp4_recipe: nvfp4
209
+ fp8: null
210
+ fp8_amax_compute_algo: most_recent
211
+ fp8_amax_history_len: 1
212
+ fp8_dot_product_attention: false
213
+ fp8_interval: 1
214
+ fp8_margin: 0
215
+ fp8_multi_head_attention: false
216
+ fp8_param: false
217
+ fp8_quantizer_factory: null
218
+ fp8_recipe: delayed
219
+ fp8_wgrad: true
220
+ fused_single_qkv_rope: false
221
+ gated_linear_unit: true
222
+ generation_config: null
223
+ glu_linear_offset: 0.0
224
+ grad_scale_func:
225
+ _call_: false
226
+ _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
227
+ grad_sync_func:
228
+ _call_: false
229
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
230
+ gradient_accumulation_fusion: false
231
+ hetereogenous_dist_checkpoint: false
232
+ heterogeneous_block_specs: false
233
+ hf_model_id: ./models/Qwen-NVARC
234
+ hidden_dropout: 0.0
235
+ hidden_size: 2560
236
+ hierarchical_context_parallel_sizes: null
237
+ inference_rng_tracker: false
238
+ inference_sampling_seed: 42
239
+ init_method:
240
+ _args_: []
241
+ _partial_: true
242
+ _target_: torch.nn.init.normal_
243
+ mean: 0.0
244
+ std: 0.02
245
+ init_method_std: 0.02
246
+ init_model_with_meta_device: false
247
+ is_hybrid_model: false
248
+ kv_channels: 128
249
+ layernorm_epsilon: 1.0e-06
250
+ layernorm_zero_centered_gamma: false
251
+ linear_attention_freq: null
252
+ linear_attention_type: null
253
+ linear_conv_kernel_dim: null
254
+ linear_key_head_dim: null
255
+ linear_num_key_heads: null
256
+ linear_num_value_heads: null
257
+ linear_value_head_dim: null
258
+ log_max_attention_logit: false
259
+ make_vocab_size_divisible_by: 16
260
+ mamba_head_dim: 64
261
+ mamba_num_groups: 8
262
+ mamba_num_heads: null
263
+ mamba_state_dim: 128
264
+ masked_softmax_fusion: true
265
+ max_position_embeddings: 40960
266
+ memory_efficient_layer_norm: false
267
+ microbatch_group_size_per_vp_stage: 1
268
+ min_offloaded_tensor_size: 1048576
269
+ mlp_chunks_for_prefill: 1
270
+ moe_apply_probs_on_input: false
271
+ moe_aux_loss_coeff: 0.0
272
+ moe_deepep_num_sms: 20
273
+ moe_enable_deepep: false
274
+ moe_expert_capacity_factor: null
275
+ moe_extended_tp: false
276
+ moe_ffn_hidden_size: null
277
+ moe_flex_dispatcher_backend: deepep
278
+ moe_grouped_gemm: false
279
+ moe_hybridep_num_sms: 16
280
+ moe_input_jitter_eps: null
281
+ moe_layer_freq: 1
282
+ moe_layer_recompute: false
283
+ moe_pad_expert_input_to_capacity: false
284
+ moe_per_layer_logging: false
285
+ moe_permute_fusion: false
286
+ moe_router_bias_update_rate: 0.0
287
+ moe_router_dtype: fp64
288
+ moe_router_enable_expert_bias: false
289
+ moe_router_force_load_balancing: false
290
+ moe_router_fusion: false
291
+ moe_router_group_topk: null
292
+ moe_router_load_balancing_type: none
293
+ moe_router_num_groups: null
294
+ moe_router_padding_for_fp8: false
295
+ moe_router_padding_for_quantization: false
296
+ moe_router_pre_softmax: false
297
+ moe_router_score_function: softmax
298
+ moe_router_topk: 2
299
+ moe_router_topk_limited_devices: null
300
+ moe_router_topk_scaling_factor: null
301
+ moe_shared_expert_gate: false
302
+ moe_shared_expert_intermediate_size: null
303
+ moe_shared_expert_overlap: false
304
+ moe_token_dispatcher_type: allgather
305
+ moe_token_drop_policy: probs
306
+ moe_token_dropping: false
307
+ moe_use_legacy_grouped_gemm: false
308
+ moe_z_loss_coeff: null
309
+ mrope_section: null
310
+ mtp_enabled: false
311
+ mtp_loss_scaling_factor: null
312
+ mtp_num_layers: null
313
+ mtp_standalone: false
314
+ multi_latent_attention: false
315
+ no_rope_freq: null
316
+ no_sync_func:
317
+ _call_: false
318
+ _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
319
+ normalization: RMSNorm
320
+ num_attention_heads: 32
321
+ num_layers: 36
322
+ num_layers_at_end_in_bf16: 1
323
+ num_layers_at_start_in_bf16: 1
324
+ num_layers_in_first_pipeline_stage: null
325
+ num_layers_in_last_pipeline_stage: null
326
+ num_microbatches_with_partial_activation_checkpoints: null
327
+ num_moe_experts: null
328
+ num_query_groups: 8
329
+ offload_modules: null
330
+ output_layer_init_method:
331
+ _args_: []
332
+ _partial_: true
333
+ _target_: torch.nn.init.normal_
334
+ mean: 0.0
335
+ std: 0.0023570226039551587
336
+ overlap_moe_expert_parallel_comm: false
337
+ overlap_p2p_comm: false
338
+ overlap_p2p_comm_warmup_flush: false
339
+ parallel_output: true
340
+ param_sync_func: null
341
+ params_dtype:
342
+ _call_: false
343
+ _target_: torch.bfloat16
344
+ perform_initialization: true
345
+ persist_layer_norm: false
346
+ pipeline_dtype:
347
+ _call_: false
348
+ _target_: torch.bfloat16
349
+ pipeline_model_parallel_comm_backend: null
350
+ pipeline_model_parallel_layout: null
351
+ pipeline_model_parallel_size: 1
352
+ position_embedding_type: rope
353
+ qk_clip: false
354
+ qk_clip_alpha: 0.5
355
+ qk_clip_threshold: 100
356
+ qk_layernorm: true
357
+ quant_recipe: null
358
+ recompute_granularity: full
359
+ recompute_method: uniform
360
+ recompute_modules:
361
+ - core_attn
362
+ recompute_num_layers: 1
363
+ restore_modelopt_state: false
364
+ rotary_base: 5000000
365
+ rotary_interleaved: false
366
+ rotary_percent: 1.0
367
+ scatter_embedding_sequence_parallel: true
368
+ seq_len_interpolation_factor: null
369
+ seq_length: 262144
370
+ sequence_parallel: false
371
+ share_embeddings_and_output_weights: true
372
+ should_pad_vocab: false
373
+ softmax_scale: null
374
+ softmax_type: vanilla
375
+ symmetric_ar_type: null
376
+ tensor_model_parallel_size: 1
377
+ test_mode: false
378
+ timers: null
379
+ tp_comm_atomic_ag: false
380
+ tp_comm_atomic_rs: false
381
+ tp_comm_bootstrap_backend: nccl
382
+ tp_comm_bulk_dgrad: true
383
+ tp_comm_bulk_wgrad: true
384
+ tp_comm_overlap: false
385
+ tp_comm_overlap_ag: true
386
+ tp_comm_overlap_cfg: null
387
+ tp_comm_overlap_disable_fc1: false
388
+ tp_comm_overlap_disable_qkv: false
389
+ tp_comm_overlap_rs: true
390
+ tp_comm_overlap_rs_dgrad: false
391
+ tp_comm_split_ag: true
392
+ tp_comm_split_rs: true
393
+ tp_only_amax_red: false
394
+ transformer_impl: transformer_engine
395
+ transformer_layer_spec:
396
+ _call_: false
397
+ _target_: megatron.bridge.models.gpt_provider.default_layer_spec
398
+ use_cpu_initialization: false
399
+ use_fused_weighted_squared_relu: false
400
+ use_kitchen: false
401
+ use_mamba_mem_eff_path: true
402
+ use_ring_exchange_p2p: false
403
+ use_te_activation_func: false
404
+ use_te_rng_tracker: false
405
+ use_transformer_engine_full_layer_spec: false
406
+ use_transformer_engine_op_fuser: false
407
+ variable_seq_lengths: false
408
+ virtual_pipeline_model_parallel_size: null
409
+ vocab_size: 16
410
+ wgrad_deferral_limit: 0
411
+ window_attn_skip_freq: null
412
+ window_size: null
413
+ nvrx_straggler: null
414
+ optimizer:
415
+ _target_: megatron.bridge.training.config.OptimizerConfig
416
+ adam_beta1: 0.9
417
+ adam_beta2: 0.98
418
+ adam_eps: 1.0e-08
419
+ barrier_with_L1_time: false
420
+ bf16: true
421
+ clip_grad: 0.5
422
+ config_logger_dir: ''
423
+ decoupled_lr: null
424
+ decoupled_min_lr: null
425
+ decoupled_weight_decay: true
426
+ exp_avg_dtype:
427
+ _call_: false
428
+ _target_: torch.float32
429
+ exp_avg_sq_dtype:
430
+ _call_: false
431
+ _target_: torch.float32
432
+ fp16: false
433
+ fp8_recipe: null
434
+ hysteresis: 2
435
+ initial_loss_scale: 4294967296
436
+ log_num_zeros_in_grad: false
437
+ loss_scale: null
438
+ loss_scale_window: 1000
439
+ lr: 0.0001
440
+ main_grads_dtype:
441
+ _call_: false
442
+ _target_: torch.float32
443
+ main_params_dtype:
444
+ _call_: false
445
+ _target_: torch.float32
446
+ min_loss_scale: 1.0
447
+ min_lr: 1.0e-07
448
+ muon_extra_scale_factor: 1.0
449
+ muon_fp32_matmul_prec: medium
450
+ muon_momentum: 0.95
451
+ muon_num_ns_steps: 5
452
+ muon_scale_mode: spectral
453
+ muon_split_qkv: true
454
+ muon_tp_mode: blockwise
455
+ muon_use_nesterov: false
456
+ optimizer: adam
457
+ optimizer_cpu_offload: false
458
+ optimizer_offload_fraction: 0.0
459
+ overlap_cpu_optimizer_d2h_h2d: false
460
+ overlap_param_gather: false
461
+ overlap_param_gather_with_optimizer_step: false
462
+ params_dtype: bfloat16
463
+ pin_cpu_grads: true
464
+ pin_cpu_params: true
465
+ reuse_grad_buf_for_mxfp8_param_ag: false
466
+ sgd_momentum: 0.9
467
+ store_param_remainders: true
468
+ timers: null
469
+ use_distributed_optimizer: true
470
+ use_precision_aware_optimizer: false
471
+ use_torch_optimizer_for_cpu_offload: false
472
+ weight_decay: 0.1
473
+ peft: null
474
+ profiling:
475
+ _target_: megatron.bridge.training.config.ProfilingConfig
476
+ memory_snapshot_path: snapshot.pickle
477
+ nvtx_ranges: false
478
+ profile_ranks:
479
+ - 0
480
+ profile_step_end: 12
481
+ profile_step_start: 10
482
+ record_memory_history: false
483
+ record_shapes: false
484
+ use_nsys_profiler: false
485
+ use_pytorch_profiler: false
486
+ rerun_state_machine:
487
+ _target_: megatron.bridge.training.config.RerunStateMachineConfig
488
+ check_for_nan_in_loss: true
489
+ check_for_spiky_loss: false
490
+ error_injection_rate: 0
491
+ error_injection_type: transient_error
492
+ rerun_mode: disabled
493
+ rng:
494
+ _target_: megatron.bridge.training.config.RNGConfig
495
+ data_parallel_random_init: false
496
+ inference_rng_tracker: false
497
+ seed: 1234
498
+ te_rng_tracker: false
499
+ scheduler:
500
+ _target_: megatron.bridge.training.config.SchedulerConfig
501
+ end_weight_decay: 0.1
502
+ lr_decay_iters: 12716
503
+ lr_decay_samples: null
504
+ lr_decay_steps: 3255296
505
+ lr_decay_style: linear
506
+ lr_warmup_fraction: null
507
+ lr_warmup_init: 1.0e-06
508
+ lr_warmup_iters: 200
509
+ lr_warmup_samples: 0
510
+ lr_warmup_steps: 51200
511
+ lr_wsd_decay_iters: null
512
+ lr_wsd_decay_samples: null
513
+ lr_wsd_decay_style: exponential
514
+ no_weight_decay_cond_type: null
515
+ override_opt_param_scheduler: false
516
+ start_weight_decay: 0.1
517
+ use_checkpoint_opt_param_scheduler: false
518
+ wd_incr_steps: 1528832
519
+ weight_decay_incr_style: constant
520
+ wsd_decay_steps: null
521
+ straggler: null
522
+ tensor_inspect: null
523
+ tokenizer:
524
+ _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
525
+ hf_tokenizer_kwargs: {}
526
+ image_tag_type: null
527
+ merge_file: null
528
+ special_tokens: null
529
+ tiktoken_num_special_tokens: 1000
530
+ tiktoken_pattern: null
531
+ tiktoken_special_tokens: null
532
+ tokenizer_model: ./models/Qwen-NVARC
533
+ tokenizer_prompt_format: null
534
+ tokenizer_type: HuggingFaceTokenizer
535
+ vocab_extra_ids: 0
536
+ vocab_file: null
537
+ vocab_size: null
538
+ train:
539
+ _target_: megatron.bridge.training.config.TrainingConfig
540
+ check_weight_hash_across_dp_replicas_interval: null
541
+ decrease_batch_size_if_needed: false
542
+ empty_unused_memory_level: 0
543
+ eval_interval: 1000
544
+ eval_iters: 100
545
+ exit_duration_in_mins: null
546
+ exit_interval: null
547
+ exit_signal:
548
+ _args_:
549
+ - 15
550
+ _call_: true
551
+ _target_: signal.Signals
552
+ exit_signal_handler: false
553
+ exit_signal_handler_for_dataloader: false
554
+ global_batch_size: 256
555
+ iterations_to_skip: []
556
+ manual_gc: false
557
+ manual_gc_eval: true
558
+ manual_gc_interval: 0
559
+ micro_batch_size: 1
560
+ rampup_batch_size: null
561
+ skip_train: false
562
+ train_iters: 5972
563
+ train_samples: null
564
+ train_sync_interval: null
step_5972/policy/weights/iter_0000000/train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5972/policy/weights/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
step_5972/policy/weights/latest_train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
+ size 3461
step_5972/train_dataloader.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bfcb136c615985e1571fc19377a9c8101d41c662c01f02e87c20a192ea5137
3
+ size 7336