iamPi commited on
Commit
ff261fa
·
verified ·
1 Parent(s): 652d359

Delete step_5400

Browse files
step_5400/config.yaml DELETED
@@ -1,207 +0,0 @@
1
- checkpointing:
2
- checkpoint_dir: results/qwen3_4b_sft
3
- checkpoint_must_save_by: null
4
- enabled: true
5
- higher_is_better: false
6
- keep_top_k: 3
7
- metric_name: val:val_loss
8
- save_period: 200
9
- cluster:
10
- gpus_per_node: 2
11
- num_nodes: 1
12
- data:
13
- num_workers: 4
14
- shuffle: true
15
- train_dataset_path:
16
- - ./data/hones
17
- val_dataset_path: ./data/arc2_evaluation6
18
- logger:
19
- gpu_monitoring:
20
- collection_interval: 10
21
- flush_interval: 10
22
- log_dir: logs/exp_019
23
- mlflow_enabled: false
24
- monitor_gpus: false
25
- swanlab_enabled: false
26
- tensorboard_enabled: false
27
- wandb:
28
- name: qwen3_4b_sft
29
- project: arc2
30
- wandb_enabled: true
31
- policy:
32
- activation_checkpointing_enabled: false
33
- attn_implementation: flash_attention_2
34
- dtensor_cfg:
35
- enabled: false
36
- dynamic_batching:
37
- enabled: false
38
- fsdp_offload_enabled: false
39
- make_sequence_length_divisible_by: 64
40
- max_grad_norm: null
41
- megatron_cfg:
42
- activation_checkpointing: true
43
- apply_rope_fusion: true
44
- bias_activation_fusion: false
45
- context_parallel_size: 2
46
- distributed_data_parallel_config:
47
- average_in_collective: true
48
- data_parallel_sharding_strategy: optim_grads_params
49
- grad_reduce_in_fp32: true
50
- overlap_grad_reduce: true
51
- overlap_param_gather: true
52
- empty_unused_memory_level: 1
53
- enabled: true
54
- env_vars:
55
- AWS_OFI_NCCL_VERSION: 1.14.0
56
- BASH_ENV: /etc/bash.bashrc
57
- CAL_VERSION: 0.4.4.50
58
- CUBLASMP_VERSION: 0.4.0.789
59
- CUBLAS_VERSION: 12.9.0.13
60
- CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
61
- CUDA_DRIVER_VERSION: 575.51.03
62
- CUDA_VERSION: 12.9.0.043
63
- CUDA_VISIBLE_DEVICES: 6,7
64
- CUDNN_FRONTEND_VERSION: 1.11.0
65
- CUDNN_VERSION: 9.10.1.4
66
- CUFFT_VERSION: 11.4.0.6
67
- CUFILE_VERSION: 1.14.0.30
68
- CURAND_VERSION: 10.3.10.19
69
- CUSOLVER_VERSION: 11.7.4.40
70
- CUSPARSELT_VERSION: 0.7.1.0
71
- CUSPARSE_VERSION: 12.5.9.5
72
- DALI_BUILD: ''
73
- DALI_URL_SUFFIX: '120'
74
- DALI_VERSION: 1.49.0
75
- EFA_VERSION: 1.38.1
76
- ENV: /etc/shinit_v2
77
- GDRCOPY_VERSION: 2.4.4
78
- HOME: /root
79
- HOSTNAME: e6ad2ac15863
80
- HPCX_VERSION: '2.23'
81
- KMP_DUPLICATE_LIB_OK: 'True'
82
- KMP_INIT_AT_FORK: 'FALSE'
83
- LC_CTYPE: C.UTF-8
84
- LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
85
- LESSCLOSE: /usr/bin/lesspipe %s %s
86
- LESSOPEN: '| /usr/bin/lesspipe %s'
87
- LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
88
- LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
89
- MODEL_OPT_VERSION: 0.27.1
90
- MOFED_VERSION: 5.4-rdmacore50.0
91
- NCCL_NET_PLUGIN: aws-ofi
92
- NCCL_TUNER_PLUGIN: aws-ofi
93
- NCCL_VERSION: 2.26.5
94
- NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
95
- NEMO_RL_VENV_DIR: /opt/ray_venvs
96
- NPP_VERSION: 12.4.0.27
97
- NRL_CONTAINER: '1'
98
- NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
99
- NSIGHT_COMPUTE_VERSION: 2025.2.0.11
100
- NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
101
- NVIDIA_BUILD_ID: '244212578'
102
- NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
103
- NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
104
- NVIDIA_PRODUCT_NAME: CUDA
105
- NVIDIA_REQUIRE_CUDA: cuda>=9.0
106
- NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
107
- NVIDIA_VISIBLE_DEVICES: all
108
- NVJITLINK_VERSION: 12.9.41
109
- NVJPEG_VERSION: 12.4.0.16
110
- NVSHMEM_VERSION: 3.2.5
111
- OLDPWD: /workspace
112
- OMPI_MCA_coll_hcoll_enable: '0'
113
- OPAL_PREFIX: /opt/hpcx/ompi
114
- OPENMPI_VERSION: 4.1.7
115
- OPENUCX_VERSION: 1.19.0
116
- PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
117
- POLYGRAPHY_VERSION: 0.49.20
118
- PWD: /workspace/ARChitects
119
- PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
120
- PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
121
- PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
122
- RAY_CLIENT_MODE: '0'
123
- RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
124
- RAY_USAGE_STATS_ENABLED: '0'
125
- RDMACORE_VERSION: '50.0'
126
- SHELL: /bin/bash
127
- SHLVL: '2'
128
- SWANLAB_API_HOST: https://api.swanlab.cn/api
129
- SWANLAB_RUNTIME: user
130
- SWANLAB_WEB_HOST: https://swanlab.cn
131
- TERM: xterm
132
- TORCH_CUDA_ARCH_LIST: '9.0'
133
- TRANSFORMER_ENGINE_VERSION: '2.3'
134
- TRTOSS_VERSION: ''
135
- TRT_VERSION: 10.10.0.31
136
- UV: /root/.local/bin/uv
137
- UV_LINK_MODE: copy
138
- UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
139
- UV_RUN_RECURSION_DEPTH: '1'
140
- VIRTUAL_ENV: /opt/nemo_rl_venv
141
- VIRTUAL_ENV_PROMPT: nemo-rl
142
- WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
143
- _: /root/.local/bin/uv
144
- _CUDA_COMPAT_PATH: /usr/local/cuda/compat
145
- _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
146
- (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
147
- _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
148
- expert_model_parallel_size: 1
149
- expert_tensor_parallel_size: 1
150
- freeze_moe_router: true
151
- moe_permute_fusion: false
152
- moe_router_bias_update_rate: 0.0
153
- moe_router_dtype: fp64
154
- moe_router_load_balancing_type: none
155
- num_layers_in_first_pipeline_stage: null
156
- num_layers_in_last_pipeline_stage: null
157
- optimizer:
158
- adam_beta1: 0.9
159
- adam_beta2: 0.98
160
- adam_eps: 1.0e-08
161
- bf16: true
162
- clip_grad: 0.5
163
- fp16: false
164
- lr: 0.0001
165
- min_lr: 1.0e-07
166
- optimizer: adam
167
- optimizer_cpu_offload: false
168
- optimizer_offload_fraction: 0.0
169
- params_dtype: bfloat16
170
- sgd_momentum: 0.9
171
- use_distributed_optimizer: true
172
- use_precision_aware_optimizer: false
173
- weight_decay: 0.1
174
- pipeline_dtype: bfloat16
175
- pipeline_model_parallel_size: 1
176
- scheduler:
177
- end_weight_decay: 0.1
178
- lr_decay_iters: 12716
179
- lr_decay_style: linear
180
- lr_warmup_init: 1.0e-06
181
- lr_warmup_iters: 200
182
- start_weight_decay: 0.1
183
- weight_decay_incr_style: constant
184
- sequence_parallel: false
185
- tensor_model_parallel_size: 1
186
- train_iters: 5972
187
- model_name: ./models/Qwen-NVARC
188
- offload_optimizer_for_logprob: false
189
- precision: bfloat16
190
- sequence_packing:
191
- algorithm: modified_first_fit_decreasing
192
- enabled: true
193
- sequence_length_round: 64
194
- train_mb_tokens: 128000
195
- tokenizer:
196
- name: ./models/Qwen-NVARC
197
- train_global_batch_size: 256
198
- train_micro_batch_size: 1
199
- sft:
200
- max_num_epochs: 1
201
- max_num_steps: 6400
202
- seed: 24
203
- val_at_start: true
204
- val_batches: 200
205
- val_global_batch_size: 256
206
- val_micro_batch_size: 1
207
- val_period: 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
step_5400/policy/weights/iter_0000000/.metadata DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:601958148c0276510ee83ae2c089910f685c2aa6fde4b6f5e668b28ed06ec567
3
- size 329201
 
 
 
 
step_5400/policy/weights/iter_0000000/__0_0.distcp DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f4f63a1df595166115fa2fd03a1601a3ae7b6c72151956a0f966332b260176d
3
- size 12718332319
 
 
 
 
step_5400/policy/weights/iter_0000000/__0_1.distcp DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:916b552fc89ae9288ede8bfb09213769b9129faeaa4906c1556f01b00c9c6ea5
3
- size 12718313784
 
 
 
 
step_5400/policy/weights/iter_0000000/__1_0.distcp DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdbfe2d6c54d823e7ef9c6bdfb156183fa5d437043a001c83847514272046f8b
3
- size 12717813616
 
 
 
 
step_5400/policy/weights/iter_0000000/__1_1.distcp DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:418e7aacc4c2ab151f789fd2a477dfb904a1078c76eb23cdd921ee133e5840fb
3
- size 12717860926
 
 
 
 
step_5400/policy/weights/iter_0000000/common.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cf17a4bbf5fb940ff8d1e669f26a4e277411e9796b4920f5cd867e4401db145
3
- size 1767
 
 
 
 
step_5400/policy/weights/iter_0000000/metadata.json DELETED
@@ -1 +0,0 @@
1
- {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
 
 
step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml DELETED
@@ -1,203 +0,0 @@
1
- activation_func: <function silu at 0x7d0251c6b420>
2
- activation_func_clamp_value: None
3
- add_bias_linear: false
4
- add_qkv_bias: false
5
- apply_query_key_layer_scaling: false
6
- apply_residual_connection_post_layernorm: false
7
- apply_rope_fusion: true
8
- attention_backend: AttnBackend.auto
9
- attention_dropout: '0.0'
10
- attention_output_gate: false
11
- attention_softmax_in_fp32: false
12
- autocast_dtype: torch.bfloat16
13
- barrier_with_L1_time: true
14
- bf16: true
15
- bias_activation_fusion: false
16
- bias_dropout_fusion: false
17
- calculate_per_token_loss: true
18
- clone_scatter_output_in_embedding: true
19
- config_logger_dir: ''
20
- cross_entropy_fusion_impl: native
21
- cross_entropy_loss_fusion: true
22
- defer_embedding_wgrad_compute: false
23
- delay_wgrad_compute: false
24
- deterministic_mode: false
25
- disable_bf16_reduced_precision_matmul: false
26
- disable_parameter_transpose_cache: false
27
- distribute_saved_activations: None
28
- enable_autocast: false
29
- fallback_to_eager_attn: false
30
- ffn_hidden_size: 9728
31
- finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
32
- pg_collection=None)
33
- fine_grained_activation_offloading: false
34
- first_last_layers_bf16: false
35
- flash_decode: false
36
- fp16: false
37
- fp16_lm_cross_entropy: false
38
- fp32_residual_connection: false
39
- fused_single_qkv_rope: false
40
- gated_linear_unit: true
41
- generation_config: None
42
- glu_linear_offset: '0.0'
43
- grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
44
- object at 0x7cf9d413cd70>>
45
- grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
46
- \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
47
- \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
48
- \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
49
- \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
50
- \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
51
- \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
52
- \ (flash_attention): FlashAttention()\n (fused_attention):\
53
- \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
54
- \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
55
- \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
56
- \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
57
- \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
58
- \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
59
- \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
60
- \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
61
- \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
62
- \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
63
- \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
64
- \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
65
- \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
66
- \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
67
- gradient_accumulation_fusion: false
68
- hetereogenous_dist_checkpoint: false
69
- heterogeneous_block_specs: false
70
- hf_model_id: ./models/Qwen-NVARC
71
- hidden_dropout: '0.0'
72
- hidden_size: 2560
73
- is_hybrid_model: false
74
- kv_channels: 128
75
- layernorm_epsilon: 1e-06
76
- layernorm_zero_centered_gamma: false
77
- linear_attention_freq: None
78
- linear_attention_type: None
79
- linear_conv_kernel_dim: None
80
- linear_key_head_dim: None
81
- linear_num_key_heads: None
82
- linear_num_value_heads: None
83
- linear_value_head_dim: None
84
- log_max_attention_logit: false
85
- make_vocab_size_divisible_by: 16
86
- mamba_head_dim: 64
87
- mamba_num_groups: 8
88
- mamba_num_heads: None
89
- mamba_state_dim: 128
90
- masked_softmax_fusion: true
91
- max_position_embeddings: 40960
92
- memory_efficient_layer_norm: false
93
- min_offloaded_tensor_size: 1048576
94
- mlp_chunks_for_prefill: 1
95
- moe_apply_probs_on_input: false
96
- moe_aux_loss_coeff: '0.0'
97
- moe_deepep_num_sms: 20
98
- moe_enable_deepep: false
99
- moe_expert_capacity_factor: None
100
- moe_extended_tp: false
101
- moe_ffn_hidden_size: None
102
- moe_flex_dispatcher_backend: deepep
103
- moe_grouped_gemm: false
104
- moe_hybridep_num_sms: 16
105
- moe_input_jitter_eps: None
106
- moe_layer_freq: 1
107
- moe_pad_expert_input_to_capacity: false
108
- moe_per_layer_logging: false
109
- moe_permute_fusion: false
110
- moe_router_bias_update_rate: '0.0'
111
- moe_router_dtype: fp64
112
- moe_router_enable_expert_bias: false
113
- moe_router_force_load_balancing: false
114
- moe_router_fusion: false
115
- moe_router_group_topk: None
116
- moe_router_load_balancing_type: none
117
- moe_router_num_groups: None
118
- moe_router_padding_for_quantization: false
119
- moe_router_pre_softmax: false
120
- moe_router_score_function: softmax
121
- moe_router_topk: 2
122
- moe_router_topk_limited_devices: None
123
- moe_router_topk_scaling_factor: None
124
- moe_shared_expert_gate: false
125
- moe_shared_expert_intermediate_size: None
126
- moe_shared_expert_overlap: false
127
- moe_token_dispatcher_type: allgather
128
- moe_token_drop_policy: probs
129
- moe_token_dropping: false
130
- moe_use_legacy_grouped_gemm: false
131
- moe_z_loss_coeff: None
132
- mrope_section: None
133
- multi_latent_attention: false
134
- no_rope_freq: None
135
- no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
136
- \ (module): CustomFloat16Module(\n (module): GPTModel(\n (embedding): LanguageModelEmbedding(\n\
137
- \ (word_embeddings): VocabParallelEmbedding()\n (embedding_dropout):\
138
- \ Dropout(p=0.0, inplace=False)\n )\n (rotary_pos_emb): RotaryEmbedding()\n\
139
- \ (decoder): TransformerBlock(\n (layers): ModuleList(\n (0-35):\
140
- \ 36 x TransformerLayer(\n (input_layernorm): IdentityOp()\n \
141
- \ (self_attention): SelfAttention(\n (core_attention): TEDotProductAttention(\n\
142
- \ (flash_attention): FlashAttention()\n (fused_attention):\
143
- \ FusedAttention()\n (unfused_attention): UnfusedDotProductAttention(\n\
144
- \ (scale_mask_softmax): FusedScaleMaskSoftmax()\n \
145
- \ (attention_dropout): Dropout(p=0.0, inplace=False)\n )\n \
146
- \ )\n (linear_proj): TERowParallelLinear(in_features=4096,\
147
- \ out_features=2560, bias=False, TP=1)\n (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
148
- \ out_features=6144, bias=False, TP=1)\n (q_layernorm): RMSNorm()\n\
149
- \ (k_layernorm): RMSNorm()\n )\n (pre_cross_attn_layernorm):\
150
- \ IdentityOp()\n (cross_attention): IdentityOp()\n (cross_attn_bda):\
151
- \ IdentityFuncOp()\n (pre_mlp_layernorm): IdentityOp()\n (mlp):\
152
- \ MLP(\n (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
153
- \ out_features=19456, bias=False, TP=1)\n (linear_fc2): TERowParallelLinear(in_features=9728,\
154
- \ out_features=2560, bias=False, TP=1)\n )\n )\n )\n\
155
- \ (final_layernorm): RMSNorm()\n )\n (output_layer): ColumnParallelLinear(in_features=2560,\
156
- \ out_features=16, bias=False, TP=1)\n )\n )\n)>"
157
- normalization: RMSNorm
158
- num_attention_heads: 32
159
- num_layers: 36
160
- num_layers_at_end_in_bf16: 1
161
- num_layers_at_start_in_bf16: 1
162
- num_moe_experts: None
163
- num_query_groups: 8
164
- nvidia_modelopt_version: 0.39.0
165
- offload_modules: None
166
- param_sync_func: None
167
- params_dtype: torch.bfloat16
168
- perform_initialization: true
169
- persist_layer_norm: false
170
- position_embedding_type: rope
171
- qk_clip: false
172
- qk_clip_alpha: '0.5'
173
- qk_clip_threshold: 100
174
- qk_layernorm: true
175
- quant_recipe: None
176
- restore_modelopt_state: false
177
- rotary_base: 5000000
178
- rotary_interleaved: false
179
- rotary_percent: '1.0'
180
- seq_len_interpolation_factor: None
181
- seq_length: 262144
182
- share_embeddings_and_output_weights: true
183
- should_pad_vocab: false
184
- softmax_scale: None
185
- softmax_type: vanilla
186
- symmetric_ar_type: None
187
- test_mode: false
188
- timers: None
189
- transformer_impl: transformer_engine
190
- transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
191
- use_fused_weighted_squared_relu: false
192
- use_kitchen: false
193
- use_mamba_mem_eff_path: true
194
- use_ring_exchange_p2p: false
195
- use_te_activation_func: false
196
- use_te_rng_tracker: false
197
- use_transformer_engine_full_layer_spec: false
198
- use_transformer_engine_op_fuser: false
199
- variable_seq_lengths: false
200
- vocab_size: 16
201
- wgrad_deferral_limit: 0
202
- window_attn_skip_freq: None
203
- window_size: None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
step_5400/policy/weights/iter_0000000/run_config.yaml DELETED
@@ -1,564 +0,0 @@
1
- _target_: megatron.bridge.training.config.ConfigContainer
2
- checkpoint:
3
- _target_: megatron.bridge.training.config.CheckpointConfig
4
- async_save: false
5
- ckpt_assume_constant_structure: false
6
- ckpt_convert_format: null
7
- ckpt_convert_save: null
8
- ckpt_format: torch_dist
9
- ckpt_step: null
10
- dist_ckpt_optim_fully_reshardable: false
11
- dist_ckpt_save_pre_mcore_014: false
12
- dist_ckpt_strictness: assume_ok_unexpected
13
- distrib_optim_fully_reshardable_mem_efficient: false
14
- exit_on_missing_checkpoint: false
15
- finetune: true
16
- fully_parallel_load: true
17
- fully_parallel_save: true
18
- load: null
19
- load_main_params_from_ckpt: false
20
- load_optim: true
21
- load_rng: false
22
- most_recent_k: -1
23
- non_persistent_ckpt_type: null
24
- non_persistent_global_ckpt_dir: null
25
- non_persistent_local_ckpt_algo: fully_parallel
26
- non_persistent_local_ckpt_dir: null
27
- non_persistent_save_interval: null
28
- pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
29
- replication: false
30
- replication_factor: 2
31
- replication_jump: null
32
- save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5400/policy/weights
33
- save_interval: 100
34
- save_optim: true
35
- save_rng: true
36
- save_tokenizer_assets: true
37
- strict_fsdp_dtensor_load: false
38
- use_checkpoint_args: false
39
- use_persistent_ckpt_worker: true
40
- comm_overlap: null
41
- dataset: null
42
- ddp:
43
- _target_: megatron.bridge.training.config.DistributedDataParallelConfig
44
- align_param_gather: false
45
- average_in_collective: false
46
- bucket_size: 40000000
47
- check_for_large_grads: false
48
- check_for_nan_in_grad: true
49
- data_parallel_sharding_strategy: optim_grads_params
50
- delay_wgrad_compute: false
51
- disable_symmetric_registration: false
52
- fp8_param_gather: false
53
- fsdp_double_buffer: false
54
- grad_reduce_in_fp32: true
55
- gradient_reduce_div_fusion: true
56
- keep_fp8_transpose_cache: false
57
- nccl_ub: false
58
- num_distributed_optimizer_instances: 1
59
- outer_dp_sharding_strategy: no_shard
60
- overlap_grad_reduce: true
61
- overlap_param_gather: true
62
- pad_buckets_for_high_nccl_busbw: false
63
- preserve_fp32_weights: true
64
- reduce_scatter_with_fp32_accumulation: false
65
- reuse_grad_buf_for_mxfp8_param_ag: false
66
- suggested_communication_unit_size: null
67
- use_custom_fsdp: false
68
- use_distributed_optimizer: true
69
- use_megatron_fsdp: false
70
- dist:
71
- _target_: megatron.bridge.training.config.DistributedInitConfig
72
- align_grad_reduce: true
73
- disable_jit_fuser: false
74
- distributed_backend: nccl
75
- distributed_timeout_minutes: 10
76
- distributed_timeout_seconds_after_init: null
77
- enable_megatron_core_experimental: false
78
- external_gpu_device_mapping: true
79
- high_priority_stream_groups: null
80
- lazy_init: false
81
- local_rank: 0
82
- nccl_communicator_config_path: null
83
- sharp_enabled_group: null
84
- use_gloo_process_groups: true
85
- use_megatron_fsdp: false
86
- use_sharp: false
87
- use_torch_fsdp2: false
88
- use_tp_pp_dp_mapping: false
89
- ft: null
90
- inprocess_restart: null
91
- logger:
92
- _target_: megatron.bridge.training.config.LoggerConfig
93
- filter_warnings: true
94
- log_energy: false
95
- log_interval: 100
96
- log_l2_norm_grad_to_tensorboard: false
97
- log_loss_scale_to_tensorboard: true
98
- log_memory_to_tensorboard: false
99
- log_params_norm: false
100
- log_progress: false
101
- log_runtime_to_tensorboard: false
102
- log_throughput: false
103
- log_throughput_to_tensorboard: false
104
- log_timers_to_tensorboard: false
105
- log_validation_ppl_to_tensorboard: false
106
- log_world_size_to_tensorboard: false
107
- logging_level: 0
108
- memory_keys: null
109
- modules_to_filter: null
110
- runtime_time_unit: hours
111
- save_config_filepath: null
112
- set_level_for_all_loggers: false
113
- tensorboard_dir: null
114
- tensorboard_log_interval: 1
115
- tensorboard_queue_size: 1000
116
- throughput_window_size: 100
117
- timing_log_level: 0
118
- timing_log_option: minmax
119
- wandb_entity: null
120
- wandb_exp_name: null
121
- wandb_project: null
122
- wandb_save_dir: null
123
- mixed_precision: null
124
- model:
125
- _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
126
- account_for_embedding_in_pipeline_split: false
127
- account_for_loss_in_pipeline_split: false
128
- activation_func:
129
- _call_: false
130
- _target_: torch.nn.functional.silu
131
- activation_func_clamp_value: null
132
- activation_func_fp8_input_store: false
133
- add_bias_linear: false
134
- add_qkv_bias: false
135
- apply_query_key_layer_scaling: false
136
- apply_residual_connection_post_layernorm: false
137
- apply_rope_fusion: true
138
- async_tensor_model_parallel_allreduce: false
139
- attention_backend:
140
- _args_:
141
- - 5
142
- _call_: true
143
- _target_: megatron.core.transformer.enums.AttnBackend
144
- attention_dropout: 0.0
145
- attention_output_gate: false
146
- attention_softmax_in_fp32: false
147
- autocast_dtype:
148
- _call_: false
149
- _target_: torch.bfloat16
150
- barrier_with_L1_time: true
151
- batch_p2p_comm: true
152
- batch_p2p_sync: true
153
- bf16: true
154
- bias_activation_fusion: false
155
- bias_dropout_fusion: false
156
- calculate_per_token_loss: true
157
- clone_scatter_output_in_embedding: true
158
- config_logger_dir: ''
159
- context_parallel_size: 2
160
- cp_comm_type: null
161
- cpu_offloading: false
162
- cpu_offloading_activations: true
163
- cpu_offloading_double_buffering: false
164
- cpu_offloading_num_layers: 0
165
- cpu_offloading_weights: false
166
- cross_entropy_fusion_impl: native
167
- cross_entropy_loss_fusion: true
168
- cuda_graph_impl: none
169
- cuda_graph_retain_backward_graph: false
170
- cuda_graph_scope: []
171
- cuda_graph_use_single_mempool: false
172
- cuda_graph_warmup_steps: 3
173
- deallocate_pipeline_outputs: true
174
- defer_embedding_wgrad_compute: false
175
- delay_wgrad_compute: false
176
- deterministic_mode: false
177
- disable_bf16_reduced_precision_matmul: false
178
- disable_parameter_transpose_cache: false
179
- distribute_saved_activations: null
180
- embedding_init_method:
181
- _args_: []
182
- _partial_: true
183
- _target_: torch.nn.init.normal_
184
- mean: 0.0
185
- std: 0.02
186
- embedding_init_method_std: 0.02
187
- enable_autocast: false
188
- enable_cuda_graph: false
189
- expert_model_parallel_size: 1
190
- expert_tensor_parallel_size: 1
191
- external_cuda_graph: false
192
- fallback_to_eager_attn: false
193
- ffn_hidden_size: 9728
194
- finalize_model_grads_func:
195
- _args_: []
196
- _partial_: true
197
- _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
198
- pg_collection: null
199
- fine_grained_activation_offloading: false
200
- first_last_layers_bf16: false
201
- flash_decode: false
202
- fp16: false
203
- fp16_lm_cross_entropy: false
204
- fp32_residual_connection: false
205
- fp4: null
206
- fp4_param: false
207
- fp4_quantizer_factory: null
208
- fp4_recipe: nvfp4
209
- fp8: null
210
- fp8_amax_compute_algo: most_recent
211
- fp8_amax_history_len: 1
212
- fp8_dot_product_attention: false
213
- fp8_interval: 1
214
- fp8_margin: 0
215
- fp8_multi_head_attention: false
216
- fp8_param: false
217
- fp8_quantizer_factory: null
218
- fp8_recipe: delayed
219
- fp8_wgrad: true
220
- fused_single_qkv_rope: false
221
- gated_linear_unit: true
222
- generation_config: null
223
- glu_linear_offset: 0.0
224
- grad_scale_func:
225
- _call_: false
226
- _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
227
- grad_sync_func:
228
- _call_: false
229
- _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
230
- gradient_accumulation_fusion: false
231
- hetereogenous_dist_checkpoint: false
232
- heterogeneous_block_specs: false
233
- hf_model_id: ./models/Qwen-NVARC
234
- hidden_dropout: 0.0
235
- hidden_size: 2560
236
- hierarchical_context_parallel_sizes: null
237
- inference_rng_tracker: false
238
- inference_sampling_seed: 42
239
- init_method:
240
- _args_: []
241
- _partial_: true
242
- _target_: torch.nn.init.normal_
243
- mean: 0.0
244
- std: 0.02
245
- init_method_std: 0.02
246
- init_model_with_meta_device: false
247
- is_hybrid_model: false
248
- kv_channels: 128
249
- layernorm_epsilon: 1.0e-06
250
- layernorm_zero_centered_gamma: false
251
- linear_attention_freq: null
252
- linear_attention_type: null
253
- linear_conv_kernel_dim: null
254
- linear_key_head_dim: null
255
- linear_num_key_heads: null
256
- linear_num_value_heads: null
257
- linear_value_head_dim: null
258
- log_max_attention_logit: false
259
- make_vocab_size_divisible_by: 16
260
- mamba_head_dim: 64
261
- mamba_num_groups: 8
262
- mamba_num_heads: null
263
- mamba_state_dim: 128
264
- masked_softmax_fusion: true
265
- max_position_embeddings: 40960
266
- memory_efficient_layer_norm: false
267
- microbatch_group_size_per_vp_stage: 1
268
- min_offloaded_tensor_size: 1048576
269
- mlp_chunks_for_prefill: 1
270
- moe_apply_probs_on_input: false
271
- moe_aux_loss_coeff: 0.0
272
- moe_deepep_num_sms: 20
273
- moe_enable_deepep: false
274
- moe_expert_capacity_factor: null
275
- moe_extended_tp: false
276
- moe_ffn_hidden_size: null
277
- moe_flex_dispatcher_backend: deepep
278
- moe_grouped_gemm: false
279
- moe_hybridep_num_sms: 16
280
- moe_input_jitter_eps: null
281
- moe_layer_freq: 1
282
- moe_layer_recompute: false
283
- moe_pad_expert_input_to_capacity: false
284
- moe_per_layer_logging: false
285
- moe_permute_fusion: false
286
- moe_router_bias_update_rate: 0.0
287
- moe_router_dtype: fp64
288
- moe_router_enable_expert_bias: false
289
- moe_router_force_load_balancing: false
290
- moe_router_fusion: false
291
- moe_router_group_topk: null
292
- moe_router_load_balancing_type: none
293
- moe_router_num_groups: null
294
- moe_router_padding_for_fp8: false
295
- moe_router_padding_for_quantization: false
296
- moe_router_pre_softmax: false
297
- moe_router_score_function: softmax
298
- moe_router_topk: 2
299
- moe_router_topk_limited_devices: null
300
- moe_router_topk_scaling_factor: null
301
- moe_shared_expert_gate: false
302
- moe_shared_expert_intermediate_size: null
303
- moe_shared_expert_overlap: false
304
- moe_token_dispatcher_type: allgather
305
- moe_token_drop_policy: probs
306
- moe_token_dropping: false
307
- moe_use_legacy_grouped_gemm: false
308
- moe_z_loss_coeff: null
309
- mrope_section: null
310
- mtp_enabled: false
311
- mtp_loss_scaling_factor: null
312
- mtp_num_layers: null
313
- mtp_standalone: false
314
- multi_latent_attention: false
315
- no_rope_freq: null
316
- no_sync_func:
317
- _call_: false
318
- _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
319
- normalization: RMSNorm
320
- num_attention_heads: 32
321
- num_layers: 36
322
- num_layers_at_end_in_bf16: 1
323
- num_layers_at_start_in_bf16: 1
324
- num_layers_in_first_pipeline_stage: null
325
- num_layers_in_last_pipeline_stage: null
326
- num_microbatches_with_partial_activation_checkpoints: null
327
- num_moe_experts: null
328
- num_query_groups: 8
329
- offload_modules: null
330
- output_layer_init_method:
331
- _args_: []
332
- _partial_: true
333
- _target_: torch.nn.init.normal_
334
- mean: 0.0
335
- std: 0.0023570226039551587
336
- overlap_moe_expert_parallel_comm: false
337
- overlap_p2p_comm: false
338
- overlap_p2p_comm_warmup_flush: false
339
- parallel_output: true
340
- param_sync_func: null
341
- params_dtype:
342
- _call_: false
343
- _target_: torch.bfloat16
344
- perform_initialization: true
345
- persist_layer_norm: false
346
- pipeline_dtype:
347
- _call_: false
348
- _target_: torch.bfloat16
349
- pipeline_model_parallel_comm_backend: null
350
- pipeline_model_parallel_layout: null
351
- pipeline_model_parallel_size: 1
352
- position_embedding_type: rope
353
- qk_clip: false
354
- qk_clip_alpha: 0.5
355
- qk_clip_threshold: 100
356
- qk_layernorm: true
357
- quant_recipe: null
358
- recompute_granularity: full
359
- recompute_method: uniform
360
- recompute_modules:
361
- - core_attn
362
- recompute_num_layers: 1
363
- restore_modelopt_state: false
364
- rotary_base: 5000000
365
- rotary_interleaved: false
366
- rotary_percent: 1.0
367
- scatter_embedding_sequence_parallel: true
368
- seq_len_interpolation_factor: null
369
- seq_length: 262144
370
- sequence_parallel: false
371
- share_embeddings_and_output_weights: true
372
- should_pad_vocab: false
373
- softmax_scale: null
374
- softmax_type: vanilla
375
- symmetric_ar_type: null
376
- tensor_model_parallel_size: 1
377
- test_mode: false
378
- timers: null
379
- tp_comm_atomic_ag: false
380
- tp_comm_atomic_rs: false
381
- tp_comm_bootstrap_backend: nccl
382
- tp_comm_bulk_dgrad: true
383
- tp_comm_bulk_wgrad: true
384
- tp_comm_overlap: false
385
- tp_comm_overlap_ag: true
386
- tp_comm_overlap_cfg: null
387
- tp_comm_overlap_disable_fc1: false
388
- tp_comm_overlap_disable_qkv: false
389
- tp_comm_overlap_rs: true
390
- tp_comm_overlap_rs_dgrad: false
391
- tp_comm_split_ag: true
392
- tp_comm_split_rs: true
393
- tp_only_amax_red: false
394
- transformer_impl: transformer_engine
395
- transformer_layer_spec:
396
- _call_: false
397
- _target_: megatron.bridge.models.gpt_provider.default_layer_spec
398
- use_cpu_initialization: false
399
- use_fused_weighted_squared_relu: false
400
- use_kitchen: false
401
- use_mamba_mem_eff_path: true
402
- use_ring_exchange_p2p: false
403
- use_te_activation_func: false
404
- use_te_rng_tracker: false
405
- use_transformer_engine_full_layer_spec: false
406
- use_transformer_engine_op_fuser: false
407
- variable_seq_lengths: false
408
- virtual_pipeline_model_parallel_size: null
409
- vocab_size: 16
410
- wgrad_deferral_limit: 0
411
- window_attn_skip_freq: null
412
- window_size: null
413
- nvrx_straggler: null
414
- optimizer:
415
- _target_: megatron.bridge.training.config.OptimizerConfig
416
- adam_beta1: 0.9
417
- adam_beta2: 0.98
418
- adam_eps: 1.0e-08
419
- barrier_with_L1_time: false
420
- bf16: true
421
- clip_grad: 0.5
422
- config_logger_dir: ''
423
- decoupled_lr: null
424
- decoupled_min_lr: null
425
- decoupled_weight_decay: true
426
- exp_avg_dtype:
427
- _call_: false
428
- _target_: torch.float32
429
- exp_avg_sq_dtype:
430
- _call_: false
431
- _target_: torch.float32
432
- fp16: false
433
- fp8_recipe: null
434
- hysteresis: 2
435
- initial_loss_scale: 4294967296
436
- log_num_zeros_in_grad: false
437
- loss_scale: null
438
- loss_scale_window: 1000
439
- lr: 0.0001
440
- main_grads_dtype:
441
- _call_: false
442
- _target_: torch.float32
443
- main_params_dtype:
444
- _call_: false
445
- _target_: torch.float32
446
- min_loss_scale: 1.0
447
- min_lr: 1.0e-07
448
- muon_extra_scale_factor: 1.0
449
- muon_fp32_matmul_prec: medium
450
- muon_momentum: 0.95
451
- muon_num_ns_steps: 5
452
- muon_scale_mode: spectral
453
- muon_split_qkv: true
454
- muon_tp_mode: blockwise
455
- muon_use_nesterov: false
456
- optimizer: adam
457
- optimizer_cpu_offload: false
458
- optimizer_offload_fraction: 0.0
459
- overlap_cpu_optimizer_d2h_h2d: false
460
- overlap_param_gather: false
461
- overlap_param_gather_with_optimizer_step: false
462
- params_dtype: bfloat16
463
- pin_cpu_grads: true
464
- pin_cpu_params: true
465
- reuse_grad_buf_for_mxfp8_param_ag: false
466
- sgd_momentum: 0.9
467
- store_param_remainders: true
468
- timers: null
469
- use_distributed_optimizer: true
470
- use_precision_aware_optimizer: false
471
- use_torch_optimizer_for_cpu_offload: false
472
- weight_decay: 0.1
473
- peft: null
474
- profiling:
475
- _target_: megatron.bridge.training.config.ProfilingConfig
476
- memory_snapshot_path: snapshot.pickle
477
- nvtx_ranges: false
478
- profile_ranks:
479
- - 0
480
- profile_step_end: 12
481
- profile_step_start: 10
482
- record_memory_history: false
483
- record_shapes: false
484
- use_nsys_profiler: false
485
- use_pytorch_profiler: false
486
- rerun_state_machine:
487
- _target_: megatron.bridge.training.config.RerunStateMachineConfig
488
- check_for_nan_in_loss: true
489
- check_for_spiky_loss: false
490
- error_injection_rate: 0
491
- error_injection_type: transient_error
492
- rerun_mode: disabled
493
- rng:
494
- _target_: megatron.bridge.training.config.RNGConfig
495
- data_parallel_random_init: false
496
- inference_rng_tracker: false
497
- seed: 1234
498
- te_rng_tracker: false
499
- scheduler:
500
- _target_: megatron.bridge.training.config.SchedulerConfig
501
- end_weight_decay: 0.1
502
- lr_decay_iters: 12716
503
- lr_decay_samples: null
504
- lr_decay_steps: 3255296
505
- lr_decay_style: linear
506
- lr_warmup_fraction: null
507
- lr_warmup_init: 1.0e-06
508
- lr_warmup_iters: 200
509
- lr_warmup_samples: 0
510
- lr_warmup_steps: 51200
511
- lr_wsd_decay_iters: null
512
- lr_wsd_decay_samples: null
513
- lr_wsd_decay_style: exponential
514
- no_weight_decay_cond_type: null
515
- override_opt_param_scheduler: false
516
- start_weight_decay: 0.1
517
- use_checkpoint_opt_param_scheduler: false
518
- wd_incr_steps: 1528832
519
- weight_decay_incr_style: constant
520
- wsd_decay_steps: null
521
- straggler: null
522
- tensor_inspect: null
523
- tokenizer:
524
- _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
525
- hf_tokenizer_kwargs: {}
526
- image_tag_type: null
527
- merge_file: null
528
- special_tokens: null
529
- tiktoken_num_special_tokens: 1000
530
- tiktoken_pattern: null
531
- tiktoken_special_tokens: null
532
- tokenizer_model: ./models/Qwen-NVARC
533
- tokenizer_prompt_format: null
534
- tokenizer_type: HuggingFaceTokenizer
535
- vocab_extra_ids: 0
536
- vocab_file: null
537
- vocab_size: null
538
- train:
539
- _target_: megatron.bridge.training.config.TrainingConfig
540
- check_weight_hash_across_dp_replicas_interval: null
541
- decrease_batch_size_if_needed: false
542
- empty_unused_memory_level: 0
543
- eval_interval: 1000
544
- eval_iters: 100
545
- exit_duration_in_mins: null
546
- exit_interval: null
547
- exit_signal:
548
- _args_:
549
- - 15
550
- _call_: true
551
- _target_: signal.Signals
552
- exit_signal_handler: false
553
- exit_signal_handler_for_dataloader: false
554
- global_batch_size: 256
555
- iterations_to_skip: []
556
- manual_gc: false
557
- manual_gc_eval: true
558
- manual_gc_interval: 0
559
- micro_batch_size: 1
560
- rampup_batch_size: null
561
- skip_train: false
562
- train_iters: 5972
563
- train_samples: null
564
- train_sync_interval: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
step_5400/policy/weights/iter_0000000/train_state.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
- size 3461
 
 
 
 
step_5400/policy/weights/latest_checkpointed_iteration.txt DELETED
@@ -1 +0,0 @@
1
- 0
 
 
step_5400/policy/weights/latest_train_state.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
3
- size 3461
 
 
 
 
step_5400/train_dataloader.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:99b28546a485528f6242d1b9dcf951cc95a6af0ca81a13ded15a567a8c9d2f7f
3
- size 7336
 
 
 
 
step_5400/training_info.json DELETED
@@ -1 +0,0 @@
1
- {"epoch": 0, "step": 5400, "total_steps": 5400, "consumed_samples": 1382400, "total_valid_tokens": 1568487826.0, "val:val_loss": 0.14914798736572266}