iamPi commited on Dec 22, 2025

Commit

76f8ca1

verified ·

1 Parent(s): e395647

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -0
step_5400/config.yaml +207 -0
step_5400/policy/weights/iter_0000000/.metadata +3 -0
step_5400/policy/weights/iter_0000000/__0_0.distcp +3 -0
step_5400/policy/weights/iter_0000000/__1_0.distcp +3 -0
step_5400/policy/weights/iter_0000000/common.pt +3 -0
step_5400/policy/weights/iter_0000000/metadata.json +1 -0
step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
step_5400/policy/weights/iter_0000000/run_config.yaml +564 -0
step_5400/policy/weights/iter_0000000/train_state.pt +3 -0
step_5400/policy/weights/latest_checkpointed_iteration.txt +1 -0
step_5400/policy/weights/latest_train_state.pt +3 -0
step_5400/train_dataloader.pt +3 -0
step_5400/training_info.json +1 -0
step_5600/config.yaml +207 -0
step_5600/policy/weights/iter_0000000/.metadata +3 -0
step_5600/policy/weights/iter_0000000/common.pt +3 -0
step_5600/policy/weights/iter_0000000/metadata.json +1 -0
step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
step_5600/policy/weights/iter_0000000/run_config.yaml +564 -0
step_5600/policy/weights/iter_0000000/train_state.pt +3 -0
step_5600/policy/weights/latest_checkpointed_iteration.txt +1 -0
step_5600/policy/weights/latest_train_state.pt +3 -0
step_5600/train_dataloader.pt +3 -0
step_5600/training_info.json +1 -0
step_5800/config.yaml +207 -0
step_5800/policy/weights/iter_0000000/.metadata +3 -0
step_5800/policy/weights/iter_0000000/__0_0.distcp +3 -0
step_5800/policy/weights/iter_0000000/__1_0.distcp +3 -0
step_5800/policy/weights/iter_0000000/common.pt +3 -0
step_5800/policy/weights/iter_0000000/metadata.json +1 -0
step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
step_5800/policy/weights/iter_0000000/run_config.yaml +564 -0
step_5800/policy/weights/iter_0000000/train_state.pt +3 -0
step_5800/policy/weights/latest_checkpointed_iteration.txt +1 -0
step_5800/policy/weights/latest_train_state.pt +3 -0
step_5800/train_dataloader.pt +3 -0
step_5800/training_info.json +1 -0
step_5972/config.yaml +207 -0
step_5972/policy/weights/iter_0000000/.metadata +3 -0
step_5972/policy/weights/iter_0000000/__0_1.distcp +3 -0
step_5972/policy/weights/iter_0000000/__1_1.distcp +3 -0
step_5972/policy/weights/iter_0000000/common.pt +3 -0
step_5972/policy/weights/iter_0000000/metadata.json +1 -0
step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml +203 -0
step_5972/policy/weights/iter_0000000/run_config.yaml +564 -0
step_5972/policy/weights/iter_0000000/train_state.pt +3 -0
step_5972/policy/weights/latest_checkpointed_iteration.txt +1 -0
step_5972/policy/weights/latest_train_state.pt +3 -0
step_5972/train_dataloader.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+step_5400/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
+step_5972/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
+step_5800/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
+step_5600/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text
+step_5400/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+step_5800/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+step_5972/policy/weights/iter_0000000/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+step_5400/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+step_5972/policy/weights/iter_0000000/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
+step_5800/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text

step_5400/config.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+checkpointing:
+  checkpoint_dir: results/qwen3_4b_sft
+  checkpoint_must_save_by: null
+  enabled: true
+  higher_is_better: false
+  keep_top_k: 3
+  metric_name: val:val_loss
+  save_period: 200
+cluster:
+  gpus_per_node: 2
+  num_nodes: 1
+data:
+  num_workers: 4
+  shuffle: true
+  train_dataset_path:
+  - ./data/hones
+  val_dataset_path: ./data/arc2_evaluation6
+logger:
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+  log_dir: logs/exp_019
+  mlflow_enabled: false
+  monitor_gpus: false
+  swanlab_enabled: false
+  tensorboard_enabled: false
+  wandb:
+    name: qwen3_4b_sft
+    project: arc2
+  wandb_enabled: true
+policy:
+  activation_checkpointing_enabled: false
+  attn_implementation: flash_attention_2
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  fsdp_offload_enabled: false
+  make_sequence_length_divisible_by: 64
+  max_grad_norm: null
+  megatron_cfg:
+    activation_checkpointing: true
+    apply_rope_fusion: true
+    bias_activation_fusion: false
+    context_parallel_size: 2
+    distributed_data_parallel_config:
+      average_in_collective: true
+      data_parallel_sharding_strategy: optim_grads_params
+      grad_reduce_in_fp32: true
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+    empty_unused_memory_level: 1
+    enabled: true
+    env_vars:
+      AWS_OFI_NCCL_VERSION: 1.14.0
+      BASH_ENV: /etc/bash.bashrc
+      CAL_VERSION: 0.4.4.50
+      CUBLASMP_VERSION: 0.4.0.789
+      CUBLAS_VERSION: 12.9.0.13
+      CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
+      CUDA_DRIVER_VERSION: 575.51.03
+      CUDA_VERSION: 12.9.0.043
+      CUDA_VISIBLE_DEVICES: 6,7
+      CUDNN_FRONTEND_VERSION: 1.11.0
+      CUDNN_VERSION: 9.10.1.4
+      CUFFT_VERSION: 11.4.0.6
+      CUFILE_VERSION: 1.14.0.30
+      CURAND_VERSION: 10.3.10.19
+      CUSOLVER_VERSION: 11.7.4.40
+      CUSPARSELT_VERSION: 0.7.1.0
+      CUSPARSE_VERSION: 12.5.9.5
+      DALI_BUILD: ''
+      DALI_URL_SUFFIX: '120'
+      DALI_VERSION: 1.49.0
+      EFA_VERSION: 1.38.1
+      ENV: /etc/shinit_v2
+      GDRCOPY_VERSION: 2.4.4
+      HOME: /root
+      HOSTNAME: e6ad2ac15863
+      HPCX_VERSION: '2.23'
+      KMP_DUPLICATE_LIB_OK: 'True'
+      KMP_INIT_AT_FORK: 'FALSE'
+      LC_CTYPE: C.UTF-8
+      LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+      LESSCLOSE: /usr/bin/lesspipe %s %s
+      LESSOPEN: '| /usr/bin/lesspipe %s'
+      LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
+      LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
+      MODEL_OPT_VERSION: 0.27.1
+      MOFED_VERSION: 5.4-rdmacore50.0
+      NCCL_NET_PLUGIN: aws-ofi
+      NCCL_TUNER_PLUGIN: aws-ofi
+      NCCL_VERSION: 2.26.5
+      NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
+      NEMO_RL_VENV_DIR: /opt/ray_venvs
+      NPP_VERSION: 12.4.0.27
+      NRL_CONTAINER: '1'
+      NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
+      NSIGHT_COMPUTE_VERSION: 2025.2.0.11
+      NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
+      NVIDIA_BUILD_ID: '244212578'
+      NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
+      NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
+      NVIDIA_PRODUCT_NAME: CUDA
+      NVIDIA_REQUIRE_CUDA: cuda>=9.0
+      NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
+      NVIDIA_VISIBLE_DEVICES: all
+      NVJITLINK_VERSION: 12.9.41
+      NVJPEG_VERSION: 12.4.0.16
+      NVSHMEM_VERSION: 3.2.5
+      OLDPWD: /workspace
+      OMPI_MCA_coll_hcoll_enable: '0'
+      OPAL_PREFIX: /opt/hpcx/ompi
+      OPENMPI_VERSION: 4.1.7
+      OPENUCX_VERSION: 1.19.0
+      PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
+      POLYGRAPHY_VERSION: 0.49.20
+      PWD: /workspace/ARChitects
+      PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
+      PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      RAY_CLIENT_MODE: '0'
+      RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
+      RAY_USAGE_STATS_ENABLED: '0'
+      RDMACORE_VERSION: '50.0'
+      SHELL: /bin/bash
+      SHLVL: '2'
+      SWANLAB_API_HOST: https://api.swanlab.cn/api
+      SWANLAB_RUNTIME: user
+      SWANLAB_WEB_HOST: https://swanlab.cn
+      TERM: xterm
+      TORCH_CUDA_ARCH_LIST: '9.0'
+      TRANSFORMER_ENGINE_VERSION: '2.3'
+      TRTOSS_VERSION: ''
+      TRT_VERSION: 10.10.0.31
+      UV: /root/.local/bin/uv
+      UV_LINK_MODE: copy
+      UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
+      UV_RUN_RECURSION_DEPTH: '1'
+      VIRTUAL_ENV: /opt/nemo_rl_venv
+      VIRTUAL_ENV_PROMPT: nemo-rl
+      WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
+      _: /root/.local/bin/uv
+      _CUDA_COMPAT_PATH: /usr/local/cuda/compat
+      _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
+        (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
+      _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    freeze_moe_router: true
+    moe_permute_fusion: false
+    moe_router_bias_update_rate: 0.0
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    optimizer:
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1.0e-08
+      bf16: true
+      clip_grad: 0.5
+      fp16: false
+      lr: 0.0001
+      min_lr: 1.0e-07
+      optimizer: adam
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+      params_dtype: bfloat16
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: false
+      weight_decay: 0.1
+    pipeline_dtype: bfloat16
+    pipeline_model_parallel_size: 1
+    scheduler:
+      end_weight_decay: 0.1
+      lr_decay_iters: 12716
+      lr_decay_style: linear
+      lr_warmup_init: 1.0e-06
+      lr_warmup_iters: 200
+      start_weight_decay: 0.1
+      weight_decay_incr_style: constant
+    sequence_parallel: false
+    tensor_model_parallel_size: 1
+    train_iters: 5972
+  model_name: ./models/Qwen-NVARC
+  offload_optimizer_for_logprob: false
+  precision: bfloat16
+  sequence_packing:
+    algorithm: modified_first_fit_decreasing
+    enabled: true
+    sequence_length_round: 64
+    train_mb_tokens: 128000
+  tokenizer:
+    name: ./models/Qwen-NVARC
+  train_global_batch_size: 256
+  train_micro_batch_size: 1
+sft:
+  max_num_epochs: 1
+  max_num_steps: 6400
+  seed: 24
+  val_at_start: true
+  val_batches: 200
+  val_global_batch_size: 256
+  val_micro_batch_size: 1
+  val_period: 200

step_5400/policy/weights/iter_0000000/.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:601958148c0276510ee83ae2c089910f685c2aa6fde4b6f5e668b28ed06ec567
+size 329201

step_5400/policy/weights/iter_0000000/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f4f63a1df595166115fa2fd03a1601a3ae7b6c72151956a0f966332b260176d
+size 12718332319

step_5400/policy/weights/iter_0000000/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdbfe2d6c54d823e7ef9c6bdfb156183fa5d437043a001c83847514272046f8b
+size 12717813616

step_5400/policy/weights/iter_0000000/common.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cf17a4bbf5fb940ff8d1e669f26a4e277411e9796b4920f5cd867e4401db145
+size 1767

step_5400/policy/weights/iter_0000000/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+activation_func: <function silu at 0x7d0251c6b420>
+activation_func_clamp_value: None
+add_bias_linear: false
+add_qkv_bias: false
+apply_query_key_layer_scaling: false
+apply_residual_connection_post_layernorm: false
+apply_rope_fusion: true
+attention_backend: AttnBackend.auto
+attention_dropout: '0.0'
+attention_output_gate: false
+attention_softmax_in_fp32: false
+autocast_dtype: torch.bfloat16
+barrier_with_L1_time: true
+bf16: true
+bias_activation_fusion: false
+bias_dropout_fusion: false
+calculate_per_token_loss: true
+clone_scatter_output_in_embedding: true
+config_logger_dir: ''
+cross_entropy_fusion_impl: native
+cross_entropy_loss_fusion: true
+defer_embedding_wgrad_compute: false
+delay_wgrad_compute: false
+deterministic_mode: false
+disable_bf16_reduced_precision_matmul: false
+disable_parameter_transpose_cache: false
+distribute_saved_activations: None
+enable_autocast: false
+fallback_to_eager_attn: false
+ffn_hidden_size: 9728
+finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
+  pg_collection=None)
+fine_grained_activation_offloading: false
+first_last_layers_bf16: false
+flash_decode: false
+fp16: false
+fp16_lm_cross_entropy: false
+fp32_residual_connection: false
+fused_single_qkv_rope: false
+gated_linear_unit: true
+generation_config: None
+glu_linear_offset: '0.0'
+grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
+  object at 0x7cf9d413cd70>>
+grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+gradient_accumulation_fusion: false
+hetereogenous_dist_checkpoint: false
+heterogeneous_block_specs: false
+hf_model_id: ./models/Qwen-NVARC
+hidden_dropout: '0.0'
+hidden_size: 2560
+is_hybrid_model: false
+kv_channels: 128
+layernorm_epsilon: 1e-06
+layernorm_zero_centered_gamma: false
+linear_attention_freq: None
+linear_attention_type: None
+linear_conv_kernel_dim: None
+linear_key_head_dim: None
+linear_num_key_heads: None
+linear_num_value_heads: None
+linear_value_head_dim: None
+log_max_attention_logit: false
+make_vocab_size_divisible_by: 16
+mamba_head_dim: 64
+mamba_num_groups: 8
+mamba_num_heads: None
+mamba_state_dim: 128
+masked_softmax_fusion: true
+max_position_embeddings: 40960
+memory_efficient_layer_norm: false
+min_offloaded_tensor_size: 1048576
+mlp_chunks_for_prefill: 1
+moe_apply_probs_on_input: false
+moe_aux_loss_coeff: '0.0'
+moe_deepep_num_sms: 20
+moe_enable_deepep: false
+moe_expert_capacity_factor: None
+moe_extended_tp: false
+moe_ffn_hidden_size: None
+moe_flex_dispatcher_backend: deepep
+moe_grouped_gemm: false
+moe_hybridep_num_sms: 16
+moe_input_jitter_eps: None
+moe_layer_freq: 1
+moe_pad_expert_input_to_capacity: false
+moe_per_layer_logging: false
+moe_permute_fusion: false
+moe_router_bias_update_rate: '0.0'
+moe_router_dtype: fp64
+moe_router_enable_expert_bias: false
+moe_router_force_load_balancing: false
+moe_router_fusion: false
+moe_router_group_topk: None
+moe_router_load_balancing_type: none
+moe_router_num_groups: None
+moe_router_padding_for_quantization: false
+moe_router_pre_softmax: false
+moe_router_score_function: softmax
+moe_router_topk: 2
+moe_router_topk_limited_devices: None
+moe_router_topk_scaling_factor: None
+moe_shared_expert_gate: false
+moe_shared_expert_intermediate_size: None
+moe_shared_expert_overlap: false
+moe_token_dispatcher_type: allgather
+moe_token_drop_policy: probs
+moe_token_dropping: false
+moe_use_legacy_grouped_gemm: false
+moe_z_loss_coeff: None
+mrope_section: None
+multi_latent_attention: false
+no_rope_freq: None
+no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+normalization: RMSNorm
+num_attention_heads: 32
+num_layers: 36
+num_layers_at_end_in_bf16: 1
+num_layers_at_start_in_bf16: 1
+num_moe_experts: None
+num_query_groups: 8
+nvidia_modelopt_version: 0.39.0
+offload_modules: None
+param_sync_func: None
+params_dtype: torch.bfloat16
+perform_initialization: true
+persist_layer_norm: false
+position_embedding_type: rope
+qk_clip: false
+qk_clip_alpha: '0.5'
+qk_clip_threshold: 100
+qk_layernorm: true
+quant_recipe: None
+restore_modelopt_state: false
+rotary_base: 5000000
+rotary_interleaved: false
+rotary_percent: '1.0'
+seq_len_interpolation_factor: None
+seq_length: 262144
+share_embeddings_and_output_weights: true
+should_pad_vocab: false
+softmax_scale: None
+softmax_type: vanilla
+symmetric_ar_type: None
+test_mode: false
+timers: None
+transformer_impl: transformer_engine
+transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
+use_fused_weighted_squared_relu: false
+use_kitchen: false
+use_mamba_mem_eff_path: true
+use_ring_exchange_p2p: false
+use_te_activation_func: false
+use_te_rng_tracker: false
+use_transformer_engine_full_layer_spec: false
+use_transformer_engine_op_fuser: false
+variable_seq_lengths: false
+vocab_size: 16
+wgrad_deferral_limit: 0
+window_attn_skip_freq: None
+window_size: None

step_5400/policy/weights/iter_0000000/run_config.yaml ADDED Viewed

	@@ -0,0 +1,564 @@

+_target_: megatron.bridge.training.config.ConfigContainer
+checkpoint:
+  _target_: megatron.bridge.training.config.CheckpointConfig
+  async_save: false
+  ckpt_assume_constant_structure: false
+  ckpt_convert_format: null
+  ckpt_convert_save: null
+  ckpt_format: torch_dist
+  ckpt_step: null
+  dist_ckpt_optim_fully_reshardable: false
+  dist_ckpt_save_pre_mcore_014: false
+  dist_ckpt_strictness: assume_ok_unexpected
+  distrib_optim_fully_reshardable_mem_efficient: false
+  exit_on_missing_checkpoint: false
+  finetune: true
+  fully_parallel_load: true
+  fully_parallel_save: true
+  load: null
+  load_main_params_from_ckpt: false
+  load_optim: true
+  load_rng: false
+  most_recent_k: -1
+  non_persistent_ckpt_type: null
+  non_persistent_global_ckpt_dir: null
+  non_persistent_local_ckpt_algo: fully_parallel
+  non_persistent_local_ckpt_dir: null
+  non_persistent_save_interval: null
+  pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
+  replication: false
+  replication_factor: 2
+  replication_jump: null
+  save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5400/policy/weights
+  save_interval: 100
+  save_optim: true
+  save_rng: true
+  save_tokenizer_assets: true
+  strict_fsdp_dtensor_load: false
+  use_checkpoint_args: false
+  use_persistent_ckpt_worker: true
+comm_overlap: null
+dataset: null
+ddp:
+  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
+  align_param_gather: false
+  average_in_collective: false
+  bucket_size: 40000000
+  check_for_large_grads: false
+  check_for_nan_in_grad: true
+  data_parallel_sharding_strategy: optim_grads_params
+  delay_wgrad_compute: false
+  disable_symmetric_registration: false
+  fp8_param_gather: false
+  fsdp_double_buffer: false
+  grad_reduce_in_fp32: true
+  gradient_reduce_div_fusion: true
+  keep_fp8_transpose_cache: false
+  nccl_ub: false
+  num_distributed_optimizer_instances: 1
+  outer_dp_sharding_strategy: no_shard
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  pad_buckets_for_high_nccl_busbw: false
+  preserve_fp32_weights: true
+  reduce_scatter_with_fp32_accumulation: false
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  suggested_communication_unit_size: null
+  use_custom_fsdp: false
+  use_distributed_optimizer: true
+  use_megatron_fsdp: false
+dist:
+  _target_: megatron.bridge.training.config.DistributedInitConfig
+  align_grad_reduce: true
+  disable_jit_fuser: false
+  distributed_backend: nccl
+  distributed_timeout_minutes: 10
+  distributed_timeout_seconds_after_init: null
+  enable_megatron_core_experimental: false
+  external_gpu_device_mapping: true
+  high_priority_stream_groups: null
+  lazy_init: false
+  local_rank: 0
+  nccl_communicator_config_path: null
+  sharp_enabled_group: null
+  use_gloo_process_groups: true
+  use_megatron_fsdp: false
+  use_sharp: false
+  use_torch_fsdp2: false
+  use_tp_pp_dp_mapping: false
+ft: null
+inprocess_restart: null
+logger:
+  _target_: megatron.bridge.training.config.LoggerConfig
+  filter_warnings: true
+  log_energy: false
+  log_interval: 100
+  log_l2_norm_grad_to_tensorboard: false
+  log_loss_scale_to_tensorboard: true
+  log_memory_to_tensorboard: false
+  log_params_norm: false
+  log_progress: false
+  log_runtime_to_tensorboard: false
+  log_throughput: false
+  log_throughput_to_tensorboard: false
+  log_timers_to_tensorboard: false
+  log_validation_ppl_to_tensorboard: false
+  log_world_size_to_tensorboard: false
+  logging_level: 0
+  memory_keys: null
+  modules_to_filter: null
+  runtime_time_unit: hours
+  save_config_filepath: null
+  set_level_for_all_loggers: false
+  tensorboard_dir: null
+  tensorboard_log_interval: 1
+  tensorboard_queue_size: 1000
+  throughput_window_size: 100
+  timing_log_level: 0
+  timing_log_option: minmax
+  wandb_entity: null
+  wandb_exp_name: null
+  wandb_project: null
+  wandb_save_dir: null
+mixed_precision: null
+model:
+  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_clamp_value: null
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _args_:
+    - 5
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_output_gate: false
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: false
+  bias_dropout_fusion: false
+  calculate_per_token_loss: true
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 2
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_double_buffering: false
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: false
+  cross_entropy_fusion_impl: native
+  cross_entropy_loss_fusion: true
+  cuda_graph_impl: none
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_scope: []
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  delay_wgrad_compute: false
+  deterministic_mode: false
+  disable_bf16_reduced_precision_matmul: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  embedding_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  embedding_init_method_std: 0.02
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: 1
+  external_cuda_graph: false
+  fallback_to_eager_attn: false
+  ffn_hidden_size: 9728
+  finalize_model_grads_func:
+    _args_: []
+    _partial_: true
+    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
+    pg_collection: null
+  fine_grained_activation_offloading: false
+  first_last_layers_bf16: false
+  flash_decode: false
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp4: null
+  fp4_param: false
+  fp4_quantizer_factory: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_quantizer_factory: null
+  fp8_recipe: delayed
+  fp8_wgrad: true
+  fused_single_qkv_rope: false
+  gated_linear_unit: true
+  generation_config: null
+  glu_linear_offset: 0.0
+  grad_scale_func:
+    _call_: false
+    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
+  grad_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
+  gradient_accumulation_fusion: false
+  hetereogenous_dist_checkpoint: false
+  heterogeneous_block_specs: false
+  hf_model_id: ./models/Qwen-NVARC
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  hierarchical_context_parallel_sizes: null
+  inference_rng_tracker: false
+  inference_sampling_seed: 42
+  init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  init_method_std: 0.02
+  init_model_with_meta_device: false
+  is_hybrid_model: false
+  kv_channels: 128
+  layernorm_epsilon: 1.0e-06
+  layernorm_zero_centered_gamma: false
+  linear_attention_freq: null
+  linear_attention_type: null
+  linear_conv_kernel_dim: null
+  linear_key_head_dim: null
+  linear_num_key_heads: null
+  linear_num_value_heads: null
+  linear_value_head_dim: null
+  log_max_attention_logit: false
+  make_vocab_size_divisible_by: 16
+  mamba_head_dim: 64
+  mamba_num_groups: 8
+  mamba_num_heads: null
+  mamba_state_dim: 128
+  masked_softmax_fusion: true
+  max_position_embeddings: 40960
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  min_offloaded_tensor_size: 1048576
+  mlp_chunks_for_prefill: 1
+  moe_apply_probs_on_input: false
+  moe_aux_loss_coeff: 0.0
+  moe_deepep_num_sms: 20
+  moe_enable_deepep: false
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_flex_dispatcher_backend: deepep
+  moe_grouped_gemm: false
+  moe_hybridep_num_sms: 16
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.0
+  moe_router_dtype: fp64
+  moe_router_enable_expert_bias: false
+  moe_router_force_load_balancing: false
+  moe_router_fusion: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: none
+  moe_router_num_groups: null
+  moe_router_padding_for_fp8: false
+  moe_router_padding_for_quantization: false
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_gate: false
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  mrope_section: null
+  mtp_enabled: false
+  mtp_loss_scaling_factor: null
+  mtp_num_layers: null
+  mtp_standalone: false
+  multi_latent_attention: false
+  no_rope_freq: null
+  no_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 36
+  num_layers_at_end_in_bf16: 1
+  num_layers_at_start_in_bf16: 1
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  offload_modules: null
+  output_layer_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.0023570226039551587
+  overlap_moe_expert_parallel_comm: false
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: false
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_comm_backend: null
+  pipeline_model_parallel_layout: null
+  pipeline_model_parallel_size: 1
+  position_embedding_type: rope
+  qk_clip: false
+  qk_clip_alpha: 0.5
+  qk_clip_threshold: 100
+  qk_layernorm: true
+  quant_recipe: null
+  recompute_granularity: full
+  recompute_method: uniform
+  recompute_modules:
+  - core_attn
+  recompute_num_layers: 1
+  restore_modelopt_state: false
+  rotary_base: 5000000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 262144
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  should_pad_vocab: false
+  softmax_scale: null
+  softmax_type: vanilla
+  symmetric_ar_type: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_cfg: null
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_impl: transformer_engine
+  transformer_layer_spec:
+    _call_: false
+    _target_: megatron.bridge.models.gpt_provider.default_layer_spec
+  use_cpu_initialization: false
+  use_fused_weighted_squared_relu: false
+  use_kitchen: false
+  use_mamba_mem_eff_path: true
+  use_ring_exchange_p2p: false
+  use_te_activation_func: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  use_transformer_engine_op_fuser: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 16
+  wgrad_deferral_limit: 0
+  window_attn_skip_freq: null
+  window_size: null
+nvrx_straggler: null
+optimizer:
+  _target_: megatron.bridge.training.config.OptimizerConfig
+  adam_beta1: 0.9
+  adam_beta2: 0.98
+  adam_eps: 1.0e-08
+  barrier_with_L1_time: false
+  bf16: true
+  clip_grad: 0.5
+  config_logger_dir: ''
+  decoupled_lr: null
+  decoupled_min_lr: null
+  decoupled_weight_decay: true
+  exp_avg_dtype:
+    _call_: false
+    _target_: torch.float32
+  exp_avg_sq_dtype:
+    _call_: false
+    _target_: torch.float32
+  fp16: false
+  fp8_recipe: null
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  log_num_zeros_in_grad: false
+  loss_scale: null
+  loss_scale_window: 1000
+  lr: 0.0001
+  main_grads_dtype:
+    _call_: false
+    _target_: torch.float32
+  main_params_dtype:
+    _call_: false
+    _target_: torch.float32
+  min_loss_scale: 1.0
+  min_lr: 1.0e-07
+  muon_extra_scale_factor: 1.0
+  muon_fp32_matmul_prec: medium
+  muon_momentum: 0.95
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral
+  muon_split_qkv: true
+  muon_tp_mode: blockwise
+  muon_use_nesterov: false
+  optimizer: adam
+  optimizer_cpu_offload: false
+  optimizer_offload_fraction: 0.0
+  overlap_cpu_optimizer_d2h_h2d: false
+  overlap_param_gather: false
+  overlap_param_gather_with_optimizer_step: false
+  params_dtype: bfloat16
+  pin_cpu_grads: true
+  pin_cpu_params: true
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  sgd_momentum: 0.9
+  store_param_remainders: true
+  timers: null
+  use_distributed_optimizer: true
+  use_precision_aware_optimizer: false
+  use_torch_optimizer_for_cpu_offload: false
+  weight_decay: 0.1
+peft: null
+profiling:
+  _target_: megatron.bridge.training.config.ProfilingConfig
+  memory_snapshot_path: snapshot.pickle
+  nvtx_ranges: false
+  profile_ranks:
+  - 0
+  profile_step_end: 12
+  profile_step_start: 10
+  record_memory_history: false
+  record_shapes: false
+  use_nsys_profiler: false
+  use_pytorch_profiler: false
+rerun_state_machine:
+  _target_: megatron.bridge.training.config.RerunStateMachineConfig
+  check_for_nan_in_loss: true
+  check_for_spiky_loss: false
+  error_injection_rate: 0
+  error_injection_type: transient_error
+  rerun_mode: disabled
+rng:
+  _target_: megatron.bridge.training.config.RNGConfig
+  data_parallel_random_init: false
+  inference_rng_tracker: false
+  seed: 1234
+  te_rng_tracker: false
+scheduler:
+  _target_: megatron.bridge.training.config.SchedulerConfig
+  end_weight_decay: 0.1
+  lr_decay_iters: 12716
+  lr_decay_samples: null
+  lr_decay_steps: 3255296
+  lr_decay_style: linear
+  lr_warmup_fraction: null
+  lr_warmup_init: 1.0e-06
+  lr_warmup_iters: 200
+  lr_warmup_samples: 0
+  lr_warmup_steps: 51200
+  lr_wsd_decay_iters: null
+  lr_wsd_decay_samples: null
+  lr_wsd_decay_style: exponential
+  no_weight_decay_cond_type: null
+  override_opt_param_scheduler: false
+  start_weight_decay: 0.1
+  use_checkpoint_opt_param_scheduler: false
+  wd_incr_steps: 1528832
+  weight_decay_incr_style: constant
+  wsd_decay_steps: null
+straggler: null
+tensor_inspect: null
+tokenizer:
+  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
+  hf_tokenizer_kwargs: {}
+  image_tag_type: null
+  merge_file: null
+  special_tokens: null
+  tiktoken_num_special_tokens: 1000
+  tiktoken_pattern: null
+  tiktoken_special_tokens: null
+  tokenizer_model: ./models/Qwen-NVARC
+  tokenizer_prompt_format: null
+  tokenizer_type: HuggingFaceTokenizer
+  vocab_extra_ids: 0
+  vocab_file: null
+  vocab_size: null
+train:
+  _target_: megatron.bridge.training.config.TrainingConfig
+  check_weight_hash_across_dp_replicas_interval: null
+  decrease_batch_size_if_needed: false
+  empty_unused_memory_level: 0
+  eval_interval: 1000
+  eval_iters: 100
+  exit_duration_in_mins: null
+  exit_interval: null
+  exit_signal:
+    _args_:
+    - 15
+    _call_: true
+    _target_: signal.Signals
+  exit_signal_handler: false
+  exit_signal_handler_for_dataloader: false
+  global_batch_size: 256
+  iterations_to_skip: []
+  manual_gc: false
+  manual_gc_eval: true
+  manual_gc_interval: 0
+  micro_batch_size: 1
+  rampup_batch_size: null
+  skip_train: false
+  train_iters: 5972
+  train_samples: null
+  train_sync_interval: null

step_5400/policy/weights/iter_0000000/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5400/policy/weights/latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0

step_5400/policy/weights/latest_train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5400/train_dataloader.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99b28546a485528f6242d1b9dcf951cc95a6af0ca81a13ded15a567a8c9d2f7f
+size 7336

step_5400/training_info.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"epoch": 0, "step": 5400, "total_steps": 5400, "consumed_samples": 1382400, "total_valid_tokens": 1568487826.0, "val:val_loss": 0.14914798736572266}

step_5600/config.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+checkpointing:
+  checkpoint_dir: results/qwen3_4b_sft
+  checkpoint_must_save_by: null
+  enabled: true
+  higher_is_better: false
+  keep_top_k: 3
+  metric_name: val:val_loss
+  save_period: 200
+cluster:
+  gpus_per_node: 2
+  num_nodes: 1
+data:
+  num_workers: 4
+  shuffle: true
+  train_dataset_path:
+  - ./data/hones
+  val_dataset_path: ./data/arc2_evaluation6
+logger:
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+  log_dir: logs/exp_019
+  mlflow_enabled: false
+  monitor_gpus: false
+  swanlab_enabled: false
+  tensorboard_enabled: false
+  wandb:
+    name: qwen3_4b_sft
+    project: arc2
+  wandb_enabled: true
+policy:
+  activation_checkpointing_enabled: false
+  attn_implementation: flash_attention_2
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  fsdp_offload_enabled: false
+  make_sequence_length_divisible_by: 64
+  max_grad_norm: null
+  megatron_cfg:
+    activation_checkpointing: true
+    apply_rope_fusion: true
+    bias_activation_fusion: false
+    context_parallel_size: 2
+    distributed_data_parallel_config:
+      average_in_collective: true
+      data_parallel_sharding_strategy: optim_grads_params
+      grad_reduce_in_fp32: true
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+    empty_unused_memory_level: 1
+    enabled: true
+    env_vars:
+      AWS_OFI_NCCL_VERSION: 1.14.0
+      BASH_ENV: /etc/bash.bashrc
+      CAL_VERSION: 0.4.4.50
+      CUBLASMP_VERSION: 0.4.0.789
+      CUBLAS_VERSION: 12.9.0.13
+      CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
+      CUDA_DRIVER_VERSION: 575.51.03
+      CUDA_VERSION: 12.9.0.043
+      CUDA_VISIBLE_DEVICES: 6,7
+      CUDNN_FRONTEND_VERSION: 1.11.0
+      CUDNN_VERSION: 9.10.1.4
+      CUFFT_VERSION: 11.4.0.6
+      CUFILE_VERSION: 1.14.0.30
+      CURAND_VERSION: 10.3.10.19
+      CUSOLVER_VERSION: 11.7.4.40
+      CUSPARSELT_VERSION: 0.7.1.0
+      CUSPARSE_VERSION: 12.5.9.5
+      DALI_BUILD: ''
+      DALI_URL_SUFFIX: '120'
+      DALI_VERSION: 1.49.0
+      EFA_VERSION: 1.38.1
+      ENV: /etc/shinit_v2
+      GDRCOPY_VERSION: 2.4.4
+      HOME: /root
+      HOSTNAME: e6ad2ac15863
+      HPCX_VERSION: '2.23'
+      KMP_DUPLICATE_LIB_OK: 'True'
+      KMP_INIT_AT_FORK: 'FALSE'
+      LC_CTYPE: C.UTF-8
+      LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+      LESSCLOSE: /usr/bin/lesspipe %s %s
+      LESSOPEN: '| /usr/bin/lesspipe %s'
+      LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
+      LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
+      MODEL_OPT_VERSION: 0.27.1
+      MOFED_VERSION: 5.4-rdmacore50.0
+      NCCL_NET_PLUGIN: aws-ofi
+      NCCL_TUNER_PLUGIN: aws-ofi
+      NCCL_VERSION: 2.26.5
+      NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
+      NEMO_RL_VENV_DIR: /opt/ray_venvs
+      NPP_VERSION: 12.4.0.27
+      NRL_CONTAINER: '1'
+      NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
+      NSIGHT_COMPUTE_VERSION: 2025.2.0.11
+      NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
+      NVIDIA_BUILD_ID: '244212578'
+      NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
+      NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
+      NVIDIA_PRODUCT_NAME: CUDA
+      NVIDIA_REQUIRE_CUDA: cuda>=9.0
+      NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
+      NVIDIA_VISIBLE_DEVICES: all
+      NVJITLINK_VERSION: 12.9.41
+      NVJPEG_VERSION: 12.4.0.16
+      NVSHMEM_VERSION: 3.2.5
+      OLDPWD: /workspace
+      OMPI_MCA_coll_hcoll_enable: '0'
+      OPAL_PREFIX: /opt/hpcx/ompi
+      OPENMPI_VERSION: 4.1.7
+      OPENUCX_VERSION: 1.19.0
+      PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
+      POLYGRAPHY_VERSION: 0.49.20
+      PWD: /workspace/ARChitects
+      PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
+      PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      RAY_CLIENT_MODE: '0'
+      RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
+      RAY_USAGE_STATS_ENABLED: '0'
+      RDMACORE_VERSION: '50.0'
+      SHELL: /bin/bash
+      SHLVL: '2'
+      SWANLAB_API_HOST: https://api.swanlab.cn/api
+      SWANLAB_RUNTIME: user
+      SWANLAB_WEB_HOST: https://swanlab.cn
+      TERM: xterm
+      TORCH_CUDA_ARCH_LIST: '9.0'
+      TRANSFORMER_ENGINE_VERSION: '2.3'
+      TRTOSS_VERSION: ''
+      TRT_VERSION: 10.10.0.31
+      UV: /root/.local/bin/uv
+      UV_LINK_MODE: copy
+      UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
+      UV_RUN_RECURSION_DEPTH: '1'
+      VIRTUAL_ENV: /opt/nemo_rl_venv
+      VIRTUAL_ENV_PROMPT: nemo-rl
+      WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
+      _: /root/.local/bin/uv
+      _CUDA_COMPAT_PATH: /usr/local/cuda/compat
+      _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
+        (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
+      _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    freeze_moe_router: true
+    moe_permute_fusion: false
+    moe_router_bias_update_rate: 0.0
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    optimizer:
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1.0e-08
+      bf16: true
+      clip_grad: 0.5
+      fp16: false
+      lr: 0.0001
+      min_lr: 1.0e-07
+      optimizer: adam
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+      params_dtype: bfloat16
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: false
+      weight_decay: 0.1
+    pipeline_dtype: bfloat16
+    pipeline_model_parallel_size: 1
+    scheduler:
+      end_weight_decay: 0.1
+      lr_decay_iters: 12716
+      lr_decay_style: linear
+      lr_warmup_init: 1.0e-06
+      lr_warmup_iters: 200
+      start_weight_decay: 0.1
+      weight_decay_incr_style: constant
+    sequence_parallel: false
+    tensor_model_parallel_size: 1
+    train_iters: 5972
+  model_name: ./models/Qwen-NVARC
+  offload_optimizer_for_logprob: false
+  precision: bfloat16
+  sequence_packing:
+    algorithm: modified_first_fit_decreasing
+    enabled: true
+    sequence_length_round: 64
+    train_mb_tokens: 128000
+  tokenizer:
+    name: ./models/Qwen-NVARC
+  train_global_batch_size: 256
+  train_micro_batch_size: 1
+sft:
+  max_num_epochs: 1
+  max_num_steps: 6400
+  seed: 24
+  val_at_start: true
+  val_batches: 200
+  val_global_batch_size: 256
+  val_micro_batch_size: 1
+  val_period: 200

step_5600/policy/weights/iter_0000000/.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a1c916057dfe0e2002fe62982907832c5f702012c7360c6613f4a610084f748
+size 329201

step_5600/policy/weights/iter_0000000/common.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3b2e874e687352cb92c06fbffd56051ad75663c60fb5d275365fa00e02a4bb
+size 1767

step_5600/policy/weights/iter_0000000/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+activation_func: <function silu at 0x7d0251c6b420>
+activation_func_clamp_value: None
+add_bias_linear: false
+add_qkv_bias: false
+apply_query_key_layer_scaling: false
+apply_residual_connection_post_layernorm: false
+apply_rope_fusion: true
+attention_backend: AttnBackend.auto
+attention_dropout: '0.0'
+attention_output_gate: false
+attention_softmax_in_fp32: false
+autocast_dtype: torch.bfloat16
+barrier_with_L1_time: true
+bf16: true
+bias_activation_fusion: false
+bias_dropout_fusion: false
+calculate_per_token_loss: true
+clone_scatter_output_in_embedding: true
+config_logger_dir: ''
+cross_entropy_fusion_impl: native
+cross_entropy_loss_fusion: true
+defer_embedding_wgrad_compute: false
+delay_wgrad_compute: false
+deterministic_mode: false
+disable_bf16_reduced_precision_matmul: false
+disable_parameter_transpose_cache: false
+distribute_saved_activations: None
+enable_autocast: false
+fallback_to_eager_attn: false
+ffn_hidden_size: 9728
+finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
+  pg_collection=None)
+fine_grained_activation_offloading: false
+first_last_layers_bf16: false
+flash_decode: false
+fp16: false
+fp16_lm_cross_entropy: false
+fp32_residual_connection: false
+fused_single_qkv_rope: false
+gated_linear_unit: true
+generation_config: None
+glu_linear_offset: '0.0'
+grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
+  object at 0x7cf9d413cd70>>
+grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+gradient_accumulation_fusion: false
+hetereogenous_dist_checkpoint: false
+heterogeneous_block_specs: false
+hf_model_id: ./models/Qwen-NVARC
+hidden_dropout: '0.0'
+hidden_size: 2560
+is_hybrid_model: false
+kv_channels: 128
+layernorm_epsilon: 1e-06
+layernorm_zero_centered_gamma: false
+linear_attention_freq: None
+linear_attention_type: None
+linear_conv_kernel_dim: None
+linear_key_head_dim: None
+linear_num_key_heads: None
+linear_num_value_heads: None
+linear_value_head_dim: None
+log_max_attention_logit: false
+make_vocab_size_divisible_by: 16
+mamba_head_dim: 64
+mamba_num_groups: 8
+mamba_num_heads: None
+mamba_state_dim: 128
+masked_softmax_fusion: true
+max_position_embeddings: 40960
+memory_efficient_layer_norm: false
+min_offloaded_tensor_size: 1048576
+mlp_chunks_for_prefill: 1
+moe_apply_probs_on_input: false
+moe_aux_loss_coeff: '0.0'
+moe_deepep_num_sms: 20
+moe_enable_deepep: false
+moe_expert_capacity_factor: None
+moe_extended_tp: false
+moe_ffn_hidden_size: None
+moe_flex_dispatcher_backend: deepep
+moe_grouped_gemm: false
+moe_hybridep_num_sms: 16
+moe_input_jitter_eps: None
+moe_layer_freq: 1
+moe_pad_expert_input_to_capacity: false
+moe_per_layer_logging: false
+moe_permute_fusion: false
+moe_router_bias_update_rate: '0.0'
+moe_router_dtype: fp64
+moe_router_enable_expert_bias: false
+moe_router_force_load_balancing: false
+moe_router_fusion: false
+moe_router_group_topk: None
+moe_router_load_balancing_type: none
+moe_router_num_groups: None
+moe_router_padding_for_quantization: false
+moe_router_pre_softmax: false
+moe_router_score_function: softmax
+moe_router_topk: 2
+moe_router_topk_limited_devices: None
+moe_router_topk_scaling_factor: None
+moe_shared_expert_gate: false
+moe_shared_expert_intermediate_size: None
+moe_shared_expert_overlap: false
+moe_token_dispatcher_type: allgather
+moe_token_drop_policy: probs
+moe_token_dropping: false
+moe_use_legacy_grouped_gemm: false
+moe_z_loss_coeff: None
+mrope_section: None
+multi_latent_attention: false
+no_rope_freq: None
+no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+normalization: RMSNorm
+num_attention_heads: 32
+num_layers: 36
+num_layers_at_end_in_bf16: 1
+num_layers_at_start_in_bf16: 1
+num_moe_experts: None
+num_query_groups: 8
+nvidia_modelopt_version: 0.39.0
+offload_modules: None
+param_sync_func: None
+params_dtype: torch.bfloat16
+perform_initialization: true
+persist_layer_norm: false
+position_embedding_type: rope
+qk_clip: false
+qk_clip_alpha: '0.5'
+qk_clip_threshold: 100
+qk_layernorm: true
+quant_recipe: None
+restore_modelopt_state: false
+rotary_base: 5000000
+rotary_interleaved: false
+rotary_percent: '1.0'
+seq_len_interpolation_factor: None
+seq_length: 262144
+share_embeddings_and_output_weights: true
+should_pad_vocab: false
+softmax_scale: None
+softmax_type: vanilla
+symmetric_ar_type: None
+test_mode: false
+timers: None
+transformer_impl: transformer_engine
+transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
+use_fused_weighted_squared_relu: false
+use_kitchen: false
+use_mamba_mem_eff_path: true
+use_ring_exchange_p2p: false
+use_te_activation_func: false
+use_te_rng_tracker: false
+use_transformer_engine_full_layer_spec: false
+use_transformer_engine_op_fuser: false
+variable_seq_lengths: false
+vocab_size: 16
+wgrad_deferral_limit: 0
+window_attn_skip_freq: None
+window_size: None

step_5600/policy/weights/iter_0000000/run_config.yaml ADDED Viewed

	@@ -0,0 +1,564 @@

+_target_: megatron.bridge.training.config.ConfigContainer
+checkpoint:
+  _target_: megatron.bridge.training.config.CheckpointConfig
+  async_save: false
+  ckpt_assume_constant_structure: false
+  ckpt_convert_format: null
+  ckpt_convert_save: null
+  ckpt_format: torch_dist
+  ckpt_step: null
+  dist_ckpt_optim_fully_reshardable: false
+  dist_ckpt_save_pre_mcore_014: false
+  dist_ckpt_strictness: assume_ok_unexpected
+  distrib_optim_fully_reshardable_mem_efficient: false
+  exit_on_missing_checkpoint: false
+  finetune: true
+  fully_parallel_load: true
+  fully_parallel_save: true
+  load: null
+  load_main_params_from_ckpt: false
+  load_optim: true
+  load_rng: false
+  most_recent_k: -1
+  non_persistent_ckpt_type: null
+  non_persistent_global_ckpt_dir: null
+  non_persistent_local_ckpt_algo: fully_parallel
+  non_persistent_local_ckpt_dir: null
+  non_persistent_save_interval: null
+  pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
+  replication: false
+  replication_factor: 2
+  replication_jump: null
+  save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5600/policy/weights
+  save_interval: 100
+  save_optim: true
+  save_rng: true
+  save_tokenizer_assets: true
+  strict_fsdp_dtensor_load: false
+  use_checkpoint_args: false
+  use_persistent_ckpt_worker: true
+comm_overlap: null
+dataset: null
+ddp:
+  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
+  align_param_gather: false
+  average_in_collective: false
+  bucket_size: 40000000
+  check_for_large_grads: false
+  check_for_nan_in_grad: true
+  data_parallel_sharding_strategy: optim_grads_params
+  delay_wgrad_compute: false
+  disable_symmetric_registration: false
+  fp8_param_gather: false
+  fsdp_double_buffer: false
+  grad_reduce_in_fp32: true
+  gradient_reduce_div_fusion: true
+  keep_fp8_transpose_cache: false
+  nccl_ub: false
+  num_distributed_optimizer_instances: 1
+  outer_dp_sharding_strategy: no_shard
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  pad_buckets_for_high_nccl_busbw: false
+  preserve_fp32_weights: true
+  reduce_scatter_with_fp32_accumulation: false
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  suggested_communication_unit_size: null
+  use_custom_fsdp: false
+  use_distributed_optimizer: true
+  use_megatron_fsdp: false
+dist:
+  _target_: megatron.bridge.training.config.DistributedInitConfig
+  align_grad_reduce: true
+  disable_jit_fuser: false
+  distributed_backend: nccl
+  distributed_timeout_minutes: 10
+  distributed_timeout_seconds_after_init: null
+  enable_megatron_core_experimental: false
+  external_gpu_device_mapping: true
+  high_priority_stream_groups: null
+  lazy_init: false
+  local_rank: 0
+  nccl_communicator_config_path: null
+  sharp_enabled_group: null
+  use_gloo_process_groups: true
+  use_megatron_fsdp: false
+  use_sharp: false
+  use_torch_fsdp2: false
+  use_tp_pp_dp_mapping: false
+ft: null
+inprocess_restart: null
+logger:
+  _target_: megatron.bridge.training.config.LoggerConfig
+  filter_warnings: true
+  log_energy: false
+  log_interval: 100
+  log_l2_norm_grad_to_tensorboard: false
+  log_loss_scale_to_tensorboard: true
+  log_memory_to_tensorboard: false
+  log_params_norm: false
+  log_progress: false
+  log_runtime_to_tensorboard: false
+  log_throughput: false
+  log_throughput_to_tensorboard: false
+  log_timers_to_tensorboard: false
+  log_validation_ppl_to_tensorboard: false
+  log_world_size_to_tensorboard: false
+  logging_level: 0
+  memory_keys: null
+  modules_to_filter: null
+  runtime_time_unit: hours
+  save_config_filepath: null
+  set_level_for_all_loggers: false
+  tensorboard_dir: null
+  tensorboard_log_interval: 1
+  tensorboard_queue_size: 1000
+  throughput_window_size: 100
+  timing_log_level: 0
+  timing_log_option: minmax
+  wandb_entity: null
+  wandb_exp_name: null
+  wandb_project: null
+  wandb_save_dir: null
+mixed_precision: null
+model:
+  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_clamp_value: null
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _args_:
+    - 5
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_output_gate: false
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: false
+  bias_dropout_fusion: false
+  calculate_per_token_loss: true
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 2
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_double_buffering: false
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: false
+  cross_entropy_fusion_impl: native
+  cross_entropy_loss_fusion: true
+  cuda_graph_impl: none
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_scope: []
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  delay_wgrad_compute: false
+  deterministic_mode: false
+  disable_bf16_reduced_precision_matmul: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  embedding_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  embedding_init_method_std: 0.02
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: 1
+  external_cuda_graph: false
+  fallback_to_eager_attn: false
+  ffn_hidden_size: 9728
+  finalize_model_grads_func:
+    _args_: []
+    _partial_: true
+    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
+    pg_collection: null
+  fine_grained_activation_offloading: false
+  first_last_layers_bf16: false
+  flash_decode: false
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp4: null
+  fp4_param: false
+  fp4_quantizer_factory: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_quantizer_factory: null
+  fp8_recipe: delayed
+  fp8_wgrad: true
+  fused_single_qkv_rope: false
+  gated_linear_unit: true
+  generation_config: null
+  glu_linear_offset: 0.0
+  grad_scale_func:
+    _call_: false
+    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
+  grad_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
+  gradient_accumulation_fusion: false
+  hetereogenous_dist_checkpoint: false
+  heterogeneous_block_specs: false
+  hf_model_id: ./models/Qwen-NVARC
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  hierarchical_context_parallel_sizes: null
+  inference_rng_tracker: false
+  inference_sampling_seed: 42
+  init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  init_method_std: 0.02
+  init_model_with_meta_device: false
+  is_hybrid_model: false
+  kv_channels: 128
+  layernorm_epsilon: 1.0e-06
+  layernorm_zero_centered_gamma: false
+  linear_attention_freq: null
+  linear_attention_type: null
+  linear_conv_kernel_dim: null
+  linear_key_head_dim: null
+  linear_num_key_heads: null
+  linear_num_value_heads: null
+  linear_value_head_dim: null
+  log_max_attention_logit: false
+  make_vocab_size_divisible_by: 16
+  mamba_head_dim: 64
+  mamba_num_groups: 8
+  mamba_num_heads: null
+  mamba_state_dim: 128
+  masked_softmax_fusion: true
+  max_position_embeddings: 40960
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  min_offloaded_tensor_size: 1048576
+  mlp_chunks_for_prefill: 1
+  moe_apply_probs_on_input: false
+  moe_aux_loss_coeff: 0.0
+  moe_deepep_num_sms: 20
+  moe_enable_deepep: false
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_flex_dispatcher_backend: deepep
+  moe_grouped_gemm: false
+  moe_hybridep_num_sms: 16
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.0
+  moe_router_dtype: fp64
+  moe_router_enable_expert_bias: false
+  moe_router_force_load_balancing: false
+  moe_router_fusion: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: none
+  moe_router_num_groups: null
+  moe_router_padding_for_fp8: false
+  moe_router_padding_for_quantization: false
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_gate: false
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  mrope_section: null
+  mtp_enabled: false
+  mtp_loss_scaling_factor: null
+  mtp_num_layers: null
+  mtp_standalone: false
+  multi_latent_attention: false
+  no_rope_freq: null
+  no_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 36
+  num_layers_at_end_in_bf16: 1
+  num_layers_at_start_in_bf16: 1
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  offload_modules: null
+  output_layer_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.0023570226039551587
+  overlap_moe_expert_parallel_comm: false
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: false
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_comm_backend: null
+  pipeline_model_parallel_layout: null
+  pipeline_model_parallel_size: 1
+  position_embedding_type: rope
+  qk_clip: false
+  qk_clip_alpha: 0.5
+  qk_clip_threshold: 100
+  qk_layernorm: true
+  quant_recipe: null
+  recompute_granularity: full
+  recompute_method: uniform
+  recompute_modules:
+  - core_attn
+  recompute_num_layers: 1
+  restore_modelopt_state: false
+  rotary_base: 5000000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 262144
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  should_pad_vocab: false
+  softmax_scale: null
+  softmax_type: vanilla
+  symmetric_ar_type: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_cfg: null
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_impl: transformer_engine
+  transformer_layer_spec:
+    _call_: false
+    _target_: megatron.bridge.models.gpt_provider.default_layer_spec
+  use_cpu_initialization: false
+  use_fused_weighted_squared_relu: false
+  use_kitchen: false
+  use_mamba_mem_eff_path: true
+  use_ring_exchange_p2p: false
+  use_te_activation_func: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  use_transformer_engine_op_fuser: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 16
+  wgrad_deferral_limit: 0
+  window_attn_skip_freq: null
+  window_size: null
+nvrx_straggler: null
+optimizer:
+  _target_: megatron.bridge.training.config.OptimizerConfig
+  adam_beta1: 0.9
+  adam_beta2: 0.98
+  adam_eps: 1.0e-08
+  barrier_with_L1_time: false
+  bf16: true
+  clip_grad: 0.5
+  config_logger_dir: ''
+  decoupled_lr: null
+  decoupled_min_lr: null
+  decoupled_weight_decay: true
+  exp_avg_dtype:
+    _call_: false
+    _target_: torch.float32
+  exp_avg_sq_dtype:
+    _call_: false
+    _target_: torch.float32
+  fp16: false
+  fp8_recipe: null
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  log_num_zeros_in_grad: false
+  loss_scale: null
+  loss_scale_window: 1000
+  lr: 0.0001
+  main_grads_dtype:
+    _call_: false
+    _target_: torch.float32
+  main_params_dtype:
+    _call_: false
+    _target_: torch.float32
+  min_loss_scale: 1.0
+  min_lr: 1.0e-07
+  muon_extra_scale_factor: 1.0
+  muon_fp32_matmul_prec: medium
+  muon_momentum: 0.95
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral
+  muon_split_qkv: true
+  muon_tp_mode: blockwise
+  muon_use_nesterov: false
+  optimizer: adam
+  optimizer_cpu_offload: false
+  optimizer_offload_fraction: 0.0
+  overlap_cpu_optimizer_d2h_h2d: false
+  overlap_param_gather: false
+  overlap_param_gather_with_optimizer_step: false
+  params_dtype: bfloat16
+  pin_cpu_grads: true
+  pin_cpu_params: true
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  sgd_momentum: 0.9
+  store_param_remainders: true
+  timers: null
+  use_distributed_optimizer: true
+  use_precision_aware_optimizer: false
+  use_torch_optimizer_for_cpu_offload: false
+  weight_decay: 0.1
+peft: null
+profiling:
+  _target_: megatron.bridge.training.config.ProfilingConfig
+  memory_snapshot_path: snapshot.pickle
+  nvtx_ranges: false
+  profile_ranks:
+  - 0
+  profile_step_end: 12
+  profile_step_start: 10
+  record_memory_history: false
+  record_shapes: false
+  use_nsys_profiler: false
+  use_pytorch_profiler: false
+rerun_state_machine:
+  _target_: megatron.bridge.training.config.RerunStateMachineConfig
+  check_for_nan_in_loss: true
+  check_for_spiky_loss: false
+  error_injection_rate: 0
+  error_injection_type: transient_error
+  rerun_mode: disabled
+rng:
+  _target_: megatron.bridge.training.config.RNGConfig
+  data_parallel_random_init: false
+  inference_rng_tracker: false
+  seed: 1234
+  te_rng_tracker: false
+scheduler:
+  _target_: megatron.bridge.training.config.SchedulerConfig
+  end_weight_decay: 0.1
+  lr_decay_iters: 12716
+  lr_decay_samples: null
+  lr_decay_steps: 3255296
+  lr_decay_style: linear
+  lr_warmup_fraction: null
+  lr_warmup_init: 1.0e-06
+  lr_warmup_iters: 200
+  lr_warmup_samples: 0
+  lr_warmup_steps: 51200
+  lr_wsd_decay_iters: null
+  lr_wsd_decay_samples: null
+  lr_wsd_decay_style: exponential
+  no_weight_decay_cond_type: null
+  override_opt_param_scheduler: false
+  start_weight_decay: 0.1
+  use_checkpoint_opt_param_scheduler: false
+  wd_incr_steps: 1528832
+  weight_decay_incr_style: constant
+  wsd_decay_steps: null
+straggler: null
+tensor_inspect: null
+tokenizer:
+  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
+  hf_tokenizer_kwargs: {}
+  image_tag_type: null
+  merge_file: null
+  special_tokens: null
+  tiktoken_num_special_tokens: 1000
+  tiktoken_pattern: null
+  tiktoken_special_tokens: null
+  tokenizer_model: ./models/Qwen-NVARC
+  tokenizer_prompt_format: null
+  tokenizer_type: HuggingFaceTokenizer
+  vocab_extra_ids: 0
+  vocab_file: null
+  vocab_size: null
+train:
+  _target_: megatron.bridge.training.config.TrainingConfig
+  check_weight_hash_across_dp_replicas_interval: null
+  decrease_batch_size_if_needed: false
+  empty_unused_memory_level: 0
+  eval_interval: 1000
+  eval_iters: 100
+  exit_duration_in_mins: null
+  exit_interval: null
+  exit_signal:
+    _args_:
+    - 15
+    _call_: true
+    _target_: signal.Signals
+  exit_signal_handler: false
+  exit_signal_handler_for_dataloader: false
+  global_batch_size: 256
+  iterations_to_skip: []
+  manual_gc: false
+  manual_gc_eval: true
+  manual_gc_interval: 0
+  micro_batch_size: 1
+  rampup_batch_size: null
+  skip_train: false
+  train_iters: 5972
+  train_samples: null
+  train_sync_interval: null

step_5600/policy/weights/iter_0000000/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5600/policy/weights/latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0

step_5600/policy/weights/latest_train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5600/train_dataloader.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abe0ee8c91d5ba1b614239817486eae38492eb4f5f311f8b71c6b33bc2151b2b
+size 7336

step_5600/training_info.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"epoch": 0, "step": 5600, "total_steps": 5600, "consumed_samples": 1433600, "total_valid_tokens": 1626494740.0, "val:val_loss": 0.14774028956890106}

step_5800/config.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+checkpointing:
+  checkpoint_dir: results/qwen3_4b_sft
+  checkpoint_must_save_by: null
+  enabled: true
+  higher_is_better: false
+  keep_top_k: 3
+  metric_name: val:val_loss
+  save_period: 200
+cluster:
+  gpus_per_node: 2
+  num_nodes: 1
+data:
+  num_workers: 4
+  shuffle: true
+  train_dataset_path:
+  - ./data/hones
+  val_dataset_path: ./data/arc2_evaluation6
+logger:
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+  log_dir: logs/exp_019
+  mlflow_enabled: false
+  monitor_gpus: false
+  swanlab_enabled: false
+  tensorboard_enabled: false
+  wandb:
+    name: qwen3_4b_sft
+    project: arc2
+  wandb_enabled: true
+policy:
+  activation_checkpointing_enabled: false
+  attn_implementation: flash_attention_2
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  fsdp_offload_enabled: false
+  make_sequence_length_divisible_by: 64
+  max_grad_norm: null
+  megatron_cfg:
+    activation_checkpointing: true
+    apply_rope_fusion: true
+    bias_activation_fusion: false
+    context_parallel_size: 2
+    distributed_data_parallel_config:
+      average_in_collective: true
+      data_parallel_sharding_strategy: optim_grads_params
+      grad_reduce_in_fp32: true
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+    empty_unused_memory_level: 1
+    enabled: true
+    env_vars:
+      AWS_OFI_NCCL_VERSION: 1.14.0
+      BASH_ENV: /etc/bash.bashrc
+      CAL_VERSION: 0.4.4.50
+      CUBLASMP_VERSION: 0.4.0.789
+      CUBLAS_VERSION: 12.9.0.13
+      CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
+      CUDA_DRIVER_VERSION: 575.51.03
+      CUDA_VERSION: 12.9.0.043
+      CUDA_VISIBLE_DEVICES: 6,7
+      CUDNN_FRONTEND_VERSION: 1.11.0
+      CUDNN_VERSION: 9.10.1.4
+      CUFFT_VERSION: 11.4.0.6
+      CUFILE_VERSION: 1.14.0.30
+      CURAND_VERSION: 10.3.10.19
+      CUSOLVER_VERSION: 11.7.4.40
+      CUSPARSELT_VERSION: 0.7.1.0
+      CUSPARSE_VERSION: 12.5.9.5
+      DALI_BUILD: ''
+      DALI_URL_SUFFIX: '120'
+      DALI_VERSION: 1.49.0
+      EFA_VERSION: 1.38.1
+      ENV: /etc/shinit_v2
+      GDRCOPY_VERSION: 2.4.4
+      HOME: /root
+      HOSTNAME: e6ad2ac15863
+      HPCX_VERSION: '2.23'
+      KMP_DUPLICATE_LIB_OK: 'True'
+      KMP_INIT_AT_FORK: 'FALSE'
+      LC_CTYPE: C.UTF-8
+      LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+      LESSCLOSE: /usr/bin/lesspipe %s %s
+      LESSOPEN: '| /usr/bin/lesspipe %s'
+      LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
+      LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
+      MODEL_OPT_VERSION: 0.27.1
+      MOFED_VERSION: 5.4-rdmacore50.0
+      NCCL_NET_PLUGIN: aws-ofi
+      NCCL_TUNER_PLUGIN: aws-ofi
+      NCCL_VERSION: 2.26.5
+      NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
+      NEMO_RL_VENV_DIR: /opt/ray_venvs
+      NPP_VERSION: 12.4.0.27
+      NRL_CONTAINER: '1'
+      NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
+      NSIGHT_COMPUTE_VERSION: 2025.2.0.11
+      NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
+      NVIDIA_BUILD_ID: '244212578'
+      NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
+      NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
+      NVIDIA_PRODUCT_NAME: CUDA
+      NVIDIA_REQUIRE_CUDA: cuda>=9.0
+      NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
+      NVIDIA_VISIBLE_DEVICES: all
+      NVJITLINK_VERSION: 12.9.41
+      NVJPEG_VERSION: 12.4.0.16
+      NVSHMEM_VERSION: 3.2.5
+      OLDPWD: /workspace
+      OMPI_MCA_coll_hcoll_enable: '0'
+      OPAL_PREFIX: /opt/hpcx/ompi
+      OPENMPI_VERSION: 4.1.7
+      OPENUCX_VERSION: 1.19.0
+      PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
+      POLYGRAPHY_VERSION: 0.49.20
+      PWD: /workspace/ARChitects
+      PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
+      PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      RAY_CLIENT_MODE: '0'
+      RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
+      RAY_USAGE_STATS_ENABLED: '0'
+      RDMACORE_VERSION: '50.0'
+      SHELL: /bin/bash
+      SHLVL: '2'
+      SWANLAB_API_HOST: https://api.swanlab.cn/api
+      SWANLAB_RUNTIME: user
+      SWANLAB_WEB_HOST: https://swanlab.cn
+      TERM: xterm
+      TORCH_CUDA_ARCH_LIST: '9.0'
+      TRANSFORMER_ENGINE_VERSION: '2.3'
+      TRTOSS_VERSION: ''
+      TRT_VERSION: 10.10.0.31
+      UV: /root/.local/bin/uv
+      UV_LINK_MODE: copy
+      UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
+      UV_RUN_RECURSION_DEPTH: '1'
+      VIRTUAL_ENV: /opt/nemo_rl_venv
+      VIRTUAL_ENV_PROMPT: nemo-rl
+      WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
+      _: /root/.local/bin/uv
+      _CUDA_COMPAT_PATH: /usr/local/cuda/compat
+      _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
+        (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
+      _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    freeze_moe_router: true
+    moe_permute_fusion: false
+    moe_router_bias_update_rate: 0.0
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    optimizer:
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1.0e-08
+      bf16: true
+      clip_grad: 0.5
+      fp16: false
+      lr: 0.0001
+      min_lr: 1.0e-07
+      optimizer: adam
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+      params_dtype: bfloat16
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: false
+      weight_decay: 0.1
+    pipeline_dtype: bfloat16
+    pipeline_model_parallel_size: 1
+    scheduler:
+      end_weight_decay: 0.1
+      lr_decay_iters: 12716
+      lr_decay_style: linear
+      lr_warmup_init: 1.0e-06
+      lr_warmup_iters: 200
+      start_weight_decay: 0.1
+      weight_decay_incr_style: constant
+    sequence_parallel: false
+    tensor_model_parallel_size: 1
+    train_iters: 5972
+  model_name: ./models/Qwen-NVARC
+  offload_optimizer_for_logprob: false
+  precision: bfloat16
+  sequence_packing:
+    algorithm: modified_first_fit_decreasing
+    enabled: true
+    sequence_length_round: 64
+    train_mb_tokens: 128000
+  tokenizer:
+    name: ./models/Qwen-NVARC
+  train_global_batch_size: 256
+  train_micro_batch_size: 1
+sft:
+  max_num_epochs: 1
+  max_num_steps: 6400
+  seed: 24
+  val_at_start: true
+  val_batches: 200
+  val_global_batch_size: 256
+  val_micro_batch_size: 1
+  val_period: 200

step_5800/policy/weights/iter_0000000/.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd177ae05a23762b1acc7a8eff274e5a9104b258ba48b225e821312fb6de12f
+size 329201

step_5800/policy/weights/iter_0000000/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2884ab50f51fa561ef6e4a6a4f422b146a712c0b47eb1ff41494ace545036d06
+size 12718332319

step_5800/policy/weights/iter_0000000/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:619738ba8dfec45074012486c339e30475eb90b1f9ec0d57c6eb9ae4cbb4af39
+size 12717813616

step_5800/policy/weights/iter_0000000/common.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcd736818bbf683f63191cf9ab55ee9ec1d1ba58597572923af7a35da3c7f532
+size 1767

step_5800/policy/weights/iter_0000000/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+activation_func: <function silu at 0x7d0251c6b420>
+activation_func_clamp_value: None
+add_bias_linear: false
+add_qkv_bias: false
+apply_query_key_layer_scaling: false
+apply_residual_connection_post_layernorm: false
+apply_rope_fusion: true
+attention_backend: AttnBackend.auto
+attention_dropout: '0.0'
+attention_output_gate: false
+attention_softmax_in_fp32: false
+autocast_dtype: torch.bfloat16
+barrier_with_L1_time: true
+bf16: true
+bias_activation_fusion: false
+bias_dropout_fusion: false
+calculate_per_token_loss: true
+clone_scatter_output_in_embedding: true
+config_logger_dir: ''
+cross_entropy_fusion_impl: native
+cross_entropy_loss_fusion: true
+defer_embedding_wgrad_compute: false
+delay_wgrad_compute: false
+deterministic_mode: false
+disable_bf16_reduced_precision_matmul: false
+disable_parameter_transpose_cache: false
+distribute_saved_activations: None
+enable_autocast: false
+fallback_to_eager_attn: false
+ffn_hidden_size: 9728
+finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
+  pg_collection=None)
+fine_grained_activation_offloading: false
+first_last_layers_bf16: false
+flash_decode: false
+fp16: false
+fp16_lm_cross_entropy: false
+fp32_residual_connection: false
+fused_single_qkv_rope: false
+gated_linear_unit: true
+generation_config: None
+glu_linear_offset: '0.0'
+grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
+  object at 0x7cf9d413cd70>>
+grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+gradient_accumulation_fusion: false
+hetereogenous_dist_checkpoint: false
+heterogeneous_block_specs: false
+hf_model_id: ./models/Qwen-NVARC
+hidden_dropout: '0.0'
+hidden_size: 2560
+is_hybrid_model: false
+kv_channels: 128
+layernorm_epsilon: 1e-06
+layernorm_zero_centered_gamma: false
+linear_attention_freq: None
+linear_attention_type: None
+linear_conv_kernel_dim: None
+linear_key_head_dim: None
+linear_num_key_heads: None
+linear_num_value_heads: None
+linear_value_head_dim: None
+log_max_attention_logit: false
+make_vocab_size_divisible_by: 16
+mamba_head_dim: 64
+mamba_num_groups: 8
+mamba_num_heads: None
+mamba_state_dim: 128
+masked_softmax_fusion: true
+max_position_embeddings: 40960
+memory_efficient_layer_norm: false
+min_offloaded_tensor_size: 1048576
+mlp_chunks_for_prefill: 1
+moe_apply_probs_on_input: false
+moe_aux_loss_coeff: '0.0'
+moe_deepep_num_sms: 20
+moe_enable_deepep: false
+moe_expert_capacity_factor: None
+moe_extended_tp: false
+moe_ffn_hidden_size: None
+moe_flex_dispatcher_backend: deepep
+moe_grouped_gemm: false
+moe_hybridep_num_sms: 16
+moe_input_jitter_eps: None
+moe_layer_freq: 1
+moe_pad_expert_input_to_capacity: false
+moe_per_layer_logging: false
+moe_permute_fusion: false
+moe_router_bias_update_rate: '0.0'
+moe_router_dtype: fp64
+moe_router_enable_expert_bias: false
+moe_router_force_load_balancing: false
+moe_router_fusion: false
+moe_router_group_topk: None
+moe_router_load_balancing_type: none
+moe_router_num_groups: None
+moe_router_padding_for_quantization: false
+moe_router_pre_softmax: false
+moe_router_score_function: softmax
+moe_router_topk: 2
+moe_router_topk_limited_devices: None
+moe_router_topk_scaling_factor: None
+moe_shared_expert_gate: false
+moe_shared_expert_intermediate_size: None
+moe_shared_expert_overlap: false
+moe_token_dispatcher_type: allgather
+moe_token_drop_policy: probs
+moe_token_dropping: false
+moe_use_legacy_grouped_gemm: false
+moe_z_loss_coeff: None
+mrope_section: None
+multi_latent_attention: false
+no_rope_freq: None
+no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+normalization: RMSNorm
+num_attention_heads: 32
+num_layers: 36
+num_layers_at_end_in_bf16: 1
+num_layers_at_start_in_bf16: 1
+num_moe_experts: None
+num_query_groups: 8
+nvidia_modelopt_version: 0.39.0
+offload_modules: None
+param_sync_func: None
+params_dtype: torch.bfloat16
+perform_initialization: true
+persist_layer_norm: false
+position_embedding_type: rope
+qk_clip: false
+qk_clip_alpha: '0.5'
+qk_clip_threshold: 100
+qk_layernorm: true
+quant_recipe: None
+restore_modelopt_state: false
+rotary_base: 5000000
+rotary_interleaved: false
+rotary_percent: '1.0'
+seq_len_interpolation_factor: None
+seq_length: 262144
+share_embeddings_and_output_weights: true
+should_pad_vocab: false
+softmax_scale: None
+softmax_type: vanilla
+symmetric_ar_type: None
+test_mode: false
+timers: None
+transformer_impl: transformer_engine
+transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
+use_fused_weighted_squared_relu: false
+use_kitchen: false
+use_mamba_mem_eff_path: true
+use_ring_exchange_p2p: false
+use_te_activation_func: false
+use_te_rng_tracker: false
+use_transformer_engine_full_layer_spec: false
+use_transformer_engine_op_fuser: false
+variable_seq_lengths: false
+vocab_size: 16
+wgrad_deferral_limit: 0
+window_attn_skip_freq: None
+window_size: None

step_5800/policy/weights/iter_0000000/run_config.yaml ADDED Viewed

	@@ -0,0 +1,564 @@

+_target_: megatron.bridge.training.config.ConfigContainer
+checkpoint:
+  _target_: megatron.bridge.training.config.CheckpointConfig
+  async_save: false
+  ckpt_assume_constant_structure: false
+  ckpt_convert_format: null
+  ckpt_convert_save: null
+  ckpt_format: torch_dist
+  ckpt_step: null
+  dist_ckpt_optim_fully_reshardable: false
+  dist_ckpt_save_pre_mcore_014: false
+  dist_ckpt_strictness: assume_ok_unexpected
+  distrib_optim_fully_reshardable_mem_efficient: false
+  exit_on_missing_checkpoint: false
+  finetune: true
+  fully_parallel_load: true
+  fully_parallel_save: true
+  load: null
+  load_main_params_from_ckpt: false
+  load_optim: true
+  load_rng: false
+  most_recent_k: -1
+  non_persistent_ckpt_type: null
+  non_persistent_global_ckpt_dir: null
+  non_persistent_local_ckpt_algo: fully_parallel
+  non_persistent_local_ckpt_dir: null
+  non_persistent_save_interval: null
+  pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
+  replication: false
+  replication_factor: 2
+  replication_jump: null
+  save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5800/policy/weights
+  save_interval: 100
+  save_optim: true
+  save_rng: true
+  save_tokenizer_assets: true
+  strict_fsdp_dtensor_load: false
+  use_checkpoint_args: false
+  use_persistent_ckpt_worker: true
+comm_overlap: null
+dataset: null
+ddp:
+  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
+  align_param_gather: false
+  average_in_collective: false
+  bucket_size: 40000000
+  check_for_large_grads: false
+  check_for_nan_in_grad: true
+  data_parallel_sharding_strategy: optim_grads_params
+  delay_wgrad_compute: false
+  disable_symmetric_registration: false
+  fp8_param_gather: false
+  fsdp_double_buffer: false
+  grad_reduce_in_fp32: true
+  gradient_reduce_div_fusion: true
+  keep_fp8_transpose_cache: false
+  nccl_ub: false
+  num_distributed_optimizer_instances: 1
+  outer_dp_sharding_strategy: no_shard
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  pad_buckets_for_high_nccl_busbw: false
+  preserve_fp32_weights: true
+  reduce_scatter_with_fp32_accumulation: false
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  suggested_communication_unit_size: null
+  use_custom_fsdp: false
+  use_distributed_optimizer: true
+  use_megatron_fsdp: false
+dist:
+  _target_: megatron.bridge.training.config.DistributedInitConfig
+  align_grad_reduce: true
+  disable_jit_fuser: false
+  distributed_backend: nccl
+  distributed_timeout_minutes: 10
+  distributed_timeout_seconds_after_init: null
+  enable_megatron_core_experimental: false
+  external_gpu_device_mapping: true
+  high_priority_stream_groups: null
+  lazy_init: false
+  local_rank: 0
+  nccl_communicator_config_path: null
+  sharp_enabled_group: null
+  use_gloo_process_groups: true
+  use_megatron_fsdp: false
+  use_sharp: false
+  use_torch_fsdp2: false
+  use_tp_pp_dp_mapping: false
+ft: null
+inprocess_restart: null
+logger:
+  _target_: megatron.bridge.training.config.LoggerConfig
+  filter_warnings: true
+  log_energy: false
+  log_interval: 100
+  log_l2_norm_grad_to_tensorboard: false
+  log_loss_scale_to_tensorboard: true
+  log_memory_to_tensorboard: false
+  log_params_norm: false
+  log_progress: false
+  log_runtime_to_tensorboard: false
+  log_throughput: false
+  log_throughput_to_tensorboard: false
+  log_timers_to_tensorboard: false
+  log_validation_ppl_to_tensorboard: false
+  log_world_size_to_tensorboard: false
+  logging_level: 0
+  memory_keys: null
+  modules_to_filter: null
+  runtime_time_unit: hours
+  save_config_filepath: null
+  set_level_for_all_loggers: false
+  tensorboard_dir: null
+  tensorboard_log_interval: 1
+  tensorboard_queue_size: 1000
+  throughput_window_size: 100
+  timing_log_level: 0
+  timing_log_option: minmax
+  wandb_entity: null
+  wandb_exp_name: null
+  wandb_project: null
+  wandb_save_dir: null
+mixed_precision: null
+model:
+  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_clamp_value: null
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _args_:
+    - 5
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_output_gate: false
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: false
+  bias_dropout_fusion: false
+  calculate_per_token_loss: true
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 2
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_double_buffering: false
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: false
+  cross_entropy_fusion_impl: native
+  cross_entropy_loss_fusion: true
+  cuda_graph_impl: none
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_scope: []
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  delay_wgrad_compute: false
+  deterministic_mode: false
+  disable_bf16_reduced_precision_matmul: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  embedding_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  embedding_init_method_std: 0.02
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: 1
+  external_cuda_graph: false
+  fallback_to_eager_attn: false
+  ffn_hidden_size: 9728
+  finalize_model_grads_func:
+    _args_: []
+    _partial_: true
+    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
+    pg_collection: null
+  fine_grained_activation_offloading: false
+  first_last_layers_bf16: false
+  flash_decode: false
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp4: null
+  fp4_param: false
+  fp4_quantizer_factory: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_quantizer_factory: null
+  fp8_recipe: delayed
+  fp8_wgrad: true
+  fused_single_qkv_rope: false
+  gated_linear_unit: true
+  generation_config: null
+  glu_linear_offset: 0.0
+  grad_scale_func:
+    _call_: false
+    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
+  grad_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
+  gradient_accumulation_fusion: false
+  hetereogenous_dist_checkpoint: false
+  heterogeneous_block_specs: false
+  hf_model_id: ./models/Qwen-NVARC
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  hierarchical_context_parallel_sizes: null
+  inference_rng_tracker: false
+  inference_sampling_seed: 42
+  init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  init_method_std: 0.02
+  init_model_with_meta_device: false
+  is_hybrid_model: false
+  kv_channels: 128
+  layernorm_epsilon: 1.0e-06
+  layernorm_zero_centered_gamma: false
+  linear_attention_freq: null
+  linear_attention_type: null
+  linear_conv_kernel_dim: null
+  linear_key_head_dim: null
+  linear_num_key_heads: null
+  linear_num_value_heads: null
+  linear_value_head_dim: null
+  log_max_attention_logit: false
+  make_vocab_size_divisible_by: 16
+  mamba_head_dim: 64
+  mamba_num_groups: 8
+  mamba_num_heads: null
+  mamba_state_dim: 128
+  masked_softmax_fusion: true
+  max_position_embeddings: 40960
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  min_offloaded_tensor_size: 1048576
+  mlp_chunks_for_prefill: 1
+  moe_apply_probs_on_input: false
+  moe_aux_loss_coeff: 0.0
+  moe_deepep_num_sms: 20
+  moe_enable_deepep: false
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_flex_dispatcher_backend: deepep
+  moe_grouped_gemm: false
+  moe_hybridep_num_sms: 16
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.0
+  moe_router_dtype: fp64
+  moe_router_enable_expert_bias: false
+  moe_router_force_load_balancing: false
+  moe_router_fusion: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: none
+  moe_router_num_groups: null
+  moe_router_padding_for_fp8: false
+  moe_router_padding_for_quantization: false
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_gate: false
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  mrope_section: null
+  mtp_enabled: false
+  mtp_loss_scaling_factor: null
+  mtp_num_layers: null
+  mtp_standalone: false
+  multi_latent_attention: false
+  no_rope_freq: null
+  no_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 36
+  num_layers_at_end_in_bf16: 1
+  num_layers_at_start_in_bf16: 1
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  offload_modules: null
+  output_layer_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.0023570226039551587
+  overlap_moe_expert_parallel_comm: false
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: false
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_comm_backend: null
+  pipeline_model_parallel_layout: null
+  pipeline_model_parallel_size: 1
+  position_embedding_type: rope
+  qk_clip: false
+  qk_clip_alpha: 0.5
+  qk_clip_threshold: 100
+  qk_layernorm: true
+  quant_recipe: null
+  recompute_granularity: full
+  recompute_method: uniform
+  recompute_modules:
+  - core_attn
+  recompute_num_layers: 1
+  restore_modelopt_state: false
+  rotary_base: 5000000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 262144
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  should_pad_vocab: false
+  softmax_scale: null
+  softmax_type: vanilla
+  symmetric_ar_type: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_cfg: null
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_impl: transformer_engine
+  transformer_layer_spec:
+    _call_: false
+    _target_: megatron.bridge.models.gpt_provider.default_layer_spec
+  use_cpu_initialization: false
+  use_fused_weighted_squared_relu: false
+  use_kitchen: false
+  use_mamba_mem_eff_path: true
+  use_ring_exchange_p2p: false
+  use_te_activation_func: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  use_transformer_engine_op_fuser: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 16
+  wgrad_deferral_limit: 0
+  window_attn_skip_freq: null
+  window_size: null
+nvrx_straggler: null
+optimizer:
+  _target_: megatron.bridge.training.config.OptimizerConfig
+  adam_beta1: 0.9
+  adam_beta2: 0.98
+  adam_eps: 1.0e-08
+  barrier_with_L1_time: false
+  bf16: true
+  clip_grad: 0.5
+  config_logger_dir: ''
+  decoupled_lr: null
+  decoupled_min_lr: null
+  decoupled_weight_decay: true
+  exp_avg_dtype:
+    _call_: false
+    _target_: torch.float32
+  exp_avg_sq_dtype:
+    _call_: false
+    _target_: torch.float32
+  fp16: false
+  fp8_recipe: null
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  log_num_zeros_in_grad: false
+  loss_scale: null
+  loss_scale_window: 1000
+  lr: 0.0001
+  main_grads_dtype:
+    _call_: false
+    _target_: torch.float32
+  main_params_dtype:
+    _call_: false
+    _target_: torch.float32
+  min_loss_scale: 1.0
+  min_lr: 1.0e-07
+  muon_extra_scale_factor: 1.0
+  muon_fp32_matmul_prec: medium
+  muon_momentum: 0.95
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral
+  muon_split_qkv: true
+  muon_tp_mode: blockwise
+  muon_use_nesterov: false
+  optimizer: adam
+  optimizer_cpu_offload: false
+  optimizer_offload_fraction: 0.0
+  overlap_cpu_optimizer_d2h_h2d: false
+  overlap_param_gather: false
+  overlap_param_gather_with_optimizer_step: false
+  params_dtype: bfloat16
+  pin_cpu_grads: true
+  pin_cpu_params: true
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  sgd_momentum: 0.9
+  store_param_remainders: true
+  timers: null
+  use_distributed_optimizer: true
+  use_precision_aware_optimizer: false
+  use_torch_optimizer_for_cpu_offload: false
+  weight_decay: 0.1
+peft: null
+profiling:
+  _target_: megatron.bridge.training.config.ProfilingConfig
+  memory_snapshot_path: snapshot.pickle
+  nvtx_ranges: false
+  profile_ranks:
+  - 0
+  profile_step_end: 12
+  profile_step_start: 10
+  record_memory_history: false
+  record_shapes: false
+  use_nsys_profiler: false
+  use_pytorch_profiler: false
+rerun_state_machine:
+  _target_: megatron.bridge.training.config.RerunStateMachineConfig
+  check_for_nan_in_loss: true
+  check_for_spiky_loss: false
+  error_injection_rate: 0
+  error_injection_type: transient_error
+  rerun_mode: disabled
+rng:
+  _target_: megatron.bridge.training.config.RNGConfig
+  data_parallel_random_init: false
+  inference_rng_tracker: false
+  seed: 1234
+  te_rng_tracker: false
+scheduler:
+  _target_: megatron.bridge.training.config.SchedulerConfig
+  end_weight_decay: 0.1
+  lr_decay_iters: 12716
+  lr_decay_samples: null
+  lr_decay_steps: 3255296
+  lr_decay_style: linear
+  lr_warmup_fraction: null
+  lr_warmup_init: 1.0e-06
+  lr_warmup_iters: 200
+  lr_warmup_samples: 0
+  lr_warmup_steps: 51200
+  lr_wsd_decay_iters: null
+  lr_wsd_decay_samples: null
+  lr_wsd_decay_style: exponential
+  no_weight_decay_cond_type: null
+  override_opt_param_scheduler: false
+  start_weight_decay: 0.1
+  use_checkpoint_opt_param_scheduler: false
+  wd_incr_steps: 1528832
+  weight_decay_incr_style: constant
+  wsd_decay_steps: null
+straggler: null
+tensor_inspect: null
+tokenizer:
+  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
+  hf_tokenizer_kwargs: {}
+  image_tag_type: null
+  merge_file: null
+  special_tokens: null
+  tiktoken_num_special_tokens: 1000
+  tiktoken_pattern: null
+  tiktoken_special_tokens: null
+  tokenizer_model: ./models/Qwen-NVARC
+  tokenizer_prompt_format: null
+  tokenizer_type: HuggingFaceTokenizer
+  vocab_extra_ids: 0
+  vocab_file: null
+  vocab_size: null
+train:
+  _target_: megatron.bridge.training.config.TrainingConfig
+  check_weight_hash_across_dp_replicas_interval: null
+  decrease_batch_size_if_needed: false
+  empty_unused_memory_level: 0
+  eval_interval: 1000
+  eval_iters: 100
+  exit_duration_in_mins: null
+  exit_interval: null
+  exit_signal:
+    _args_:
+    - 15
+    _call_: true
+    _target_: signal.Signals
+  exit_signal_handler: false
+  exit_signal_handler_for_dataloader: false
+  global_batch_size: 256
+  iterations_to_skip: []
+  manual_gc: false
+  manual_gc_eval: true
+  manual_gc_interval: 0
+  micro_batch_size: 1
+  rampup_batch_size: null
+  skip_train: false
+  train_iters: 5972
+  train_samples: null
+  train_sync_interval: null

step_5800/policy/weights/iter_0000000/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5800/policy/weights/latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0

step_5800/policy/weights/latest_train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5800/train_dataloader.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:458e115a25d1c97a8415a462c7ac872cd8c36b2dd1561561119e578a52acef61
+size 7336

step_5800/training_info.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"epoch": 0, "step": 5800, "total_steps": 5800, "consumed_samples": 1484800, "total_valid_tokens": 1684485110.0, "val:val_loss": 0.14940811693668365}

step_5972/config.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+checkpointing:
+  checkpoint_dir: results/qwen3_4b_sft
+  checkpoint_must_save_by: null
+  enabled: true
+  higher_is_better: false
+  keep_top_k: 3
+  metric_name: val:val_loss
+  save_period: 200
+cluster:
+  gpus_per_node: 2
+  num_nodes: 1
+data:
+  num_workers: 4
+  shuffle: true
+  train_dataset_path:
+  - ./data/hones
+  val_dataset_path: ./data/arc2_evaluation6
+logger:
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+  log_dir: logs/exp_019
+  mlflow_enabled: false
+  monitor_gpus: false
+  swanlab_enabled: false
+  tensorboard_enabled: false
+  wandb:
+    name: qwen3_4b_sft
+    project: arc2
+  wandb_enabled: true
+policy:
+  activation_checkpointing_enabled: false
+  attn_implementation: flash_attention_2
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  fsdp_offload_enabled: false
+  make_sequence_length_divisible_by: 64
+  max_grad_norm: null
+  megatron_cfg:
+    activation_checkpointing: true
+    apply_rope_fusion: true
+    bias_activation_fusion: false
+    context_parallel_size: 2
+    distributed_data_parallel_config:
+      average_in_collective: true
+      data_parallel_sharding_strategy: optim_grads_params
+      grad_reduce_in_fp32: true
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+    empty_unused_memory_level: 1
+    enabled: true
+    env_vars:
+      AWS_OFI_NCCL_VERSION: 1.14.0
+      BASH_ENV: /etc/bash.bashrc
+      CAL_VERSION: 0.4.4.50
+      CUBLASMP_VERSION: 0.4.0.789
+      CUBLAS_VERSION: 12.9.0.13
+      CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
+      CUDA_DRIVER_VERSION: 575.51.03
+      CUDA_VERSION: 12.9.0.043
+      CUDA_VISIBLE_DEVICES: 6,7
+      CUDNN_FRONTEND_VERSION: 1.11.0
+      CUDNN_VERSION: 9.10.1.4
+      CUFFT_VERSION: 11.4.0.6
+      CUFILE_VERSION: 1.14.0.30
+      CURAND_VERSION: 10.3.10.19
+      CUSOLVER_VERSION: 11.7.4.40
+      CUSPARSELT_VERSION: 0.7.1.0
+      CUSPARSE_VERSION: 12.5.9.5
+      DALI_BUILD: ''
+      DALI_URL_SUFFIX: '120'
+      DALI_VERSION: 1.49.0
+      EFA_VERSION: 1.38.1
+      ENV: /etc/shinit_v2
+      GDRCOPY_VERSION: 2.4.4
+      HOME: /root
+      HOSTNAME: e6ad2ac15863
+      HPCX_VERSION: '2.23'
+      KMP_DUPLICATE_LIB_OK: 'True'
+      KMP_INIT_AT_FORK: 'FALSE'
+      LC_CTYPE: C.UTF-8
+      LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+      LESSCLOSE: /usr/bin/lesspipe %s %s
+      LESSOPEN: '| /usr/bin/lesspipe %s'
+      LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
+      LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
+      MODEL_OPT_VERSION: 0.27.1
+      MOFED_VERSION: 5.4-rdmacore50.0
+      NCCL_NET_PLUGIN: aws-ofi
+      NCCL_TUNER_PLUGIN: aws-ofi
+      NCCL_VERSION: 2.26.5
+      NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
+      NEMO_RL_VENV_DIR: /opt/ray_venvs
+      NPP_VERSION: 12.4.0.27
+      NRL_CONTAINER: '1'
+      NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
+      NSIGHT_COMPUTE_VERSION: 2025.2.0.11
+      NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
+      NVIDIA_BUILD_ID: '244212578'
+      NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
+      NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
+      NVIDIA_PRODUCT_NAME: CUDA
+      NVIDIA_REQUIRE_CUDA: cuda>=9.0
+      NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
+      NVIDIA_VISIBLE_DEVICES: all
+      NVJITLINK_VERSION: 12.9.41
+      NVJPEG_VERSION: 12.4.0.16
+      NVSHMEM_VERSION: 3.2.5
+      OLDPWD: /workspace
+      OMPI_MCA_coll_hcoll_enable: '0'
+      OPAL_PREFIX: /opt/hpcx/ompi
+      OPENMPI_VERSION: 4.1.7
+      OPENUCX_VERSION: 1.19.0
+      PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
+      POLYGRAPHY_VERSION: 0.49.20
+      PWD: /workspace/ARChitects
+      PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
+      PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      RAY_CLIENT_MODE: '0'
+      RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
+      RAY_USAGE_STATS_ENABLED: '0'
+      RDMACORE_VERSION: '50.0'
+      SHELL: /bin/bash
+      SHLVL: '2'
+      SWANLAB_API_HOST: https://api.swanlab.cn/api
+      SWANLAB_RUNTIME: user
+      SWANLAB_WEB_HOST: https://swanlab.cn
+      TERM: xterm
+      TORCH_CUDA_ARCH_LIST: '9.0'
+      TRANSFORMER_ENGINE_VERSION: '2.3'
+      TRTOSS_VERSION: ''
+      TRT_VERSION: 10.10.0.31
+      UV: /root/.local/bin/uv
+      UV_LINK_MODE: copy
+      UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
+      UV_RUN_RECURSION_DEPTH: '1'
+      VIRTUAL_ENV: /opt/nemo_rl_venv
+      VIRTUAL_ENV_PROMPT: nemo-rl
+      WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
+      _: /root/.local/bin/uv
+      _CUDA_COMPAT_PATH: /usr/local/cuda/compat
+      _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
+        (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
+      _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    freeze_moe_router: true
+    moe_permute_fusion: false
+    moe_router_bias_update_rate: 0.0
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    optimizer:
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1.0e-08
+      bf16: true
+      clip_grad: 0.5
+      fp16: false
+      lr: 0.0001
+      min_lr: 1.0e-07
+      optimizer: adam
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+      params_dtype: bfloat16
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: false
+      weight_decay: 0.1
+    pipeline_dtype: bfloat16
+    pipeline_model_parallel_size: 1
+    scheduler:
+      end_weight_decay: 0.1
+      lr_decay_iters: 12716
+      lr_decay_style: linear
+      lr_warmup_init: 1.0e-06
+      lr_warmup_iters: 200
+      start_weight_decay: 0.1
+      weight_decay_incr_style: constant
+    sequence_parallel: false
+    tensor_model_parallel_size: 1
+    train_iters: 5972
+  model_name: ./models/Qwen-NVARC
+  offload_optimizer_for_logprob: false
+  precision: bfloat16
+  sequence_packing:
+    algorithm: modified_first_fit_decreasing
+    enabled: true
+    sequence_length_round: 64
+    train_mb_tokens: 128000
+  tokenizer:
+    name: ./models/Qwen-NVARC
+  train_global_batch_size: 256
+  train_micro_batch_size: 1
+sft:
+  max_num_epochs: 1
+  max_num_steps: 6400
+  seed: 24
+  val_at_start: true
+  val_batches: 200
+  val_global_batch_size: 256
+  val_micro_batch_size: 1
+  val_period: 200

step_5972/policy/weights/iter_0000000/.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63b2c2c7a6c21b171a30b50ae7dc76c9744532ea6b3c093434c81c412ad99548
+size 329201

step_5972/policy/weights/iter_0000000/__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f6811136b6e0fbc6c36bf350aee4b9e42c450265f2475895b613fc98ff26e7
+size 12718313784

step_5972/policy/weights/iter_0000000/__1_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22f80411837876981796f026820ac72c19b06b79a478df2c332f912075adc25f
+size 12717860926

step_5972/policy/weights/iter_0000000/common.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:293a19ff82e664ad14eeea37b1cdcfc976171b534d5ec99eff7d86a5dfade2af
+size 1767

step_5972/policy/weights/iter_0000000/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+activation_func: <function silu at 0x7d0251c6b420>
+activation_func_clamp_value: None
+add_bias_linear: false
+add_qkv_bias: false
+apply_query_key_layer_scaling: false
+apply_residual_connection_post_layernorm: false
+apply_rope_fusion: true
+attention_backend: AttnBackend.auto
+attention_dropout: '0.0'
+attention_output_gate: false
+attention_softmax_in_fp32: false
+autocast_dtype: torch.bfloat16
+barrier_with_L1_time: true
+bf16: true
+bias_activation_fusion: false
+bias_dropout_fusion: false
+calculate_per_token_loss: true
+clone_scatter_output_in_embedding: true
+config_logger_dir: ''
+cross_entropy_fusion_impl: native
+cross_entropy_loss_fusion: true
+defer_embedding_wgrad_compute: false
+delay_wgrad_compute: false
+deterministic_mode: false
+disable_bf16_reduced_precision_matmul: false
+disable_parameter_transpose_cache: false
+distribute_saved_activations: None
+enable_autocast: false
+fallback_to_eager_attn: false
+ffn_hidden_size: 9728
+finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
+  pg_collection=None)
+fine_grained_activation_offloading: false
+first_last_layers_bf16: false
+flash_decode: false
+fp16: false
+fp16_lm_cross_entropy: false
+fp32_residual_connection: false
+fused_single_qkv_rope: false
+gated_linear_unit: true
+generation_config: None
+glu_linear_offset: '0.0'
+grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
+  object at 0x7cf9d413cd70>>
+grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+gradient_accumulation_fusion: false
+hetereogenous_dist_checkpoint: false
+heterogeneous_block_specs: false
+hf_model_id: ./models/Qwen-NVARC
+hidden_dropout: '0.0'
+hidden_size: 2560
+is_hybrid_model: false
+kv_channels: 128
+layernorm_epsilon: 1e-06
+layernorm_zero_centered_gamma: false
+linear_attention_freq: None
+linear_attention_type: None
+linear_conv_kernel_dim: None
+linear_key_head_dim: None
+linear_num_key_heads: None
+linear_num_value_heads: None
+linear_value_head_dim: None
+log_max_attention_logit: false
+make_vocab_size_divisible_by: 16
+mamba_head_dim: 64
+mamba_num_groups: 8
+mamba_num_heads: None
+mamba_state_dim: 128
+masked_softmax_fusion: true
+max_position_embeddings: 40960
+memory_efficient_layer_norm: false
+min_offloaded_tensor_size: 1048576
+mlp_chunks_for_prefill: 1
+moe_apply_probs_on_input: false
+moe_aux_loss_coeff: '0.0'
+moe_deepep_num_sms: 20
+moe_enable_deepep: false
+moe_expert_capacity_factor: None
+moe_extended_tp: false
+moe_ffn_hidden_size: None
+moe_flex_dispatcher_backend: deepep
+moe_grouped_gemm: false
+moe_hybridep_num_sms: 16
+moe_input_jitter_eps: None
+moe_layer_freq: 1
+moe_pad_expert_input_to_capacity: false
+moe_per_layer_logging: false
+moe_permute_fusion: false
+moe_router_bias_update_rate: '0.0'
+moe_router_dtype: fp64
+moe_router_enable_expert_bias: false
+moe_router_force_load_balancing: false
+moe_router_fusion: false
+moe_router_group_topk: None
+moe_router_load_balancing_type: none
+moe_router_num_groups: None
+moe_router_padding_for_quantization: false
+moe_router_pre_softmax: false
+moe_router_score_function: softmax
+moe_router_topk: 2
+moe_router_topk_limited_devices: None
+moe_router_topk_scaling_factor: None
+moe_shared_expert_gate: false
+moe_shared_expert_intermediate_size: None
+moe_shared_expert_overlap: false
+moe_token_dispatcher_type: allgather
+moe_token_drop_policy: probs
+moe_token_dropping: false
+moe_use_legacy_grouped_gemm: false
+moe_z_loss_coeff: None
+mrope_section: None
+multi_latent_attention: false
+no_rope_freq: None
+no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
+  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
+  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
+  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
+  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
+  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
+  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
+  \                (flash_attention): FlashAttention()\n                (fused_attention):\
+  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
+  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
+  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
+  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
+  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
+  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
+  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
+  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
+  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
+  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
+  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
+  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
+  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
+normalization: RMSNorm
+num_attention_heads: 32
+num_layers: 36
+num_layers_at_end_in_bf16: 1
+num_layers_at_start_in_bf16: 1
+num_moe_experts: None
+num_query_groups: 8
+nvidia_modelopt_version: 0.39.0
+offload_modules: None
+param_sync_func: None
+params_dtype: torch.bfloat16
+perform_initialization: true
+persist_layer_norm: false
+position_embedding_type: rope
+qk_clip: false
+qk_clip_alpha: '0.5'
+qk_clip_threshold: 100
+qk_layernorm: true
+quant_recipe: None
+restore_modelopt_state: false
+rotary_base: 5000000
+rotary_interleaved: false
+rotary_percent: '1.0'
+seq_len_interpolation_factor: None
+seq_length: 262144
+share_embeddings_and_output_weights: true
+should_pad_vocab: false
+softmax_scale: None
+softmax_type: vanilla
+symmetric_ar_type: None
+test_mode: false
+timers: None
+transformer_impl: transformer_engine
+transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
+use_fused_weighted_squared_relu: false
+use_kitchen: false
+use_mamba_mem_eff_path: true
+use_ring_exchange_p2p: false
+use_te_activation_func: false
+use_te_rng_tracker: false
+use_transformer_engine_full_layer_spec: false
+use_transformer_engine_op_fuser: false
+variable_seq_lengths: false
+vocab_size: 16
+wgrad_deferral_limit: 0
+window_attn_skip_freq: None
+window_size: None

step_5972/policy/weights/iter_0000000/run_config.yaml ADDED Viewed

	@@ -0,0 +1,564 @@

+_target_: megatron.bridge.training.config.ConfigContainer
+checkpoint:
+  _target_: megatron.bridge.training.config.CheckpointConfig
+  async_save: false
+  ckpt_assume_constant_structure: false
+  ckpt_convert_format: null
+  ckpt_convert_save: null
+  ckpt_format: torch_dist
+  ckpt_step: null
+  dist_ckpt_optim_fully_reshardable: false
+  dist_ckpt_save_pre_mcore_014: false
+  dist_ckpt_strictness: assume_ok_unexpected
+  distrib_optim_fully_reshardable_mem_efficient: false
+  exit_on_missing_checkpoint: false
+  finetune: true
+  fully_parallel_load: true
+  fully_parallel_save: true
+  load: null
+  load_main_params_from_ckpt: false
+  load_optim: true
+  load_rng: false
+  most_recent_k: -1
+  non_persistent_ckpt_type: null
+  non_persistent_global_ckpt_dir: null
+  non_persistent_local_ckpt_algo: fully_parallel
+  non_persistent_local_ckpt_dir: null
+  non_persistent_save_interval: null
+  pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
+  replication: false
+  replication_factor: 2
+  replication_jump: null
+  save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5972/policy/weights
+  save_interval: 100
+  save_optim: true
+  save_rng: true
+  save_tokenizer_assets: true
+  strict_fsdp_dtensor_load: false
+  use_checkpoint_args: false
+  use_persistent_ckpt_worker: true
+comm_overlap: null
+dataset: null
+ddp:
+  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
+  align_param_gather: false
+  average_in_collective: false
+  bucket_size: 40000000
+  check_for_large_grads: false
+  check_for_nan_in_grad: true
+  data_parallel_sharding_strategy: optim_grads_params
+  delay_wgrad_compute: false
+  disable_symmetric_registration: false
+  fp8_param_gather: false
+  fsdp_double_buffer: false
+  grad_reduce_in_fp32: true
+  gradient_reduce_div_fusion: true
+  keep_fp8_transpose_cache: false
+  nccl_ub: false
+  num_distributed_optimizer_instances: 1
+  outer_dp_sharding_strategy: no_shard
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  pad_buckets_for_high_nccl_busbw: false
+  preserve_fp32_weights: true
+  reduce_scatter_with_fp32_accumulation: false
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  suggested_communication_unit_size: null
+  use_custom_fsdp: false
+  use_distributed_optimizer: true
+  use_megatron_fsdp: false
+dist:
+  _target_: megatron.bridge.training.config.DistributedInitConfig
+  align_grad_reduce: true
+  disable_jit_fuser: false
+  distributed_backend: nccl
+  distributed_timeout_minutes: 10
+  distributed_timeout_seconds_after_init: null
+  enable_megatron_core_experimental: false
+  external_gpu_device_mapping: true
+  high_priority_stream_groups: null
+  lazy_init: false
+  local_rank: 0
+  nccl_communicator_config_path: null
+  sharp_enabled_group: null
+  use_gloo_process_groups: true
+  use_megatron_fsdp: false
+  use_sharp: false
+  use_torch_fsdp2: false
+  use_tp_pp_dp_mapping: false
+ft: null
+inprocess_restart: null
+logger:
+  _target_: megatron.bridge.training.config.LoggerConfig
+  filter_warnings: true
+  log_energy: false
+  log_interval: 100
+  log_l2_norm_grad_to_tensorboard: false
+  log_loss_scale_to_tensorboard: true
+  log_memory_to_tensorboard: false
+  log_params_norm: false
+  log_progress: false
+  log_runtime_to_tensorboard: false
+  log_throughput: false
+  log_throughput_to_tensorboard: false
+  log_timers_to_tensorboard: false
+  log_validation_ppl_to_tensorboard: false
+  log_world_size_to_tensorboard: false
+  logging_level: 0
+  memory_keys: null
+  modules_to_filter: null
+  runtime_time_unit: hours
+  save_config_filepath: null
+  set_level_for_all_loggers: false
+  tensorboard_dir: null
+  tensorboard_log_interval: 1
+  tensorboard_queue_size: 1000
+  throughput_window_size: 100
+  timing_log_level: 0
+  timing_log_option: minmax
+  wandb_entity: null
+  wandb_exp_name: null
+  wandb_project: null
+  wandb_save_dir: null
+mixed_precision: null
+model:
+  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_clamp_value: null
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _args_:
+    - 5
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_output_gate: false
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: false
+  bias_dropout_fusion: false
+  calculate_per_token_loss: true
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 2
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_double_buffering: false
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: false
+  cross_entropy_fusion_impl: native
+  cross_entropy_loss_fusion: true
+  cuda_graph_impl: none
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_scope: []
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  delay_wgrad_compute: false
+  deterministic_mode: false
+  disable_bf16_reduced_precision_matmul: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  embedding_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  embedding_init_method_std: 0.02
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: 1
+  external_cuda_graph: false
+  fallback_to_eager_attn: false
+  ffn_hidden_size: 9728
+  finalize_model_grads_func:
+    _args_: []
+    _partial_: true
+    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
+    pg_collection: null
+  fine_grained_activation_offloading: false
+  first_last_layers_bf16: false
+  flash_decode: false
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp4: null
+  fp4_param: false
+  fp4_quantizer_factory: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_quantizer_factory: null
+  fp8_recipe: delayed
+  fp8_wgrad: true
+  fused_single_qkv_rope: false
+  gated_linear_unit: true
+  generation_config: null
+  glu_linear_offset: 0.0
+  grad_scale_func:
+    _call_: false
+    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
+  grad_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
+  gradient_accumulation_fusion: false
+  hetereogenous_dist_checkpoint: false
+  heterogeneous_block_specs: false
+  hf_model_id: ./models/Qwen-NVARC
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  hierarchical_context_parallel_sizes: null
+  inference_rng_tracker: false
+  inference_sampling_seed: 42
+  init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  init_method_std: 0.02
+  init_model_with_meta_device: false
+  is_hybrid_model: false
+  kv_channels: 128
+  layernorm_epsilon: 1.0e-06
+  layernorm_zero_centered_gamma: false
+  linear_attention_freq: null
+  linear_attention_type: null
+  linear_conv_kernel_dim: null
+  linear_key_head_dim: null
+  linear_num_key_heads: null
+  linear_num_value_heads: null
+  linear_value_head_dim: null
+  log_max_attention_logit: false
+  make_vocab_size_divisible_by: 16
+  mamba_head_dim: 64
+  mamba_num_groups: 8
+  mamba_num_heads: null
+  mamba_state_dim: 128
+  masked_softmax_fusion: true
+  max_position_embeddings: 40960
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  min_offloaded_tensor_size: 1048576
+  mlp_chunks_for_prefill: 1
+  moe_apply_probs_on_input: false
+  moe_aux_loss_coeff: 0.0
+  moe_deepep_num_sms: 20
+  moe_enable_deepep: false
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_flex_dispatcher_backend: deepep
+  moe_grouped_gemm: false
+  moe_hybridep_num_sms: 16
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.0
+  moe_router_dtype: fp64
+  moe_router_enable_expert_bias: false
+  moe_router_force_load_balancing: false
+  moe_router_fusion: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: none
+  moe_router_num_groups: null
+  moe_router_padding_for_fp8: false
+  moe_router_padding_for_quantization: false
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_gate: false
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  mrope_section: null
+  mtp_enabled: false
+  mtp_loss_scaling_factor: null
+  mtp_num_layers: null
+  mtp_standalone: false
+  multi_latent_attention: false
+  no_rope_freq: null
+  no_sync_func:
+    _call_: false
+    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 36
+  num_layers_at_end_in_bf16: 1
+  num_layers_at_start_in_bf16: 1
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  offload_modules: null
+  output_layer_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.0023570226039551587
+  overlap_moe_expert_parallel_comm: false
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: false
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_comm_backend: null
+  pipeline_model_parallel_layout: null
+  pipeline_model_parallel_size: 1
+  position_embedding_type: rope
+  qk_clip: false
+  qk_clip_alpha: 0.5
+  qk_clip_threshold: 100
+  qk_layernorm: true
+  quant_recipe: null
+  recompute_granularity: full
+  recompute_method: uniform
+  recompute_modules:
+  - core_attn
+  recompute_num_layers: 1
+  restore_modelopt_state: false
+  rotary_base: 5000000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 262144
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  should_pad_vocab: false
+  softmax_scale: null
+  softmax_type: vanilla
+  symmetric_ar_type: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_cfg: null
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_impl: transformer_engine
+  transformer_layer_spec:
+    _call_: false
+    _target_: megatron.bridge.models.gpt_provider.default_layer_spec
+  use_cpu_initialization: false
+  use_fused_weighted_squared_relu: false
+  use_kitchen: false
+  use_mamba_mem_eff_path: true
+  use_ring_exchange_p2p: false
+  use_te_activation_func: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  use_transformer_engine_op_fuser: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 16
+  wgrad_deferral_limit: 0
+  window_attn_skip_freq: null
+  window_size: null
+nvrx_straggler: null
+optimizer:
+  _target_: megatron.bridge.training.config.OptimizerConfig
+  adam_beta1: 0.9
+  adam_beta2: 0.98
+  adam_eps: 1.0e-08
+  barrier_with_L1_time: false
+  bf16: true
+  clip_grad: 0.5
+  config_logger_dir: ''
+  decoupled_lr: null
+  decoupled_min_lr: null
+  decoupled_weight_decay: true
+  exp_avg_dtype:
+    _call_: false
+    _target_: torch.float32
+  exp_avg_sq_dtype:
+    _call_: false
+    _target_: torch.float32
+  fp16: false
+  fp8_recipe: null
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  log_num_zeros_in_grad: false
+  loss_scale: null
+  loss_scale_window: 1000
+  lr: 0.0001
+  main_grads_dtype:
+    _call_: false
+    _target_: torch.float32
+  main_params_dtype:
+    _call_: false
+    _target_: torch.float32
+  min_loss_scale: 1.0
+  min_lr: 1.0e-07
+  muon_extra_scale_factor: 1.0
+  muon_fp32_matmul_prec: medium
+  muon_momentum: 0.95
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral
+  muon_split_qkv: true
+  muon_tp_mode: blockwise
+  muon_use_nesterov: false
+  optimizer: adam
+  optimizer_cpu_offload: false
+  optimizer_offload_fraction: 0.0
+  overlap_cpu_optimizer_d2h_h2d: false
+  overlap_param_gather: false
+  overlap_param_gather_with_optimizer_step: false
+  params_dtype: bfloat16
+  pin_cpu_grads: true
+  pin_cpu_params: true
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  sgd_momentum: 0.9
+  store_param_remainders: true
+  timers: null
+  use_distributed_optimizer: true
+  use_precision_aware_optimizer: false
+  use_torch_optimizer_for_cpu_offload: false
+  weight_decay: 0.1
+peft: null
+profiling:
+  _target_: megatron.bridge.training.config.ProfilingConfig
+  memory_snapshot_path: snapshot.pickle
+  nvtx_ranges: false
+  profile_ranks:
+  - 0
+  profile_step_end: 12
+  profile_step_start: 10
+  record_memory_history: false
+  record_shapes: false
+  use_nsys_profiler: false
+  use_pytorch_profiler: false
+rerun_state_machine:
+  _target_: megatron.bridge.training.config.RerunStateMachineConfig
+  check_for_nan_in_loss: true
+  check_for_spiky_loss: false
+  error_injection_rate: 0
+  error_injection_type: transient_error
+  rerun_mode: disabled
+rng:
+  _target_: megatron.bridge.training.config.RNGConfig
+  data_parallel_random_init: false
+  inference_rng_tracker: false
+  seed: 1234
+  te_rng_tracker: false
+scheduler:
+  _target_: megatron.bridge.training.config.SchedulerConfig
+  end_weight_decay: 0.1
+  lr_decay_iters: 12716
+  lr_decay_samples: null
+  lr_decay_steps: 3255296
+  lr_decay_style: linear
+  lr_warmup_fraction: null
+  lr_warmup_init: 1.0e-06
+  lr_warmup_iters: 200
+  lr_warmup_samples: 0
+  lr_warmup_steps: 51200
+  lr_wsd_decay_iters: null
+  lr_wsd_decay_samples: null
+  lr_wsd_decay_style: exponential
+  no_weight_decay_cond_type: null
+  override_opt_param_scheduler: false
+  start_weight_decay: 0.1
+  use_checkpoint_opt_param_scheduler: false
+  wd_incr_steps: 1528832
+  weight_decay_incr_style: constant
+  wsd_decay_steps: null
+straggler: null
+tensor_inspect: null
+tokenizer:
+  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
+  hf_tokenizer_kwargs: {}
+  image_tag_type: null
+  merge_file: null
+  special_tokens: null
+  tiktoken_num_special_tokens: 1000
+  tiktoken_pattern: null
+  tiktoken_special_tokens: null
+  tokenizer_model: ./models/Qwen-NVARC
+  tokenizer_prompt_format: null
+  tokenizer_type: HuggingFaceTokenizer
+  vocab_extra_ids: 0
+  vocab_file: null
+  vocab_size: null
+train:
+  _target_: megatron.bridge.training.config.TrainingConfig
+  check_weight_hash_across_dp_replicas_interval: null
+  decrease_batch_size_if_needed: false
+  empty_unused_memory_level: 0
+  eval_interval: 1000
+  eval_iters: 100
+  exit_duration_in_mins: null
+  exit_interval: null
+  exit_signal:
+    _args_:
+    - 15
+    _call_: true
+    _target_: signal.Signals
+  exit_signal_handler: false
+  exit_signal_handler_for_dataloader: false
+  global_batch_size: 256
+  iterations_to_skip: []
+  manual_gc: false
+  manual_gc_eval: true
+  manual_gc_interval: 0
+  micro_batch_size: 1
+  rampup_batch_size: null
+  skip_train: false
+  train_iters: 5972
+  train_samples: null
+  train_sync_interval: null

step_5972/policy/weights/iter_0000000/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5972/policy/weights/latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0

step_5972/policy/weights/latest_train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
+size 3461

step_5972/train_dataloader.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12bfcb136c615985e1571fc19377a9c8101d41c662c01f02e87c20a192ea5137
+size 7336