Delete step_5600

Browse files

Files changed (15) hide show

step_5600/config.yaml +0 -207
step_5600/policy/weights/iter_0000000/.metadata +0 -3
step_5600/policy/weights/iter_0000000/__0_0.distcp +0 -3
step_5600/policy/weights/iter_0000000/__0_1.distcp +0 -3
step_5600/policy/weights/iter_0000000/__1_0.distcp +0 -3
step_5600/policy/weights/iter_0000000/__1_1.distcp +0 -3
step_5600/policy/weights/iter_0000000/common.pt +0 -3
step_5600/policy/weights/iter_0000000/metadata.json +0 -1
step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml +0 -203
step_5600/policy/weights/iter_0000000/run_config.yaml +0 -564
step_5600/policy/weights/iter_0000000/train_state.pt +0 -3
step_5600/policy/weights/latest_checkpointed_iteration.txt +0 -1
step_5600/policy/weights/latest_train_state.pt +0 -3
step_5600/train_dataloader.pt +0 -3
step_5600/training_info.json +0 -1

step_5600/config.yaml DELETED Viewed

@@ -1,207 +0,0 @@
-checkpointing:
-  checkpoint_dir: results/qwen3_4b_sft
-  checkpoint_must_save_by: null
-  enabled: true
-  higher_is_better: false
-  keep_top_k: 3
-  metric_name: val:val_loss
-  save_period: 200
-cluster:
-  gpus_per_node: 2
-  num_nodes: 1
-data:
-  num_workers: 4
-  shuffle: true
-  train_dataset_path:
-  - ./data/hones
-  val_dataset_path: ./data/arc2_evaluation6
-logger:
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-  log_dir: logs/exp_019
-  mlflow_enabled: false
-  monitor_gpus: false
-  swanlab_enabled: false
-  tensorboard_enabled: false
-  wandb:
-    name: qwen3_4b_sft
-    project: arc2
-  wandb_enabled: true
-policy:
-  activation_checkpointing_enabled: false
-  attn_implementation: flash_attention_2
-  dtensor_cfg:
-    enabled: false
-  dynamic_batching:
-    enabled: false
-  fsdp_offload_enabled: false
-  make_sequence_length_divisible_by: 64
-  max_grad_norm: null
-  megatron_cfg:
-    activation_checkpointing: true
-    apply_rope_fusion: true
-    bias_activation_fusion: false
-    context_parallel_size: 2
-    distributed_data_parallel_config:
-      average_in_collective: true
-      data_parallel_sharding_strategy: optim_grads_params
-      grad_reduce_in_fp32: true
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-    empty_unused_memory_level: 1
-    enabled: true
-    env_vars:
-      AWS_OFI_NCCL_VERSION: 1.14.0
-      BASH_ENV: /etc/bash.bashrc
-      CAL_VERSION: 0.4.4.50
-      CUBLASMP_VERSION: 0.4.0.789
-      CUBLAS_VERSION: 12.9.0.13
-      CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0
-      CUDA_DRIVER_VERSION: 575.51.03
-      CUDA_VERSION: 12.9.0.043
-      CUDA_VISIBLE_DEVICES: 6,7
-      CUDNN_FRONTEND_VERSION: 1.11.0
-      CUDNN_VERSION: 9.10.1.4
-      CUFFT_VERSION: 11.4.0.6
-      CUFILE_VERSION: 1.14.0.30
-      CURAND_VERSION: 10.3.10.19
-      CUSOLVER_VERSION: 11.7.4.40
-      CUSPARSELT_VERSION: 0.7.1.0
-      CUSPARSE_VERSION: 12.5.9.5
-      DALI_BUILD: ''
-      DALI_URL_SUFFIX: '120'
-      DALI_VERSION: 1.49.0
-      EFA_VERSION: 1.38.1
-      ENV: /etc/shinit_v2
-      GDRCOPY_VERSION: 2.4.4
-      HOME: /root
-      HOSTNAME: e6ad2ac15863
-      HPCX_VERSION: '2.23'
-      KMP_DUPLICATE_LIB_OK: 'True'
-      KMP_INIT_AT_FORK: 'FALSE'
-      LC_CTYPE: C.UTF-8
-      LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-      LESSCLOSE: /usr/bin/lesspipe %s %s
-      LESSOPEN: '| /usr/bin/lesspipe %s'
-      LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:'
-      LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:'
-      MODEL_OPT_VERSION: 0.27.1
-      MOFED_VERSION: 5.4-rdmacore50.0
-      NCCL_NET_PLUGIN: aws-ofi
-      NCCL_TUNER_PLUGIN: aws-ofi
-      NCCL_VERSION: 2.26.5
-      NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a
-      NEMO_RL_VENV_DIR: /opt/ray_venvs
-      NPP_VERSION: 12.4.0.27
-      NRL_CONTAINER: '1'
-      NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron
-      NSIGHT_COMPUTE_VERSION: 2025.2.0.11
-      NSIGHT_SYSTEMS_VERSION: 2025.3.1.90
-      NVIDIA_BUILD_ID: '244212578'
-      NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732
-      NVIDIA_DRIVER_CAPABILITIES: compute,utility,video
-      NVIDIA_PRODUCT_NAME: CUDA
-      NVIDIA_REQUIRE_CUDA: cuda>=9.0
-      NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: ''
-      NVIDIA_VISIBLE_DEVICES: all
-      NVJITLINK_VERSION: 12.9.41
-      NVJPEG_VERSION: 12.4.0.16
-      NVSHMEM_VERSION: 3.2.5
-      OLDPWD: /workspace
-      OMPI_MCA_coll_hcoll_enable: '0'
-      OPAL_PREFIX: /opt/hpcx/ompi
-      OPENMPI_VERSION: 4.1.7
-      OPENUCX_VERSION: 1.19.0
-      PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin
-      POLYGRAPHY_VERSION: 0.49.20
-      PWD: /workspace/ARChitects
-      PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace
-      PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:'
-      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
-      RAY_CLIENT_MODE: '0'
-      RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0'
-      RAY_USAGE_STATS_ENABLED: '0'
-      RDMACORE_VERSION: '50.0'
-      SHELL: /bin/bash
-      SHLVL: '2'
-      SWANLAB_API_HOST: https://api.swanlab.cn/api
-      SWANLAB_RUNTIME: user
-      SWANLAB_WEB_HOST: https://swanlab.cn
-      TERM: xterm
-      TORCH_CUDA_ARCH_LIST: '9.0'
-      TRANSFORMER_ENGINE_VERSION: '2.3'
-      TRTOSS_VERSION: ''
-      TRT_VERSION: 10.10.0.31
-      UV: /root/.local/bin/uv
-      UV_LINK_MODE: copy
-      UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv
-      UV_RUN_RECURSION_DEPTH: '1'
-      VIRTUAL_ENV: /opt/nemo_rl_venv
-      VIRTUAL_ENV_PROMPT: nemo-rl
-      WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket
-      _: /root/.local/bin/uv
-      _CUDA_COMPAT_PATH: /usr/local/cuda/compat
-      _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination
-        (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803
-      _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9
-    expert_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    freeze_moe_router: true
-    moe_permute_fusion: false
-    moe_router_bias_update_rate: 0.0
-    moe_router_dtype: fp64
-    moe_router_load_balancing_type: none
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    optimizer:
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1.0e-08
-      bf16: true
-      clip_grad: 0.5
-      fp16: false
-      lr: 0.0001
-      min_lr: 1.0e-07
-      optimizer: adam
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-      params_dtype: bfloat16
-      sgd_momentum: 0.9
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: false
-      weight_decay: 0.1
-    pipeline_dtype: bfloat16
-    pipeline_model_parallel_size: 1
-    scheduler:
-      end_weight_decay: 0.1
-      lr_decay_iters: 12716
-      lr_decay_style: linear
-      lr_warmup_init: 1.0e-06
-      lr_warmup_iters: 200
-      start_weight_decay: 0.1
-      weight_decay_incr_style: constant
-    sequence_parallel: false
-    tensor_model_parallel_size: 1
-    train_iters: 5972
-  model_name: ./models/Qwen-NVARC
-  offload_optimizer_for_logprob: false
-  precision: bfloat16
-  sequence_packing:
-    algorithm: modified_first_fit_decreasing
-    enabled: true
-    sequence_length_round: 64
-    train_mb_tokens: 128000
-  tokenizer:
-    name: ./models/Qwen-NVARC
-  train_global_batch_size: 256
-  train_micro_batch_size: 1
-sft:
-  max_num_epochs: 1
-  max_num_steps: 6400
-  seed: 24
-  val_at_start: true
-  val_batches: 200
-  val_global_batch_size: 256
-  val_micro_batch_size: 1
-  val_period: 200

step_5600/policy/weights/iter_0000000/.metadata DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1a1c916057dfe0e2002fe62982907832c5f702012c7360c6613f4a610084f748
-size 329201

step_5600/policy/weights/iter_0000000/__0_0.distcp DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:901973afc07dfb6d733533ab116c5681ede28f1cbc1d6e92e9f6d1a8c20806ea
-size 12718332319

step_5600/policy/weights/iter_0000000/__0_1.distcp DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe086722749eb01b41a72cdcf74497f8c3ecafb54a559dc7d7f048f942d46398
-size 12718313784

step_5600/policy/weights/iter_0000000/__1_0.distcp DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a7be36693c193175f7bbf318b48669253e255d8272481d0669f2c5e15e5aba1c
-size 12717813616

step_5600/policy/weights/iter_0000000/__1_1.distcp DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7b305a117794a7753c18722aef5568bc6c5ab5e8e27e9a319b905a141cd33c5e
-size 12717860926

step_5600/policy/weights/iter_0000000/common.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ca3b2e874e687352cb92c06fbffd56051ad75663c60fb5d275365fa00e02a4bb
-size 1767

step_5600/policy/weights/iter_0000000/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml DELETED Viewed

@@ -1,203 +0,0 @@
-activation_func: <function silu at 0x7d0251c6b420>
-activation_func_clamp_value: None
-add_bias_linear: false
-add_qkv_bias: false
-apply_query_key_layer_scaling: false
-apply_residual_connection_post_layernorm: false
-apply_rope_fusion: true
-attention_backend: AttnBackend.auto
-attention_dropout: '0.0'
-attention_output_gate: false
-attention_softmax_in_fp32: false
-autocast_dtype: torch.bfloat16
-barrier_with_L1_time: true
-bf16: true
-bias_activation_fusion: false
-bias_dropout_fusion: false
-calculate_per_token_loss: true
-clone_scatter_output_in_embedding: true
-config_logger_dir: ''
-cross_entropy_fusion_impl: native
-cross_entropy_loss_fusion: true
-defer_embedding_wgrad_compute: false
-delay_wgrad_compute: false
-deterministic_mode: false
-disable_bf16_reduced_precision_matmul: false
-disable_parameter_transpose_cache: false
-distribute_saved_activations: None
-enable_autocast: false
-fallback_to_eager_attn: false
-ffn_hidden_size: 9728
-finalize_model_grads_func: functools.partial(<function finalize_model_grads at 0x7cfab95b9e40>,
-  pg_collection=None)
-fine_grained_activation_offloading: false
-first_last_layers_bf16: false
-flash_decode: false
-fp16: false
-fp16_lm_cross_entropy: false
-fp32_residual_connection: false
-fused_single_qkv_rope: false
-gated_linear_unit: true
-generation_config: None
-glu_linear_offset: '0.0'
-grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
-  object at 0x7cf9d413cd70>>
-grad_sync_func: "<bound method DistributedDataParallel.start_grad_sync of DistributedDataParallel(\n\
-  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
-  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
-  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
-  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
-  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
-  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
-  \                (flash_attention): FlashAttention()\n                (fused_attention):\
-  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
-  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
-  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
-  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
-  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
-  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
-  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
-  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
-  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
-  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
-  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
-  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
-  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
-  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
-gradient_accumulation_fusion: false
-hetereogenous_dist_checkpoint: false
-heterogeneous_block_specs: false
-hf_model_id: ./models/Qwen-NVARC
-hidden_dropout: '0.0'
-hidden_size: 2560
-is_hybrid_model: false
-kv_channels: 128
-layernorm_epsilon: 1e-06
-layernorm_zero_centered_gamma: false
-linear_attention_freq: None
-linear_attention_type: None
-linear_conv_kernel_dim: None
-linear_key_head_dim: None
-linear_num_key_heads: None
-linear_num_value_heads: None
-linear_value_head_dim: None
-log_max_attention_logit: false
-make_vocab_size_divisible_by: 16
-mamba_head_dim: 64
-mamba_num_groups: 8
-mamba_num_heads: None
-mamba_state_dim: 128
-masked_softmax_fusion: true
-max_position_embeddings: 40960
-memory_efficient_layer_norm: false
-min_offloaded_tensor_size: 1048576
-mlp_chunks_for_prefill: 1
-moe_apply_probs_on_input: false
-moe_aux_loss_coeff: '0.0'
-moe_deepep_num_sms: 20
-moe_enable_deepep: false
-moe_expert_capacity_factor: None
-moe_extended_tp: false
-moe_ffn_hidden_size: None
-moe_flex_dispatcher_backend: deepep
-moe_grouped_gemm: false
-moe_hybridep_num_sms: 16
-moe_input_jitter_eps: None
-moe_layer_freq: 1
-moe_pad_expert_input_to_capacity: false
-moe_per_layer_logging: false
-moe_permute_fusion: false
-moe_router_bias_update_rate: '0.0'
-moe_router_dtype: fp64
-moe_router_enable_expert_bias: false
-moe_router_force_load_balancing: false
-moe_router_fusion: false
-moe_router_group_topk: None
-moe_router_load_balancing_type: none
-moe_router_num_groups: None
-moe_router_padding_for_quantization: false
-moe_router_pre_softmax: false
-moe_router_score_function: softmax
-moe_router_topk: 2
-moe_router_topk_limited_devices: None
-moe_router_topk_scaling_factor: None
-moe_shared_expert_gate: false
-moe_shared_expert_intermediate_size: None
-moe_shared_expert_overlap: false
-moe_token_dispatcher_type: allgather
-moe_token_drop_policy: probs
-moe_token_dropping: false
-moe_use_legacy_grouped_gemm: false
-moe_z_loss_coeff: None
-mrope_section: None
-multi_latent_attention: false
-no_rope_freq: None
-no_sync_func: "<bound method DistributedDataParallel.no_sync of DistributedDataParallel(\n\
-  \  (module): CustomFloat16Module(\n    (module): GPTModel(\n      (embedding): LanguageModelEmbedding(\n\
-  \        (word_embeddings): VocabParallelEmbedding()\n        (embedding_dropout):\
-  \ Dropout(p=0.0, inplace=False)\n      )\n      (rotary_pos_emb): RotaryEmbedding()\n\
-  \      (decoder): TransformerBlock(\n        (layers): ModuleList(\n          (0-35):\
-  \ 36 x TransformerLayer(\n            (input_layernorm): IdentityOp()\n        \
-  \    (self_attention): SelfAttention(\n              (core_attention): TEDotProductAttention(\n\
-  \                (flash_attention): FlashAttention()\n                (fused_attention):\
-  \ FusedAttention()\n                (unfused_attention): UnfusedDotProductAttention(\n\
-  \                  (scale_mask_softmax): FusedScaleMaskSoftmax()\n             \
-  \     (attention_dropout): Dropout(p=0.0, inplace=False)\n                )\n  \
-  \            )\n              (linear_proj): TERowParallelLinear(in_features=4096,\
-  \ out_features=2560, bias=False, TP=1)\n              (linear_qkv): TELayerNormColumnParallelLinear(in_features=2560,\
-  \ out_features=6144, bias=False, TP=1)\n              (q_layernorm): RMSNorm()\n\
-  \              (k_layernorm): RMSNorm()\n            )\n            (pre_cross_attn_layernorm):\
-  \ IdentityOp()\n            (cross_attention): IdentityOp()\n            (cross_attn_bda):\
-  \ IdentityFuncOp()\n            (pre_mlp_layernorm): IdentityOp()\n            (mlp):\
-  \ MLP(\n              (linear_fc1): TELayerNormColumnParallelLinear(in_features=2560,\
-  \ out_features=19456, bias=False, TP=1)\n              (linear_fc2): TERowParallelLinear(in_features=9728,\
-  \ out_features=2560, bias=False, TP=1)\n            )\n          )\n        )\n\
-  \        (final_layernorm): RMSNorm()\n      )\n      (output_layer): ColumnParallelLinear(in_features=2560,\
-  \ out_features=16, bias=False, TP=1)\n    )\n  )\n)>"
-normalization: RMSNorm
-num_attention_heads: 32
-num_layers: 36
-num_layers_at_end_in_bf16: 1
-num_layers_at_start_in_bf16: 1
-num_moe_experts: None
-num_query_groups: 8
-nvidia_modelopt_version: 0.39.0
-offload_modules: None
-param_sync_func: None
-params_dtype: torch.bfloat16
-perform_initialization: true
-persist_layer_norm: false
-position_embedding_type: rope
-qk_clip: false
-qk_clip_alpha: '0.5'
-qk_clip_threshold: 100
-qk_layernorm: true
-quant_recipe: None
-restore_modelopt_state: false
-rotary_base: 5000000
-rotary_interleaved: false
-rotary_percent: '1.0'
-seq_len_interpolation_factor: None
-seq_length: 262144
-share_embeddings_and_output_weights: true
-should_pad_vocab: false
-softmax_scale: None
-softmax_type: vanilla
-symmetric_ar_type: None
-test_mode: false
-timers: None
-transformer_impl: transformer_engine
-transformer_layer_spec: <function default_layer_spec at 0x7cfa556ad440>
-use_fused_weighted_squared_relu: false
-use_kitchen: false
-use_mamba_mem_eff_path: true
-use_ring_exchange_p2p: false
-use_te_activation_func: false
-use_te_rng_tracker: false
-use_transformer_engine_full_layer_spec: false
-use_transformer_engine_op_fuser: false
-variable_seq_lengths: false
-vocab_size: 16
-wgrad_deferral_limit: 0
-window_attn_skip_freq: None
-window_size: None

step_5600/policy/weights/iter_0000000/run_config.yaml DELETED Viewed

@@ -1,564 +0,0 @@
-_target_: megatron.bridge.training.config.ConfigContainer
-checkpoint:
-  _target_: megatron.bridge.training.config.CheckpointConfig
-  async_save: false
-  ckpt_assume_constant_structure: false
-  ckpt_convert_format: null
-  ckpt_convert_save: null
-  ckpt_format: torch_dist
-  ckpt_step: null
-  dist_ckpt_optim_fully_reshardable: false
-  dist_ckpt_save_pre_mcore_014: false
-  dist_ckpt_strictness: assume_ok_unexpected
-  distrib_optim_fully_reshardable_mem_efficient: false
-  exit_on_missing_checkpoint: false
-  finetune: true
-  fully_parallel_load: true
-  fully_parallel_save: true
-  load: null
-  load_main_params_from_ckpt: false
-  load_optim: true
-  load_rng: false
-  most_recent_k: -1
-  non_persistent_ckpt_type: null
-  non_persistent_global_ckpt_dir: null
-  non_persistent_local_ckpt_algo: fully_parallel
-  non_persistent_local_ckpt_dir: null
-  non_persistent_save_interval: null
-  pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC
-  replication: false
-  replication_factor: 2
-  replication_jump: null
-  save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5600/policy/weights
-  save_interval: 100
-  save_optim: true
-  save_rng: true
-  save_tokenizer_assets: true
-  strict_fsdp_dtensor_load: false
-  use_checkpoint_args: false
-  use_persistent_ckpt_worker: true
-comm_overlap: null
-dataset: null
-ddp:
-  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
-  align_param_gather: false
-  average_in_collective: false
-  bucket_size: 40000000
-  check_for_large_grads: false
-  check_for_nan_in_grad: true
-  data_parallel_sharding_strategy: optim_grads_params
-  delay_wgrad_compute: false
-  disable_symmetric_registration: false
-  fp8_param_gather: false
-  fsdp_double_buffer: false
-  grad_reduce_in_fp32: true
-  gradient_reduce_div_fusion: true
-  keep_fp8_transpose_cache: false
-  nccl_ub: false
-  num_distributed_optimizer_instances: 1
-  outer_dp_sharding_strategy: no_shard
-  overlap_grad_reduce: true
-  overlap_param_gather: true
-  pad_buckets_for_high_nccl_busbw: false
-  preserve_fp32_weights: true
-  reduce_scatter_with_fp32_accumulation: false
-  reuse_grad_buf_for_mxfp8_param_ag: false
-  suggested_communication_unit_size: null
-  use_custom_fsdp: false
-  use_distributed_optimizer: true
-  use_megatron_fsdp: false
-dist:
-  _target_: megatron.bridge.training.config.DistributedInitConfig
-  align_grad_reduce: true
-  disable_jit_fuser: false
-  distributed_backend: nccl
-  distributed_timeout_minutes: 10
-  distributed_timeout_seconds_after_init: null
-  enable_megatron_core_experimental: false
-  external_gpu_device_mapping: true
-  high_priority_stream_groups: null
-  lazy_init: false
-  local_rank: 0
-  nccl_communicator_config_path: null
-  sharp_enabled_group: null
-  use_gloo_process_groups: true
-  use_megatron_fsdp: false
-  use_sharp: false
-  use_torch_fsdp2: false
-  use_tp_pp_dp_mapping: false
-ft: null
-inprocess_restart: null
-logger:
-  _target_: megatron.bridge.training.config.LoggerConfig
-  filter_warnings: true
-  log_energy: false
-  log_interval: 100
-  log_l2_norm_grad_to_tensorboard: false
-  log_loss_scale_to_tensorboard: true
-  log_memory_to_tensorboard: false
-  log_params_norm: false
-  log_progress: false
-  log_runtime_to_tensorboard: false
-  log_throughput: false
-  log_throughput_to_tensorboard: false
-  log_timers_to_tensorboard: false
-  log_validation_ppl_to_tensorboard: false
-  log_world_size_to_tensorboard: false
-  logging_level: 0
-  memory_keys: null
-  modules_to_filter: null
-  runtime_time_unit: hours
-  save_config_filepath: null
-  set_level_for_all_loggers: false
-  tensorboard_dir: null
-  tensorboard_log_interval: 1
-  tensorboard_queue_size: 1000
-  throughput_window_size: 100
-  timing_log_level: 0
-  timing_log_option: minmax
-  wandb_entity: null
-  wandb_exp_name: null
-  wandb_project: null
-  wandb_save_dir: null
-mixed_precision: null
-model:
-  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider
-  account_for_embedding_in_pipeline_split: false
-  account_for_loss_in_pipeline_split: false
-  activation_func:
-    _call_: false
-    _target_: torch.nn.functional.silu
-  activation_func_clamp_value: null
-  activation_func_fp8_input_store: false
-  add_bias_linear: false
-  add_qkv_bias: false
-  apply_query_key_layer_scaling: false
-  apply_residual_connection_post_layernorm: false
-  apply_rope_fusion: true
-  async_tensor_model_parallel_allreduce: false
-  attention_backend:
-    _args_:
-    - 5
-    _call_: true
-    _target_: megatron.core.transformer.enums.AttnBackend
-  attention_dropout: 0.0
-  attention_output_gate: false
-  attention_softmax_in_fp32: false
-  autocast_dtype:
-    _call_: false
-    _target_: torch.bfloat16
-  barrier_with_L1_time: true
-  batch_p2p_comm: true
-  batch_p2p_sync: true
-  bf16: true
-  bias_activation_fusion: false
-  bias_dropout_fusion: false
-  calculate_per_token_loss: true
-  clone_scatter_output_in_embedding: true
-  config_logger_dir: ''
-  context_parallel_size: 2
-  cp_comm_type: null
-  cpu_offloading: false
-  cpu_offloading_activations: true
-  cpu_offloading_double_buffering: false
-  cpu_offloading_num_layers: 0
-  cpu_offloading_weights: false
-  cross_entropy_fusion_impl: native
-  cross_entropy_loss_fusion: true
-  cuda_graph_impl: none
-  cuda_graph_retain_backward_graph: false
-  cuda_graph_scope: []
-  cuda_graph_use_single_mempool: false
-  cuda_graph_warmup_steps: 3
-  deallocate_pipeline_outputs: true
-  defer_embedding_wgrad_compute: false
-  delay_wgrad_compute: false
-  deterministic_mode: false
-  disable_bf16_reduced_precision_matmul: false
-  disable_parameter_transpose_cache: false
-  distribute_saved_activations: null
-  embedding_init_method:
-    _args_: []
-    _partial_: true
-    _target_: torch.nn.init.normal_
-    mean: 0.0
-    std: 0.02
-  embedding_init_method_std: 0.02
-  enable_autocast: false
-  enable_cuda_graph: false
-  expert_model_parallel_size: 1
-  expert_tensor_parallel_size: 1
-  external_cuda_graph: false
-  fallback_to_eager_attn: false
-  ffn_hidden_size: 9728
-  finalize_model_grads_func:
-    _args_: []
-    _partial_: true
-    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
-    pg_collection: null
-  fine_grained_activation_offloading: false
-  first_last_layers_bf16: false
-  flash_decode: false
-  fp16: false
-  fp16_lm_cross_entropy: false
-  fp32_residual_connection: false
-  fp4: null
-  fp4_param: false
-  fp4_quantizer_factory: null
-  fp4_recipe: nvfp4
-  fp8: null
-  fp8_amax_compute_algo: most_recent
-  fp8_amax_history_len: 1
-  fp8_dot_product_attention: false
-  fp8_interval: 1
-  fp8_margin: 0
-  fp8_multi_head_attention: false
-  fp8_param: false
-  fp8_quantizer_factory: null
-  fp8_recipe: delayed
-  fp8_wgrad: true
-  fused_single_qkv_rope: false
-  gated_linear_unit: true
-  generation_config: null
-  glu_linear_offset: 0.0
-  grad_scale_func:
-    _call_: false
-    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
-  grad_sync_func:
-    _call_: false
-    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync
-  gradient_accumulation_fusion: false
-  hetereogenous_dist_checkpoint: false
-  heterogeneous_block_specs: false
-  hf_model_id: ./models/Qwen-NVARC
-  hidden_dropout: 0.0
-  hidden_size: 2560
-  hierarchical_context_parallel_sizes: null
-  inference_rng_tracker: false
-  inference_sampling_seed: 42
-  init_method:
-    _args_: []
-    _partial_: true
-    _target_: torch.nn.init.normal_
-    mean: 0.0
-    std: 0.02
-  init_method_std: 0.02
-  init_model_with_meta_device: false
-  is_hybrid_model: false
-  kv_channels: 128
-  layernorm_epsilon: 1.0e-06
-  layernorm_zero_centered_gamma: false
-  linear_attention_freq: null
-  linear_attention_type: null
-  linear_conv_kernel_dim: null
-  linear_key_head_dim: null
-  linear_num_key_heads: null
-  linear_num_value_heads: null
-  linear_value_head_dim: null
-  log_max_attention_logit: false
-  make_vocab_size_divisible_by: 16
-  mamba_head_dim: 64
-  mamba_num_groups: 8
-  mamba_num_heads: null
-  mamba_state_dim: 128
-  masked_softmax_fusion: true
-  max_position_embeddings: 40960
-  memory_efficient_layer_norm: false
-  microbatch_group_size_per_vp_stage: 1
-  min_offloaded_tensor_size: 1048576
-  mlp_chunks_for_prefill: 1
-  moe_apply_probs_on_input: false
-  moe_aux_loss_coeff: 0.0
-  moe_deepep_num_sms: 20
-  moe_enable_deepep: false
-  moe_expert_capacity_factor: null
-  moe_extended_tp: false
-  moe_ffn_hidden_size: null
-  moe_flex_dispatcher_backend: deepep
-  moe_grouped_gemm: false
-  moe_hybridep_num_sms: 16
-  moe_input_jitter_eps: null
-  moe_layer_freq: 1
-  moe_layer_recompute: false
-  moe_pad_expert_input_to_capacity: false
-  moe_per_layer_logging: false
-  moe_permute_fusion: false
-  moe_router_bias_update_rate: 0.0
-  moe_router_dtype: fp64
-  moe_router_enable_expert_bias: false
-  moe_router_force_load_balancing: false
-  moe_router_fusion: false
-  moe_router_group_topk: null
-  moe_router_load_balancing_type: none
-  moe_router_num_groups: null
-  moe_router_padding_for_fp8: false
-  moe_router_padding_for_quantization: false
-  moe_router_pre_softmax: false
-  moe_router_score_function: softmax
-  moe_router_topk: 2
-  moe_router_topk_limited_devices: null
-  moe_router_topk_scaling_factor: null
-  moe_shared_expert_gate: false
-  moe_shared_expert_intermediate_size: null
-  moe_shared_expert_overlap: false
-  moe_token_dispatcher_type: allgather
-  moe_token_drop_policy: probs
-  moe_token_dropping: false
-  moe_use_legacy_grouped_gemm: false
-  moe_z_loss_coeff: null
-  mrope_section: null
-  mtp_enabled: false
-  mtp_loss_scaling_factor: null
-  mtp_num_layers: null
-  mtp_standalone: false
-  multi_latent_attention: false
-  no_rope_freq: null
-  no_sync_func:
-    _call_: false
-    _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync
-  normalization: RMSNorm
-  num_attention_heads: 32
-  num_layers: 36
-  num_layers_at_end_in_bf16: 1
-  num_layers_at_start_in_bf16: 1
-  num_layers_in_first_pipeline_stage: null
-  num_layers_in_last_pipeline_stage: null
-  num_microbatches_with_partial_activation_checkpoints: null
-  num_moe_experts: null
-  num_query_groups: 8
-  offload_modules: null
-  output_layer_init_method:
-    _args_: []
-    _partial_: true
-    _target_: torch.nn.init.normal_
-    mean: 0.0
-    std: 0.0023570226039551587
-  overlap_moe_expert_parallel_comm: false
-  overlap_p2p_comm: false
-  overlap_p2p_comm_warmup_flush: false
-  parallel_output: true
-  param_sync_func: null
-  params_dtype:
-    _call_: false
-    _target_: torch.bfloat16
-  perform_initialization: true
-  persist_layer_norm: false
-  pipeline_dtype:
-    _call_: false
-    _target_: torch.bfloat16
-  pipeline_model_parallel_comm_backend: null
-  pipeline_model_parallel_layout: null
-  pipeline_model_parallel_size: 1
-  position_embedding_type: rope
-  qk_clip: false
-  qk_clip_alpha: 0.5
-  qk_clip_threshold: 100
-  qk_layernorm: true
-  quant_recipe: null
-  recompute_granularity: full
-  recompute_method: uniform
-  recompute_modules:
-  - core_attn
-  recompute_num_layers: 1
-  restore_modelopt_state: false
-  rotary_base: 5000000
-  rotary_interleaved: false
-  rotary_percent: 1.0
-  scatter_embedding_sequence_parallel: true
-  seq_len_interpolation_factor: null
-  seq_length: 262144
-  sequence_parallel: false
-  share_embeddings_and_output_weights: true
-  should_pad_vocab: false
-  softmax_scale: null
-  softmax_type: vanilla
-  symmetric_ar_type: null
-  tensor_model_parallel_size: 1
-  test_mode: false
-  timers: null
-  tp_comm_atomic_ag: false
-  tp_comm_atomic_rs: false
-  tp_comm_bootstrap_backend: nccl
-  tp_comm_bulk_dgrad: true
-  tp_comm_bulk_wgrad: true
-  tp_comm_overlap: false
-  tp_comm_overlap_ag: true
-  tp_comm_overlap_cfg: null
-  tp_comm_overlap_disable_fc1: false
-  tp_comm_overlap_disable_qkv: false
-  tp_comm_overlap_rs: true
-  tp_comm_overlap_rs_dgrad: false
-  tp_comm_split_ag: true
-  tp_comm_split_rs: true
-  tp_only_amax_red: false
-  transformer_impl: transformer_engine
-  transformer_layer_spec:
-    _call_: false
-    _target_: megatron.bridge.models.gpt_provider.default_layer_spec
-  use_cpu_initialization: false
-  use_fused_weighted_squared_relu: false
-  use_kitchen: false
-  use_mamba_mem_eff_path: true
-  use_ring_exchange_p2p: false
-  use_te_activation_func: false
-  use_te_rng_tracker: false
-  use_transformer_engine_full_layer_spec: false
-  use_transformer_engine_op_fuser: false
-  variable_seq_lengths: false
-  virtual_pipeline_model_parallel_size: null
-  vocab_size: 16
-  wgrad_deferral_limit: 0
-  window_attn_skip_freq: null
-  window_size: null
-nvrx_straggler: null
-optimizer:
-  _target_: megatron.bridge.training.config.OptimizerConfig
-  adam_beta1: 0.9
-  adam_beta2: 0.98
-  adam_eps: 1.0e-08
-  barrier_with_L1_time: false
-  bf16: true
-  clip_grad: 0.5
-  config_logger_dir: ''
-  decoupled_lr: null
-  decoupled_min_lr: null
-  decoupled_weight_decay: true
-  exp_avg_dtype:
-    _call_: false
-    _target_: torch.float32
-  exp_avg_sq_dtype:
-    _call_: false
-    _target_: torch.float32
-  fp16: false
-  fp8_recipe: null
-  hysteresis: 2
-  initial_loss_scale: 4294967296
-  log_num_zeros_in_grad: false
-  loss_scale: null
-  loss_scale_window: 1000
-  lr: 0.0001
-  main_grads_dtype:
-    _call_: false
-    _target_: torch.float32
-  main_params_dtype:
-    _call_: false
-    _target_: torch.float32
-  min_loss_scale: 1.0
-  min_lr: 1.0e-07
-  muon_extra_scale_factor: 1.0
-  muon_fp32_matmul_prec: medium
-  muon_momentum: 0.95
-  muon_num_ns_steps: 5
-  muon_scale_mode: spectral
-  muon_split_qkv: true
-  muon_tp_mode: blockwise
-  muon_use_nesterov: false
-  optimizer: adam
-  optimizer_cpu_offload: false
-  optimizer_offload_fraction: 0.0
-  overlap_cpu_optimizer_d2h_h2d: false
-  overlap_param_gather: false
-  overlap_param_gather_with_optimizer_step: false
-  params_dtype: bfloat16
-  pin_cpu_grads: true
-  pin_cpu_params: true
-  reuse_grad_buf_for_mxfp8_param_ag: false
-  sgd_momentum: 0.9
-  store_param_remainders: true
-  timers: null
-  use_distributed_optimizer: true
-  use_precision_aware_optimizer: false
-  use_torch_optimizer_for_cpu_offload: false
-  weight_decay: 0.1
-peft: null
-profiling:
-  _target_: megatron.bridge.training.config.ProfilingConfig
-  memory_snapshot_path: snapshot.pickle
-  nvtx_ranges: false
-  profile_ranks:
-  - 0
-  profile_step_end: 12
-  profile_step_start: 10
-  record_memory_history: false
-  record_shapes: false
-  use_nsys_profiler: false
-  use_pytorch_profiler: false
-rerun_state_machine:
-  _target_: megatron.bridge.training.config.RerunStateMachineConfig
-  check_for_nan_in_loss: true
-  check_for_spiky_loss: false
-  error_injection_rate: 0
-  error_injection_type: transient_error
-  rerun_mode: disabled
-rng:
-  _target_: megatron.bridge.training.config.RNGConfig
-  data_parallel_random_init: false
-  inference_rng_tracker: false
-  seed: 1234
-  te_rng_tracker: false
-scheduler:
-  _target_: megatron.bridge.training.config.SchedulerConfig
-  end_weight_decay: 0.1
-  lr_decay_iters: 12716
-  lr_decay_samples: null
-  lr_decay_steps: 3255296
-  lr_decay_style: linear
-  lr_warmup_fraction: null
-  lr_warmup_init: 1.0e-06
-  lr_warmup_iters: 200
-  lr_warmup_samples: 0
-  lr_warmup_steps: 51200
-  lr_wsd_decay_iters: null
-  lr_wsd_decay_samples: null
-  lr_wsd_decay_style: exponential
-  no_weight_decay_cond_type: null
-  override_opt_param_scheduler: false
-  start_weight_decay: 0.1
-  use_checkpoint_opt_param_scheduler: false
-  wd_incr_steps: 1528832
-  weight_decay_incr_style: constant
-  wsd_decay_steps: null
-straggler: null
-tensor_inspect: null
-tokenizer:
-  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
-  hf_tokenizer_kwargs: {}
-  image_tag_type: null
-  merge_file: null
-  special_tokens: null
-  tiktoken_num_special_tokens: 1000
-  tiktoken_pattern: null
-  tiktoken_special_tokens: null
-  tokenizer_model: ./models/Qwen-NVARC
-  tokenizer_prompt_format: null
-  tokenizer_type: HuggingFaceTokenizer
-  vocab_extra_ids: 0
-  vocab_file: null
-  vocab_size: null
-train:
-  _target_: megatron.bridge.training.config.TrainingConfig
-  check_weight_hash_across_dp_replicas_interval: null
-  decrease_batch_size_if_needed: false
-  empty_unused_memory_level: 0
-  eval_interval: 1000
-  eval_iters: 100
-  exit_duration_in_mins: null
-  exit_interval: null
-  exit_signal:
-    _args_:
-    - 15
-    _call_: true
-    _target_: signal.Signals
-  exit_signal_handler: false
-  exit_signal_handler_for_dataloader: false
-  global_batch_size: 256
-  iterations_to_skip: []
-  manual_gc: false
-  manual_gc_eval: true
-  manual_gc_interval: 0
-  micro_batch_size: 1
-  rampup_batch_size: null
-  skip_train: false
-  train_iters: 5972
-  train_samples: null
-  train_sync_interval: null

step_5600/policy/weights/iter_0000000/train_state.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
-size 3461

step_5600/policy/weights/latest_checkpointed_iteration.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 0

step_5600/policy/weights/latest_train_state.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2
-size 3461

step_5600/train_dataloader.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:abe0ee8c91d5ba1b614239817486eae38492eb4f5f311f8b71c6b33bc2151b2b
-size 7336

step_5600/training_info.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"epoch": 0, "step": 5600, "total_steps": 5600, "consumed_samples": 1433600, "total_valid_tokens": 1626494740.0, "val:val_loss": 0.14774028956890106}