diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..57fd82c08a4a989a60fbcbf53858dc95fb60224c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +step_5400/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text +step_5972/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text +step_5800/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text +step_5600/policy/weights/iter_0000000/.metadata filter=lfs diff=lfs merge=lfs -text +step_5400/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +step_5800/policy/weights/iter_0000000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +step_5972/policy/weights/iter_0000000/__0_1.distcp filter=lfs diff=lfs merge=lfs -text +step_5400/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +step_5972/policy/weights/iter_0000000/__1_1.distcp filter=lfs diff=lfs merge=lfs -text +step_5800/policy/weights/iter_0000000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text diff --git a/step_5400/config.yaml b/step_5400/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e935c0bf4a22e0bf18c2273893d86b8c3c2798df --- /dev/null +++ b/step_5400/config.yaml @@ -0,0 +1,207 @@ +checkpointing: + checkpoint_dir: results/qwen3_4b_sft + checkpoint_must_save_by: null + enabled: true + higher_is_better: false + keep_top_k: 3 + metric_name: val:val_loss + save_period: 200 +cluster: + gpus_per_node: 2 + num_nodes: 1 +data: + num_workers: 4 + shuffle: true + train_dataset_path: + - ./data/hones + val_dataset_path: ./data/arc2_evaluation6 +logger: + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 + log_dir: logs/exp_019 + mlflow_enabled: false + monitor_gpus: false + swanlab_enabled: false + tensorboard_enabled: false + wandb: + name: qwen3_4b_sft + project: arc2 + wandb_enabled: true +policy: + activation_checkpointing_enabled: false + attn_implementation: flash_attention_2 + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: false + fsdp_offload_enabled: false + make_sequence_length_divisible_by: 64 + max_grad_norm: null + megatron_cfg: + activation_checkpointing: true + apply_rope_fusion: true + bias_activation_fusion: false + context_parallel_size: 2 + distributed_data_parallel_config: + average_in_collective: true + data_parallel_sharding_strategy: optim_grads_params + grad_reduce_in_fp32: true + overlap_grad_reduce: true + overlap_param_gather: true + empty_unused_memory_level: 1 + enabled: true + env_vars: + AWS_OFI_NCCL_VERSION: 1.14.0 + BASH_ENV: /etc/bash.bashrc + CAL_VERSION: 0.4.4.50 + CUBLASMP_VERSION: 0.4.0.789 + CUBLAS_VERSION: 12.9.0.13 + CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0 + CUDA_DRIVER_VERSION: 575.51.03 + CUDA_VERSION: 12.9.0.043 + CUDA_VISIBLE_DEVICES: 6,7 + CUDNN_FRONTEND_VERSION: 1.11.0 + CUDNN_VERSION: 9.10.1.4 + CUFFT_VERSION: 11.4.0.6 + CUFILE_VERSION: 1.14.0.30 + CURAND_VERSION: 10.3.10.19 + CUSOLVER_VERSION: 11.7.4.40 + CUSPARSELT_VERSION: 0.7.1.0 + CUSPARSE_VERSION: 12.5.9.5 + DALI_BUILD: '' + DALI_URL_SUFFIX: '120' + DALI_VERSION: 1.49.0 + EFA_VERSION: 1.38.1 + ENV: /etc/shinit_v2 + GDRCOPY_VERSION: 2.4.4 + HOME: /root + HOSTNAME: e6ad2ac15863 + HPCX_VERSION: '2.23' + KMP_DUPLICATE_LIB_OK: 'True' + KMP_INIT_AT_FORK: 'FALSE' + LC_CTYPE: C.UTF-8 + LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LESSCLOSE: /usr/bin/lesspipe %s %s + LESSOPEN: '| /usr/bin/lesspipe %s' + LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:' + LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:' + MODEL_OPT_VERSION: 0.27.1 + MOFED_VERSION: 5.4-rdmacore50.0 + NCCL_NET_PLUGIN: aws-ofi + NCCL_TUNER_PLUGIN: aws-ofi + NCCL_VERSION: 2.26.5 + NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a + NEMO_RL_VENV_DIR: /opt/ray_venvs + NPP_VERSION: 12.4.0.27 + NRL_CONTAINER: '1' + NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron + NSIGHT_COMPUTE_VERSION: 2025.2.0.11 + NSIGHT_SYSTEMS_VERSION: 2025.3.1.90 + NVIDIA_BUILD_ID: '244212578' + NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732 + NVIDIA_DRIVER_CAPABILITIES: compute,utility,video + NVIDIA_PRODUCT_NAME: CUDA + NVIDIA_REQUIRE_CUDA: cuda>=9.0 + NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: '' + NVIDIA_VISIBLE_DEVICES: all + NVJITLINK_VERSION: 12.9.41 + NVJPEG_VERSION: 12.4.0.16 + NVSHMEM_VERSION: 3.2.5 + OLDPWD: /workspace + OMPI_MCA_coll_hcoll_enable: '0' + OPAL_PREFIX: /opt/hpcx/ompi + OPENMPI_VERSION: 4.1.7 + OPENUCX_VERSION: 1.19.0 + PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin + POLYGRAPHY_VERSION: 0.49.20 + PWD: /workspace/ARChitects + PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace + PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:' + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + RAY_CLIENT_MODE: '0' + RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0' + RAY_USAGE_STATS_ENABLED: '0' + RDMACORE_VERSION: '50.0' + SHELL: /bin/bash + SHLVL: '2' + SWANLAB_API_HOST: https://api.swanlab.cn/api + SWANLAB_RUNTIME: user + SWANLAB_WEB_HOST: https://swanlab.cn + TERM: xterm + TORCH_CUDA_ARCH_LIST: '9.0' + TRANSFORMER_ENGINE_VERSION: '2.3' + TRTOSS_VERSION: '' + TRT_VERSION: 10.10.0.31 + UV: /root/.local/bin/uv + UV_LINK_MODE: copy + UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv + UV_RUN_RECURSION_DEPTH: '1' + VIRTUAL_ENV: /opt/nemo_rl_venv + VIRTUAL_ENV_PROMPT: nemo-rl + WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket + _: /root/.local/bin/uv + _CUDA_COMPAT_PATH: /usr/local/cuda/compat + _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination + (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803 + _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9 + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + freeze_moe_router: true + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_load_balancing_type: none + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + optimizer: + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + bf16: true + clip_grad: 0.5 + fp16: false + lr: 0.0001 + min_lr: 1.0e-07 + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + params_dtype: bfloat16 + sgd_momentum: 0.9 + use_distributed_optimizer: true + use_precision_aware_optimizer: false + weight_decay: 0.1 + pipeline_dtype: bfloat16 + pipeline_model_parallel_size: 1 + scheduler: + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_style: linear + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + start_weight_decay: 0.1 + weight_decay_incr_style: constant + sequence_parallel: false + tensor_model_parallel_size: 1 + train_iters: 5972 + model_name: ./models/Qwen-NVARC + offload_optimizer_for_logprob: false + precision: bfloat16 + sequence_packing: + algorithm: modified_first_fit_decreasing + enabled: true + sequence_length_round: 64 + train_mb_tokens: 128000 + tokenizer: + name: ./models/Qwen-NVARC + train_global_batch_size: 256 + train_micro_batch_size: 1 +sft: + max_num_epochs: 1 + max_num_steps: 6400 + seed: 24 + val_at_start: true + val_batches: 200 + val_global_batch_size: 256 + val_micro_batch_size: 1 + val_period: 200 diff --git a/step_5400/policy/weights/iter_0000000/.metadata b/step_5400/policy/weights/iter_0000000/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5560dd8cf2ef3adec62d8b54731e2a75477e02aa --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601958148c0276510ee83ae2c089910f685c2aa6fde4b6f5e668b28ed06ec567 +size 329201 diff --git a/step_5400/policy/weights/iter_0000000/__0_0.distcp b/step_5400/policy/weights/iter_0000000/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a372337efa27e86a15113948b25b74150cbc925f --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4f63a1df595166115fa2fd03a1601a3ae7b6c72151956a0f966332b260176d +size 12718332319 diff --git a/step_5400/policy/weights/iter_0000000/__1_0.distcp b/step_5400/policy/weights/iter_0000000/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f7b059baec769d609ae062805aaaf50a733c2d9f --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdbfe2d6c54d823e7ef9c6bdfb156183fa5d437043a001c83847514272046f8b +size 12717813616 diff --git a/step_5400/policy/weights/iter_0000000/common.pt b/step_5400/policy/weights/iter_0000000/common.pt new file mode 100644 index 0000000000000000000000000000000000000000..1adb7a4061285240b80563d6716d7a50d8ad6f76 --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/common.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf17a4bbf5fb940ff8d1e669f26a4e277411e9796b4920f5cd867e4401db145 +size 1767 diff --git a/step_5400/policy/weights/iter_0000000/metadata.json b/step_5400/policy/weights/iter_0000000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c9dbde0bcf4d2d993122dcc7d6bcb1eef8b6fb77 --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/metadata.json @@ -0,0 +1 @@ +{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1} \ No newline at end of file diff --git a/step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml b/step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44ee391cb75b24b9ae1693ad28ca2c1c6a0b1f25 --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/modelopt_run_config.yaml @@ -0,0 +1,203 @@ +activation_func: +activation_func_clamp_value: None +add_bias_linear: false +add_qkv_bias: false +apply_query_key_layer_scaling: false +apply_residual_connection_post_layernorm: false +apply_rope_fusion: true +attention_backend: AttnBackend.auto +attention_dropout: '0.0' +attention_output_gate: false +attention_softmax_in_fp32: false +autocast_dtype: torch.bfloat16 +barrier_with_L1_time: true +bf16: true +bias_activation_fusion: false +bias_dropout_fusion: false +calculate_per_token_loss: true +clone_scatter_output_in_embedding: true +config_logger_dir: '' +cross_entropy_fusion_impl: native +cross_entropy_loss_fusion: true +defer_embedding_wgrad_compute: false +delay_wgrad_compute: false +deterministic_mode: false +disable_bf16_reduced_precision_matmul: false +disable_parameter_transpose_cache: false +distribute_saved_activations: None +enable_autocast: false +fallback_to_eager_attn: false +ffn_hidden_size: 9728 +finalize_model_grads_func: functools.partial(, + pg_collection=None) +fine_grained_activation_offloading: false +first_last_layers_bf16: false +flash_decode: false +fp16: false +fp16_lm_cross_entropy: false +fp32_residual_connection: false +fused_single_qkv_rope: false +gated_linear_unit: true +generation_config: None +glu_linear_offset: '0.0' +grad_scale_func: > +grad_sync_func: "" +gradient_accumulation_fusion: false +hetereogenous_dist_checkpoint: false +heterogeneous_block_specs: false +hf_model_id: ./models/Qwen-NVARC +hidden_dropout: '0.0' +hidden_size: 2560 +is_hybrid_model: false +kv_channels: 128 +layernorm_epsilon: 1e-06 +layernorm_zero_centered_gamma: false +linear_attention_freq: None +linear_attention_type: None +linear_conv_kernel_dim: None +linear_key_head_dim: None +linear_num_key_heads: None +linear_num_value_heads: None +linear_value_head_dim: None +log_max_attention_logit: false +make_vocab_size_divisible_by: 16 +mamba_head_dim: 64 +mamba_num_groups: 8 +mamba_num_heads: None +mamba_state_dim: 128 +masked_softmax_fusion: true +max_position_embeddings: 40960 +memory_efficient_layer_norm: false +min_offloaded_tensor_size: 1048576 +mlp_chunks_for_prefill: 1 +moe_apply_probs_on_input: false +moe_aux_loss_coeff: '0.0' +moe_deepep_num_sms: 20 +moe_enable_deepep: false +moe_expert_capacity_factor: None +moe_extended_tp: false +moe_ffn_hidden_size: None +moe_flex_dispatcher_backend: deepep +moe_grouped_gemm: false +moe_hybridep_num_sms: 16 +moe_input_jitter_eps: None +moe_layer_freq: 1 +moe_pad_expert_input_to_capacity: false +moe_per_layer_logging: false +moe_permute_fusion: false +moe_router_bias_update_rate: '0.0' +moe_router_dtype: fp64 +moe_router_enable_expert_bias: false +moe_router_force_load_balancing: false +moe_router_fusion: false +moe_router_group_topk: None +moe_router_load_balancing_type: none +moe_router_num_groups: None +moe_router_padding_for_quantization: false +moe_router_pre_softmax: false +moe_router_score_function: softmax +moe_router_topk: 2 +moe_router_topk_limited_devices: None +moe_router_topk_scaling_factor: None +moe_shared_expert_gate: false +moe_shared_expert_intermediate_size: None +moe_shared_expert_overlap: false +moe_token_dispatcher_type: allgather +moe_token_drop_policy: probs +moe_token_dropping: false +moe_use_legacy_grouped_gemm: false +moe_z_loss_coeff: None +mrope_section: None +multi_latent_attention: false +no_rope_freq: None +no_sync_func: "" +normalization: RMSNorm +num_attention_heads: 32 +num_layers: 36 +num_layers_at_end_in_bf16: 1 +num_layers_at_start_in_bf16: 1 +num_moe_experts: None +num_query_groups: 8 +nvidia_modelopt_version: 0.39.0 +offload_modules: None +param_sync_func: None +params_dtype: torch.bfloat16 +perform_initialization: true +persist_layer_norm: false +position_embedding_type: rope +qk_clip: false +qk_clip_alpha: '0.5' +qk_clip_threshold: 100 +qk_layernorm: true +quant_recipe: None +restore_modelopt_state: false +rotary_base: 5000000 +rotary_interleaved: false +rotary_percent: '1.0' +seq_len_interpolation_factor: None +seq_length: 262144 +share_embeddings_and_output_weights: true +should_pad_vocab: false +softmax_scale: None +softmax_type: vanilla +symmetric_ar_type: None +test_mode: false +timers: None +transformer_impl: transformer_engine +transformer_layer_spec: +use_fused_weighted_squared_relu: false +use_kitchen: false +use_mamba_mem_eff_path: true +use_ring_exchange_p2p: false +use_te_activation_func: false +use_te_rng_tracker: false +use_transformer_engine_full_layer_spec: false +use_transformer_engine_op_fuser: false +variable_seq_lengths: false +vocab_size: 16 +wgrad_deferral_limit: 0 +window_attn_skip_freq: None +window_size: None diff --git a/step_5400/policy/weights/iter_0000000/run_config.yaml b/step_5400/policy/weights/iter_0000000/run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6ff566d10536be12791e5150693b7a0c5145152b --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/run_config.yaml @@ -0,0 +1,564 @@ +_target_: megatron.bridge.training.config.ConfigContainer +checkpoint: + _target_: megatron.bridge.training.config.CheckpointConfig + async_save: false + ckpt_assume_constant_structure: false + ckpt_convert_format: null + ckpt_convert_save: null + ckpt_format: torch_dist + ckpt_step: null + dist_ckpt_optim_fully_reshardable: false + dist_ckpt_save_pre_mcore_014: false + dist_ckpt_strictness: assume_ok_unexpected + distrib_optim_fully_reshardable_mem_efficient: false + exit_on_missing_checkpoint: false + finetune: true + fully_parallel_load: true + fully_parallel_save: true + load: null + load_main_params_from_ckpt: false + load_optim: true + load_rng: false + most_recent_k: -1 + non_persistent_ckpt_type: null + non_persistent_global_ckpt_dir: null + non_persistent_local_ckpt_algo: fully_parallel + non_persistent_local_ckpt_dir: null + non_persistent_save_interval: null + pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC + replication: false + replication_factor: 2 + replication_jump: null + save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5400/policy/weights + save_interval: 100 + save_optim: true + save_rng: true + save_tokenizer_assets: true + strict_fsdp_dtensor_load: false + use_checkpoint_args: false + use_persistent_ckpt_worker: true +comm_overlap: null +dataset: null +ddp: + _target_: megatron.bridge.training.config.DistributedDataParallelConfig + align_param_gather: false + average_in_collective: false + bucket_size: 40000000 + check_for_large_grads: false + check_for_nan_in_grad: true + data_parallel_sharding_strategy: optim_grads_params + delay_wgrad_compute: false + disable_symmetric_registration: false + fp8_param_gather: false + fsdp_double_buffer: false + grad_reduce_in_fp32: true + gradient_reduce_div_fusion: true + keep_fp8_transpose_cache: false + nccl_ub: false + num_distributed_optimizer_instances: 1 + outer_dp_sharding_strategy: no_shard + overlap_grad_reduce: true + overlap_param_gather: true + pad_buckets_for_high_nccl_busbw: false + preserve_fp32_weights: true + reduce_scatter_with_fp32_accumulation: false + reuse_grad_buf_for_mxfp8_param_ag: false + suggested_communication_unit_size: null + use_custom_fsdp: false + use_distributed_optimizer: true + use_megatron_fsdp: false +dist: + _target_: megatron.bridge.training.config.DistributedInitConfig + align_grad_reduce: true + disable_jit_fuser: false + distributed_backend: nccl + distributed_timeout_minutes: 10 + distributed_timeout_seconds_after_init: null + enable_megatron_core_experimental: false + external_gpu_device_mapping: true + high_priority_stream_groups: null + lazy_init: false + local_rank: 0 + nccl_communicator_config_path: null + sharp_enabled_group: null + use_gloo_process_groups: true + use_megatron_fsdp: false + use_sharp: false + use_torch_fsdp2: false + use_tp_pp_dp_mapping: false +ft: null +inprocess_restart: null +logger: + _target_: megatron.bridge.training.config.LoggerConfig + filter_warnings: true + log_energy: false + log_interval: 100 + log_l2_norm_grad_to_tensorboard: false + log_loss_scale_to_tensorboard: true + log_memory_to_tensorboard: false + log_params_norm: false + log_progress: false + log_runtime_to_tensorboard: false + log_throughput: false + log_throughput_to_tensorboard: false + log_timers_to_tensorboard: false + log_validation_ppl_to_tensorboard: false + log_world_size_to_tensorboard: false + logging_level: 0 + memory_keys: null + modules_to_filter: null + runtime_time_unit: hours + save_config_filepath: null + set_level_for_all_loggers: false + tensorboard_dir: null + tensorboard_log_interval: 1 + tensorboard_queue_size: 1000 + throughput_window_size: 100 + timing_log_level: 0 + timing_log_option: minmax + wandb_entity: null + wandb_exp_name: null + wandb_project: null + wandb_save_dir: null +mixed_precision: null +model: + _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider + account_for_embedding_in_pipeline_split: false + account_for_loss_in_pipeline_split: false + activation_func: + _call_: false + _target_: torch.nn.functional.silu + activation_func_clamp_value: null + activation_func_fp8_input_store: false + add_bias_linear: false + add_qkv_bias: false + apply_query_key_layer_scaling: false + apply_residual_connection_post_layernorm: false + apply_rope_fusion: true + async_tensor_model_parallel_allreduce: false + attention_backend: + _args_: + - 5 + _call_: true + _target_: megatron.core.transformer.enums.AttnBackend + attention_dropout: 0.0 + attention_output_gate: false + attention_softmax_in_fp32: false + autocast_dtype: + _call_: false + _target_: torch.bfloat16 + barrier_with_L1_time: true + batch_p2p_comm: true + batch_p2p_sync: true + bf16: true + bias_activation_fusion: false + bias_dropout_fusion: false + calculate_per_token_loss: true + clone_scatter_output_in_embedding: true + config_logger_dir: '' + context_parallel_size: 2 + cp_comm_type: null + cpu_offloading: false + cpu_offloading_activations: true + cpu_offloading_double_buffering: false + cpu_offloading_num_layers: 0 + cpu_offloading_weights: false + cross_entropy_fusion_impl: native + cross_entropy_loss_fusion: true + cuda_graph_impl: none + cuda_graph_retain_backward_graph: false + cuda_graph_scope: [] + cuda_graph_use_single_mempool: false + cuda_graph_warmup_steps: 3 + deallocate_pipeline_outputs: true + defer_embedding_wgrad_compute: false + delay_wgrad_compute: false + deterministic_mode: false + disable_bf16_reduced_precision_matmul: false + disable_parameter_transpose_cache: false + distribute_saved_activations: null + embedding_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + embedding_init_method_std: 0.02 + enable_autocast: false + enable_cuda_graph: false + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + external_cuda_graph: false + fallback_to_eager_attn: false + ffn_hidden_size: 9728 + finalize_model_grads_func: + _args_: [] + _partial_: true + _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads + pg_collection: null + fine_grained_activation_offloading: false + first_last_layers_bf16: false + flash_decode: false + fp16: false + fp16_lm_cross_entropy: false + fp32_residual_connection: false + fp4: null + fp4_param: false + fp4_quantizer_factory: null + fp4_recipe: nvfp4 + fp8: null + fp8_amax_compute_algo: most_recent + fp8_amax_history_len: 1 + fp8_dot_product_attention: false + fp8_interval: 1 + fp8_margin: 0 + fp8_multi_head_attention: false + fp8_param: false + fp8_quantizer_factory: null + fp8_recipe: delayed + fp8_wgrad: true + fused_single_qkv_rope: false + gated_linear_unit: true + generation_config: null + glu_linear_offset: 0.0 + grad_scale_func: + _call_: false + _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss + grad_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync + gradient_accumulation_fusion: false + hetereogenous_dist_checkpoint: false + heterogeneous_block_specs: false + hf_model_id: ./models/Qwen-NVARC + hidden_dropout: 0.0 + hidden_size: 2560 + hierarchical_context_parallel_sizes: null + inference_rng_tracker: false + inference_sampling_seed: 42 + init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + init_method_std: 0.02 + init_model_with_meta_device: false + is_hybrid_model: false + kv_channels: 128 + layernorm_epsilon: 1.0e-06 + layernorm_zero_centered_gamma: false + linear_attention_freq: null + linear_attention_type: null + linear_conv_kernel_dim: null + linear_key_head_dim: null + linear_num_key_heads: null + linear_num_value_heads: null + linear_value_head_dim: null + log_max_attention_logit: false + make_vocab_size_divisible_by: 16 + mamba_head_dim: 64 + mamba_num_groups: 8 + mamba_num_heads: null + mamba_state_dim: 128 + masked_softmax_fusion: true + max_position_embeddings: 40960 + memory_efficient_layer_norm: false + microbatch_group_size_per_vp_stage: 1 + min_offloaded_tensor_size: 1048576 + mlp_chunks_for_prefill: 1 + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: null + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 2 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: allgather + moe_token_drop_policy: probs + moe_token_dropping: false + moe_use_legacy_grouped_gemm: false + moe_z_loss_coeff: null + mrope_section: null + mtp_enabled: false + mtp_loss_scaling_factor: null + mtp_num_layers: null + mtp_standalone: false + multi_latent_attention: false + no_rope_freq: null + no_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync + normalization: RMSNorm + num_attention_heads: 32 + num_layers: 36 + num_layers_at_end_in_bf16: 1 + num_layers_at_start_in_bf16: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + num_microbatches_with_partial_activation_checkpoints: null + num_moe_experts: null + num_query_groups: 8 + offload_modules: null + output_layer_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.0023570226039551587 + overlap_moe_expert_parallel_comm: false + overlap_p2p_comm: false + overlap_p2p_comm_warmup_flush: false + parallel_output: true + param_sync_func: null + params_dtype: + _call_: false + _target_: torch.bfloat16 + perform_initialization: true + persist_layer_norm: false + pipeline_dtype: + _call_: false + _target_: torch.bfloat16 + pipeline_model_parallel_comm_backend: null + pipeline_model_parallel_layout: null + pipeline_model_parallel_size: 1 + position_embedding_type: rope + qk_clip: false + qk_clip_alpha: 0.5 + qk_clip_threshold: 100 + qk_layernorm: true + quant_recipe: null + recompute_granularity: full + recompute_method: uniform + recompute_modules: + - core_attn + recompute_num_layers: 1 + restore_modelopt_state: false + rotary_base: 5000000 + rotary_interleaved: false + rotary_percent: 1.0 + scatter_embedding_sequence_parallel: true + seq_len_interpolation_factor: null + seq_length: 262144 + sequence_parallel: false + share_embeddings_and_output_weights: true + should_pad_vocab: false + softmax_scale: null + softmax_type: vanilla + symmetric_ar_type: null + tensor_model_parallel_size: 1 + test_mode: false + timers: null + tp_comm_atomic_ag: false + tp_comm_atomic_rs: false + tp_comm_bootstrap_backend: nccl + tp_comm_bulk_dgrad: true + tp_comm_bulk_wgrad: true + tp_comm_overlap: false + tp_comm_overlap_ag: true + tp_comm_overlap_cfg: null + tp_comm_overlap_disable_fc1: false + tp_comm_overlap_disable_qkv: false + tp_comm_overlap_rs: true + tp_comm_overlap_rs_dgrad: false + tp_comm_split_ag: true + tp_comm_split_rs: true + tp_only_amax_red: false + transformer_impl: transformer_engine + transformer_layer_spec: + _call_: false + _target_: megatron.bridge.models.gpt_provider.default_layer_spec + use_cpu_initialization: false + use_fused_weighted_squared_relu: false + use_kitchen: false + use_mamba_mem_eff_path: true + use_ring_exchange_p2p: false + use_te_activation_func: false + use_te_rng_tracker: false + use_transformer_engine_full_layer_spec: false + use_transformer_engine_op_fuser: false + variable_seq_lengths: false + virtual_pipeline_model_parallel_size: null + vocab_size: 16 + wgrad_deferral_limit: 0 + window_attn_skip_freq: null + window_size: null +nvrx_straggler: null +optimizer: + _target_: megatron.bridge.training.config.OptimizerConfig + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + barrier_with_L1_time: false + bf16: true + clip_grad: 0.5 + config_logger_dir: '' + decoupled_lr: null + decoupled_min_lr: null + decoupled_weight_decay: true + exp_avg_dtype: + _call_: false + _target_: torch.float32 + exp_avg_sq_dtype: + _call_: false + _target_: torch.float32 + fp16: false + fp8_recipe: null + hysteresis: 2 + initial_loss_scale: 4294967296 + log_num_zeros_in_grad: false + loss_scale: null + loss_scale_window: 1000 + lr: 0.0001 + main_grads_dtype: + _call_: false + _target_: torch.float32 + main_params_dtype: + _call_: false + _target_: torch.float32 + min_loss_scale: 1.0 + min_lr: 1.0e-07 + muon_extra_scale_factor: 1.0 + muon_fp32_matmul_prec: medium + muon_momentum: 0.95 + muon_num_ns_steps: 5 + muon_scale_mode: spectral + muon_split_qkv: true + muon_tp_mode: blockwise + muon_use_nesterov: false + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + overlap_cpu_optimizer_d2h_h2d: false + overlap_param_gather: false + overlap_param_gather_with_optimizer_step: false + params_dtype: bfloat16 + pin_cpu_grads: true + pin_cpu_params: true + reuse_grad_buf_for_mxfp8_param_ag: false + sgd_momentum: 0.9 + store_param_remainders: true + timers: null + use_distributed_optimizer: true + use_precision_aware_optimizer: false + use_torch_optimizer_for_cpu_offload: false + weight_decay: 0.1 +peft: null +profiling: + _target_: megatron.bridge.training.config.ProfilingConfig + memory_snapshot_path: snapshot.pickle + nvtx_ranges: false + profile_ranks: + - 0 + profile_step_end: 12 + profile_step_start: 10 + record_memory_history: false + record_shapes: false + use_nsys_profiler: false + use_pytorch_profiler: false +rerun_state_machine: + _target_: megatron.bridge.training.config.RerunStateMachineConfig + check_for_nan_in_loss: true + check_for_spiky_loss: false + error_injection_rate: 0 + error_injection_type: transient_error + rerun_mode: disabled +rng: + _target_: megatron.bridge.training.config.RNGConfig + data_parallel_random_init: false + inference_rng_tracker: false + seed: 1234 + te_rng_tracker: false +scheduler: + _target_: megatron.bridge.training.config.SchedulerConfig + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_samples: null + lr_decay_steps: 3255296 + lr_decay_style: linear + lr_warmup_fraction: null + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + lr_warmup_samples: 0 + lr_warmup_steps: 51200 + lr_wsd_decay_iters: null + lr_wsd_decay_samples: null + lr_wsd_decay_style: exponential + no_weight_decay_cond_type: null + override_opt_param_scheduler: false + start_weight_decay: 0.1 + use_checkpoint_opt_param_scheduler: false + wd_incr_steps: 1528832 + weight_decay_incr_style: constant + wsd_decay_steps: null +straggler: null +tensor_inspect: null +tokenizer: + _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig + hf_tokenizer_kwargs: {} + image_tag_type: null + merge_file: null + special_tokens: null + tiktoken_num_special_tokens: 1000 + tiktoken_pattern: null + tiktoken_special_tokens: null + tokenizer_model: ./models/Qwen-NVARC + tokenizer_prompt_format: null + tokenizer_type: HuggingFaceTokenizer + vocab_extra_ids: 0 + vocab_file: null + vocab_size: null +train: + _target_: megatron.bridge.training.config.TrainingConfig + check_weight_hash_across_dp_replicas_interval: null + decrease_batch_size_if_needed: false + empty_unused_memory_level: 0 + eval_interval: 1000 + eval_iters: 100 + exit_duration_in_mins: null + exit_interval: null + exit_signal: + _args_: + - 15 + _call_: true + _target_: signal.Signals + exit_signal_handler: false + exit_signal_handler_for_dataloader: false + global_batch_size: 256 + iterations_to_skip: [] + manual_gc: false + manual_gc_eval: true + manual_gc_interval: 0 + micro_batch_size: 1 + rampup_batch_size: null + skip_train: false + train_iters: 5972 + train_samples: null + train_sync_interval: null diff --git a/step_5400/policy/weights/iter_0000000/train_state.pt b/step_5400/policy/weights/iter_0000000/train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5400/policy/weights/iter_0000000/train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5400/policy/weights/latest_checkpointed_iteration.txt b/step_5400/policy/weights/latest_checkpointed_iteration.txt new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/step_5400/policy/weights/latest_checkpointed_iteration.txt @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/step_5400/policy/weights/latest_train_state.pt b/step_5400/policy/weights/latest_train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5400/policy/weights/latest_train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5400/train_dataloader.pt b/step_5400/train_dataloader.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bb3abe30d1422ff52f94bbd0abcc26db1953a4c --- /dev/null +++ b/step_5400/train_dataloader.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99b28546a485528f6242d1b9dcf951cc95a6af0ca81a13ded15a567a8c9d2f7f +size 7336 diff --git a/step_5400/training_info.json b/step_5400/training_info.json new file mode 100644 index 0000000000000000000000000000000000000000..62e07bfba283099bdd4b7d7b9f20c873b963172b --- /dev/null +++ b/step_5400/training_info.json @@ -0,0 +1 @@ +{"epoch": 0, "step": 5400, "total_steps": 5400, "consumed_samples": 1382400, "total_valid_tokens": 1568487826.0, "val:val_loss": 0.14914798736572266} \ No newline at end of file diff --git a/step_5600/config.yaml b/step_5600/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e935c0bf4a22e0bf18c2273893d86b8c3c2798df --- /dev/null +++ b/step_5600/config.yaml @@ -0,0 +1,207 @@ +checkpointing: + checkpoint_dir: results/qwen3_4b_sft + checkpoint_must_save_by: null + enabled: true + higher_is_better: false + keep_top_k: 3 + metric_name: val:val_loss + save_period: 200 +cluster: + gpus_per_node: 2 + num_nodes: 1 +data: + num_workers: 4 + shuffle: true + train_dataset_path: + - ./data/hones + val_dataset_path: ./data/arc2_evaluation6 +logger: + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 + log_dir: logs/exp_019 + mlflow_enabled: false + monitor_gpus: false + swanlab_enabled: false + tensorboard_enabled: false + wandb: + name: qwen3_4b_sft + project: arc2 + wandb_enabled: true +policy: + activation_checkpointing_enabled: false + attn_implementation: flash_attention_2 + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: false + fsdp_offload_enabled: false + make_sequence_length_divisible_by: 64 + max_grad_norm: null + megatron_cfg: + activation_checkpointing: true + apply_rope_fusion: true + bias_activation_fusion: false + context_parallel_size: 2 + distributed_data_parallel_config: + average_in_collective: true + data_parallel_sharding_strategy: optim_grads_params + grad_reduce_in_fp32: true + overlap_grad_reduce: true + overlap_param_gather: true + empty_unused_memory_level: 1 + enabled: true + env_vars: + AWS_OFI_NCCL_VERSION: 1.14.0 + BASH_ENV: /etc/bash.bashrc + CAL_VERSION: 0.4.4.50 + CUBLASMP_VERSION: 0.4.0.789 + CUBLAS_VERSION: 12.9.0.13 + CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0 + CUDA_DRIVER_VERSION: 575.51.03 + CUDA_VERSION: 12.9.0.043 + CUDA_VISIBLE_DEVICES: 6,7 + CUDNN_FRONTEND_VERSION: 1.11.0 + CUDNN_VERSION: 9.10.1.4 + CUFFT_VERSION: 11.4.0.6 + CUFILE_VERSION: 1.14.0.30 + CURAND_VERSION: 10.3.10.19 + CUSOLVER_VERSION: 11.7.4.40 + CUSPARSELT_VERSION: 0.7.1.0 + CUSPARSE_VERSION: 12.5.9.5 + DALI_BUILD: '' + DALI_URL_SUFFIX: '120' + DALI_VERSION: 1.49.0 + EFA_VERSION: 1.38.1 + ENV: /etc/shinit_v2 + GDRCOPY_VERSION: 2.4.4 + HOME: /root + HOSTNAME: e6ad2ac15863 + HPCX_VERSION: '2.23' + KMP_DUPLICATE_LIB_OK: 'True' + KMP_INIT_AT_FORK: 'FALSE' + LC_CTYPE: C.UTF-8 + LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LESSCLOSE: /usr/bin/lesspipe %s %s + LESSOPEN: '| /usr/bin/lesspipe %s' + LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:' + LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:' + MODEL_OPT_VERSION: 0.27.1 + MOFED_VERSION: 5.4-rdmacore50.0 + NCCL_NET_PLUGIN: aws-ofi + NCCL_TUNER_PLUGIN: aws-ofi + NCCL_VERSION: 2.26.5 + NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a + NEMO_RL_VENV_DIR: /opt/ray_venvs + NPP_VERSION: 12.4.0.27 + NRL_CONTAINER: '1' + NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron + NSIGHT_COMPUTE_VERSION: 2025.2.0.11 + NSIGHT_SYSTEMS_VERSION: 2025.3.1.90 + NVIDIA_BUILD_ID: '244212578' + NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732 + NVIDIA_DRIVER_CAPABILITIES: compute,utility,video + NVIDIA_PRODUCT_NAME: CUDA + NVIDIA_REQUIRE_CUDA: cuda>=9.0 + NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: '' + NVIDIA_VISIBLE_DEVICES: all + NVJITLINK_VERSION: 12.9.41 + NVJPEG_VERSION: 12.4.0.16 + NVSHMEM_VERSION: 3.2.5 + OLDPWD: /workspace + OMPI_MCA_coll_hcoll_enable: '0' + OPAL_PREFIX: /opt/hpcx/ompi + OPENMPI_VERSION: 4.1.7 + OPENUCX_VERSION: 1.19.0 + PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin + POLYGRAPHY_VERSION: 0.49.20 + PWD: /workspace/ARChitects + PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace + PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:' + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + RAY_CLIENT_MODE: '0' + RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0' + RAY_USAGE_STATS_ENABLED: '0' + RDMACORE_VERSION: '50.0' + SHELL: /bin/bash + SHLVL: '2' + SWANLAB_API_HOST: https://api.swanlab.cn/api + SWANLAB_RUNTIME: user + SWANLAB_WEB_HOST: https://swanlab.cn + TERM: xterm + TORCH_CUDA_ARCH_LIST: '9.0' + TRANSFORMER_ENGINE_VERSION: '2.3' + TRTOSS_VERSION: '' + TRT_VERSION: 10.10.0.31 + UV: /root/.local/bin/uv + UV_LINK_MODE: copy + UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv + UV_RUN_RECURSION_DEPTH: '1' + VIRTUAL_ENV: /opt/nemo_rl_venv + VIRTUAL_ENV_PROMPT: nemo-rl + WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket + _: /root/.local/bin/uv + _CUDA_COMPAT_PATH: /usr/local/cuda/compat + _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination + (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803 + _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9 + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + freeze_moe_router: true + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_load_balancing_type: none + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + optimizer: + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + bf16: true + clip_grad: 0.5 + fp16: false + lr: 0.0001 + min_lr: 1.0e-07 + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + params_dtype: bfloat16 + sgd_momentum: 0.9 + use_distributed_optimizer: true + use_precision_aware_optimizer: false + weight_decay: 0.1 + pipeline_dtype: bfloat16 + pipeline_model_parallel_size: 1 + scheduler: + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_style: linear + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + start_weight_decay: 0.1 + weight_decay_incr_style: constant + sequence_parallel: false + tensor_model_parallel_size: 1 + train_iters: 5972 + model_name: ./models/Qwen-NVARC + offload_optimizer_for_logprob: false + precision: bfloat16 + sequence_packing: + algorithm: modified_first_fit_decreasing + enabled: true + sequence_length_round: 64 + train_mb_tokens: 128000 + tokenizer: + name: ./models/Qwen-NVARC + train_global_batch_size: 256 + train_micro_batch_size: 1 +sft: + max_num_epochs: 1 + max_num_steps: 6400 + seed: 24 + val_at_start: true + val_batches: 200 + val_global_batch_size: 256 + val_micro_batch_size: 1 + val_period: 200 diff --git a/step_5600/policy/weights/iter_0000000/.metadata b/step_5600/policy/weights/iter_0000000/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..996fdc7aec13be8d487fd8ca22468be7520b9b5d --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1c916057dfe0e2002fe62982907832c5f702012c7360c6613f4a610084f748 +size 329201 diff --git a/step_5600/policy/weights/iter_0000000/common.pt b/step_5600/policy/weights/iter_0000000/common.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ee19e4f1d85ef522cb53f4f53846f00ea8d05f0 --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/common.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca3b2e874e687352cb92c06fbffd56051ad75663c60fb5d275365fa00e02a4bb +size 1767 diff --git a/step_5600/policy/weights/iter_0000000/metadata.json b/step_5600/policy/weights/iter_0000000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c9dbde0bcf4d2d993122dcc7d6bcb1eef8b6fb77 --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/metadata.json @@ -0,0 +1 @@ +{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1} \ No newline at end of file diff --git a/step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml b/step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44ee391cb75b24b9ae1693ad28ca2c1c6a0b1f25 --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/modelopt_run_config.yaml @@ -0,0 +1,203 @@ +activation_func: +activation_func_clamp_value: None +add_bias_linear: false +add_qkv_bias: false +apply_query_key_layer_scaling: false +apply_residual_connection_post_layernorm: false +apply_rope_fusion: true +attention_backend: AttnBackend.auto +attention_dropout: '0.0' +attention_output_gate: false +attention_softmax_in_fp32: false +autocast_dtype: torch.bfloat16 +barrier_with_L1_time: true +bf16: true +bias_activation_fusion: false +bias_dropout_fusion: false +calculate_per_token_loss: true +clone_scatter_output_in_embedding: true +config_logger_dir: '' +cross_entropy_fusion_impl: native +cross_entropy_loss_fusion: true +defer_embedding_wgrad_compute: false +delay_wgrad_compute: false +deterministic_mode: false +disable_bf16_reduced_precision_matmul: false +disable_parameter_transpose_cache: false +distribute_saved_activations: None +enable_autocast: false +fallback_to_eager_attn: false +ffn_hidden_size: 9728 +finalize_model_grads_func: functools.partial(, + pg_collection=None) +fine_grained_activation_offloading: false +first_last_layers_bf16: false +flash_decode: false +fp16: false +fp16_lm_cross_entropy: false +fp32_residual_connection: false +fused_single_qkv_rope: false +gated_linear_unit: true +generation_config: None +glu_linear_offset: '0.0' +grad_scale_func: > +grad_sync_func: "" +gradient_accumulation_fusion: false +hetereogenous_dist_checkpoint: false +heterogeneous_block_specs: false +hf_model_id: ./models/Qwen-NVARC +hidden_dropout: '0.0' +hidden_size: 2560 +is_hybrid_model: false +kv_channels: 128 +layernorm_epsilon: 1e-06 +layernorm_zero_centered_gamma: false +linear_attention_freq: None +linear_attention_type: None +linear_conv_kernel_dim: None +linear_key_head_dim: None +linear_num_key_heads: None +linear_num_value_heads: None +linear_value_head_dim: None +log_max_attention_logit: false +make_vocab_size_divisible_by: 16 +mamba_head_dim: 64 +mamba_num_groups: 8 +mamba_num_heads: None +mamba_state_dim: 128 +masked_softmax_fusion: true +max_position_embeddings: 40960 +memory_efficient_layer_norm: false +min_offloaded_tensor_size: 1048576 +mlp_chunks_for_prefill: 1 +moe_apply_probs_on_input: false +moe_aux_loss_coeff: '0.0' +moe_deepep_num_sms: 20 +moe_enable_deepep: false +moe_expert_capacity_factor: None +moe_extended_tp: false +moe_ffn_hidden_size: None +moe_flex_dispatcher_backend: deepep +moe_grouped_gemm: false +moe_hybridep_num_sms: 16 +moe_input_jitter_eps: None +moe_layer_freq: 1 +moe_pad_expert_input_to_capacity: false +moe_per_layer_logging: false +moe_permute_fusion: false +moe_router_bias_update_rate: '0.0' +moe_router_dtype: fp64 +moe_router_enable_expert_bias: false +moe_router_force_load_balancing: false +moe_router_fusion: false +moe_router_group_topk: None +moe_router_load_balancing_type: none +moe_router_num_groups: None +moe_router_padding_for_quantization: false +moe_router_pre_softmax: false +moe_router_score_function: softmax +moe_router_topk: 2 +moe_router_topk_limited_devices: None +moe_router_topk_scaling_factor: None +moe_shared_expert_gate: false +moe_shared_expert_intermediate_size: None +moe_shared_expert_overlap: false +moe_token_dispatcher_type: allgather +moe_token_drop_policy: probs +moe_token_dropping: false +moe_use_legacy_grouped_gemm: false +moe_z_loss_coeff: None +mrope_section: None +multi_latent_attention: false +no_rope_freq: None +no_sync_func: "" +normalization: RMSNorm +num_attention_heads: 32 +num_layers: 36 +num_layers_at_end_in_bf16: 1 +num_layers_at_start_in_bf16: 1 +num_moe_experts: None +num_query_groups: 8 +nvidia_modelopt_version: 0.39.0 +offload_modules: None +param_sync_func: None +params_dtype: torch.bfloat16 +perform_initialization: true +persist_layer_norm: false +position_embedding_type: rope +qk_clip: false +qk_clip_alpha: '0.5' +qk_clip_threshold: 100 +qk_layernorm: true +quant_recipe: None +restore_modelopt_state: false +rotary_base: 5000000 +rotary_interleaved: false +rotary_percent: '1.0' +seq_len_interpolation_factor: None +seq_length: 262144 +share_embeddings_and_output_weights: true +should_pad_vocab: false +softmax_scale: None +softmax_type: vanilla +symmetric_ar_type: None +test_mode: false +timers: None +transformer_impl: transformer_engine +transformer_layer_spec: +use_fused_weighted_squared_relu: false +use_kitchen: false +use_mamba_mem_eff_path: true +use_ring_exchange_p2p: false +use_te_activation_func: false +use_te_rng_tracker: false +use_transformer_engine_full_layer_spec: false +use_transformer_engine_op_fuser: false +variable_seq_lengths: false +vocab_size: 16 +wgrad_deferral_limit: 0 +window_attn_skip_freq: None +window_size: None diff --git a/step_5600/policy/weights/iter_0000000/run_config.yaml b/step_5600/policy/weights/iter_0000000/run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..75ad721f1a4134b0de4769cfce0e3bfafdad186b --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/run_config.yaml @@ -0,0 +1,564 @@ +_target_: megatron.bridge.training.config.ConfigContainer +checkpoint: + _target_: megatron.bridge.training.config.CheckpointConfig + async_save: false + ckpt_assume_constant_structure: false + ckpt_convert_format: null + ckpt_convert_save: null + ckpt_format: torch_dist + ckpt_step: null + dist_ckpt_optim_fully_reshardable: false + dist_ckpt_save_pre_mcore_014: false + dist_ckpt_strictness: assume_ok_unexpected + distrib_optim_fully_reshardable_mem_efficient: false + exit_on_missing_checkpoint: false + finetune: true + fully_parallel_load: true + fully_parallel_save: true + load: null + load_main_params_from_ckpt: false + load_optim: true + load_rng: false + most_recent_k: -1 + non_persistent_ckpt_type: null + non_persistent_global_ckpt_dir: null + non_persistent_local_ckpt_algo: fully_parallel + non_persistent_local_ckpt_dir: null + non_persistent_save_interval: null + pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC + replication: false + replication_factor: 2 + replication_jump: null + save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5600/policy/weights + save_interval: 100 + save_optim: true + save_rng: true + save_tokenizer_assets: true + strict_fsdp_dtensor_load: false + use_checkpoint_args: false + use_persistent_ckpt_worker: true +comm_overlap: null +dataset: null +ddp: + _target_: megatron.bridge.training.config.DistributedDataParallelConfig + align_param_gather: false + average_in_collective: false + bucket_size: 40000000 + check_for_large_grads: false + check_for_nan_in_grad: true + data_parallel_sharding_strategy: optim_grads_params + delay_wgrad_compute: false + disable_symmetric_registration: false + fp8_param_gather: false + fsdp_double_buffer: false + grad_reduce_in_fp32: true + gradient_reduce_div_fusion: true + keep_fp8_transpose_cache: false + nccl_ub: false + num_distributed_optimizer_instances: 1 + outer_dp_sharding_strategy: no_shard + overlap_grad_reduce: true + overlap_param_gather: true + pad_buckets_for_high_nccl_busbw: false + preserve_fp32_weights: true + reduce_scatter_with_fp32_accumulation: false + reuse_grad_buf_for_mxfp8_param_ag: false + suggested_communication_unit_size: null + use_custom_fsdp: false + use_distributed_optimizer: true + use_megatron_fsdp: false +dist: + _target_: megatron.bridge.training.config.DistributedInitConfig + align_grad_reduce: true + disable_jit_fuser: false + distributed_backend: nccl + distributed_timeout_minutes: 10 + distributed_timeout_seconds_after_init: null + enable_megatron_core_experimental: false + external_gpu_device_mapping: true + high_priority_stream_groups: null + lazy_init: false + local_rank: 0 + nccl_communicator_config_path: null + sharp_enabled_group: null + use_gloo_process_groups: true + use_megatron_fsdp: false + use_sharp: false + use_torch_fsdp2: false + use_tp_pp_dp_mapping: false +ft: null +inprocess_restart: null +logger: + _target_: megatron.bridge.training.config.LoggerConfig + filter_warnings: true + log_energy: false + log_interval: 100 + log_l2_norm_grad_to_tensorboard: false + log_loss_scale_to_tensorboard: true + log_memory_to_tensorboard: false + log_params_norm: false + log_progress: false + log_runtime_to_tensorboard: false + log_throughput: false + log_throughput_to_tensorboard: false + log_timers_to_tensorboard: false + log_validation_ppl_to_tensorboard: false + log_world_size_to_tensorboard: false + logging_level: 0 + memory_keys: null + modules_to_filter: null + runtime_time_unit: hours + save_config_filepath: null + set_level_for_all_loggers: false + tensorboard_dir: null + tensorboard_log_interval: 1 + tensorboard_queue_size: 1000 + throughput_window_size: 100 + timing_log_level: 0 + timing_log_option: minmax + wandb_entity: null + wandb_exp_name: null + wandb_project: null + wandb_save_dir: null +mixed_precision: null +model: + _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider + account_for_embedding_in_pipeline_split: false + account_for_loss_in_pipeline_split: false + activation_func: + _call_: false + _target_: torch.nn.functional.silu + activation_func_clamp_value: null + activation_func_fp8_input_store: false + add_bias_linear: false + add_qkv_bias: false + apply_query_key_layer_scaling: false + apply_residual_connection_post_layernorm: false + apply_rope_fusion: true + async_tensor_model_parallel_allreduce: false + attention_backend: + _args_: + - 5 + _call_: true + _target_: megatron.core.transformer.enums.AttnBackend + attention_dropout: 0.0 + attention_output_gate: false + attention_softmax_in_fp32: false + autocast_dtype: + _call_: false + _target_: torch.bfloat16 + barrier_with_L1_time: true + batch_p2p_comm: true + batch_p2p_sync: true + bf16: true + bias_activation_fusion: false + bias_dropout_fusion: false + calculate_per_token_loss: true + clone_scatter_output_in_embedding: true + config_logger_dir: '' + context_parallel_size: 2 + cp_comm_type: null + cpu_offloading: false + cpu_offloading_activations: true + cpu_offloading_double_buffering: false + cpu_offloading_num_layers: 0 + cpu_offloading_weights: false + cross_entropy_fusion_impl: native + cross_entropy_loss_fusion: true + cuda_graph_impl: none + cuda_graph_retain_backward_graph: false + cuda_graph_scope: [] + cuda_graph_use_single_mempool: false + cuda_graph_warmup_steps: 3 + deallocate_pipeline_outputs: true + defer_embedding_wgrad_compute: false + delay_wgrad_compute: false + deterministic_mode: false + disable_bf16_reduced_precision_matmul: false + disable_parameter_transpose_cache: false + distribute_saved_activations: null + embedding_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + embedding_init_method_std: 0.02 + enable_autocast: false + enable_cuda_graph: false + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + external_cuda_graph: false + fallback_to_eager_attn: false + ffn_hidden_size: 9728 + finalize_model_grads_func: + _args_: [] + _partial_: true + _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads + pg_collection: null + fine_grained_activation_offloading: false + first_last_layers_bf16: false + flash_decode: false + fp16: false + fp16_lm_cross_entropy: false + fp32_residual_connection: false + fp4: null + fp4_param: false + fp4_quantizer_factory: null + fp4_recipe: nvfp4 + fp8: null + fp8_amax_compute_algo: most_recent + fp8_amax_history_len: 1 + fp8_dot_product_attention: false + fp8_interval: 1 + fp8_margin: 0 + fp8_multi_head_attention: false + fp8_param: false + fp8_quantizer_factory: null + fp8_recipe: delayed + fp8_wgrad: true + fused_single_qkv_rope: false + gated_linear_unit: true + generation_config: null + glu_linear_offset: 0.0 + grad_scale_func: + _call_: false + _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss + grad_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync + gradient_accumulation_fusion: false + hetereogenous_dist_checkpoint: false + heterogeneous_block_specs: false + hf_model_id: ./models/Qwen-NVARC + hidden_dropout: 0.0 + hidden_size: 2560 + hierarchical_context_parallel_sizes: null + inference_rng_tracker: false + inference_sampling_seed: 42 + init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + init_method_std: 0.02 + init_model_with_meta_device: false + is_hybrid_model: false + kv_channels: 128 + layernorm_epsilon: 1.0e-06 + layernorm_zero_centered_gamma: false + linear_attention_freq: null + linear_attention_type: null + linear_conv_kernel_dim: null + linear_key_head_dim: null + linear_num_key_heads: null + linear_num_value_heads: null + linear_value_head_dim: null + log_max_attention_logit: false + make_vocab_size_divisible_by: 16 + mamba_head_dim: 64 + mamba_num_groups: 8 + mamba_num_heads: null + mamba_state_dim: 128 + masked_softmax_fusion: true + max_position_embeddings: 40960 + memory_efficient_layer_norm: false + microbatch_group_size_per_vp_stage: 1 + min_offloaded_tensor_size: 1048576 + mlp_chunks_for_prefill: 1 + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: null + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 2 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: allgather + moe_token_drop_policy: probs + moe_token_dropping: false + moe_use_legacy_grouped_gemm: false + moe_z_loss_coeff: null + mrope_section: null + mtp_enabled: false + mtp_loss_scaling_factor: null + mtp_num_layers: null + mtp_standalone: false + multi_latent_attention: false + no_rope_freq: null + no_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync + normalization: RMSNorm + num_attention_heads: 32 + num_layers: 36 + num_layers_at_end_in_bf16: 1 + num_layers_at_start_in_bf16: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + num_microbatches_with_partial_activation_checkpoints: null + num_moe_experts: null + num_query_groups: 8 + offload_modules: null + output_layer_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.0023570226039551587 + overlap_moe_expert_parallel_comm: false + overlap_p2p_comm: false + overlap_p2p_comm_warmup_flush: false + parallel_output: true + param_sync_func: null + params_dtype: + _call_: false + _target_: torch.bfloat16 + perform_initialization: true + persist_layer_norm: false + pipeline_dtype: + _call_: false + _target_: torch.bfloat16 + pipeline_model_parallel_comm_backend: null + pipeline_model_parallel_layout: null + pipeline_model_parallel_size: 1 + position_embedding_type: rope + qk_clip: false + qk_clip_alpha: 0.5 + qk_clip_threshold: 100 + qk_layernorm: true + quant_recipe: null + recompute_granularity: full + recompute_method: uniform + recompute_modules: + - core_attn + recompute_num_layers: 1 + restore_modelopt_state: false + rotary_base: 5000000 + rotary_interleaved: false + rotary_percent: 1.0 + scatter_embedding_sequence_parallel: true + seq_len_interpolation_factor: null + seq_length: 262144 + sequence_parallel: false + share_embeddings_and_output_weights: true + should_pad_vocab: false + softmax_scale: null + softmax_type: vanilla + symmetric_ar_type: null + tensor_model_parallel_size: 1 + test_mode: false + timers: null + tp_comm_atomic_ag: false + tp_comm_atomic_rs: false + tp_comm_bootstrap_backend: nccl + tp_comm_bulk_dgrad: true + tp_comm_bulk_wgrad: true + tp_comm_overlap: false + tp_comm_overlap_ag: true + tp_comm_overlap_cfg: null + tp_comm_overlap_disable_fc1: false + tp_comm_overlap_disable_qkv: false + tp_comm_overlap_rs: true + tp_comm_overlap_rs_dgrad: false + tp_comm_split_ag: true + tp_comm_split_rs: true + tp_only_amax_red: false + transformer_impl: transformer_engine + transformer_layer_spec: + _call_: false + _target_: megatron.bridge.models.gpt_provider.default_layer_spec + use_cpu_initialization: false + use_fused_weighted_squared_relu: false + use_kitchen: false + use_mamba_mem_eff_path: true + use_ring_exchange_p2p: false + use_te_activation_func: false + use_te_rng_tracker: false + use_transformer_engine_full_layer_spec: false + use_transformer_engine_op_fuser: false + variable_seq_lengths: false + virtual_pipeline_model_parallel_size: null + vocab_size: 16 + wgrad_deferral_limit: 0 + window_attn_skip_freq: null + window_size: null +nvrx_straggler: null +optimizer: + _target_: megatron.bridge.training.config.OptimizerConfig + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + barrier_with_L1_time: false + bf16: true + clip_grad: 0.5 + config_logger_dir: '' + decoupled_lr: null + decoupled_min_lr: null + decoupled_weight_decay: true + exp_avg_dtype: + _call_: false + _target_: torch.float32 + exp_avg_sq_dtype: + _call_: false + _target_: torch.float32 + fp16: false + fp8_recipe: null + hysteresis: 2 + initial_loss_scale: 4294967296 + log_num_zeros_in_grad: false + loss_scale: null + loss_scale_window: 1000 + lr: 0.0001 + main_grads_dtype: + _call_: false + _target_: torch.float32 + main_params_dtype: + _call_: false + _target_: torch.float32 + min_loss_scale: 1.0 + min_lr: 1.0e-07 + muon_extra_scale_factor: 1.0 + muon_fp32_matmul_prec: medium + muon_momentum: 0.95 + muon_num_ns_steps: 5 + muon_scale_mode: spectral + muon_split_qkv: true + muon_tp_mode: blockwise + muon_use_nesterov: false + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + overlap_cpu_optimizer_d2h_h2d: false + overlap_param_gather: false + overlap_param_gather_with_optimizer_step: false + params_dtype: bfloat16 + pin_cpu_grads: true + pin_cpu_params: true + reuse_grad_buf_for_mxfp8_param_ag: false + sgd_momentum: 0.9 + store_param_remainders: true + timers: null + use_distributed_optimizer: true + use_precision_aware_optimizer: false + use_torch_optimizer_for_cpu_offload: false + weight_decay: 0.1 +peft: null +profiling: + _target_: megatron.bridge.training.config.ProfilingConfig + memory_snapshot_path: snapshot.pickle + nvtx_ranges: false + profile_ranks: + - 0 + profile_step_end: 12 + profile_step_start: 10 + record_memory_history: false + record_shapes: false + use_nsys_profiler: false + use_pytorch_profiler: false +rerun_state_machine: + _target_: megatron.bridge.training.config.RerunStateMachineConfig + check_for_nan_in_loss: true + check_for_spiky_loss: false + error_injection_rate: 0 + error_injection_type: transient_error + rerun_mode: disabled +rng: + _target_: megatron.bridge.training.config.RNGConfig + data_parallel_random_init: false + inference_rng_tracker: false + seed: 1234 + te_rng_tracker: false +scheduler: + _target_: megatron.bridge.training.config.SchedulerConfig + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_samples: null + lr_decay_steps: 3255296 + lr_decay_style: linear + lr_warmup_fraction: null + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + lr_warmup_samples: 0 + lr_warmup_steps: 51200 + lr_wsd_decay_iters: null + lr_wsd_decay_samples: null + lr_wsd_decay_style: exponential + no_weight_decay_cond_type: null + override_opt_param_scheduler: false + start_weight_decay: 0.1 + use_checkpoint_opt_param_scheduler: false + wd_incr_steps: 1528832 + weight_decay_incr_style: constant + wsd_decay_steps: null +straggler: null +tensor_inspect: null +tokenizer: + _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig + hf_tokenizer_kwargs: {} + image_tag_type: null + merge_file: null + special_tokens: null + tiktoken_num_special_tokens: 1000 + tiktoken_pattern: null + tiktoken_special_tokens: null + tokenizer_model: ./models/Qwen-NVARC + tokenizer_prompt_format: null + tokenizer_type: HuggingFaceTokenizer + vocab_extra_ids: 0 + vocab_file: null + vocab_size: null +train: + _target_: megatron.bridge.training.config.TrainingConfig + check_weight_hash_across_dp_replicas_interval: null + decrease_batch_size_if_needed: false + empty_unused_memory_level: 0 + eval_interval: 1000 + eval_iters: 100 + exit_duration_in_mins: null + exit_interval: null + exit_signal: + _args_: + - 15 + _call_: true + _target_: signal.Signals + exit_signal_handler: false + exit_signal_handler_for_dataloader: false + global_batch_size: 256 + iterations_to_skip: [] + manual_gc: false + manual_gc_eval: true + manual_gc_interval: 0 + micro_batch_size: 1 + rampup_batch_size: null + skip_train: false + train_iters: 5972 + train_samples: null + train_sync_interval: null diff --git a/step_5600/policy/weights/iter_0000000/train_state.pt b/step_5600/policy/weights/iter_0000000/train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5600/policy/weights/iter_0000000/train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5600/policy/weights/latest_checkpointed_iteration.txt b/step_5600/policy/weights/latest_checkpointed_iteration.txt new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/step_5600/policy/weights/latest_checkpointed_iteration.txt @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/step_5600/policy/weights/latest_train_state.pt b/step_5600/policy/weights/latest_train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5600/policy/weights/latest_train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5600/train_dataloader.pt b/step_5600/train_dataloader.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f019131e11ae758d1da660a332238c760d6218d --- /dev/null +++ b/step_5600/train_dataloader.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe0ee8c91d5ba1b614239817486eae38492eb4f5f311f8b71c6b33bc2151b2b +size 7336 diff --git a/step_5600/training_info.json b/step_5600/training_info.json new file mode 100644 index 0000000000000000000000000000000000000000..0888b9b8fc333faaf0a6fb86dc4b0b07a9c8fc60 --- /dev/null +++ b/step_5600/training_info.json @@ -0,0 +1 @@ +{"epoch": 0, "step": 5600, "total_steps": 5600, "consumed_samples": 1433600, "total_valid_tokens": 1626494740.0, "val:val_loss": 0.14774028956890106} \ No newline at end of file diff --git a/step_5800/config.yaml b/step_5800/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e935c0bf4a22e0bf18c2273893d86b8c3c2798df --- /dev/null +++ b/step_5800/config.yaml @@ -0,0 +1,207 @@ +checkpointing: + checkpoint_dir: results/qwen3_4b_sft + checkpoint_must_save_by: null + enabled: true + higher_is_better: false + keep_top_k: 3 + metric_name: val:val_loss + save_period: 200 +cluster: + gpus_per_node: 2 + num_nodes: 1 +data: + num_workers: 4 + shuffle: true + train_dataset_path: + - ./data/hones + val_dataset_path: ./data/arc2_evaluation6 +logger: + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 + log_dir: logs/exp_019 + mlflow_enabled: false + monitor_gpus: false + swanlab_enabled: false + tensorboard_enabled: false + wandb: + name: qwen3_4b_sft + project: arc2 + wandb_enabled: true +policy: + activation_checkpointing_enabled: false + attn_implementation: flash_attention_2 + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: false + fsdp_offload_enabled: false + make_sequence_length_divisible_by: 64 + max_grad_norm: null + megatron_cfg: + activation_checkpointing: true + apply_rope_fusion: true + bias_activation_fusion: false + context_parallel_size: 2 + distributed_data_parallel_config: + average_in_collective: true + data_parallel_sharding_strategy: optim_grads_params + grad_reduce_in_fp32: true + overlap_grad_reduce: true + overlap_param_gather: true + empty_unused_memory_level: 1 + enabled: true + env_vars: + AWS_OFI_NCCL_VERSION: 1.14.0 + BASH_ENV: /etc/bash.bashrc + CAL_VERSION: 0.4.4.50 + CUBLASMP_VERSION: 0.4.0.789 + CUBLAS_VERSION: 12.9.0.13 + CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0 + CUDA_DRIVER_VERSION: 575.51.03 + CUDA_VERSION: 12.9.0.043 + CUDA_VISIBLE_DEVICES: 6,7 + CUDNN_FRONTEND_VERSION: 1.11.0 + CUDNN_VERSION: 9.10.1.4 + CUFFT_VERSION: 11.4.0.6 + CUFILE_VERSION: 1.14.0.30 + CURAND_VERSION: 10.3.10.19 + CUSOLVER_VERSION: 11.7.4.40 + CUSPARSELT_VERSION: 0.7.1.0 + CUSPARSE_VERSION: 12.5.9.5 + DALI_BUILD: '' + DALI_URL_SUFFIX: '120' + DALI_VERSION: 1.49.0 + EFA_VERSION: 1.38.1 + ENV: /etc/shinit_v2 + GDRCOPY_VERSION: 2.4.4 + HOME: /root + HOSTNAME: e6ad2ac15863 + HPCX_VERSION: '2.23' + KMP_DUPLICATE_LIB_OK: 'True' + KMP_INIT_AT_FORK: 'FALSE' + LC_CTYPE: C.UTF-8 + LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LESSCLOSE: /usr/bin/lesspipe %s %s + LESSOPEN: '| /usr/bin/lesspipe %s' + LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:' + LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:' + MODEL_OPT_VERSION: 0.27.1 + MOFED_VERSION: 5.4-rdmacore50.0 + NCCL_NET_PLUGIN: aws-ofi + NCCL_TUNER_PLUGIN: aws-ofi + NCCL_VERSION: 2.26.5 + NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a + NEMO_RL_VENV_DIR: /opt/ray_venvs + NPP_VERSION: 12.4.0.27 + NRL_CONTAINER: '1' + NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron + NSIGHT_COMPUTE_VERSION: 2025.2.0.11 + NSIGHT_SYSTEMS_VERSION: 2025.3.1.90 + NVIDIA_BUILD_ID: '244212578' + NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732 + NVIDIA_DRIVER_CAPABILITIES: compute,utility,video + NVIDIA_PRODUCT_NAME: CUDA + NVIDIA_REQUIRE_CUDA: cuda>=9.0 + NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: '' + NVIDIA_VISIBLE_DEVICES: all + NVJITLINK_VERSION: 12.9.41 + NVJPEG_VERSION: 12.4.0.16 + NVSHMEM_VERSION: 3.2.5 + OLDPWD: /workspace + OMPI_MCA_coll_hcoll_enable: '0' + OPAL_PREFIX: /opt/hpcx/ompi + OPENMPI_VERSION: 4.1.7 + OPENUCX_VERSION: 1.19.0 + PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin + POLYGRAPHY_VERSION: 0.49.20 + PWD: /workspace/ARChitects + PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace + PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:' + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + RAY_CLIENT_MODE: '0' + RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0' + RAY_USAGE_STATS_ENABLED: '0' + RDMACORE_VERSION: '50.0' + SHELL: /bin/bash + SHLVL: '2' + SWANLAB_API_HOST: https://api.swanlab.cn/api + SWANLAB_RUNTIME: user + SWANLAB_WEB_HOST: https://swanlab.cn + TERM: xterm + TORCH_CUDA_ARCH_LIST: '9.0' + TRANSFORMER_ENGINE_VERSION: '2.3' + TRTOSS_VERSION: '' + TRT_VERSION: 10.10.0.31 + UV: /root/.local/bin/uv + UV_LINK_MODE: copy + UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv + UV_RUN_RECURSION_DEPTH: '1' + VIRTUAL_ENV: /opt/nemo_rl_venv + VIRTUAL_ENV_PROMPT: nemo-rl + WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket + _: /root/.local/bin/uv + _CUDA_COMPAT_PATH: /usr/local/cuda/compat + _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination + (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803 + _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9 + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + freeze_moe_router: true + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_load_balancing_type: none + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + optimizer: + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + bf16: true + clip_grad: 0.5 + fp16: false + lr: 0.0001 + min_lr: 1.0e-07 + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + params_dtype: bfloat16 + sgd_momentum: 0.9 + use_distributed_optimizer: true + use_precision_aware_optimizer: false + weight_decay: 0.1 + pipeline_dtype: bfloat16 + pipeline_model_parallel_size: 1 + scheduler: + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_style: linear + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + start_weight_decay: 0.1 + weight_decay_incr_style: constant + sequence_parallel: false + tensor_model_parallel_size: 1 + train_iters: 5972 + model_name: ./models/Qwen-NVARC + offload_optimizer_for_logprob: false + precision: bfloat16 + sequence_packing: + algorithm: modified_first_fit_decreasing + enabled: true + sequence_length_round: 64 + train_mb_tokens: 128000 + tokenizer: + name: ./models/Qwen-NVARC + train_global_batch_size: 256 + train_micro_batch_size: 1 +sft: + max_num_epochs: 1 + max_num_steps: 6400 + seed: 24 + val_at_start: true + val_batches: 200 + val_global_batch_size: 256 + val_micro_batch_size: 1 + val_period: 200 diff --git a/step_5800/policy/weights/iter_0000000/.metadata b/step_5800/policy/weights/iter_0000000/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..1092f2cc20961e574c73f21f9dfde490241969ef --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd177ae05a23762b1acc7a8eff274e5a9104b258ba48b225e821312fb6de12f +size 329201 diff --git a/step_5800/policy/weights/iter_0000000/__0_0.distcp b/step_5800/policy/weights/iter_0000000/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fa3bdd91a6f1c20c9255470a217b42b93b78b078 --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2884ab50f51fa561ef6e4a6a4f422b146a712c0b47eb1ff41494ace545036d06 +size 12718332319 diff --git a/step_5800/policy/weights/iter_0000000/__1_0.distcp b/step_5800/policy/weights/iter_0000000/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cedeb12ab99e7c8649bbf786094198b503eb195c --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:619738ba8dfec45074012486c339e30475eb90b1f9ec0d57c6eb9ae4cbb4af39 +size 12717813616 diff --git a/step_5800/policy/weights/iter_0000000/common.pt b/step_5800/policy/weights/iter_0000000/common.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a47ecdc3b6e3df56be6064d550b782a21a812c3 --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/common.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd736818bbf683f63191cf9ab55ee9ec1d1ba58597572923af7a35da3c7f532 +size 1767 diff --git a/step_5800/policy/weights/iter_0000000/metadata.json b/step_5800/policy/weights/iter_0000000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c9dbde0bcf4d2d993122dcc7d6bcb1eef8b6fb77 --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/metadata.json @@ -0,0 +1 @@ +{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1} \ No newline at end of file diff --git a/step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml b/step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44ee391cb75b24b9ae1693ad28ca2c1c6a0b1f25 --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/modelopt_run_config.yaml @@ -0,0 +1,203 @@ +activation_func: +activation_func_clamp_value: None +add_bias_linear: false +add_qkv_bias: false +apply_query_key_layer_scaling: false +apply_residual_connection_post_layernorm: false +apply_rope_fusion: true +attention_backend: AttnBackend.auto +attention_dropout: '0.0' +attention_output_gate: false +attention_softmax_in_fp32: false +autocast_dtype: torch.bfloat16 +barrier_with_L1_time: true +bf16: true +bias_activation_fusion: false +bias_dropout_fusion: false +calculate_per_token_loss: true +clone_scatter_output_in_embedding: true +config_logger_dir: '' +cross_entropy_fusion_impl: native +cross_entropy_loss_fusion: true +defer_embedding_wgrad_compute: false +delay_wgrad_compute: false +deterministic_mode: false +disable_bf16_reduced_precision_matmul: false +disable_parameter_transpose_cache: false +distribute_saved_activations: None +enable_autocast: false +fallback_to_eager_attn: false +ffn_hidden_size: 9728 +finalize_model_grads_func: functools.partial(, + pg_collection=None) +fine_grained_activation_offloading: false +first_last_layers_bf16: false +flash_decode: false +fp16: false +fp16_lm_cross_entropy: false +fp32_residual_connection: false +fused_single_qkv_rope: false +gated_linear_unit: true +generation_config: None +glu_linear_offset: '0.0' +grad_scale_func: > +grad_sync_func: "" +gradient_accumulation_fusion: false +hetereogenous_dist_checkpoint: false +heterogeneous_block_specs: false +hf_model_id: ./models/Qwen-NVARC +hidden_dropout: '0.0' +hidden_size: 2560 +is_hybrid_model: false +kv_channels: 128 +layernorm_epsilon: 1e-06 +layernorm_zero_centered_gamma: false +linear_attention_freq: None +linear_attention_type: None +linear_conv_kernel_dim: None +linear_key_head_dim: None +linear_num_key_heads: None +linear_num_value_heads: None +linear_value_head_dim: None +log_max_attention_logit: false +make_vocab_size_divisible_by: 16 +mamba_head_dim: 64 +mamba_num_groups: 8 +mamba_num_heads: None +mamba_state_dim: 128 +masked_softmax_fusion: true +max_position_embeddings: 40960 +memory_efficient_layer_norm: false +min_offloaded_tensor_size: 1048576 +mlp_chunks_for_prefill: 1 +moe_apply_probs_on_input: false +moe_aux_loss_coeff: '0.0' +moe_deepep_num_sms: 20 +moe_enable_deepep: false +moe_expert_capacity_factor: None +moe_extended_tp: false +moe_ffn_hidden_size: None +moe_flex_dispatcher_backend: deepep +moe_grouped_gemm: false +moe_hybridep_num_sms: 16 +moe_input_jitter_eps: None +moe_layer_freq: 1 +moe_pad_expert_input_to_capacity: false +moe_per_layer_logging: false +moe_permute_fusion: false +moe_router_bias_update_rate: '0.0' +moe_router_dtype: fp64 +moe_router_enable_expert_bias: false +moe_router_force_load_balancing: false +moe_router_fusion: false +moe_router_group_topk: None +moe_router_load_balancing_type: none +moe_router_num_groups: None +moe_router_padding_for_quantization: false +moe_router_pre_softmax: false +moe_router_score_function: softmax +moe_router_topk: 2 +moe_router_topk_limited_devices: None +moe_router_topk_scaling_factor: None +moe_shared_expert_gate: false +moe_shared_expert_intermediate_size: None +moe_shared_expert_overlap: false +moe_token_dispatcher_type: allgather +moe_token_drop_policy: probs +moe_token_dropping: false +moe_use_legacy_grouped_gemm: false +moe_z_loss_coeff: None +mrope_section: None +multi_latent_attention: false +no_rope_freq: None +no_sync_func: "" +normalization: RMSNorm +num_attention_heads: 32 +num_layers: 36 +num_layers_at_end_in_bf16: 1 +num_layers_at_start_in_bf16: 1 +num_moe_experts: None +num_query_groups: 8 +nvidia_modelopt_version: 0.39.0 +offload_modules: None +param_sync_func: None +params_dtype: torch.bfloat16 +perform_initialization: true +persist_layer_norm: false +position_embedding_type: rope +qk_clip: false +qk_clip_alpha: '0.5' +qk_clip_threshold: 100 +qk_layernorm: true +quant_recipe: None +restore_modelopt_state: false +rotary_base: 5000000 +rotary_interleaved: false +rotary_percent: '1.0' +seq_len_interpolation_factor: None +seq_length: 262144 +share_embeddings_and_output_weights: true +should_pad_vocab: false +softmax_scale: None +softmax_type: vanilla +symmetric_ar_type: None +test_mode: false +timers: None +transformer_impl: transformer_engine +transformer_layer_spec: +use_fused_weighted_squared_relu: false +use_kitchen: false +use_mamba_mem_eff_path: true +use_ring_exchange_p2p: false +use_te_activation_func: false +use_te_rng_tracker: false +use_transformer_engine_full_layer_spec: false +use_transformer_engine_op_fuser: false +variable_seq_lengths: false +vocab_size: 16 +wgrad_deferral_limit: 0 +window_attn_skip_freq: None +window_size: None diff --git a/step_5800/policy/weights/iter_0000000/run_config.yaml b/step_5800/policy/weights/iter_0000000/run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..270671c4ffea685a5ad2aed40b44f9c4df61f3ed --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/run_config.yaml @@ -0,0 +1,564 @@ +_target_: megatron.bridge.training.config.ConfigContainer +checkpoint: + _target_: megatron.bridge.training.config.CheckpointConfig + async_save: false + ckpt_assume_constant_structure: false + ckpt_convert_format: null + ckpt_convert_save: null + ckpt_format: torch_dist + ckpt_step: null + dist_ckpt_optim_fully_reshardable: false + dist_ckpt_save_pre_mcore_014: false + dist_ckpt_strictness: assume_ok_unexpected + distrib_optim_fully_reshardable_mem_efficient: false + exit_on_missing_checkpoint: false + finetune: true + fully_parallel_load: true + fully_parallel_save: true + load: null + load_main_params_from_ckpt: false + load_optim: true + load_rng: false + most_recent_k: -1 + non_persistent_ckpt_type: null + non_persistent_global_ckpt_dir: null + non_persistent_local_ckpt_algo: fully_parallel + non_persistent_local_ckpt_dir: null + non_persistent_save_interval: null + pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC + replication: false + replication_factor: 2 + replication_jump: null + save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5800/policy/weights + save_interval: 100 + save_optim: true + save_rng: true + save_tokenizer_assets: true + strict_fsdp_dtensor_load: false + use_checkpoint_args: false + use_persistent_ckpt_worker: true +comm_overlap: null +dataset: null +ddp: + _target_: megatron.bridge.training.config.DistributedDataParallelConfig + align_param_gather: false + average_in_collective: false + bucket_size: 40000000 + check_for_large_grads: false + check_for_nan_in_grad: true + data_parallel_sharding_strategy: optim_grads_params + delay_wgrad_compute: false + disable_symmetric_registration: false + fp8_param_gather: false + fsdp_double_buffer: false + grad_reduce_in_fp32: true + gradient_reduce_div_fusion: true + keep_fp8_transpose_cache: false + nccl_ub: false + num_distributed_optimizer_instances: 1 + outer_dp_sharding_strategy: no_shard + overlap_grad_reduce: true + overlap_param_gather: true + pad_buckets_for_high_nccl_busbw: false + preserve_fp32_weights: true + reduce_scatter_with_fp32_accumulation: false + reuse_grad_buf_for_mxfp8_param_ag: false + suggested_communication_unit_size: null + use_custom_fsdp: false + use_distributed_optimizer: true + use_megatron_fsdp: false +dist: + _target_: megatron.bridge.training.config.DistributedInitConfig + align_grad_reduce: true + disable_jit_fuser: false + distributed_backend: nccl + distributed_timeout_minutes: 10 + distributed_timeout_seconds_after_init: null + enable_megatron_core_experimental: false + external_gpu_device_mapping: true + high_priority_stream_groups: null + lazy_init: false + local_rank: 0 + nccl_communicator_config_path: null + sharp_enabled_group: null + use_gloo_process_groups: true + use_megatron_fsdp: false + use_sharp: false + use_torch_fsdp2: false + use_tp_pp_dp_mapping: false +ft: null +inprocess_restart: null +logger: + _target_: megatron.bridge.training.config.LoggerConfig + filter_warnings: true + log_energy: false + log_interval: 100 + log_l2_norm_grad_to_tensorboard: false + log_loss_scale_to_tensorboard: true + log_memory_to_tensorboard: false + log_params_norm: false + log_progress: false + log_runtime_to_tensorboard: false + log_throughput: false + log_throughput_to_tensorboard: false + log_timers_to_tensorboard: false + log_validation_ppl_to_tensorboard: false + log_world_size_to_tensorboard: false + logging_level: 0 + memory_keys: null + modules_to_filter: null + runtime_time_unit: hours + save_config_filepath: null + set_level_for_all_loggers: false + tensorboard_dir: null + tensorboard_log_interval: 1 + tensorboard_queue_size: 1000 + throughput_window_size: 100 + timing_log_level: 0 + timing_log_option: minmax + wandb_entity: null + wandb_exp_name: null + wandb_project: null + wandb_save_dir: null +mixed_precision: null +model: + _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider + account_for_embedding_in_pipeline_split: false + account_for_loss_in_pipeline_split: false + activation_func: + _call_: false + _target_: torch.nn.functional.silu + activation_func_clamp_value: null + activation_func_fp8_input_store: false + add_bias_linear: false + add_qkv_bias: false + apply_query_key_layer_scaling: false + apply_residual_connection_post_layernorm: false + apply_rope_fusion: true + async_tensor_model_parallel_allreduce: false + attention_backend: + _args_: + - 5 + _call_: true + _target_: megatron.core.transformer.enums.AttnBackend + attention_dropout: 0.0 + attention_output_gate: false + attention_softmax_in_fp32: false + autocast_dtype: + _call_: false + _target_: torch.bfloat16 + barrier_with_L1_time: true + batch_p2p_comm: true + batch_p2p_sync: true + bf16: true + bias_activation_fusion: false + bias_dropout_fusion: false + calculate_per_token_loss: true + clone_scatter_output_in_embedding: true + config_logger_dir: '' + context_parallel_size: 2 + cp_comm_type: null + cpu_offloading: false + cpu_offloading_activations: true + cpu_offloading_double_buffering: false + cpu_offloading_num_layers: 0 + cpu_offloading_weights: false + cross_entropy_fusion_impl: native + cross_entropy_loss_fusion: true + cuda_graph_impl: none + cuda_graph_retain_backward_graph: false + cuda_graph_scope: [] + cuda_graph_use_single_mempool: false + cuda_graph_warmup_steps: 3 + deallocate_pipeline_outputs: true + defer_embedding_wgrad_compute: false + delay_wgrad_compute: false + deterministic_mode: false + disable_bf16_reduced_precision_matmul: false + disable_parameter_transpose_cache: false + distribute_saved_activations: null + embedding_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + embedding_init_method_std: 0.02 + enable_autocast: false + enable_cuda_graph: false + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + external_cuda_graph: false + fallback_to_eager_attn: false + ffn_hidden_size: 9728 + finalize_model_grads_func: + _args_: [] + _partial_: true + _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads + pg_collection: null + fine_grained_activation_offloading: false + first_last_layers_bf16: false + flash_decode: false + fp16: false + fp16_lm_cross_entropy: false + fp32_residual_connection: false + fp4: null + fp4_param: false + fp4_quantizer_factory: null + fp4_recipe: nvfp4 + fp8: null + fp8_amax_compute_algo: most_recent + fp8_amax_history_len: 1 + fp8_dot_product_attention: false + fp8_interval: 1 + fp8_margin: 0 + fp8_multi_head_attention: false + fp8_param: false + fp8_quantizer_factory: null + fp8_recipe: delayed + fp8_wgrad: true + fused_single_qkv_rope: false + gated_linear_unit: true + generation_config: null + glu_linear_offset: 0.0 + grad_scale_func: + _call_: false + _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss + grad_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync + gradient_accumulation_fusion: false + hetereogenous_dist_checkpoint: false + heterogeneous_block_specs: false + hf_model_id: ./models/Qwen-NVARC + hidden_dropout: 0.0 + hidden_size: 2560 + hierarchical_context_parallel_sizes: null + inference_rng_tracker: false + inference_sampling_seed: 42 + init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + init_method_std: 0.02 + init_model_with_meta_device: false + is_hybrid_model: false + kv_channels: 128 + layernorm_epsilon: 1.0e-06 + layernorm_zero_centered_gamma: false + linear_attention_freq: null + linear_attention_type: null + linear_conv_kernel_dim: null + linear_key_head_dim: null + linear_num_key_heads: null + linear_num_value_heads: null + linear_value_head_dim: null + log_max_attention_logit: false + make_vocab_size_divisible_by: 16 + mamba_head_dim: 64 + mamba_num_groups: 8 + mamba_num_heads: null + mamba_state_dim: 128 + masked_softmax_fusion: true + max_position_embeddings: 40960 + memory_efficient_layer_norm: false + microbatch_group_size_per_vp_stage: 1 + min_offloaded_tensor_size: 1048576 + mlp_chunks_for_prefill: 1 + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: null + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 2 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: allgather + moe_token_drop_policy: probs + moe_token_dropping: false + moe_use_legacy_grouped_gemm: false + moe_z_loss_coeff: null + mrope_section: null + mtp_enabled: false + mtp_loss_scaling_factor: null + mtp_num_layers: null + mtp_standalone: false + multi_latent_attention: false + no_rope_freq: null + no_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync + normalization: RMSNorm + num_attention_heads: 32 + num_layers: 36 + num_layers_at_end_in_bf16: 1 + num_layers_at_start_in_bf16: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + num_microbatches_with_partial_activation_checkpoints: null + num_moe_experts: null + num_query_groups: 8 + offload_modules: null + output_layer_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.0023570226039551587 + overlap_moe_expert_parallel_comm: false + overlap_p2p_comm: false + overlap_p2p_comm_warmup_flush: false + parallel_output: true + param_sync_func: null + params_dtype: + _call_: false + _target_: torch.bfloat16 + perform_initialization: true + persist_layer_norm: false + pipeline_dtype: + _call_: false + _target_: torch.bfloat16 + pipeline_model_parallel_comm_backend: null + pipeline_model_parallel_layout: null + pipeline_model_parallel_size: 1 + position_embedding_type: rope + qk_clip: false + qk_clip_alpha: 0.5 + qk_clip_threshold: 100 + qk_layernorm: true + quant_recipe: null + recompute_granularity: full + recompute_method: uniform + recompute_modules: + - core_attn + recompute_num_layers: 1 + restore_modelopt_state: false + rotary_base: 5000000 + rotary_interleaved: false + rotary_percent: 1.0 + scatter_embedding_sequence_parallel: true + seq_len_interpolation_factor: null + seq_length: 262144 + sequence_parallel: false + share_embeddings_and_output_weights: true + should_pad_vocab: false + softmax_scale: null + softmax_type: vanilla + symmetric_ar_type: null + tensor_model_parallel_size: 1 + test_mode: false + timers: null + tp_comm_atomic_ag: false + tp_comm_atomic_rs: false + tp_comm_bootstrap_backend: nccl + tp_comm_bulk_dgrad: true + tp_comm_bulk_wgrad: true + tp_comm_overlap: false + tp_comm_overlap_ag: true + tp_comm_overlap_cfg: null + tp_comm_overlap_disable_fc1: false + tp_comm_overlap_disable_qkv: false + tp_comm_overlap_rs: true + tp_comm_overlap_rs_dgrad: false + tp_comm_split_ag: true + tp_comm_split_rs: true + tp_only_amax_red: false + transformer_impl: transformer_engine + transformer_layer_spec: + _call_: false + _target_: megatron.bridge.models.gpt_provider.default_layer_spec + use_cpu_initialization: false + use_fused_weighted_squared_relu: false + use_kitchen: false + use_mamba_mem_eff_path: true + use_ring_exchange_p2p: false + use_te_activation_func: false + use_te_rng_tracker: false + use_transformer_engine_full_layer_spec: false + use_transformer_engine_op_fuser: false + variable_seq_lengths: false + virtual_pipeline_model_parallel_size: null + vocab_size: 16 + wgrad_deferral_limit: 0 + window_attn_skip_freq: null + window_size: null +nvrx_straggler: null +optimizer: + _target_: megatron.bridge.training.config.OptimizerConfig + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + barrier_with_L1_time: false + bf16: true + clip_grad: 0.5 + config_logger_dir: '' + decoupled_lr: null + decoupled_min_lr: null + decoupled_weight_decay: true + exp_avg_dtype: + _call_: false + _target_: torch.float32 + exp_avg_sq_dtype: + _call_: false + _target_: torch.float32 + fp16: false + fp8_recipe: null + hysteresis: 2 + initial_loss_scale: 4294967296 + log_num_zeros_in_grad: false + loss_scale: null + loss_scale_window: 1000 + lr: 0.0001 + main_grads_dtype: + _call_: false + _target_: torch.float32 + main_params_dtype: + _call_: false + _target_: torch.float32 + min_loss_scale: 1.0 + min_lr: 1.0e-07 + muon_extra_scale_factor: 1.0 + muon_fp32_matmul_prec: medium + muon_momentum: 0.95 + muon_num_ns_steps: 5 + muon_scale_mode: spectral + muon_split_qkv: true + muon_tp_mode: blockwise + muon_use_nesterov: false + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + overlap_cpu_optimizer_d2h_h2d: false + overlap_param_gather: false + overlap_param_gather_with_optimizer_step: false + params_dtype: bfloat16 + pin_cpu_grads: true + pin_cpu_params: true + reuse_grad_buf_for_mxfp8_param_ag: false + sgd_momentum: 0.9 + store_param_remainders: true + timers: null + use_distributed_optimizer: true + use_precision_aware_optimizer: false + use_torch_optimizer_for_cpu_offload: false + weight_decay: 0.1 +peft: null +profiling: + _target_: megatron.bridge.training.config.ProfilingConfig + memory_snapshot_path: snapshot.pickle + nvtx_ranges: false + profile_ranks: + - 0 + profile_step_end: 12 + profile_step_start: 10 + record_memory_history: false + record_shapes: false + use_nsys_profiler: false + use_pytorch_profiler: false +rerun_state_machine: + _target_: megatron.bridge.training.config.RerunStateMachineConfig + check_for_nan_in_loss: true + check_for_spiky_loss: false + error_injection_rate: 0 + error_injection_type: transient_error + rerun_mode: disabled +rng: + _target_: megatron.bridge.training.config.RNGConfig + data_parallel_random_init: false + inference_rng_tracker: false + seed: 1234 + te_rng_tracker: false +scheduler: + _target_: megatron.bridge.training.config.SchedulerConfig + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_samples: null + lr_decay_steps: 3255296 + lr_decay_style: linear + lr_warmup_fraction: null + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + lr_warmup_samples: 0 + lr_warmup_steps: 51200 + lr_wsd_decay_iters: null + lr_wsd_decay_samples: null + lr_wsd_decay_style: exponential + no_weight_decay_cond_type: null + override_opt_param_scheduler: false + start_weight_decay: 0.1 + use_checkpoint_opt_param_scheduler: false + wd_incr_steps: 1528832 + weight_decay_incr_style: constant + wsd_decay_steps: null +straggler: null +tensor_inspect: null +tokenizer: + _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig + hf_tokenizer_kwargs: {} + image_tag_type: null + merge_file: null + special_tokens: null + tiktoken_num_special_tokens: 1000 + tiktoken_pattern: null + tiktoken_special_tokens: null + tokenizer_model: ./models/Qwen-NVARC + tokenizer_prompt_format: null + tokenizer_type: HuggingFaceTokenizer + vocab_extra_ids: 0 + vocab_file: null + vocab_size: null +train: + _target_: megatron.bridge.training.config.TrainingConfig + check_weight_hash_across_dp_replicas_interval: null + decrease_batch_size_if_needed: false + empty_unused_memory_level: 0 + eval_interval: 1000 + eval_iters: 100 + exit_duration_in_mins: null + exit_interval: null + exit_signal: + _args_: + - 15 + _call_: true + _target_: signal.Signals + exit_signal_handler: false + exit_signal_handler_for_dataloader: false + global_batch_size: 256 + iterations_to_skip: [] + manual_gc: false + manual_gc_eval: true + manual_gc_interval: 0 + micro_batch_size: 1 + rampup_batch_size: null + skip_train: false + train_iters: 5972 + train_samples: null + train_sync_interval: null diff --git a/step_5800/policy/weights/iter_0000000/train_state.pt b/step_5800/policy/weights/iter_0000000/train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5800/policy/weights/iter_0000000/train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5800/policy/weights/latest_checkpointed_iteration.txt b/step_5800/policy/weights/latest_checkpointed_iteration.txt new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/step_5800/policy/weights/latest_checkpointed_iteration.txt @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/step_5800/policy/weights/latest_train_state.pt b/step_5800/policy/weights/latest_train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5800/policy/weights/latest_train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5800/train_dataloader.pt b/step_5800/train_dataloader.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c847f299bdcc96cea92c874fcd1c531e18a7979 --- /dev/null +++ b/step_5800/train_dataloader.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458e115a25d1c97a8415a462c7ac872cd8c36b2dd1561561119e578a52acef61 +size 7336 diff --git a/step_5800/training_info.json b/step_5800/training_info.json new file mode 100644 index 0000000000000000000000000000000000000000..9be495484b52e5a585339ddad7cd9926e9c083bb --- /dev/null +++ b/step_5800/training_info.json @@ -0,0 +1 @@ +{"epoch": 0, "step": 5800, "total_steps": 5800, "consumed_samples": 1484800, "total_valid_tokens": 1684485110.0, "val:val_loss": 0.14940811693668365} \ No newline at end of file diff --git a/step_5972/config.yaml b/step_5972/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e935c0bf4a22e0bf18c2273893d86b8c3c2798df --- /dev/null +++ b/step_5972/config.yaml @@ -0,0 +1,207 @@ +checkpointing: + checkpoint_dir: results/qwen3_4b_sft + checkpoint_must_save_by: null + enabled: true + higher_is_better: false + keep_top_k: 3 + metric_name: val:val_loss + save_period: 200 +cluster: + gpus_per_node: 2 + num_nodes: 1 +data: + num_workers: 4 + shuffle: true + train_dataset_path: + - ./data/hones + val_dataset_path: ./data/arc2_evaluation6 +logger: + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 + log_dir: logs/exp_019 + mlflow_enabled: false + monitor_gpus: false + swanlab_enabled: false + tensorboard_enabled: false + wandb: + name: qwen3_4b_sft + project: arc2 + wandb_enabled: true +policy: + activation_checkpointing_enabled: false + attn_implementation: flash_attention_2 + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: false + fsdp_offload_enabled: false + make_sequence_length_divisible_by: 64 + max_grad_norm: null + megatron_cfg: + activation_checkpointing: true + apply_rope_fusion: true + bias_activation_fusion: false + context_parallel_size: 2 + distributed_data_parallel_config: + average_in_collective: true + data_parallel_sharding_strategy: optim_grads_params + grad_reduce_in_fp32: true + overlap_grad_reduce: true + overlap_param_gather: true + empty_unused_memory_level: 1 + enabled: true + env_vars: + AWS_OFI_NCCL_VERSION: 1.14.0 + BASH_ENV: /etc/bash.bashrc + CAL_VERSION: 0.4.4.50 + CUBLASMP_VERSION: 0.4.0.789 + CUBLAS_VERSION: 12.9.0.13 + CUDA_ARCH_LIST: 7.5 8.0 8.6 9.0 10.0 12.0 + CUDA_DRIVER_VERSION: 575.51.03 + CUDA_VERSION: 12.9.0.043 + CUDA_VISIBLE_DEVICES: 6,7 + CUDNN_FRONTEND_VERSION: 1.11.0 + CUDNN_VERSION: 9.10.1.4 + CUFFT_VERSION: 11.4.0.6 + CUFILE_VERSION: 1.14.0.30 + CURAND_VERSION: 10.3.10.19 + CUSOLVER_VERSION: 11.7.4.40 + CUSPARSELT_VERSION: 0.7.1.0 + CUSPARSE_VERSION: 12.5.9.5 + DALI_BUILD: '' + DALI_URL_SUFFIX: '120' + DALI_VERSION: 1.49.0 + EFA_VERSION: 1.38.1 + ENV: /etc/shinit_v2 + GDRCOPY_VERSION: 2.4.4 + HOME: /root + HOSTNAME: e6ad2ac15863 + HPCX_VERSION: '2.23' + KMP_DUPLICATE_LIB_OK: 'True' + KMP_INIT_AT_FORK: 'FALSE' + LC_CTYPE: C.UTF-8 + LD_LIBRARY_PATH: /usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LESSCLOSE: /usr/bin/lesspipe %s %s + LESSOPEN: '| /usr/bin/lesspipe %s' + LIBRARY_PATH: '/usr/local/cuda/lib64/stubs:' + LS_COLORS: 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:' + MODEL_OPT_VERSION: 0.27.1 + MOFED_VERSION: 5.4-rdmacore50.0 + NCCL_NET_PLUGIN: aws-ofi + NCCL_TUNER_PLUGIN: aws-ofi + NCCL_VERSION: 2.26.5 + NEMO_RL_COMMIT: 90ecaaccdf66fb92f814b2325db81a0105ed933a + NEMO_RL_VENV_DIR: /opt/ray_venvs + NPP_VERSION: 12.4.0.27 + NRL_CONTAINER: '1' + NRL_MEGATRON_CHECKPOINT_DIR: ./results/megatron + NSIGHT_COMPUTE_VERSION: 2025.2.0.11 + NSIGHT_SYSTEMS_VERSION: 2025.3.1.90 + NVIDIA_BUILD_ID: '244212578' + NVIDIA_BUILD_REF: 6ba29f15920b12095387b8c82cea05ac3d6d9732 + NVIDIA_DRIVER_CAPABILITIES: compute,utility,video + NVIDIA_PRODUCT_NAME: CUDA + NVIDIA_REQUIRE_CUDA: cuda>=9.0 + NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS: '' + NVIDIA_VISIBLE_DEVICES: all + NVJITLINK_VERSION: 12.9.41 + NVJPEG_VERSION: 12.4.0.16 + NVSHMEM_VERSION: 3.2.5 + OLDPWD: /workspace + OMPI_MCA_coll_hcoll_enable: '0' + OPAL_PREFIX: /opt/hpcx/ompi + OPENMPI_VERSION: 4.1.7 + OPENUCX_VERSION: 1.19.0 + PATH: /opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/opt/nemo_rl_venv/bin:/root/.local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin + POLYGRAPHY_VERSION: 0.49.20 + PWD: /workspace/ARChitects + PYTHONBREAKPOINT: ray.util.rpdb._driver_set_trace + PYTHONPATH: '/workspace/NVARC/ARChitects/nemo-rl:' + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + RAY_CLIENT_MODE: '0' + RAY_ENABLE_UV_RUN_RUNTIME_ENV: '0' + RAY_USAGE_STATS_ENABLED: '0' + RDMACORE_VERSION: '50.0' + SHELL: /bin/bash + SHLVL: '2' + SWANLAB_API_HOST: https://api.swanlab.cn/api + SWANLAB_RUNTIME: user + SWANLAB_WEB_HOST: https://swanlab.cn + TERM: xterm + TORCH_CUDA_ARCH_LIST: '9.0' + TRANSFORMER_ENGINE_VERSION: '2.3' + TRTOSS_VERSION: '' + TRT_VERSION: 10.10.0.31 + UV: /root/.local/bin/uv + UV_LINK_MODE: copy + UV_PROJECT_ENVIRONMENT: /opt/nemo_rl_venv + UV_RUN_RECURSION_DEPTH: '1' + VIRTUAL_ENV: /opt/nemo_rl_venv + VIRTUAL_ENV_PROMPT: nemo-rl + WANDB_SERVICE: 3-239782-unix-/tmp/wandb-239782-251940-2530276073/socket + _: /root/.local/bin/uv + _CUDA_COMPAT_PATH: /usr/local/cuda/compat + _CUDA_COMPAT_STATUS: System has unsupported display driver / cuda driver combination + (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803 + _MLFLOW_TELEMETRY_SESSION_ID: 8ac94e5a44a94a62a2cf3302ff4fa0c9 + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + freeze_moe_router: true + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_load_balancing_type: none + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + optimizer: + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + bf16: true + clip_grad: 0.5 + fp16: false + lr: 0.0001 + min_lr: 1.0e-07 + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + params_dtype: bfloat16 + sgd_momentum: 0.9 + use_distributed_optimizer: true + use_precision_aware_optimizer: false + weight_decay: 0.1 + pipeline_dtype: bfloat16 + pipeline_model_parallel_size: 1 + scheduler: + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_style: linear + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + start_weight_decay: 0.1 + weight_decay_incr_style: constant + sequence_parallel: false + tensor_model_parallel_size: 1 + train_iters: 5972 + model_name: ./models/Qwen-NVARC + offload_optimizer_for_logprob: false + precision: bfloat16 + sequence_packing: + algorithm: modified_first_fit_decreasing + enabled: true + sequence_length_round: 64 + train_mb_tokens: 128000 + tokenizer: + name: ./models/Qwen-NVARC + train_global_batch_size: 256 + train_micro_batch_size: 1 +sft: + max_num_epochs: 1 + max_num_steps: 6400 + seed: 24 + val_at_start: true + val_batches: 200 + val_global_batch_size: 256 + val_micro_batch_size: 1 + val_period: 200 diff --git a/step_5972/policy/weights/iter_0000000/.metadata b/step_5972/policy/weights/iter_0000000/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..95b0c9570c2790195eaaff9a91e35d806de193ac --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b2c2c7a6c21b171a30b50ae7dc76c9744532ea6b3c093434c81c412ad99548 +size 329201 diff --git a/step_5972/policy/weights/iter_0000000/__0_1.distcp b/step_5972/policy/weights/iter_0000000/__0_1.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8b75566730d1e591b3a4fb8079d0da1dc302feff --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/__0_1.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2f6811136b6e0fbc6c36bf350aee4b9e42c450265f2475895b613fc98ff26e7 +size 12718313784 diff --git a/step_5972/policy/weights/iter_0000000/__1_1.distcp b/step_5972/policy/weights/iter_0000000/__1_1.distcp new file mode 100644 index 0000000000000000000000000000000000000000..af3db3a33813e96eb82e3cdcfaae47dffb18eab5 --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/__1_1.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f80411837876981796f026820ac72c19b06b79a478df2c332f912075adc25f +size 12717860926 diff --git a/step_5972/policy/weights/iter_0000000/common.pt b/step_5972/policy/weights/iter_0000000/common.pt new file mode 100644 index 0000000000000000000000000000000000000000..26facb3572476743d371932801a8473560606879 --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/common.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:293a19ff82e664ad14eeea37b1cdcfc976171b534d5ec99eff7d86a5dfade2af +size 1767 diff --git a/step_5972/policy/weights/iter_0000000/metadata.json b/step_5972/policy/weights/iter_0000000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c9dbde0bcf4d2d993122dcc7d6bcb1eef8b6fb77 --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/metadata.json @@ -0,0 +1 @@ +{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1} \ No newline at end of file diff --git a/step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml b/step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44ee391cb75b24b9ae1693ad28ca2c1c6a0b1f25 --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/modelopt_run_config.yaml @@ -0,0 +1,203 @@ +activation_func: +activation_func_clamp_value: None +add_bias_linear: false +add_qkv_bias: false +apply_query_key_layer_scaling: false +apply_residual_connection_post_layernorm: false +apply_rope_fusion: true +attention_backend: AttnBackend.auto +attention_dropout: '0.0' +attention_output_gate: false +attention_softmax_in_fp32: false +autocast_dtype: torch.bfloat16 +barrier_with_L1_time: true +bf16: true +bias_activation_fusion: false +bias_dropout_fusion: false +calculate_per_token_loss: true +clone_scatter_output_in_embedding: true +config_logger_dir: '' +cross_entropy_fusion_impl: native +cross_entropy_loss_fusion: true +defer_embedding_wgrad_compute: false +delay_wgrad_compute: false +deterministic_mode: false +disable_bf16_reduced_precision_matmul: false +disable_parameter_transpose_cache: false +distribute_saved_activations: None +enable_autocast: false +fallback_to_eager_attn: false +ffn_hidden_size: 9728 +finalize_model_grads_func: functools.partial(, + pg_collection=None) +fine_grained_activation_offloading: false +first_last_layers_bf16: false +flash_decode: false +fp16: false +fp16_lm_cross_entropy: false +fp32_residual_connection: false +fused_single_qkv_rope: false +gated_linear_unit: true +generation_config: None +glu_linear_offset: '0.0' +grad_scale_func: > +grad_sync_func: "" +gradient_accumulation_fusion: false +hetereogenous_dist_checkpoint: false +heterogeneous_block_specs: false +hf_model_id: ./models/Qwen-NVARC +hidden_dropout: '0.0' +hidden_size: 2560 +is_hybrid_model: false +kv_channels: 128 +layernorm_epsilon: 1e-06 +layernorm_zero_centered_gamma: false +linear_attention_freq: None +linear_attention_type: None +linear_conv_kernel_dim: None +linear_key_head_dim: None +linear_num_key_heads: None +linear_num_value_heads: None +linear_value_head_dim: None +log_max_attention_logit: false +make_vocab_size_divisible_by: 16 +mamba_head_dim: 64 +mamba_num_groups: 8 +mamba_num_heads: None +mamba_state_dim: 128 +masked_softmax_fusion: true +max_position_embeddings: 40960 +memory_efficient_layer_norm: false +min_offloaded_tensor_size: 1048576 +mlp_chunks_for_prefill: 1 +moe_apply_probs_on_input: false +moe_aux_loss_coeff: '0.0' +moe_deepep_num_sms: 20 +moe_enable_deepep: false +moe_expert_capacity_factor: None +moe_extended_tp: false +moe_ffn_hidden_size: None +moe_flex_dispatcher_backend: deepep +moe_grouped_gemm: false +moe_hybridep_num_sms: 16 +moe_input_jitter_eps: None +moe_layer_freq: 1 +moe_pad_expert_input_to_capacity: false +moe_per_layer_logging: false +moe_permute_fusion: false +moe_router_bias_update_rate: '0.0' +moe_router_dtype: fp64 +moe_router_enable_expert_bias: false +moe_router_force_load_balancing: false +moe_router_fusion: false +moe_router_group_topk: None +moe_router_load_balancing_type: none +moe_router_num_groups: None +moe_router_padding_for_quantization: false +moe_router_pre_softmax: false +moe_router_score_function: softmax +moe_router_topk: 2 +moe_router_topk_limited_devices: None +moe_router_topk_scaling_factor: None +moe_shared_expert_gate: false +moe_shared_expert_intermediate_size: None +moe_shared_expert_overlap: false +moe_token_dispatcher_type: allgather +moe_token_drop_policy: probs +moe_token_dropping: false +moe_use_legacy_grouped_gemm: false +moe_z_loss_coeff: None +mrope_section: None +multi_latent_attention: false +no_rope_freq: None +no_sync_func: "" +normalization: RMSNorm +num_attention_heads: 32 +num_layers: 36 +num_layers_at_end_in_bf16: 1 +num_layers_at_start_in_bf16: 1 +num_moe_experts: None +num_query_groups: 8 +nvidia_modelopt_version: 0.39.0 +offload_modules: None +param_sync_func: None +params_dtype: torch.bfloat16 +perform_initialization: true +persist_layer_norm: false +position_embedding_type: rope +qk_clip: false +qk_clip_alpha: '0.5' +qk_clip_threshold: 100 +qk_layernorm: true +quant_recipe: None +restore_modelopt_state: false +rotary_base: 5000000 +rotary_interleaved: false +rotary_percent: '1.0' +seq_len_interpolation_factor: None +seq_length: 262144 +share_embeddings_and_output_weights: true +should_pad_vocab: false +softmax_scale: None +softmax_type: vanilla +symmetric_ar_type: None +test_mode: false +timers: None +transformer_impl: transformer_engine +transformer_layer_spec: +use_fused_weighted_squared_relu: false +use_kitchen: false +use_mamba_mem_eff_path: true +use_ring_exchange_p2p: false +use_te_activation_func: false +use_te_rng_tracker: false +use_transformer_engine_full_layer_spec: false +use_transformer_engine_op_fuser: false +variable_seq_lengths: false +vocab_size: 16 +wgrad_deferral_limit: 0 +window_attn_skip_freq: None +window_size: None diff --git a/step_5972/policy/weights/iter_0000000/run_config.yaml b/step_5972/policy/weights/iter_0000000/run_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..110787924b255fe01d868fd5403be920af87798d --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/run_config.yaml @@ -0,0 +1,564 @@ +_target_: megatron.bridge.training.config.ConfigContainer +checkpoint: + _target_: megatron.bridge.training.config.CheckpointConfig + async_save: false + ckpt_assume_constant_structure: false + ckpt_convert_format: null + ckpt_convert_save: null + ckpt_format: torch_dist + ckpt_step: null + dist_ckpt_optim_fully_reshardable: false + dist_ckpt_save_pre_mcore_014: false + dist_ckpt_strictness: assume_ok_unexpected + distrib_optim_fully_reshardable_mem_efficient: false + exit_on_missing_checkpoint: false + finetune: true + fully_parallel_load: true + fully_parallel_save: true + load: null + load_main_params_from_ckpt: false + load_optim: true + load_rng: false + most_recent_k: -1 + non_persistent_ckpt_type: null + non_persistent_global_ckpt_dir: null + non_persistent_local_ckpt_algo: fully_parallel + non_persistent_local_ckpt_dir: null + non_persistent_save_interval: null + pretrained_checkpoint: ./results/megatron/model_._models_Qwen-NVARC + replication: false + replication_factor: 2 + replication_jump: null + save: /workspace/ARChitects/results/qwen3_4b_sft/tmp_step_5972/policy/weights + save_interval: 100 + save_optim: true + save_rng: true + save_tokenizer_assets: true + strict_fsdp_dtensor_load: false + use_checkpoint_args: false + use_persistent_ckpt_worker: true +comm_overlap: null +dataset: null +ddp: + _target_: megatron.bridge.training.config.DistributedDataParallelConfig + align_param_gather: false + average_in_collective: false + bucket_size: 40000000 + check_for_large_grads: false + check_for_nan_in_grad: true + data_parallel_sharding_strategy: optim_grads_params + delay_wgrad_compute: false + disable_symmetric_registration: false + fp8_param_gather: false + fsdp_double_buffer: false + grad_reduce_in_fp32: true + gradient_reduce_div_fusion: true + keep_fp8_transpose_cache: false + nccl_ub: false + num_distributed_optimizer_instances: 1 + outer_dp_sharding_strategy: no_shard + overlap_grad_reduce: true + overlap_param_gather: true + pad_buckets_for_high_nccl_busbw: false + preserve_fp32_weights: true + reduce_scatter_with_fp32_accumulation: false + reuse_grad_buf_for_mxfp8_param_ag: false + suggested_communication_unit_size: null + use_custom_fsdp: false + use_distributed_optimizer: true + use_megatron_fsdp: false +dist: + _target_: megatron.bridge.training.config.DistributedInitConfig + align_grad_reduce: true + disable_jit_fuser: false + distributed_backend: nccl + distributed_timeout_minutes: 10 + distributed_timeout_seconds_after_init: null + enable_megatron_core_experimental: false + external_gpu_device_mapping: true + high_priority_stream_groups: null + lazy_init: false + local_rank: 0 + nccl_communicator_config_path: null + sharp_enabled_group: null + use_gloo_process_groups: true + use_megatron_fsdp: false + use_sharp: false + use_torch_fsdp2: false + use_tp_pp_dp_mapping: false +ft: null +inprocess_restart: null +logger: + _target_: megatron.bridge.training.config.LoggerConfig + filter_warnings: true + log_energy: false + log_interval: 100 + log_l2_norm_grad_to_tensorboard: false + log_loss_scale_to_tensorboard: true + log_memory_to_tensorboard: false + log_params_norm: false + log_progress: false + log_runtime_to_tensorboard: false + log_throughput: false + log_throughput_to_tensorboard: false + log_timers_to_tensorboard: false + log_validation_ppl_to_tensorboard: false + log_world_size_to_tensorboard: false + logging_level: 0 + memory_keys: null + modules_to_filter: null + runtime_time_unit: hours + save_config_filepath: null + set_level_for_all_loggers: false + tensorboard_dir: null + tensorboard_log_interval: 1 + tensorboard_queue_size: 1000 + throughput_window_size: 100 + timing_log_level: 0 + timing_log_option: minmax + wandb_entity: null + wandb_exp_name: null + wandb_project: null + wandb_save_dir: null +mixed_precision: null +model: + _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3ModelProvider + account_for_embedding_in_pipeline_split: false + account_for_loss_in_pipeline_split: false + activation_func: + _call_: false + _target_: torch.nn.functional.silu + activation_func_clamp_value: null + activation_func_fp8_input_store: false + add_bias_linear: false + add_qkv_bias: false + apply_query_key_layer_scaling: false + apply_residual_connection_post_layernorm: false + apply_rope_fusion: true + async_tensor_model_parallel_allreduce: false + attention_backend: + _args_: + - 5 + _call_: true + _target_: megatron.core.transformer.enums.AttnBackend + attention_dropout: 0.0 + attention_output_gate: false + attention_softmax_in_fp32: false + autocast_dtype: + _call_: false + _target_: torch.bfloat16 + barrier_with_L1_time: true + batch_p2p_comm: true + batch_p2p_sync: true + bf16: true + bias_activation_fusion: false + bias_dropout_fusion: false + calculate_per_token_loss: true + clone_scatter_output_in_embedding: true + config_logger_dir: '' + context_parallel_size: 2 + cp_comm_type: null + cpu_offloading: false + cpu_offloading_activations: true + cpu_offloading_double_buffering: false + cpu_offloading_num_layers: 0 + cpu_offloading_weights: false + cross_entropy_fusion_impl: native + cross_entropy_loss_fusion: true + cuda_graph_impl: none + cuda_graph_retain_backward_graph: false + cuda_graph_scope: [] + cuda_graph_use_single_mempool: false + cuda_graph_warmup_steps: 3 + deallocate_pipeline_outputs: true + defer_embedding_wgrad_compute: false + delay_wgrad_compute: false + deterministic_mode: false + disable_bf16_reduced_precision_matmul: false + disable_parameter_transpose_cache: false + distribute_saved_activations: null + embedding_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + embedding_init_method_std: 0.02 + enable_autocast: false + enable_cuda_graph: false + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + external_cuda_graph: false + fallback_to_eager_attn: false + ffn_hidden_size: 9728 + finalize_model_grads_func: + _args_: [] + _partial_: true + _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads + pg_collection: null + fine_grained_activation_offloading: false + first_last_layers_bf16: false + flash_decode: false + fp16: false + fp16_lm_cross_entropy: false + fp32_residual_connection: false + fp4: null + fp4_param: false + fp4_quantizer_factory: null + fp4_recipe: nvfp4 + fp8: null + fp8_amax_compute_algo: most_recent + fp8_amax_history_len: 1 + fp8_dot_product_attention: false + fp8_interval: 1 + fp8_margin: 0 + fp8_multi_head_attention: false + fp8_param: false + fp8_quantizer_factory: null + fp8_recipe: delayed + fp8_wgrad: true + fused_single_qkv_rope: false + gated_linear_unit: true + generation_config: null + glu_linear_offset: 0.0 + grad_scale_func: + _call_: false + _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss + grad_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.start_grad_sync + gradient_accumulation_fusion: false + hetereogenous_dist_checkpoint: false + heterogeneous_block_specs: false + hf_model_id: ./models/Qwen-NVARC + hidden_dropout: 0.0 + hidden_size: 2560 + hierarchical_context_parallel_sizes: null + inference_rng_tracker: false + inference_sampling_seed: 42 + init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.02 + init_method_std: 0.02 + init_model_with_meta_device: false + is_hybrid_model: false + kv_channels: 128 + layernorm_epsilon: 1.0e-06 + layernorm_zero_centered_gamma: false + linear_attention_freq: null + linear_attention_type: null + linear_conv_kernel_dim: null + linear_key_head_dim: null + linear_num_key_heads: null + linear_num_value_heads: null + linear_value_head_dim: null + log_max_attention_logit: false + make_vocab_size_divisible_by: 16 + mamba_head_dim: 64 + mamba_num_groups: 8 + mamba_num_heads: null + mamba_state_dim: 128 + masked_softmax_fusion: true + max_position_embeddings: 40960 + memory_efficient_layer_norm: false + microbatch_group_size_per_vp_stage: 1 + min_offloaded_tensor_size: 1048576 + mlp_chunks_for_prefill: 1 + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: null + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_permute_fusion: false + moe_router_bias_update_rate: 0.0 + moe_router_dtype: fp64 + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 2 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: allgather + moe_token_drop_policy: probs + moe_token_dropping: false + moe_use_legacy_grouped_gemm: false + moe_z_loss_coeff: null + mrope_section: null + mtp_enabled: false + mtp_loss_scaling_factor: null + mtp_num_layers: null + mtp_standalone: false + multi_latent_attention: false + no_rope_freq: null + no_sync_func: + _call_: false + _target_: megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.no_sync + normalization: RMSNorm + num_attention_heads: 32 + num_layers: 36 + num_layers_at_end_in_bf16: 1 + num_layers_at_start_in_bf16: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + num_microbatches_with_partial_activation_checkpoints: null + num_moe_experts: null + num_query_groups: 8 + offload_modules: null + output_layer_init_method: + _args_: [] + _partial_: true + _target_: torch.nn.init.normal_ + mean: 0.0 + std: 0.0023570226039551587 + overlap_moe_expert_parallel_comm: false + overlap_p2p_comm: false + overlap_p2p_comm_warmup_flush: false + parallel_output: true + param_sync_func: null + params_dtype: + _call_: false + _target_: torch.bfloat16 + perform_initialization: true + persist_layer_norm: false + pipeline_dtype: + _call_: false + _target_: torch.bfloat16 + pipeline_model_parallel_comm_backend: null + pipeline_model_parallel_layout: null + pipeline_model_parallel_size: 1 + position_embedding_type: rope + qk_clip: false + qk_clip_alpha: 0.5 + qk_clip_threshold: 100 + qk_layernorm: true + quant_recipe: null + recompute_granularity: full + recompute_method: uniform + recompute_modules: + - core_attn + recompute_num_layers: 1 + restore_modelopt_state: false + rotary_base: 5000000 + rotary_interleaved: false + rotary_percent: 1.0 + scatter_embedding_sequence_parallel: true + seq_len_interpolation_factor: null + seq_length: 262144 + sequence_parallel: false + share_embeddings_and_output_weights: true + should_pad_vocab: false + softmax_scale: null + softmax_type: vanilla + symmetric_ar_type: null + tensor_model_parallel_size: 1 + test_mode: false + timers: null + tp_comm_atomic_ag: false + tp_comm_atomic_rs: false + tp_comm_bootstrap_backend: nccl + tp_comm_bulk_dgrad: true + tp_comm_bulk_wgrad: true + tp_comm_overlap: false + tp_comm_overlap_ag: true + tp_comm_overlap_cfg: null + tp_comm_overlap_disable_fc1: false + tp_comm_overlap_disable_qkv: false + tp_comm_overlap_rs: true + tp_comm_overlap_rs_dgrad: false + tp_comm_split_ag: true + tp_comm_split_rs: true + tp_only_amax_red: false + transformer_impl: transformer_engine + transformer_layer_spec: + _call_: false + _target_: megatron.bridge.models.gpt_provider.default_layer_spec + use_cpu_initialization: false + use_fused_weighted_squared_relu: false + use_kitchen: false + use_mamba_mem_eff_path: true + use_ring_exchange_p2p: false + use_te_activation_func: false + use_te_rng_tracker: false + use_transformer_engine_full_layer_spec: false + use_transformer_engine_op_fuser: false + variable_seq_lengths: false + virtual_pipeline_model_parallel_size: null + vocab_size: 16 + wgrad_deferral_limit: 0 + window_attn_skip_freq: null + window_size: null +nvrx_straggler: null +optimizer: + _target_: megatron.bridge.training.config.OptimizerConfig + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-08 + barrier_with_L1_time: false + bf16: true + clip_grad: 0.5 + config_logger_dir: '' + decoupled_lr: null + decoupled_min_lr: null + decoupled_weight_decay: true + exp_avg_dtype: + _call_: false + _target_: torch.float32 + exp_avg_sq_dtype: + _call_: false + _target_: torch.float32 + fp16: false + fp8_recipe: null + hysteresis: 2 + initial_loss_scale: 4294967296 + log_num_zeros_in_grad: false + loss_scale: null + loss_scale_window: 1000 + lr: 0.0001 + main_grads_dtype: + _call_: false + _target_: torch.float32 + main_params_dtype: + _call_: false + _target_: torch.float32 + min_loss_scale: 1.0 + min_lr: 1.0e-07 + muon_extra_scale_factor: 1.0 + muon_fp32_matmul_prec: medium + muon_momentum: 0.95 + muon_num_ns_steps: 5 + muon_scale_mode: spectral + muon_split_qkv: true + muon_tp_mode: blockwise + muon_use_nesterov: false + optimizer: adam + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + overlap_cpu_optimizer_d2h_h2d: false + overlap_param_gather: false + overlap_param_gather_with_optimizer_step: false + params_dtype: bfloat16 + pin_cpu_grads: true + pin_cpu_params: true + reuse_grad_buf_for_mxfp8_param_ag: false + sgd_momentum: 0.9 + store_param_remainders: true + timers: null + use_distributed_optimizer: true + use_precision_aware_optimizer: false + use_torch_optimizer_for_cpu_offload: false + weight_decay: 0.1 +peft: null +profiling: + _target_: megatron.bridge.training.config.ProfilingConfig + memory_snapshot_path: snapshot.pickle + nvtx_ranges: false + profile_ranks: + - 0 + profile_step_end: 12 + profile_step_start: 10 + record_memory_history: false + record_shapes: false + use_nsys_profiler: false + use_pytorch_profiler: false +rerun_state_machine: + _target_: megatron.bridge.training.config.RerunStateMachineConfig + check_for_nan_in_loss: true + check_for_spiky_loss: false + error_injection_rate: 0 + error_injection_type: transient_error + rerun_mode: disabled +rng: + _target_: megatron.bridge.training.config.RNGConfig + data_parallel_random_init: false + inference_rng_tracker: false + seed: 1234 + te_rng_tracker: false +scheduler: + _target_: megatron.bridge.training.config.SchedulerConfig + end_weight_decay: 0.1 + lr_decay_iters: 12716 + lr_decay_samples: null + lr_decay_steps: 3255296 + lr_decay_style: linear + lr_warmup_fraction: null + lr_warmup_init: 1.0e-06 + lr_warmup_iters: 200 + lr_warmup_samples: 0 + lr_warmup_steps: 51200 + lr_wsd_decay_iters: null + lr_wsd_decay_samples: null + lr_wsd_decay_style: exponential + no_weight_decay_cond_type: null + override_opt_param_scheduler: false + start_weight_decay: 0.1 + use_checkpoint_opt_param_scheduler: false + wd_incr_steps: 1528832 + weight_decay_incr_style: constant + wsd_decay_steps: null +straggler: null +tensor_inspect: null +tokenizer: + _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig + hf_tokenizer_kwargs: {} + image_tag_type: null + merge_file: null + special_tokens: null + tiktoken_num_special_tokens: 1000 + tiktoken_pattern: null + tiktoken_special_tokens: null + tokenizer_model: ./models/Qwen-NVARC + tokenizer_prompt_format: null + tokenizer_type: HuggingFaceTokenizer + vocab_extra_ids: 0 + vocab_file: null + vocab_size: null +train: + _target_: megatron.bridge.training.config.TrainingConfig + check_weight_hash_across_dp_replicas_interval: null + decrease_batch_size_if_needed: false + empty_unused_memory_level: 0 + eval_interval: 1000 + eval_iters: 100 + exit_duration_in_mins: null + exit_interval: null + exit_signal: + _args_: + - 15 + _call_: true + _target_: signal.Signals + exit_signal_handler: false + exit_signal_handler_for_dataloader: false + global_batch_size: 256 + iterations_to_skip: [] + manual_gc: false + manual_gc_eval: true + manual_gc_interval: 0 + micro_batch_size: 1 + rampup_batch_size: null + skip_train: false + train_iters: 5972 + train_samples: null + train_sync_interval: null diff --git a/step_5972/policy/weights/iter_0000000/train_state.pt b/step_5972/policy/weights/iter_0000000/train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5972/policy/weights/iter_0000000/train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5972/policy/weights/latest_checkpointed_iteration.txt b/step_5972/policy/weights/latest_checkpointed_iteration.txt new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/step_5972/policy/weights/latest_checkpointed_iteration.txt @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/step_5972/policy/weights/latest_train_state.pt b/step_5972/policy/weights/latest_train_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362367436b8d879b575604f086662820bb1ab6b --- /dev/null +++ b/step_5972/policy/weights/latest_train_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9077193280ecd59384d5ae94d97a7943638810502f2ce08a7617aeb20b3586d2 +size 3461 diff --git a/step_5972/train_dataloader.pt b/step_5972/train_dataloader.pt new file mode 100644 index 0000000000000000000000000000000000000000..acd783629d9a6641b0bc4d13e8821b714bd1f28b --- /dev/null +++ b/step_5972/train_dataloader.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12bfcb136c615985e1571fc19377a9c8101d41c662c01f02e87c20a192ea5137 +size 7336 diff --git a/step_5972/training_info.json b/step_5972/training_info.json new file mode 100644 index 0000000000000000000000000000000000000000..1d09c0a67989586153eb42a4d9c7d90a676403dc --- /dev/null +++ b/step_5972/training_info.json @@ -0,0 +1 @@ +{"epoch": 0, "step": 0, "total_steps": 5972, "consumed_samples": 1528832, "total_valid_tokens": 1734100783.0} \ No newline at end of file