jihyeonl commited on Jan 28

Commit

7754092

verified ·

1 Parent(s): 9e50f9a

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitattributes +17 -0
.metadata +3 -0
__0_0.distcp +3 -0
__0_1.distcp +3 -0
__1_0.distcp +3 -0
__1_1.distcp +3 -0
__2_0.distcp +3 -0
__2_1.distcp +3 -0
__3_0.distcp +3 -0
__3_1.distcp +3 -0
__4_0.distcp +3 -0
__4_1.distcp +3 -0
__5_0.distcp +3 -0
__5_1.distcp +3 -0
__6_0.distcp +3 -0
__6_1.distcp +3 -0
__7_0.distcp +3 -0
__7_1.distcp +3 -0
common.pt +3 -0
metadata.json +1 -0
modelopt_run_config.yaml +163 -0
run_config.yaml +616 -0
train_state.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.metadata filter=lfs diff=lfs merge=lfs -text
+__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+__1_1.distcp filter=lfs diff=lfs merge=lfs -text
+__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+__2_1.distcp filter=lfs diff=lfs merge=lfs -text
+__3_0.distcp filter=lfs diff=lfs merge=lfs -text
+__3_1.distcp filter=lfs diff=lfs merge=lfs -text
+__4_0.distcp filter=lfs diff=lfs merge=lfs -text
+__4_1.distcp filter=lfs diff=lfs merge=lfs -text
+__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+__5_1.distcp filter=lfs diff=lfs merge=lfs -text
+__6_0.distcp filter=lfs diff=lfs merge=lfs -text
+__6_1.distcp filter=lfs diff=lfs merge=lfs -text
+__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+__7_1.distcp filter=lfs diff=lfs merge=lfs -text

.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bfb224cfbc7e01c6813317f09fa155206a44d5565930468ccc2949598d77a8a
+size 8037545

__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d3145f1f09dd6f9d6bae6c2631a968cb90d57a482cdf8215a92f1a62d656991
+size 11669263879

__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb86819db4ff5906e58127db20403504f6ebc32a14627b34a17d40e51861e7b3
+size 11669248562

__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fc644960138eb1e3ccb0468963087f9b3882b31f77c611b6ee64ed470a27a76
+size 11647286277

__1_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71d5cce0bc8468671e00ca143b026020c8a54da3435253b42cd051e3eca968fe
+size 11647300081

__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d5a40d5ae65d6ff53626c4ea4e8e9a327fe113d12727a76bdd7ad0343e94050
+size 11647286277

__2_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fed334a22e4af31f3238668864ba0e4fd257bcd949197b3675b3391a0e3f963d
+size 11647300081

__3_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ad6debdb19d305b1d6d0978904a0ff802577656f2f5cdb2b203c47f7b076d4
+size 11647286277

__3_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96acd10900eb3086011a53c9426cca2d5c821143ff032d717bf367a86586f0f2
+size 11647300081

__4_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc3b518376d7ac14ece702c207a12805882e6b2d4be033e5b37795196d83584
+size 11647286277

__4_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1e9d63d3f523927a73bbf1f5c1b0abfff772d0cf29991e015c11457db7055fc
+size 11647300081

__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c42ed5ace9bb650b4c3db4066c752f2cf65f5a7d6298307dacf7170429463b4f
+size 11647286277

__5_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f3f9a9e0cc50cf7cf67f112a35c655a5fa719bf3470f892414f5aa8ed8fc08a
+size 11647300081

__6_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:522d67d5d2c76894fd6b665e4b532c3f1f2486bfe49317d9c261976599a046a3
+size 11647286277

__6_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68685712037c1b940229723cc04c8d1e5f2e721bafc71efe891d9972545f65ed
+size 11647300081

__7_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16f45912c2cb95a14620c5532fcf43d968b22f43bd2c9e590a23471e76b43306
+size 11647286277

__7_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:210b4eb2b5b1caea0cbde127e34162720815e67e0d581d0731bab8d916227fc4
+size 11647300081

common.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f656d00004c0cea72d54764b7be55a4852dd99d38fdd1065de5f34a7d81edce
+size 1773

metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

modelopt_run_config.yaml ADDED Viewed

	@@ -0,0 +1,163 @@

+activation_func: <function squared_relu at 0x749c0c174fe0>
+activation_func_clamp_value: None
+add_bias_linear: false
+add_qkv_bias: false
+apply_query_key_layer_scaling: false
+apply_residual_connection_post_layernorm: false
+apply_rope_fusion: true
+attention_backend: AttnBackend.flash
+attention_dropout: '0.0'
+attention_output_gate: false
+attention_softmax_in_fp32: true
+autocast_dtype: torch.bfloat16
+barrier_with_L1_time: true
+bf16: true
+bias_activation_fusion: false
+bias_dropout_fusion: true
+calculate_per_token_loss: false
+clone_scatter_output_in_embedding: true
+config_logger_dir: ''
+cross_entropy_fusion_impl: native
+cross_entropy_loss_fusion: true
+defer_embedding_wgrad_compute: false
+delay_wgrad_compute: false
+deterministic_mode: false
+disable_bf16_reduced_precision_matmul: false
+disable_parameter_transpose_cache: false
+distribute_saved_activations: None
+enable_autocast: false
+fallback_to_eager_attn: false
+ffn_hidden_size: 20480
+finalize_model_grads_func: None
+fine_grained_activation_offloading: false
+first_last_layers_bf16: false
+flash_decode: false
+fp16: false
+fp16_lm_cross_entropy: false
+fp32_residual_connection: false
+freeze_language_model: false
+freeze_vision_model: false
+freeze_vision_projection: false
+fused_single_qkv_rope: false
+gated_linear_unit: false
+generation_config: None
+glu_linear_offset: '0.0'
+grad_scale_func: None
+grad_sync_func: None
+gradient_accumulation_fusion: false
+hetereogenous_dist_checkpoint: false
+heterogeneous_block_specs: false
+hf_model_id: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
+hidden_dropout: '0.0'
+hidden_size: 5120
+hybrid_attention_ratio: '0.0'
+hybrid_mlp_ratio: '0.0'
+hybrid_override_pattern: M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-
+is_hybrid_model: true
+kv_channels: 128
+language_model_type: nemotron5-hybrid-12b
+layernorm_epsilon: 1e-05
+layernorm_zero_centered_gamma: false
+linear_attention_freq: None
+linear_attention_type: None
+linear_conv_kernel_dim: None
+linear_key_head_dim: None
+linear_num_key_heads: None
+linear_num_value_heads: None
+linear_value_head_dim: None
+log_max_attention_logit: false
+make_vocab_size_divisible_by: 128
+mamba_head_dim: 80
+mamba_num_groups: 8
+mamba_num_heads: 128
+mamba_stack_spec: <function get_default_mamba_stack_spec at 0x749bedf1d620>
+mamba_state_dim: 128
+masked_softmax_fusion: true
+memory_efficient_layer_norm: false
+min_offloaded_tensor_size: 1048576
+mlp_chunks_for_prefill: 1
+moe_apply_probs_on_input: false
+moe_aux_loss_coeff: '0.0001'
+moe_deepep_num_sms: 20
+moe_enable_deepep: false
+moe_expert_capacity_factor: None
+moe_extended_tp: false
+moe_ffn_hidden_size: None
+moe_flex_dispatcher_backend: deepep
+moe_grouped_gemm: true
+moe_hybridep_num_sms: 16
+moe_input_jitter_eps: None
+moe_layer_freq: 1
+moe_pad_expert_input_to_capacity: false
+moe_per_layer_logging: false
+moe_permute_fusion: true
+moe_router_bias_update_rate: '0.001'
+moe_router_dtype: fp32
+moe_router_enable_expert_bias: true
+moe_router_force_load_balancing: false
+moe_router_fusion: false
+moe_router_group_topk: None
+moe_router_load_balancing_type: seq_aux_loss
+moe_router_num_groups: None
+moe_router_padding_for_quantization: false
+moe_router_pre_softmax: false
+moe_router_score_function: sigmoid
+moe_router_topk: 2
+moe_router_topk_limited_devices: None
+moe_router_topk_scaling_factor: None
+moe_shared_expert_gate: false
+moe_shared_expert_intermediate_size: None
+moe_shared_expert_overlap: true
+moe_token_dispatcher_type: alltoall
+moe_token_drop_policy: probs
+moe_token_dropping: false
+moe_use_legacy_grouped_gemm: false
+moe_z_loss_coeff: None
+mrope_section: None
+multi_latent_attention: false
+no_rope_freq: None
+no_sync_func: None
+normalization: RMSNorm
+num_attention_heads: 40
+num_layers: 62
+num_layers_at_end_in_bf16: 0
+num_layers_at_start_in_bf16: 0
+num_moe_experts: None
+num_query_groups: 8
+nvidia_modelopt_version: 0.37.0
+offload_modules: None
+param_sync_func: None
+params_dtype: torch.bfloat16
+perform_initialization: true
+persist_layer_norm: true
+position_embedding_type: none
+qk_clip: false
+qk_clip_alpha: '0.5'
+qk_clip_threshold: 100
+qk_layernorm: false
+quant_recipe: None
+rotary_base: 10000
+rotary_interleaved: false
+rotary_percent: '1.0'
+seq_len_interpolation_factor: None
+seq_length: 8192
+share_embeddings_and_output_weights: false
+should_pad_vocab: false
+softmax_scale: None
+softmax_type: vanilla
+symmetric_ar_type: None
+test_mode: false
+timers: None
+transformer_impl: transformer_engine
+use_fused_weighted_squared_relu: false
+use_kitchen: false
+use_mamba_mem_eff_path: true
+use_ring_exchange_p2p: false
+use_te_activation_func: false
+use_te_rng_tracker: false
+variable_seq_lengths: false
+vision_model_type: radio
+vocab_size: 132096
+wgrad_deferral_limit: 0
+window_attn_skip_freq: None
+window_size: None

run_config.yaml ADDED Viewed

	@@ -0,0 +1,616 @@

+_target_: megatron.bridge.training.config.ConfigContainer
+checkpoint:
+  _target_: megatron.bridge.training.config.CheckpointConfig
+  async_save: false
+  ckpt_assume_constant_structure: false
+  ckpt_convert_format: null
+  ckpt_convert_save: null
+  ckpt_format: torch_dist
+  ckpt_step: null
+  dist_ckpt_optim_fully_reshardable: false
+  dist_ckpt_save_pre_mcore_014: false
+  dist_ckpt_strictness: assume_ok_unexpected
+  distrib_optim_fully_reshardable_mem_efficient: false
+  exit_on_missing_checkpoint: false
+  finetune: true
+  fully_parallel_load: false
+  fully_parallel_save: true
+  load: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/checkpoints
+  load_main_params_from_ckpt: false
+  load_optim: true
+  load_rng: true
+  most_recent_k: -1
+  non_persistent_ckpt_type: null
+  non_persistent_global_ckpt_dir: null
+  non_persistent_local_ckpt_algo: fully_parallel
+  non_persistent_local_ckpt_dir: null
+  non_persistent_save_interval: null
+  pretrained_checkpoint: /work/checkpoints/mb/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
+  replication: false
+  replication_factor: 2
+  replication_jump: null
+  save: /work/nemo-visual-systems/training/checkpoints/nemo-vs-tp8
+  save_interval: 1000
+  save_optim: true
+  save_rng: true
+  save_tokenizer_assets: true
+  strict_fsdp_dtensor_load: false
+  use_checkpoint_args: false
+  use_persistent_ckpt_worker: true
+comm_overlap: null
+dataset:
+  _target_: megatron.bridge.data.vlm_datasets.preloaded_provider.PreloadedVLMConversationProvider
+  data_sharding: true
+  dataloader_type: single
+  hf_processor_path: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
+  image_folder: /work/datasets
+  num_workers: 2
+  persistent_workers: false
+  pin_memory: true
+  sequence_length: 8192
+  skip_getting_attention_mask_from_dataset: true
+  test_data_path: /work/datasets/merged_dataset/test.jsonl
+  train_data_path: /work/datasets/merged_dataset/train.jsonl
+  valid_data_path: /work/datasets/merged_dataset/valid.jsonl
+ddp:
+  _target_: megatron.bridge.training.config.DistributedDataParallelConfig
+  align_param_gather: false
+  average_in_collective: false
+  bucket_size: null
+  check_for_large_grads: false
+  check_for_nan_in_grad: true
+  data_parallel_sharding_strategy: optim_grads_params
+  delay_wgrad_compute: false
+  disable_symmetric_registration: false
+  fp8_param_gather: false
+  fsdp_double_buffer: false
+  grad_reduce_in_fp32: true
+  gradient_reduce_div_fusion: true
+  keep_fp8_transpose_cache: false
+  nccl_ub: false
+  num_distributed_optimizer_instances: 1
+  outer_dp_sharding_strategy: no_shard
+  overlap_grad_reduce: false
+  overlap_param_gather: false
+  pad_buckets_for_high_nccl_busbw: false
+  preserve_fp32_weights: true
+  reduce_scatter_with_fp32_accumulation: false
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  suggested_communication_unit_size: null
+  use_custom_fsdp: false
+  use_distributed_optimizer: true
+  use_megatron_fsdp: false
+dist:
+  _target_: megatron.bridge.training.config.DistributedInitConfig
+  align_grad_reduce: true
+  disable_jit_fuser: false
+  distributed_backend: nccl
+  distributed_timeout_minutes: 10
+  distributed_timeout_seconds_after_init: null
+  enable_megatron_core_experimental: false
+  external_gpu_device_mapping: false
+  high_priority_stream_groups: null
+  lazy_init: false
+  local_rank: 0
+  nccl_communicator_config_path: null
+  sharp_enabled_group: null
+  use_gloo_process_groups: true
+  use_megatron_fsdp: false
+  use_sharp: false
+  use_torch_fsdp2: false
+  use_tp_pp_dp_mapping: false
+ft: null
+inprocess_restart: null
+logger:
+  _target_: megatron.bridge.training.config.LoggerConfig
+  filter_warnings: true
+  log_energy: false
+  log_interval: 1
+  log_l2_norm_grad_to_tensorboard: false
+  log_loss_scale_to_tensorboard: true
+  log_memory_to_tensorboard: false
+  log_params_norm: false
+  log_progress: false
+  log_runtime_to_tensorboard: false
+  log_throughput: false
+  log_throughput_to_tensorboard: false
+  log_timers_to_tensorboard: true
+  log_validation_ppl_to_tensorboard: false
+  log_world_size_to_tensorboard: false
+  logging_level: 20
+  memory_keys: null
+  modules_to_filter: null
+  runtime_time_unit: hours
+  save_config_filepath: null
+  set_level_for_all_loggers: false
+  tensorboard_dir: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/tb_logs
+  tensorboard_log_interval: 1
+  tensorboard_queue_size: 1000
+  throughput_window_size: 100
+  timing_log_level: 0
+  timing_log_option: minmax
+  wandb_entity: null
+  wandb_exp_name: merged-sft-tp8
+  wandb_project: nemo-vs
+  wandb_save_dir: /work/nemo-visual-systems/training/checkpoints
+mixed_precision:
+  _target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
+  autocast_dtype: null
+  autocast_enabled: false
+  bf16: true
+  first_last_layers_bf16: false
+  fp16: false
+  fp32: false
+  fp4: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_param_gather: false
+  fp8_recipe: tensorwise
+  fp8_wgrad: true
+  grad_reduce_in_fp32: true
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  loss_scale: null
+  loss_scale_window: 1000
+  min_loss_scale: 1.0
+  num_layers_at_end_in_bf16: 0
+  num_layers_at_start_in_bf16: 0
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  reuse_grad_buf_for_mxfp8_param_ag: false
+model:
+  _target_: megatron.bridge.models.nemotron_vl.nemotron_vl_provider.NemotronNano12Bv2VLModelProvider
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: megatron.core.activations.squared_relu
+  activation_func_clamp_value: null
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _args_:
+    - 1
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_output_gate: false
+  attention_softmax_in_fp32: true
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: false
+  bias_dropout_fusion: true
+  calculate_per_token_loss: false
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 1
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_double_buffering: false
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: false
+  cross_entropy_fusion_impl: native
+  cross_entropy_loss_fusion: true
+  cuda_graph_impl: none
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_scope: []
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  delay_wgrad_compute: false
+  deterministic_mode: false
+  disable_bf16_reduced_precision_matmul: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  embedding_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  embedding_init_method_std: 0.02
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: 8
+  external_cuda_graph: false
+  fallback_to_eager_attn: false
+  ffn_hidden_size: 20480
+  finalize_model_grads_func:
+    _args_: []
+    _partial_: true
+    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
+    pg_collection:
+      _call_: true
+      _target_: megatron.core.process_groups_config.ProcessGroupCollection
+  fine_grained_activation_offloading: false
+  first_last_layers_bf16: false
+  flash_decode: false
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp4: null
+  fp4_param: false
+  fp4_quantizer_factory: null
+  fp4_recipe: nvfp4
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_param: false
+  fp8_quantizer_factory: null
+  fp8_recipe: tensorwise
+  fp8_wgrad: true
+  freeze_language_model: false
+  freeze_vision_model: false
+  freeze_vision_projection: false
+  fused_single_qkv_rope: false
+  gated_linear_unit: false
+  generation_config: null
+  glu_linear_offset: 0.0
+  grad_scale_func:
+    _call_: false
+    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
+  grad_sync_func: null
+  gradient_accumulation_fusion: false
+  hetereogenous_dist_checkpoint: false
+  heterogeneous_block_specs: false
+  hf_model_id: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
+  hidden_dropout: 0.0
+  hidden_size: 5120
+  hierarchical_context_parallel_sizes: null
+  hybrid_attention_ratio: 0.0
+  hybrid_mlp_ratio: 0.0
+  hybrid_override_pattern: M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-
+  inference_rng_tracker: false
+  inference_sampling_seed: 42
+  init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.02
+  init_method_std: 0.02
+  init_model_with_meta_device: false
+  is_hybrid_model: true
+  kv_channels: 128
+  language_model_type: nemotron5-hybrid-12b
+  layernorm_epsilon: 1.0e-05
+  layernorm_zero_centered_gamma: false
+  linear_attention_freq: null
+  linear_attention_type: null
+  linear_conv_kernel_dim: null
+  linear_key_head_dim: null
+  linear_num_key_heads: null
+  linear_num_value_heads: null
+  linear_value_head_dim: null
+  log_max_attention_logit: false
+  make_vocab_size_divisible_by: 128
+  mamba_head_dim: 80
+  mamba_num_groups: 8
+  mamba_num_heads: 128
+  mamba_stack_spec:
+    _call_: false
+    _target_: megatron.bridge.models.mamba.mamba_provider.get_default_mamba_stack_spec
+  mamba_state_dim: 128
+  masked_softmax_fusion: true
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  min_offloaded_tensor_size: 1048576
+  mlp_chunks_for_prefill: 1
+  moe_apply_probs_on_input: false
+  moe_aux_loss_coeff: 0.0001
+  moe_deepep_num_sms: 20
+  moe_enable_deepep: false
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_flex_dispatcher_backend: deepep
+  moe_grouped_gemm: true
+  moe_hybridep_num_sms: 16
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: true
+  moe_router_bias_update_rate: 0.001
+  moe_router_dtype: fp32
+  moe_router_enable_expert_bias: true
+  moe_router_force_load_balancing: false
+  moe_router_fusion: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: seq_aux_loss
+  moe_router_num_groups: null
+  moe_router_padding_for_fp8: false
+  moe_router_padding_for_quantization: false
+  moe_router_pre_softmax: false
+  moe_router_score_function: sigmoid
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_gate: false
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: true
+  moe_token_dispatcher_type: alltoall
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  mrope_section: null
+  mtp_loss_scaling_factor: null
+  mtp_num_layers: null
+  mtp_standalone: false
+  multi_latent_attention: false
+  no_rope_freq: null
+  no_sync_func: null
+  normalization: RMSNorm
+  num_attention_heads: 40
+  num_layers: 62
+  num_layers_at_end_in_bf16: 0
+  num_layers_at_start_in_bf16: 0
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  offload_modules: null
+  output_layer_init_method:
+    _args_: []
+    _partial_: true
+    _target_: torch.nn.init.normal_
+    mean: 0.0
+    std: 0.00254000254000381
+  overlap_moe_expert_parallel_comm: false
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: true
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_comm_backend: null
+  pipeline_model_parallel_layout: null
+  pipeline_model_parallel_size: 1
+  position_embedding_type: none
+  qk_clip: false
+  qk_clip_alpha: 0.5
+  qk_clip_threshold: 100
+  qk_layernorm: false
+  quant_recipe: null
+  recompute_granularity: null
+  recompute_method: null
+  recompute_modules:
+  - core_attn
+  recompute_num_layers: null
+  rotary_base: 10000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scatter_embedding_sequence_parallel: false
+  seq_len_interpolation_factor: null
+  seq_length: 8192
+  sequence_parallel: false
+  share_embeddings_and_output_weights: false
+  should_pad_vocab: false
+  softmax_scale: null
+  softmax_type: vanilla
+  symmetric_ar_type: null
+  tensor_model_parallel_size: 8
+  test_mode: false
+  timers:
+    _call_: true
+    _target_: megatron.core.timers.Timers
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_impl: transformer_engine
+  use_cpu_initialization: false
+  use_fused_weighted_squared_relu: false
+  use_inference_optimized_layers: false
+  use_kitchen: false
+  use_mamba_mem_eff_path: true
+  use_ring_exchange_p2p: false
+  use_te_activation_func: false
+  use_te_rng_tracker: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  vision_model_type: radio
+  vocab_size: 132096
+  wgrad_deferral_limit: 0
+  window_attn_skip_freq: null
+  window_size: null
+nvrx_straggler: null
+optimizer:
+  _target_: megatron.bridge.training.config.OptimizerConfig
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-05
+  barrier_with_L1_time: false
+  bf16: true
+  clip_grad: 1.25
+  config_logger_dir: ''
+  decoupled_weight_decay: true
+  exp_avg_dtype:
+    _call_: false
+    _target_: torch.float32
+  exp_avg_sq_dtype:
+    _call_: false
+    _target_: torch.float32
+  fp16: false
+  fp8_recipe: tensorwise
+  hysteresis: 2
+  initial_loss_scale: 4294967296
+  log_num_zeros_in_grad: false
+  loss_scale: null
+  loss_scale_window: 1000
+  lr: 1.0e-05
+  main_grads_dtype:
+    _call_: false
+    _target_: torch.float32
+  main_params_dtype:
+    _call_: false
+    _target_: torch.float32
+  min_loss_scale: 1.0
+  min_lr: 1.0e-06
+  muon_extra_scale_factor: 1.0
+  muon_fp32_matmul_prec: medium
+  muon_momentum: 0.95
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral
+  muon_split_qkv: true
+  muon_tp_mode: blockwise
+  muon_use_nesterov: false
+  optimizer: adam
+  optimizer_cpu_offload: false
+  optimizer_offload_fraction: 0.0
+  overlap_cpu_optimizer_d2h_h2d: false
+  overlap_param_gather: false
+  overlap_param_gather_with_optimizer_step: false
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pin_cpu_grads: true
+  pin_cpu_params: true
+  reuse_grad_buf_for_mxfp8_param_ag: false
+  sgd_momentum: 0.9
+  store_param_remainders: true
+  timers:
+    _call_: true
+    _target_: megatron.core.timers.Timers
+  use_distributed_optimizer: true
+  use_precision_aware_optimizer: false
+  use_torch_optimizer_for_cpu_offload: false
+  weight_decay: 0.1
+peft: null
+profiling:
+  _target_: megatron.bridge.training.config.ProfilingConfig
+  memory_snapshot_path: snapshot.pickle
+  nvtx_ranges: false
+  profile_ranks:
+  - 0
+  profile_step_end: 12
+  profile_step_start: 10
+  record_memory_history: false
+  record_shapes: false
+  use_nsys_profiler: false
+  use_pytorch_profiler: false
+rerun_state_machine:
+  _target_: megatron.bridge.training.config.RerunStateMachineConfig
+  check_for_nan_in_loss: true
+  check_for_spiky_loss: false
+  error_injection_rate: 0
+  error_injection_type: transient_error
+  rerun_mode: disabled
+rng:
+  _target_: megatron.bridge.training.config.RNGConfig
+  data_parallel_random_init: false
+  inference_rng_tracker: false
+  seed: 42
+  te_rng_tracker: false
+scheduler:
+  _target_: megatron.bridge.training.config.SchedulerConfig
+  end_weight_decay: 0.033
+  lr_decay_iters: 1500
+  lr_decay_samples: null
+  lr_decay_steps: 48000
+  lr_decay_style: cosine
+  lr_warmup_fraction: null
+  lr_warmup_init: 0.0
+  lr_warmup_iters: 200
+  lr_warmup_samples: 0
+  lr_warmup_steps: 6400
+  lr_wsd_decay_iters: null
+  lr_wsd_decay_samples: null
+  lr_wsd_decay_style: exponential
+  no_weight_decay_cond_type: null
+  override_opt_param_scheduler: true
+  start_weight_decay: 0.033
+  use_checkpoint_opt_param_scheduler: false
+  wd_incr_steps: 48000
+  weight_decay_incr_style: constant
+  wsd_decay_steps: null
+straggler: null
+tensor_inspect: null
+tokenizer:
+  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
+  hf_tokenizer_kwargs: {}
+  image_tag_type: null
+  merge_file: null
+  special_tokens: null
+  tiktoken_num_special_tokens: 1000
+  tiktoken_pattern: null
+  tiktoken_special_tokens: null
+  tokenizer_model: null
+  tokenizer_prompt_format: null
+  tokenizer_type: NullTokenizer
+  vocab_extra_ids: 0
+  vocab_file: null
+  vocab_size: 32000
+train:
+  _target_: megatron.bridge.training.config.TrainingConfig
+  check_weight_hash_across_dp_replicas_interval: null
+  decrease_batch_size_if_needed: false
+  empty_unused_memory_level: 0
+  eval_interval: 500
+  eval_iters: 0
+  exit_duration_in_mins: null
+  exit_interval: null
+  exit_signal:
+    _args_:
+    - 15
+    _call_: true
+    _target_: signal.Signals
+  exit_signal_handler: false
+  exit_signal_handler_for_dataloader: false
+  global_batch_size: 32
+  iterations_to_skip: []
+  manual_gc: true
+  manual_gc_eval: 100
+  manual_gc_interval: 100
+  micro_batch_size: 1
+  rampup_batch_size: null
+  skip_train: false
+  train_iters: 1500
+  train_samples: null
+  train_sync_interval: null

train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:797cb34b03b9e704bdedc02850eae91dd0fb413270cbc3becd34e3913e9dea86
+size 3405