jihyeonl commited on
Commit
7754092
·
verified ·
1 Parent(s): 9e50f9a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .metadata filter=lfs diff=lfs merge=lfs -text
37
+ __0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ __0_1.distcp filter=lfs diff=lfs merge=lfs -text
39
+ __1_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ __1_1.distcp filter=lfs diff=lfs merge=lfs -text
41
+ __2_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ __2_1.distcp filter=lfs diff=lfs merge=lfs -text
43
+ __3_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ __3_1.distcp filter=lfs diff=lfs merge=lfs -text
45
+ __4_0.distcp filter=lfs diff=lfs merge=lfs -text
46
+ __4_1.distcp filter=lfs diff=lfs merge=lfs -text
47
+ __5_0.distcp filter=lfs diff=lfs merge=lfs -text
48
+ __5_1.distcp filter=lfs diff=lfs merge=lfs -text
49
+ __6_0.distcp filter=lfs diff=lfs merge=lfs -text
50
+ __6_1.distcp filter=lfs diff=lfs merge=lfs -text
51
+ __7_0.distcp filter=lfs diff=lfs merge=lfs -text
52
+ __7_1.distcp filter=lfs diff=lfs merge=lfs -text
.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bfb224cfbc7e01c6813317f09fa155206a44d5565930468ccc2949598d77a8a
3
+ size 8037545
__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d3145f1f09dd6f9d6bae6c2631a968cb90d57a482cdf8215a92f1a62d656991
3
+ size 11669263879
__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb86819db4ff5906e58127db20403504f6ebc32a14627b34a17d40e51861e7b3
3
+ size 11669248562
__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc644960138eb1e3ccb0468963087f9b3882b31f77c611b6ee64ed470a27a76
3
+ size 11647286277
__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d5cce0bc8468671e00ca143b026020c8a54da3435253b42cd051e3eca968fe
3
+ size 11647300081
__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5a40d5ae65d6ff53626c4ea4e8e9a327fe113d12727a76bdd7ad0343e94050
3
+ size 11647286277
__2_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed334a22e4af31f3238668864ba0e4fd257bcd949197b3675b3391a0e3f963d
3
+ size 11647300081
__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0ad6debdb19d305b1d6d0978904a0ff802577656f2f5cdb2b203c47f7b076d4
3
+ size 11647286277
__3_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96acd10900eb3086011a53c9426cca2d5c821143ff032d717bf367a86586f0f2
3
+ size 11647300081
__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc3b518376d7ac14ece702c207a12805882e6b2d4be033e5b37795196d83584
3
+ size 11647286277
__4_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1e9d63d3f523927a73bbf1f5c1b0abfff772d0cf29991e015c11457db7055fc
3
+ size 11647300081
__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c42ed5ace9bb650b4c3db4066c752f2cf65f5a7d6298307dacf7170429463b4f
3
+ size 11647286277
__5_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3f9a9e0cc50cf7cf67f112a35c655a5fa719bf3470f892414f5aa8ed8fc08a
3
+ size 11647300081
__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:522d67d5d2c76894fd6b665e4b532c3f1f2486bfe49317d9c261976599a046a3
3
+ size 11647286277
__6_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68685712037c1b940229723cc04c8d1e5f2e721bafc71efe891d9972545f65ed
3
+ size 11647300081
__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f45912c2cb95a14620c5532fcf43d968b22f43bd2c9e590a23471e76b43306
3
+ size 11647286277
__7_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210b4eb2b5b1caea0cbde127e34162720815e67e0d581d0731bab8d916227fc4
3
+ size 11647300081
common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f656d00004c0cea72d54764b7be55a4852dd99d38fdd1065de5f34a7d81edce
3
+ size 1773
metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
modelopt_run_config.yaml ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function squared_relu at 0x749c0c174fe0>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: false
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.flash
9
+ attention_dropout: '0.0'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: true
12
+ autocast_dtype: torch.bfloat16
13
+ barrier_with_L1_time: true
14
+ bf16: true
15
+ bias_activation_fusion: false
16
+ bias_dropout_fusion: true
17
+ calculate_per_token_loss: false
18
+ clone_scatter_output_in_embedding: true
19
+ config_logger_dir: ''
20
+ cross_entropy_fusion_impl: native
21
+ cross_entropy_loss_fusion: true
22
+ defer_embedding_wgrad_compute: false
23
+ delay_wgrad_compute: false
24
+ deterministic_mode: false
25
+ disable_bf16_reduced_precision_matmul: false
26
+ disable_parameter_transpose_cache: false
27
+ distribute_saved_activations: None
28
+ enable_autocast: false
29
+ fallback_to_eager_attn: false
30
+ ffn_hidden_size: 20480
31
+ finalize_model_grads_func: None
32
+ fine_grained_activation_offloading: false
33
+ first_last_layers_bf16: false
34
+ flash_decode: false
35
+ fp16: false
36
+ fp16_lm_cross_entropy: false
37
+ fp32_residual_connection: false
38
+ freeze_language_model: false
39
+ freeze_vision_model: false
40
+ freeze_vision_projection: false
41
+ fused_single_qkv_rope: false
42
+ gated_linear_unit: false
43
+ generation_config: None
44
+ glu_linear_offset: '0.0'
45
+ grad_scale_func: None
46
+ grad_sync_func: None
47
+ gradient_accumulation_fusion: false
48
+ hetereogenous_dist_checkpoint: false
49
+ heterogeneous_block_specs: false
50
+ hf_model_id: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
51
+ hidden_dropout: '0.0'
52
+ hidden_size: 5120
53
+ hybrid_attention_ratio: '0.0'
54
+ hybrid_mlp_ratio: '0.0'
55
+ hybrid_override_pattern: M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-
56
+ is_hybrid_model: true
57
+ kv_channels: 128
58
+ language_model_type: nemotron5-hybrid-12b
59
+ layernorm_epsilon: 1e-05
60
+ layernorm_zero_centered_gamma: false
61
+ linear_attention_freq: None
62
+ linear_attention_type: None
63
+ linear_conv_kernel_dim: None
64
+ linear_key_head_dim: None
65
+ linear_num_key_heads: None
66
+ linear_num_value_heads: None
67
+ linear_value_head_dim: None
68
+ log_max_attention_logit: false
69
+ make_vocab_size_divisible_by: 128
70
+ mamba_head_dim: 80
71
+ mamba_num_groups: 8
72
+ mamba_num_heads: 128
73
+ mamba_stack_spec: <function get_default_mamba_stack_spec at 0x749bedf1d620>
74
+ mamba_state_dim: 128
75
+ masked_softmax_fusion: true
76
+ memory_efficient_layer_norm: false
77
+ min_offloaded_tensor_size: 1048576
78
+ mlp_chunks_for_prefill: 1
79
+ moe_apply_probs_on_input: false
80
+ moe_aux_loss_coeff: '0.0001'
81
+ moe_deepep_num_sms: 20
82
+ moe_enable_deepep: false
83
+ moe_expert_capacity_factor: None
84
+ moe_extended_tp: false
85
+ moe_ffn_hidden_size: None
86
+ moe_flex_dispatcher_backend: deepep
87
+ moe_grouped_gemm: true
88
+ moe_hybridep_num_sms: 16
89
+ moe_input_jitter_eps: None
90
+ moe_layer_freq: 1
91
+ moe_pad_expert_input_to_capacity: false
92
+ moe_per_layer_logging: false
93
+ moe_permute_fusion: true
94
+ moe_router_bias_update_rate: '0.001'
95
+ moe_router_dtype: fp32
96
+ moe_router_enable_expert_bias: true
97
+ moe_router_force_load_balancing: false
98
+ moe_router_fusion: false
99
+ moe_router_group_topk: None
100
+ moe_router_load_balancing_type: seq_aux_loss
101
+ moe_router_num_groups: None
102
+ moe_router_padding_for_quantization: false
103
+ moe_router_pre_softmax: false
104
+ moe_router_score_function: sigmoid
105
+ moe_router_topk: 2
106
+ moe_router_topk_limited_devices: None
107
+ moe_router_topk_scaling_factor: None
108
+ moe_shared_expert_gate: false
109
+ moe_shared_expert_intermediate_size: None
110
+ moe_shared_expert_overlap: true
111
+ moe_token_dispatcher_type: alltoall
112
+ moe_token_drop_policy: probs
113
+ moe_token_dropping: false
114
+ moe_use_legacy_grouped_gemm: false
115
+ moe_z_loss_coeff: None
116
+ mrope_section: None
117
+ multi_latent_attention: false
118
+ no_rope_freq: None
119
+ no_sync_func: None
120
+ normalization: RMSNorm
121
+ num_attention_heads: 40
122
+ num_layers: 62
123
+ num_layers_at_end_in_bf16: 0
124
+ num_layers_at_start_in_bf16: 0
125
+ num_moe_experts: None
126
+ num_query_groups: 8
127
+ nvidia_modelopt_version: 0.37.0
128
+ offload_modules: None
129
+ param_sync_func: None
130
+ params_dtype: torch.bfloat16
131
+ perform_initialization: true
132
+ persist_layer_norm: true
133
+ position_embedding_type: none
134
+ qk_clip: false
135
+ qk_clip_alpha: '0.5'
136
+ qk_clip_threshold: 100
137
+ qk_layernorm: false
138
+ quant_recipe: None
139
+ rotary_base: 10000
140
+ rotary_interleaved: false
141
+ rotary_percent: '1.0'
142
+ seq_len_interpolation_factor: None
143
+ seq_length: 8192
144
+ share_embeddings_and_output_weights: false
145
+ should_pad_vocab: false
146
+ softmax_scale: None
147
+ softmax_type: vanilla
148
+ symmetric_ar_type: None
149
+ test_mode: false
150
+ timers: None
151
+ transformer_impl: transformer_engine
152
+ use_fused_weighted_squared_relu: false
153
+ use_kitchen: false
154
+ use_mamba_mem_eff_path: true
155
+ use_ring_exchange_p2p: false
156
+ use_te_activation_func: false
157
+ use_te_rng_tracker: false
158
+ variable_seq_lengths: false
159
+ vision_model_type: radio
160
+ vocab_size: 132096
161
+ wgrad_deferral_limit: 0
162
+ window_attn_skip_freq: None
163
+ window_size: None
run_config.yaml ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: megatron.bridge.training.config.ConfigContainer
2
+ checkpoint:
3
+ _target_: megatron.bridge.training.config.CheckpointConfig
4
+ async_save: false
5
+ ckpt_assume_constant_structure: false
6
+ ckpt_convert_format: null
7
+ ckpt_convert_save: null
8
+ ckpt_format: torch_dist
9
+ ckpt_step: null
10
+ dist_ckpt_optim_fully_reshardable: false
11
+ dist_ckpt_save_pre_mcore_014: false
12
+ dist_ckpt_strictness: assume_ok_unexpected
13
+ distrib_optim_fully_reshardable_mem_efficient: false
14
+ exit_on_missing_checkpoint: false
15
+ finetune: true
16
+ fully_parallel_load: false
17
+ fully_parallel_save: true
18
+ load: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/checkpoints
19
+ load_main_params_from_ckpt: false
20
+ load_optim: true
21
+ load_rng: true
22
+ most_recent_k: -1
23
+ non_persistent_ckpt_type: null
24
+ non_persistent_global_ckpt_dir: null
25
+ non_persistent_local_ckpt_algo: fully_parallel
26
+ non_persistent_local_ckpt_dir: null
27
+ non_persistent_save_interval: null
28
+ pretrained_checkpoint: /work/checkpoints/mb/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
29
+ replication: false
30
+ replication_factor: 2
31
+ replication_jump: null
32
+ save: /work/nemo-visual-systems/training/checkpoints/nemo-vs-tp8
33
+ save_interval: 1000
34
+ save_optim: true
35
+ save_rng: true
36
+ save_tokenizer_assets: true
37
+ strict_fsdp_dtensor_load: false
38
+ use_checkpoint_args: false
39
+ use_persistent_ckpt_worker: true
40
+ comm_overlap: null
41
+ dataset:
42
+ _target_: megatron.bridge.data.vlm_datasets.preloaded_provider.PreloadedVLMConversationProvider
43
+ data_sharding: true
44
+ dataloader_type: single
45
+ hf_processor_path: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
46
+ image_folder: /work/datasets
47
+ num_workers: 2
48
+ persistent_workers: false
49
+ pin_memory: true
50
+ sequence_length: 8192
51
+ skip_getting_attention_mask_from_dataset: true
52
+ test_data_path: /work/datasets/merged_dataset/test.jsonl
53
+ train_data_path: /work/datasets/merged_dataset/train.jsonl
54
+ valid_data_path: /work/datasets/merged_dataset/valid.jsonl
55
+ ddp:
56
+ _target_: megatron.bridge.training.config.DistributedDataParallelConfig
57
+ align_param_gather: false
58
+ average_in_collective: false
59
+ bucket_size: null
60
+ check_for_large_grads: false
61
+ check_for_nan_in_grad: true
62
+ data_parallel_sharding_strategy: optim_grads_params
63
+ delay_wgrad_compute: false
64
+ disable_symmetric_registration: false
65
+ fp8_param_gather: false
66
+ fsdp_double_buffer: false
67
+ grad_reduce_in_fp32: true
68
+ gradient_reduce_div_fusion: true
69
+ keep_fp8_transpose_cache: false
70
+ nccl_ub: false
71
+ num_distributed_optimizer_instances: 1
72
+ outer_dp_sharding_strategy: no_shard
73
+ overlap_grad_reduce: false
74
+ overlap_param_gather: false
75
+ pad_buckets_for_high_nccl_busbw: false
76
+ preserve_fp32_weights: true
77
+ reduce_scatter_with_fp32_accumulation: false
78
+ reuse_grad_buf_for_mxfp8_param_ag: false
79
+ suggested_communication_unit_size: null
80
+ use_custom_fsdp: false
81
+ use_distributed_optimizer: true
82
+ use_megatron_fsdp: false
83
+ dist:
84
+ _target_: megatron.bridge.training.config.DistributedInitConfig
85
+ align_grad_reduce: true
86
+ disable_jit_fuser: false
87
+ distributed_backend: nccl
88
+ distributed_timeout_minutes: 10
89
+ distributed_timeout_seconds_after_init: null
90
+ enable_megatron_core_experimental: false
91
+ external_gpu_device_mapping: false
92
+ high_priority_stream_groups: null
93
+ lazy_init: false
94
+ local_rank: 0
95
+ nccl_communicator_config_path: null
96
+ sharp_enabled_group: null
97
+ use_gloo_process_groups: true
98
+ use_megatron_fsdp: false
99
+ use_sharp: false
100
+ use_torch_fsdp2: false
101
+ use_tp_pp_dp_mapping: false
102
+ ft: null
103
+ inprocess_restart: null
104
+ logger:
105
+ _target_: megatron.bridge.training.config.LoggerConfig
106
+ filter_warnings: true
107
+ log_energy: false
108
+ log_interval: 1
109
+ log_l2_norm_grad_to_tensorboard: false
110
+ log_loss_scale_to_tensorboard: true
111
+ log_memory_to_tensorboard: false
112
+ log_params_norm: false
113
+ log_progress: false
114
+ log_runtime_to_tensorboard: false
115
+ log_throughput: false
116
+ log_throughput_to_tensorboard: false
117
+ log_timers_to_tensorboard: true
118
+ log_validation_ppl_to_tensorboard: false
119
+ log_world_size_to_tensorboard: false
120
+ logging_level: 20
121
+ memory_keys: null
122
+ modules_to_filter: null
123
+ runtime_time_unit: hours
124
+ save_config_filepath: null
125
+ set_level_for_all_loggers: false
126
+ tensorboard_dir: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/tb_logs
127
+ tensorboard_log_interval: 1
128
+ tensorboard_queue_size: 1000
129
+ throughput_window_size: 100
130
+ timing_log_level: 0
131
+ timing_log_option: minmax
132
+ wandb_entity: null
133
+ wandb_exp_name: merged-sft-tp8
134
+ wandb_project: nemo-vs
135
+ wandb_save_dir: /work/nemo-visual-systems/training/checkpoints
136
+ mixed_precision:
137
+ _target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
138
+ autocast_dtype: null
139
+ autocast_enabled: false
140
+ bf16: true
141
+ first_last_layers_bf16: false
142
+ fp16: false
143
+ fp32: false
144
+ fp4: null
145
+ fp4_recipe: nvfp4
146
+ fp8: null
147
+ fp8_amax_compute_algo: most_recent
148
+ fp8_amax_history_len: 1
149
+ fp8_dot_product_attention: false
150
+ fp8_margin: 0
151
+ fp8_multi_head_attention: false
152
+ fp8_param: false
153
+ fp8_param_gather: false
154
+ fp8_recipe: tensorwise
155
+ fp8_wgrad: true
156
+ grad_reduce_in_fp32: true
157
+ hysteresis: 2
158
+ initial_loss_scale: 4294967296
159
+ loss_scale: null
160
+ loss_scale_window: 1000
161
+ min_loss_scale: 1.0
162
+ num_layers_at_end_in_bf16: 0
163
+ num_layers_at_start_in_bf16: 0
164
+ params_dtype:
165
+ _call_: false
166
+ _target_: torch.bfloat16
167
+ pipeline_dtype:
168
+ _call_: false
169
+ _target_: torch.bfloat16
170
+ reuse_grad_buf_for_mxfp8_param_ag: false
171
+ model:
172
+ _target_: megatron.bridge.models.nemotron_vl.nemotron_vl_provider.NemotronNano12Bv2VLModelProvider
173
+ account_for_embedding_in_pipeline_split: false
174
+ account_for_loss_in_pipeline_split: false
175
+ activation_func:
176
+ _call_: false
177
+ _target_: megatron.core.activations.squared_relu
178
+ activation_func_clamp_value: null
179
+ activation_func_fp8_input_store: false
180
+ add_bias_linear: false
181
+ add_qkv_bias: false
182
+ apply_query_key_layer_scaling: false
183
+ apply_residual_connection_post_layernorm: false
184
+ apply_rope_fusion: true
185
+ async_tensor_model_parallel_allreduce: false
186
+ attention_backend:
187
+ _args_:
188
+ - 1
189
+ _call_: true
190
+ _target_: megatron.core.transformer.enums.AttnBackend
191
+ attention_dropout: 0.0
192
+ attention_output_gate: false
193
+ attention_softmax_in_fp32: true
194
+ autocast_dtype:
195
+ _call_: false
196
+ _target_: torch.bfloat16
197
+ barrier_with_L1_time: true
198
+ batch_p2p_comm: true
199
+ batch_p2p_sync: true
200
+ bf16: true
201
+ bias_activation_fusion: false
202
+ bias_dropout_fusion: true
203
+ calculate_per_token_loss: false
204
+ clone_scatter_output_in_embedding: true
205
+ config_logger_dir: ''
206
+ context_parallel_size: 1
207
+ cp_comm_type: null
208
+ cpu_offloading: false
209
+ cpu_offloading_activations: true
210
+ cpu_offloading_double_buffering: false
211
+ cpu_offloading_num_layers: 0
212
+ cpu_offloading_weights: false
213
+ cross_entropy_fusion_impl: native
214
+ cross_entropy_loss_fusion: true
215
+ cuda_graph_impl: none
216
+ cuda_graph_retain_backward_graph: false
217
+ cuda_graph_scope: []
218
+ cuda_graph_use_single_mempool: false
219
+ cuda_graph_warmup_steps: 3
220
+ deallocate_pipeline_outputs: true
221
+ defer_embedding_wgrad_compute: false
222
+ delay_wgrad_compute: false
223
+ deterministic_mode: false
224
+ disable_bf16_reduced_precision_matmul: false
225
+ disable_parameter_transpose_cache: false
226
+ distribute_saved_activations: null
227
+ embedding_init_method:
228
+ _args_: []
229
+ _partial_: true
230
+ _target_: torch.nn.init.normal_
231
+ mean: 0.0
232
+ std: 0.02
233
+ embedding_init_method_std: 0.02
234
+ enable_autocast: false
235
+ enable_cuda_graph: false
236
+ expert_model_parallel_size: 1
237
+ expert_tensor_parallel_size: 8
238
+ external_cuda_graph: false
239
+ fallback_to_eager_attn: false
240
+ ffn_hidden_size: 20480
241
+ finalize_model_grads_func:
242
+ _args_: []
243
+ _partial_: true
244
+ _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
245
+ pg_collection:
246
+ _call_: true
247
+ _target_: megatron.core.process_groups_config.ProcessGroupCollection
248
+ fine_grained_activation_offloading: false
249
+ first_last_layers_bf16: false
250
+ flash_decode: false
251
+ fp16: false
252
+ fp16_lm_cross_entropy: false
253
+ fp32_residual_connection: false
254
+ fp4: null
255
+ fp4_param: false
256
+ fp4_quantizer_factory: null
257
+ fp4_recipe: nvfp4
258
+ fp8: null
259
+ fp8_amax_compute_algo: most_recent
260
+ fp8_amax_history_len: 1
261
+ fp8_dot_product_attention: false
262
+ fp8_interval: 1
263
+ fp8_margin: 0
264
+ fp8_multi_head_attention: false
265
+ fp8_param: false
266
+ fp8_quantizer_factory: null
267
+ fp8_recipe: tensorwise
268
+ fp8_wgrad: true
269
+ freeze_language_model: false
270
+ freeze_vision_model: false
271
+ freeze_vision_projection: false
272
+ fused_single_qkv_rope: false
273
+ gated_linear_unit: false
274
+ generation_config: null
275
+ glu_linear_offset: 0.0
276
+ grad_scale_func:
277
+ _call_: false
278
+ _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
279
+ grad_sync_func: null
280
+ gradient_accumulation_fusion: false
281
+ hetereogenous_dist_checkpoint: false
282
+ heterogeneous_block_specs: false
283
+ hf_model_id: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16
284
+ hidden_dropout: 0.0
285
+ hidden_size: 5120
286
+ hierarchical_context_parallel_sizes: null
287
+ hybrid_attention_ratio: 0.0
288
+ hybrid_mlp_ratio: 0.0
289
+ hybrid_override_pattern: M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-
290
+ inference_rng_tracker: false
291
+ inference_sampling_seed: 42
292
+ init_method:
293
+ _args_: []
294
+ _partial_: true
295
+ _target_: torch.nn.init.normal_
296
+ mean: 0.0
297
+ std: 0.02
298
+ init_method_std: 0.02
299
+ init_model_with_meta_device: false
300
+ is_hybrid_model: true
301
+ kv_channels: 128
302
+ language_model_type: nemotron5-hybrid-12b
303
+ layernorm_epsilon: 1.0e-05
304
+ layernorm_zero_centered_gamma: false
305
+ linear_attention_freq: null
306
+ linear_attention_type: null
307
+ linear_conv_kernel_dim: null
308
+ linear_key_head_dim: null
309
+ linear_num_key_heads: null
310
+ linear_num_value_heads: null
311
+ linear_value_head_dim: null
312
+ log_max_attention_logit: false
313
+ make_vocab_size_divisible_by: 128
314
+ mamba_head_dim: 80
315
+ mamba_num_groups: 8
316
+ mamba_num_heads: 128
317
+ mamba_stack_spec:
318
+ _call_: false
319
+ _target_: megatron.bridge.models.mamba.mamba_provider.get_default_mamba_stack_spec
320
+ mamba_state_dim: 128
321
+ masked_softmax_fusion: true
322
+ memory_efficient_layer_norm: false
323
+ microbatch_group_size_per_vp_stage: 1
324
+ min_offloaded_tensor_size: 1048576
325
+ mlp_chunks_for_prefill: 1
326
+ moe_apply_probs_on_input: false
327
+ moe_aux_loss_coeff: 0.0001
328
+ moe_deepep_num_sms: 20
329
+ moe_enable_deepep: false
330
+ moe_expert_capacity_factor: null
331
+ moe_extended_tp: false
332
+ moe_ffn_hidden_size: null
333
+ moe_flex_dispatcher_backend: deepep
334
+ moe_grouped_gemm: true
335
+ moe_hybridep_num_sms: 16
336
+ moe_input_jitter_eps: null
337
+ moe_layer_freq: 1
338
+ moe_layer_recompute: false
339
+ moe_pad_expert_input_to_capacity: false
340
+ moe_per_layer_logging: false
341
+ moe_permute_fusion: true
342
+ moe_router_bias_update_rate: 0.001
343
+ moe_router_dtype: fp32
344
+ moe_router_enable_expert_bias: true
345
+ moe_router_force_load_balancing: false
346
+ moe_router_fusion: false
347
+ moe_router_group_topk: null
348
+ moe_router_load_balancing_type: seq_aux_loss
349
+ moe_router_num_groups: null
350
+ moe_router_padding_for_fp8: false
351
+ moe_router_padding_for_quantization: false
352
+ moe_router_pre_softmax: false
353
+ moe_router_score_function: sigmoid
354
+ moe_router_topk: 2
355
+ moe_router_topk_limited_devices: null
356
+ moe_router_topk_scaling_factor: null
357
+ moe_shared_expert_gate: false
358
+ moe_shared_expert_intermediate_size: null
359
+ moe_shared_expert_overlap: true
360
+ moe_token_dispatcher_type: alltoall
361
+ moe_token_drop_policy: probs
362
+ moe_token_dropping: false
363
+ moe_use_legacy_grouped_gemm: false
364
+ moe_z_loss_coeff: null
365
+ mrope_section: null
366
+ mtp_loss_scaling_factor: null
367
+ mtp_num_layers: null
368
+ mtp_standalone: false
369
+ multi_latent_attention: false
370
+ no_rope_freq: null
371
+ no_sync_func: null
372
+ normalization: RMSNorm
373
+ num_attention_heads: 40
374
+ num_layers: 62
375
+ num_layers_at_end_in_bf16: 0
376
+ num_layers_at_start_in_bf16: 0
377
+ num_layers_in_first_pipeline_stage: null
378
+ num_layers_in_last_pipeline_stage: null
379
+ num_microbatches_with_partial_activation_checkpoints: null
380
+ num_moe_experts: null
381
+ num_query_groups: 8
382
+ offload_modules: null
383
+ output_layer_init_method:
384
+ _args_: []
385
+ _partial_: true
386
+ _target_: torch.nn.init.normal_
387
+ mean: 0.0
388
+ std: 0.00254000254000381
389
+ overlap_moe_expert_parallel_comm: false
390
+ overlap_p2p_comm: false
391
+ overlap_p2p_comm_warmup_flush: false
392
+ parallel_output: true
393
+ param_sync_func: null
394
+ params_dtype:
395
+ _call_: false
396
+ _target_: torch.bfloat16
397
+ perform_initialization: true
398
+ persist_layer_norm: true
399
+ pipeline_dtype:
400
+ _call_: false
401
+ _target_: torch.bfloat16
402
+ pipeline_model_parallel_comm_backend: null
403
+ pipeline_model_parallel_layout: null
404
+ pipeline_model_parallel_size: 1
405
+ position_embedding_type: none
406
+ qk_clip: false
407
+ qk_clip_alpha: 0.5
408
+ qk_clip_threshold: 100
409
+ qk_layernorm: false
410
+ quant_recipe: null
411
+ recompute_granularity: null
412
+ recompute_method: null
413
+ recompute_modules:
414
+ - core_attn
415
+ recompute_num_layers: null
416
+ rotary_base: 10000
417
+ rotary_interleaved: false
418
+ rotary_percent: 1.0
419
+ scatter_embedding_sequence_parallel: false
420
+ seq_len_interpolation_factor: null
421
+ seq_length: 8192
422
+ sequence_parallel: false
423
+ share_embeddings_and_output_weights: false
424
+ should_pad_vocab: false
425
+ softmax_scale: null
426
+ softmax_type: vanilla
427
+ symmetric_ar_type: null
428
+ tensor_model_parallel_size: 8
429
+ test_mode: false
430
+ timers:
431
+ _call_: true
432
+ _target_: megatron.core.timers.Timers
433
+ tp_comm_atomic_ag: false
434
+ tp_comm_atomic_rs: false
435
+ tp_comm_bootstrap_backend: nccl
436
+ tp_comm_bulk_dgrad: true
437
+ tp_comm_bulk_wgrad: true
438
+ tp_comm_overlap: false
439
+ tp_comm_overlap_ag: true
440
+ tp_comm_overlap_disable_fc1: false
441
+ tp_comm_overlap_disable_qkv: false
442
+ tp_comm_overlap_rs: true
443
+ tp_comm_overlap_rs_dgrad: false
444
+ tp_comm_split_ag: true
445
+ tp_comm_split_rs: true
446
+ tp_only_amax_red: false
447
+ transformer_impl: transformer_engine
448
+ use_cpu_initialization: false
449
+ use_fused_weighted_squared_relu: false
450
+ use_inference_optimized_layers: false
451
+ use_kitchen: false
452
+ use_mamba_mem_eff_path: true
453
+ use_ring_exchange_p2p: false
454
+ use_te_activation_func: false
455
+ use_te_rng_tracker: false
456
+ variable_seq_lengths: false
457
+ virtual_pipeline_model_parallel_size: null
458
+ vision_model_type: radio
459
+ vocab_size: 132096
460
+ wgrad_deferral_limit: 0
461
+ window_attn_skip_freq: null
462
+ window_size: null
463
+ nvrx_straggler: null
464
+ optimizer:
465
+ _target_: megatron.bridge.training.config.OptimizerConfig
466
+ adam_beta1: 0.9
467
+ adam_beta2: 0.95
468
+ adam_eps: 1.0e-05
469
+ barrier_with_L1_time: false
470
+ bf16: true
471
+ clip_grad: 1.25
472
+ config_logger_dir: ''
473
+ decoupled_weight_decay: true
474
+ exp_avg_dtype:
475
+ _call_: false
476
+ _target_: torch.float32
477
+ exp_avg_sq_dtype:
478
+ _call_: false
479
+ _target_: torch.float32
480
+ fp16: false
481
+ fp8_recipe: tensorwise
482
+ hysteresis: 2
483
+ initial_loss_scale: 4294967296
484
+ log_num_zeros_in_grad: false
485
+ loss_scale: null
486
+ loss_scale_window: 1000
487
+ lr: 1.0e-05
488
+ main_grads_dtype:
489
+ _call_: false
490
+ _target_: torch.float32
491
+ main_params_dtype:
492
+ _call_: false
493
+ _target_: torch.float32
494
+ min_loss_scale: 1.0
495
+ min_lr: 1.0e-06
496
+ muon_extra_scale_factor: 1.0
497
+ muon_fp32_matmul_prec: medium
498
+ muon_momentum: 0.95
499
+ muon_num_ns_steps: 5
500
+ muon_scale_mode: spectral
501
+ muon_split_qkv: true
502
+ muon_tp_mode: blockwise
503
+ muon_use_nesterov: false
504
+ optimizer: adam
505
+ optimizer_cpu_offload: false
506
+ optimizer_offload_fraction: 0.0
507
+ overlap_cpu_optimizer_d2h_h2d: false
508
+ overlap_param_gather: false
509
+ overlap_param_gather_with_optimizer_step: false
510
+ params_dtype:
511
+ _call_: false
512
+ _target_: torch.bfloat16
513
+ pin_cpu_grads: true
514
+ pin_cpu_params: true
515
+ reuse_grad_buf_for_mxfp8_param_ag: false
516
+ sgd_momentum: 0.9
517
+ store_param_remainders: true
518
+ timers:
519
+ _call_: true
520
+ _target_: megatron.core.timers.Timers
521
+ use_distributed_optimizer: true
522
+ use_precision_aware_optimizer: false
523
+ use_torch_optimizer_for_cpu_offload: false
524
+ weight_decay: 0.1
525
+ peft: null
526
+ profiling:
527
+ _target_: megatron.bridge.training.config.ProfilingConfig
528
+ memory_snapshot_path: snapshot.pickle
529
+ nvtx_ranges: false
530
+ profile_ranks:
531
+ - 0
532
+ profile_step_end: 12
533
+ profile_step_start: 10
534
+ record_memory_history: false
535
+ record_shapes: false
536
+ use_nsys_profiler: false
537
+ use_pytorch_profiler: false
538
+ rerun_state_machine:
539
+ _target_: megatron.bridge.training.config.RerunStateMachineConfig
540
+ check_for_nan_in_loss: true
541
+ check_for_spiky_loss: false
542
+ error_injection_rate: 0
543
+ error_injection_type: transient_error
544
+ rerun_mode: disabled
545
+ rng:
546
+ _target_: megatron.bridge.training.config.RNGConfig
547
+ data_parallel_random_init: false
548
+ inference_rng_tracker: false
549
+ seed: 42
550
+ te_rng_tracker: false
551
+ scheduler:
552
+ _target_: megatron.bridge.training.config.SchedulerConfig
553
+ end_weight_decay: 0.033
554
+ lr_decay_iters: 1500
555
+ lr_decay_samples: null
556
+ lr_decay_steps: 48000
557
+ lr_decay_style: cosine
558
+ lr_warmup_fraction: null
559
+ lr_warmup_init: 0.0
560
+ lr_warmup_iters: 200
561
+ lr_warmup_samples: 0
562
+ lr_warmup_steps: 6400
563
+ lr_wsd_decay_iters: null
564
+ lr_wsd_decay_samples: null
565
+ lr_wsd_decay_style: exponential
566
+ no_weight_decay_cond_type: null
567
+ override_opt_param_scheduler: true
568
+ start_weight_decay: 0.033
569
+ use_checkpoint_opt_param_scheduler: false
570
+ wd_incr_steps: 48000
571
+ weight_decay_incr_style: constant
572
+ wsd_decay_steps: null
573
+ straggler: null
574
+ tensor_inspect: null
575
+ tokenizer:
576
+ _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
577
+ hf_tokenizer_kwargs: {}
578
+ image_tag_type: null
579
+ merge_file: null
580
+ special_tokens: null
581
+ tiktoken_num_special_tokens: 1000
582
+ tiktoken_pattern: null
583
+ tiktoken_special_tokens: null
584
+ tokenizer_model: null
585
+ tokenizer_prompt_format: null
586
+ tokenizer_type: NullTokenizer
587
+ vocab_extra_ids: 0
588
+ vocab_file: null
589
+ vocab_size: 32000
590
+ train:
591
+ _target_: megatron.bridge.training.config.TrainingConfig
592
+ check_weight_hash_across_dp_replicas_interval: null
593
+ decrease_batch_size_if_needed: false
594
+ empty_unused_memory_level: 0
595
+ eval_interval: 500
596
+ eval_iters: 0
597
+ exit_duration_in_mins: null
598
+ exit_interval: null
599
+ exit_signal:
600
+ _args_:
601
+ - 15
602
+ _call_: true
603
+ _target_: signal.Signals
604
+ exit_signal_handler: false
605
+ exit_signal_handler_for_dataloader: false
606
+ global_batch_size: 32
607
+ iterations_to_skip: []
608
+ manual_gc: true
609
+ manual_gc_eval: 100
610
+ manual_gc_interval: 100
611
+ micro_batch_size: 1
612
+ rampup_batch_size: null
613
+ skip_train: false
614
+ train_iters: 1500
615
+ train_samples: null
616
+ train_sync_interval: null
train_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797cb34b03b9e704bdedc02850eae91dd0fb413270cbc3becd34e3913e9dea86
3
+ size 3405