| _target_: megatron.bridge.training.config.ConfigContainer |
| checkpoint: |
| _target_: megatron.bridge.training.config.CheckpointConfig |
| async_save: false |
| ckpt_assume_constant_structure: false |
| ckpt_convert_format: null |
| ckpt_convert_save: null |
| ckpt_format: torch_dist |
| ckpt_step: null |
| dist_ckpt_optim_fully_reshardable: false |
| dist_ckpt_save_pre_mcore_014: false |
| dist_ckpt_strictness: assume_ok_unexpected |
| distrib_optim_fully_reshardable_mem_efficient: false |
| exit_on_missing_checkpoint: false |
| finetune: true |
| fully_parallel_load: false |
| fully_parallel_save: true |
| load: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/checkpoints |
| load_main_params_from_ckpt: false |
| load_optim: true |
| load_rng: true |
| most_recent_k: -1 |
| non_persistent_ckpt_type: null |
| non_persistent_global_ckpt_dir: null |
| non_persistent_local_ckpt_algo: fully_parallel |
| non_persistent_local_ckpt_dir: null |
| non_persistent_save_interval: null |
| pretrained_checkpoint: /work/checkpoints/mb/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 |
| replication: false |
| replication_factor: 2 |
| replication_jump: null |
| save: /work/nemo-visual-systems/training/checkpoints/nemo-vs-tp8 |
| save_interval: 1000 |
| save_optim: true |
| save_rng: true |
| save_tokenizer_assets: true |
| strict_fsdp_dtensor_load: false |
| use_checkpoint_args: false |
| use_persistent_ckpt_worker: true |
| comm_overlap: null |
| dataset: |
| _target_: megatron.bridge.data.vlm_datasets.preloaded_provider.PreloadedVLMConversationProvider |
| data_sharding: true |
| dataloader_type: single |
| hf_processor_path: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 |
| image_folder: /work/datasets |
| num_workers: 2 |
| persistent_workers: false |
| pin_memory: true |
| sequence_length: 8192 |
| skip_getting_attention_mask_from_dataset: true |
| test_data_path: /work/datasets/merged_dataset/test.jsonl |
| train_data_path: /work/datasets/merged_dataset/train.jsonl |
| valid_data_path: /work/datasets/merged_dataset/valid.jsonl |
| ddp: |
| _target_: megatron.bridge.training.config.DistributedDataParallelConfig |
| align_param_gather: false |
| average_in_collective: false |
| bucket_size: null |
| check_for_large_grads: false |
| check_for_nan_in_grad: true |
| data_parallel_sharding_strategy: optim_grads_params |
| delay_wgrad_compute: false |
| disable_symmetric_registration: false |
| fp8_param_gather: false |
| fsdp_double_buffer: false |
| grad_reduce_in_fp32: true |
| gradient_reduce_div_fusion: true |
| keep_fp8_transpose_cache: false |
| nccl_ub: false |
| num_distributed_optimizer_instances: 1 |
| outer_dp_sharding_strategy: no_shard |
| overlap_grad_reduce: false |
| overlap_param_gather: false |
| pad_buckets_for_high_nccl_busbw: false |
| preserve_fp32_weights: true |
| reduce_scatter_with_fp32_accumulation: false |
| reuse_grad_buf_for_mxfp8_param_ag: false |
| suggested_communication_unit_size: null |
| use_custom_fsdp: false |
| use_distributed_optimizer: true |
| use_megatron_fsdp: false |
| dist: |
| _target_: megatron.bridge.training.config.DistributedInitConfig |
| align_grad_reduce: true |
| disable_jit_fuser: false |
| distributed_backend: nccl |
| distributed_timeout_minutes: 10 |
| distributed_timeout_seconds_after_init: null |
| enable_megatron_core_experimental: false |
| external_gpu_device_mapping: false |
| high_priority_stream_groups: null |
| lazy_init: false |
| local_rank: 0 |
| nccl_communicator_config_path: null |
| sharp_enabled_group: null |
| use_gloo_process_groups: true |
| use_megatron_fsdp: false |
| use_sharp: false |
| use_torch_fsdp2: false |
| use_tp_pp_dp_mapping: false |
| ft: null |
| inprocess_restart: null |
| logger: |
| _target_: megatron.bridge.training.config.LoggerConfig |
| filter_warnings: true |
| log_energy: false |
| log_interval: 1 |
| log_l2_norm_grad_to_tensorboard: false |
| log_loss_scale_to_tensorboard: true |
| log_memory_to_tensorboard: false |
| log_params_norm: false |
| log_progress: false |
| log_runtime_to_tensorboard: false |
| log_throughput: false |
| log_throughput_to_tensorboard: false |
| log_timers_to_tensorboard: true |
| log_validation_ppl_to_tensorboard: false |
| log_world_size_to_tensorboard: false |
| logging_level: 20 |
| memory_keys: null |
| modules_to_filter: null |
| runtime_time_unit: hours |
| save_config_filepath: null |
| set_level_for_all_loggers: false |
| tensorboard_dir: /work/nemo-visual-systems/training/nemo_experiments/nemotron_nano_v2_vl_pretrain/tb_logs |
| tensorboard_log_interval: 1 |
| tensorboard_queue_size: 1000 |
| throughput_window_size: 100 |
| timing_log_level: 0 |
| timing_log_option: minmax |
| wandb_entity: null |
| wandb_exp_name: merged-sft-tp8 |
| wandb_project: nemo-vs |
| wandb_save_dir: /work/nemo-visual-systems/training/checkpoints |
| mixed_precision: |
| _target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig |
| autocast_dtype: null |
| autocast_enabled: false |
| bf16: true |
| first_last_layers_bf16: false |
| fp16: false |
| fp32: false |
| fp4: null |
| fp4_recipe: nvfp4 |
| fp8: null |
| fp8_amax_compute_algo: most_recent |
| fp8_amax_history_len: 1 |
| fp8_dot_product_attention: false |
| fp8_margin: 0 |
| fp8_multi_head_attention: false |
| fp8_param: false |
| fp8_param_gather: false |
| fp8_recipe: tensorwise |
| fp8_wgrad: true |
| grad_reduce_in_fp32: true |
| hysteresis: 2 |
| initial_loss_scale: 4294967296 |
| loss_scale: null |
| loss_scale_window: 1000 |
| min_loss_scale: 1.0 |
| num_layers_at_end_in_bf16: 0 |
| num_layers_at_start_in_bf16: 0 |
| params_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| pipeline_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| reuse_grad_buf_for_mxfp8_param_ag: false |
| model: |
| _target_: megatron.bridge.models.nemotron_vl.nemotron_vl_provider.NemotronNano12Bv2VLModelProvider |
| account_for_embedding_in_pipeline_split: false |
| account_for_loss_in_pipeline_split: false |
| activation_func: |
| _call_: false |
| _target_: megatron.core.activations.squared_relu |
| activation_func_clamp_value: null |
| activation_func_fp8_input_store: false |
| add_bias_linear: false |
| add_qkv_bias: false |
| apply_query_key_layer_scaling: false |
| apply_residual_connection_post_layernorm: false |
| apply_rope_fusion: true |
| async_tensor_model_parallel_allreduce: false |
| attention_backend: |
| _args_: |
| - 1 |
| _call_: true |
| _target_: megatron.core.transformer.enums.AttnBackend |
| attention_dropout: 0.0 |
| attention_output_gate: false |
| attention_softmax_in_fp32: true |
| autocast_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| barrier_with_L1_time: true |
| batch_p2p_comm: true |
| batch_p2p_sync: true |
| bf16: true |
| bias_activation_fusion: false |
| bias_dropout_fusion: true |
| calculate_per_token_loss: false |
| clone_scatter_output_in_embedding: true |
| config_logger_dir: '' |
| context_parallel_size: 1 |
| cp_comm_type: null |
| cpu_offloading: false |
| cpu_offloading_activations: true |
| cpu_offloading_double_buffering: false |
| cpu_offloading_num_layers: 0 |
| cpu_offloading_weights: false |
| cross_entropy_fusion_impl: native |
| cross_entropy_loss_fusion: true |
| cuda_graph_impl: none |
| cuda_graph_retain_backward_graph: false |
| cuda_graph_scope: [] |
| cuda_graph_use_single_mempool: false |
| cuda_graph_warmup_steps: 3 |
| deallocate_pipeline_outputs: true |
| defer_embedding_wgrad_compute: false |
| delay_wgrad_compute: false |
| deterministic_mode: false |
| disable_bf16_reduced_precision_matmul: false |
| disable_parameter_transpose_cache: false |
| distribute_saved_activations: null |
| embedding_init_method: |
| _args_: [] |
| _partial_: true |
| _target_: torch.nn.init.normal_ |
| mean: 0.0 |
| std: 0.02 |
| embedding_init_method_std: 0.02 |
| enable_autocast: false |
| enable_cuda_graph: false |
| expert_model_parallel_size: 1 |
| expert_tensor_parallel_size: 8 |
| external_cuda_graph: false |
| fallback_to_eager_attn: false |
| ffn_hidden_size: 20480 |
| finalize_model_grads_func: |
| _args_: [] |
| _partial_: true |
| _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads |
| pg_collection: |
| _call_: true |
| _target_: megatron.core.process_groups_config.ProcessGroupCollection |
| fine_grained_activation_offloading: false |
| first_last_layers_bf16: false |
| flash_decode: false |
| fp16: false |
| fp16_lm_cross_entropy: false |
| fp32_residual_connection: false |
| fp4: null |
| fp4_param: false |
| fp4_quantizer_factory: null |
| fp4_recipe: nvfp4 |
| fp8: null |
| fp8_amax_compute_algo: most_recent |
| fp8_amax_history_len: 1 |
| fp8_dot_product_attention: false |
| fp8_interval: 1 |
| fp8_margin: 0 |
| fp8_multi_head_attention: false |
| fp8_param: false |
| fp8_quantizer_factory: null |
| fp8_recipe: tensorwise |
| fp8_wgrad: true |
| freeze_language_model: false |
| freeze_vision_model: false |
| freeze_vision_projection: false |
| fused_single_qkv_rope: false |
| gated_linear_unit: false |
| generation_config: null |
| glu_linear_offset: 0.0 |
| grad_scale_func: |
| _call_: false |
| _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss |
| grad_sync_func: null |
| gradient_accumulation_fusion: false |
| hetereogenous_dist_checkpoint: false |
| heterogeneous_block_specs: false |
| hf_model_id: /work/checkpoints/hf/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 |
| hidden_dropout: 0.0 |
| hidden_size: 5120 |
| hierarchical_context_parallel_sizes: null |
| hybrid_attention_ratio: 0.0 |
| hybrid_mlp_ratio: 0.0 |
| hybrid_override_pattern: M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M- |
| inference_rng_tracker: false |
| inference_sampling_seed: 42 |
| init_method: |
| _args_: [] |
| _partial_: true |
| _target_: torch.nn.init.normal_ |
| mean: 0.0 |
| std: 0.02 |
| init_method_std: 0.02 |
| init_model_with_meta_device: false |
| is_hybrid_model: true |
| kv_channels: 128 |
| language_model_type: nemotron5-hybrid-12b |
| layernorm_epsilon: 1.0e-05 |
| layernorm_zero_centered_gamma: false |
| linear_attention_freq: null |
| linear_attention_type: null |
| linear_conv_kernel_dim: null |
| linear_key_head_dim: null |
| linear_num_key_heads: null |
| linear_num_value_heads: null |
| linear_value_head_dim: null |
| log_max_attention_logit: false |
| make_vocab_size_divisible_by: 128 |
| mamba_head_dim: 80 |
| mamba_num_groups: 8 |
| mamba_num_heads: 128 |
| mamba_stack_spec: |
| _call_: false |
| _target_: megatron.bridge.models.mamba.mamba_provider.get_default_mamba_stack_spec |
| mamba_state_dim: 128 |
| masked_softmax_fusion: true |
| memory_efficient_layer_norm: false |
| microbatch_group_size_per_vp_stage: 1 |
| min_offloaded_tensor_size: 1048576 |
| mlp_chunks_for_prefill: 1 |
| moe_apply_probs_on_input: false |
| moe_aux_loss_coeff: 0.0001 |
| moe_deepep_num_sms: 20 |
| moe_enable_deepep: false |
| moe_expert_capacity_factor: null |
| moe_extended_tp: false |
| moe_ffn_hidden_size: null |
| moe_flex_dispatcher_backend: deepep |
| moe_grouped_gemm: true |
| moe_hybridep_num_sms: 16 |
| moe_input_jitter_eps: null |
| moe_layer_freq: 1 |
| moe_layer_recompute: false |
| moe_pad_expert_input_to_capacity: false |
| moe_per_layer_logging: false |
| moe_permute_fusion: true |
| moe_router_bias_update_rate: 0.001 |
| moe_router_dtype: fp32 |
| moe_router_enable_expert_bias: true |
| moe_router_force_load_balancing: false |
| moe_router_fusion: false |
| moe_router_group_topk: null |
| moe_router_load_balancing_type: seq_aux_loss |
| moe_router_num_groups: null |
| moe_router_padding_for_fp8: false |
| moe_router_padding_for_quantization: false |
| moe_router_pre_softmax: false |
| moe_router_score_function: sigmoid |
| moe_router_topk: 2 |
| moe_router_topk_limited_devices: null |
| moe_router_topk_scaling_factor: null |
| moe_shared_expert_gate: false |
| moe_shared_expert_intermediate_size: null |
| moe_shared_expert_overlap: true |
| moe_token_dispatcher_type: alltoall |
| moe_token_drop_policy: probs |
| moe_token_dropping: false |
| moe_use_legacy_grouped_gemm: false |
| moe_z_loss_coeff: null |
| mrope_section: null |
| mtp_loss_scaling_factor: null |
| mtp_num_layers: null |
| mtp_standalone: false |
| multi_latent_attention: false |
| no_rope_freq: null |
| no_sync_func: null |
| normalization: RMSNorm |
| num_attention_heads: 40 |
| num_layers: 62 |
| num_layers_at_end_in_bf16: 0 |
| num_layers_at_start_in_bf16: 0 |
| num_layers_in_first_pipeline_stage: null |
| num_layers_in_last_pipeline_stage: null |
| num_microbatches_with_partial_activation_checkpoints: null |
| num_moe_experts: null |
| num_query_groups: 8 |
| offload_modules: null |
| output_layer_init_method: |
| _args_: [] |
| _partial_: true |
| _target_: torch.nn.init.normal_ |
| mean: 0.0 |
| std: 0.00254000254000381 |
| overlap_moe_expert_parallel_comm: false |
| overlap_p2p_comm: false |
| overlap_p2p_comm_warmup_flush: false |
| parallel_output: true |
| param_sync_func: null |
| params_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| perform_initialization: true |
| persist_layer_norm: true |
| pipeline_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| pipeline_model_parallel_comm_backend: null |
| pipeline_model_parallel_layout: null |
| pipeline_model_parallel_size: 1 |
| position_embedding_type: none |
| qk_clip: false |
| qk_clip_alpha: 0.5 |
| qk_clip_threshold: 100 |
| qk_layernorm: false |
| quant_recipe: null |
| recompute_granularity: null |
| recompute_method: null |
| recompute_modules: |
| - core_attn |
| recompute_num_layers: null |
| rotary_base: 10000 |
| rotary_interleaved: false |
| rotary_percent: 1.0 |
| scatter_embedding_sequence_parallel: false |
| seq_len_interpolation_factor: null |
| seq_length: 8192 |
| sequence_parallel: false |
| share_embeddings_and_output_weights: false |
| should_pad_vocab: false |
| softmax_scale: null |
| softmax_type: vanilla |
| symmetric_ar_type: null |
| tensor_model_parallel_size: 8 |
| test_mode: false |
| timers: |
| _call_: true |
| _target_: megatron.core.timers.Timers |
| tp_comm_atomic_ag: false |
| tp_comm_atomic_rs: false |
| tp_comm_bootstrap_backend: nccl |
| tp_comm_bulk_dgrad: true |
| tp_comm_bulk_wgrad: true |
| tp_comm_overlap: false |
| tp_comm_overlap_ag: true |
| tp_comm_overlap_disable_fc1: false |
| tp_comm_overlap_disable_qkv: false |
| tp_comm_overlap_rs: true |
| tp_comm_overlap_rs_dgrad: false |
| tp_comm_split_ag: true |
| tp_comm_split_rs: true |
| tp_only_amax_red: false |
| transformer_impl: transformer_engine |
| use_cpu_initialization: false |
| use_fused_weighted_squared_relu: false |
| use_inference_optimized_layers: false |
| use_kitchen: false |
| use_mamba_mem_eff_path: true |
| use_ring_exchange_p2p: false |
| use_te_activation_func: false |
| use_te_rng_tracker: false |
| variable_seq_lengths: false |
| virtual_pipeline_model_parallel_size: null |
| vision_model_type: radio |
| vocab_size: 132096 |
| wgrad_deferral_limit: 0 |
| window_attn_skip_freq: null |
| window_size: null |
| nvrx_straggler: null |
| optimizer: |
| _target_: megatron.bridge.training.config.OptimizerConfig |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| adam_eps: 1.0e-05 |
| barrier_with_L1_time: false |
| bf16: true |
| clip_grad: 1.25 |
| config_logger_dir: '' |
| decoupled_weight_decay: true |
| exp_avg_dtype: |
| _call_: false |
| _target_: torch.float32 |
| exp_avg_sq_dtype: |
| _call_: false |
| _target_: torch.float32 |
| fp16: false |
| fp8_recipe: tensorwise |
| hysteresis: 2 |
| initial_loss_scale: 4294967296 |
| log_num_zeros_in_grad: false |
| loss_scale: null |
| loss_scale_window: 1000 |
| lr: 1.0e-05 |
| main_grads_dtype: |
| _call_: false |
| _target_: torch.float32 |
| main_params_dtype: |
| _call_: false |
| _target_: torch.float32 |
| min_loss_scale: 1.0 |
| min_lr: 1.0e-06 |
| muon_extra_scale_factor: 1.0 |
| muon_fp32_matmul_prec: medium |
| muon_momentum: 0.95 |
| muon_num_ns_steps: 5 |
| muon_scale_mode: spectral |
| muon_split_qkv: true |
| muon_tp_mode: blockwise |
| muon_use_nesterov: false |
| optimizer: adam |
| optimizer_cpu_offload: false |
| optimizer_offload_fraction: 0.0 |
| overlap_cpu_optimizer_d2h_h2d: false |
| overlap_param_gather: false |
| overlap_param_gather_with_optimizer_step: false |
| params_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| pin_cpu_grads: true |
| pin_cpu_params: true |
| reuse_grad_buf_for_mxfp8_param_ag: false |
| sgd_momentum: 0.9 |
| store_param_remainders: true |
| timers: |
| _call_: true |
| _target_: megatron.core.timers.Timers |
| use_distributed_optimizer: true |
| use_precision_aware_optimizer: false |
| use_torch_optimizer_for_cpu_offload: false |
| weight_decay: 0.1 |
| peft: null |
| profiling: |
| _target_: megatron.bridge.training.config.ProfilingConfig |
| memory_snapshot_path: snapshot.pickle |
| nvtx_ranges: false |
| profile_ranks: |
| - 0 |
| profile_step_end: 12 |
| profile_step_start: 10 |
| record_memory_history: false |
| record_shapes: false |
| use_nsys_profiler: false |
| use_pytorch_profiler: false |
| rerun_state_machine: |
| _target_: megatron.bridge.training.config.RerunStateMachineConfig |
| check_for_nan_in_loss: true |
| check_for_spiky_loss: false |
| error_injection_rate: 0 |
| error_injection_type: transient_error |
| rerun_mode: disabled |
| rng: |
| _target_: megatron.bridge.training.config.RNGConfig |
| data_parallel_random_init: false |
| inference_rng_tracker: false |
| seed: 42 |
| te_rng_tracker: false |
| scheduler: |
| _target_: megatron.bridge.training.config.SchedulerConfig |
| end_weight_decay: 0.033 |
| lr_decay_iters: 1500 |
| lr_decay_samples: null |
| lr_decay_steps: 48000 |
| lr_decay_style: cosine |
| lr_warmup_fraction: null |
| lr_warmup_init: 0.0 |
| lr_warmup_iters: 200 |
| lr_warmup_samples: 0 |
| lr_warmup_steps: 6400 |
| lr_wsd_decay_iters: null |
| lr_wsd_decay_samples: null |
| lr_wsd_decay_style: exponential |
| no_weight_decay_cond_type: null |
| override_opt_param_scheduler: true |
| start_weight_decay: 0.033 |
| use_checkpoint_opt_param_scheduler: false |
| wd_incr_steps: 48000 |
| weight_decay_incr_style: constant |
| wsd_decay_steps: null |
| straggler: null |
| tensor_inspect: null |
| tokenizer: |
| _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig |
| hf_tokenizer_kwargs: {} |
| image_tag_type: null |
| merge_file: null |
| special_tokens: null |
| tiktoken_num_special_tokens: 1000 |
| tiktoken_pattern: null |
| tiktoken_special_tokens: null |
| tokenizer_model: null |
| tokenizer_prompt_format: null |
| tokenizer_type: NullTokenizer |
| vocab_extra_ids: 0 |
| vocab_file: null |
| vocab_size: 32000 |
| train: |
| _target_: megatron.bridge.training.config.TrainingConfig |
| check_weight_hash_across_dp_replicas_interval: null |
| decrease_batch_size_if_needed: false |
| empty_unused_memory_level: 0 |
| eval_interval: 500 |
| eval_iters: 0 |
| exit_duration_in_mins: null |
| exit_interval: null |
| exit_signal: |
| _args_: |
| - 15 |
| _call_: true |
| _target_: signal.Signals |
| exit_signal_handler: false |
| exit_signal_handler_for_dataloader: false |
| global_batch_size: 32 |
| iterations_to_skip: [] |
| manual_gc: true |
| manual_gc_eval: 100 |
| manual_gc_interval: 100 |
| micro_batch_size: 1 |
| rampup_batch_size: null |
| skip_train: false |
| train_iters: 1500 |
| train_samples: null |
| train_sync_interval: null |
|
|