| _target_: nemo.collections.llm.gpt.model.ssm.MambaModel |
| config: |
| _cpu_offloading_context: null |
| _target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfig8B |
| account_for_embedding_in_pipeline_split: false |
| account_for_loss_in_pipeline_split: false |
| activation_func: |
| _call_: false |
| _target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfigBase.<lambda> |
| activation_func_fp8_input_store: false |
| add_bias_linear: false |
| add_qkv_bias: false |
| apply_query_key_layer_scaling: false |
| apply_residual_connection_post_layernorm: false |
| apply_rope_fusion: true |
| async_tensor_model_parallel_allreduce: false |
| attention_backend: |
| _call_: true |
| _target_: megatron.core.transformer.enums.AttnBackend |
| attention_dropout: 0.0 |
| attention_softmax_in_fp32: false |
| autocast_dtype: null |
| barrier_with_L1_time: true |
| batch_p2p_comm: true |
| batch_p2p_sync: true |
| bf16: true |
| bias_activation_fusion: false |
| bias_dropout_fusion: true |
| calculate_per_token_loss: false |
| clone_scatter_output_in_embedding: true |
| config_logger_dir: '' |
| context_parallel_size: 1 |
| cp_comm_type: null |
| cpu_offloading: false |
| cpu_offloading_activations: true |
| cpu_offloading_num_layers: 0 |
| cpu_offloading_weights: true |
| cross_entropy_fusion_impl: native |
| cross_entropy_loss_fusion: true |
| cuda_graph_retain_backward_graph: false |
| cuda_graph_scope: full |
| cuda_graph_use_single_mempool: false |
| cuda_graph_warmup_steps: 3 |
| data_step_fn: |
| _call_: false |
| _target_: nemo.collections.llm.gpt.model.base.gpt_data_step |
| deallocate_pipeline_outputs: true |
| defer_embedding_wgrad_compute: false |
| deterministic_mode: false |
| disable_parameter_transpose_cache: false |
| distribute_saved_activations: null |
| enable_autocast: false |
| enable_cuda_graph: false |
| expert_model_parallel_size: 1 |
| expert_tensor_parallel_size: null |
| external_cuda_graph: false |
| ffn_hidden_size: 21504 |
| finalize_model_grads_func: null |
| first_last_layers_bf16: true |
| flash_decode: false |
| forward_step_fn: |
| _call_: false |
| _target_: nemo.collections.llm.gpt.model.ssm.ssm_forward_step |
| fp16: false |
| fp16_lm_cross_entropy: false |
| fp32_residual_connection: false |
| fp8: null |
| fp8_amax_compute_algo: most_recent |
| fp8_amax_history_len: 1 |
| fp8_dot_product_attention: false |
| fp8_interval: 1 |
| fp8_margin: 0 |
| fp8_multi_head_attention: false |
| fp8_recipe: delayed |
| fp8_wgrad: true |
| gated_linear_unit: false |
| get_attention_mask_from_fusion: false |
| grad_scale_func: null |
| grad_sync_func: null |
| gradient_accumulation_fusion: false |
| hidden_dropout: 0.0 |
| hidden_size: 4096 |
| hierarchical_context_parallel_sizes: null |
| hybrid_attention_ratio: 0.0 |
| hybrid_mlp_ratio: 0.0 |
| hybrid_override_pattern: M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- |
| inference_rng_tracker: false |
| init_method: null |
| init_method_std: 0.02 |
| init_model_with_meta_device: false |
| is_hybrid_model: true |
| kv_channels: null |
| layernorm_epsilon: 1.0e-05 |
| layernorm_zero_centered_gamma: false |
| make_vocab_size_divisible_by: 128 |
| mamba_head_dim: 64 |
| mamba_nheads: 128 |
| mamba_num_groups: 8 |
| mamba_state_dim: 128 |
| mapping_type: nvidia-hybrid-nemotronh |
| masked_softmax_fusion: true |
| memory_efficient_layer_norm: false |
| microbatch_group_size_per_vp_stage: 1 |
| moe_aux_loss_coeff: 0 |
| moe_enable_deepep: false |
| moe_expert_capacity_factor: null |
| moe_extended_tp: false |
| moe_ffn_hidden_size: null |
| moe_grouped_gemm: false |
| moe_input_jitter_eps: null |
| moe_layer_freq: 1 |
| moe_layer_recompute: false |
| moe_pad_expert_input_to_capacity: false |
| moe_per_layer_logging: false |
| moe_permute_fusion: false |
| moe_router_bias_update_rate: 0.001 |
| moe_router_dtype: null |
| moe_router_enable_expert_bias: false |
| moe_router_group_topk: null |
| moe_router_load_balancing_type: aux_loss |
| moe_router_num_groups: null |
| moe_router_pre_softmax: false |
| moe_router_score_function: softmax |
| moe_router_topk: 2 |
| moe_router_topk_limited_devices: null |
| moe_router_topk_scaling_factor: null |
| moe_shared_expert_intermediate_size: null |
| moe_shared_expert_overlap: false |
| moe_token_dispatcher_type: allgather |
| moe_token_drop_policy: probs |
| moe_token_dropping: false |
| moe_use_legacy_grouped_gemm: false |
| moe_z_loss_coeff: null |
| mtp_loss_scaling_factor: null |
| mtp_num_layers: null |
| multi_latent_attention: false |
| no_sync_func: null |
| normalization: RMSNorm |
| num_attention_heads: 32 |
| num_layers: 52 |
| num_layers_at_end_in_bf16: 1 |
| num_layers_at_start_in_bf16: 1 |
| num_layers_in_first_pipeline_stage: null |
| num_layers_in_last_pipeline_stage: null |
| num_microbatches_with_partial_activation_checkpoints: null |
| num_moe_experts: null |
| num_query_groups: 8 |
| output_layer_init_method: null |
| overlap_p2p_comm: false |
| overlap_p2p_comm_warmup_flush: false |
| parallel_output: true |
| param_sync_func: null |
| params_dtype: |
| _call_: false |
| _target_: torch.bfloat16 |
| perform_initialization: true |
| persist_layer_norm: true |
| pipeline_dtype: null |
| pipeline_model_parallel_comm_backend: null |
| pipeline_model_parallel_size: 1 |
| pipeline_model_parallel_split_rank: null |
| position_embedding_type: none |
| post_process: true |
| pre_process: true |
| qk_layernorm: false |
| recompute_granularity: null |
| recompute_method: null |
| recompute_num_layers: null |
| rotary_base: 10000 |
| rotary_interleaved: false |
| rotary_percent: 1.0 |
| seq_len_interpolation_factor: null |
| seq_length: 8192 |
| sequence_parallel: false |
| share_embeddings_and_output_weights: false |
| softmax_scale: null |
| tensor_model_parallel_size: 1 |
| test_mode: false |
| timers: null |
| tokenizer_library: tiktoken |
| tokenizer_model_path: null |
| tokenizer_name: TiktokenTokenizer |
| tp_comm_atomic_ag: false |
| tp_comm_atomic_rs: false |
| tp_comm_bootstrap_backend: nccl |
| tp_comm_bulk_dgrad: true |
| tp_comm_bulk_wgrad: true |
| tp_comm_overlap: false |
| tp_comm_overlap_ag: true |
| tp_comm_overlap_disable_fc1: false |
| tp_comm_overlap_disable_qkv: false |
| tp_comm_overlap_rs: true |
| tp_comm_overlap_rs_dgrad: false |
| tp_comm_split_ag: true |
| tp_comm_split_rs: true |
| tp_only_amax_red: false |
| use_cpu_initialization: false |
| use_custom_fsdp: false |
| use_ring_exchange_p2p: false |
| use_te_rng_tracker: false |
| variable_seq_lengths: false |
| virtual_pipeline_model_parallel_size: null |
| vocab_file: null |
| vocab_size: 131072 |
| wgrad_deferral_limit: 0 |
| window_size: null |
| model_transform: null |
| optim: |
| _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule |
| config: |
| _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig |
| adam_beta1: 0.9 |
| adam_beta2: 0.999 |
| adam_eps: 1.0e-08 |
| barrier_with_L1_time: false |
| bf16: false |
| clip_grad: 1.0 |
| config_logger_dir: '' |
| decoupled_lr: null |
| decoupled_min_lr: null |
| exp_avg_dtype: |
| _call_: false |
| _target_: torch.float32 |
| exp_avg_sq_dtype: |
| _call_: false |
| _target_: torch.float32 |
| fp16: false |
| hysteresis: 2 |
| initial_loss_scale: 4294967296 |
| log_num_zeros_in_grad: false |
| loss_scale: null |
| loss_scale_window: 1000 |
| lr: 0.0001 |
| main_grads_dtype: |
| _call_: false |
| _target_: torch.float32 |
| main_params_dtype: |
| _call_: false |
| _target_: torch.float32 |
| min_loss_scale: 1.0 |
| min_lr: null |
| optimizer: adam |
| optimizer_cpu_offload: false |
| optimizer_offload_fraction: 0.0 |
| overlap_cpu_optimizer_d2h_h2d: false |
| overlap_param_gather_with_optimizer_step: false |
| params_dtype: |
| _call_: false |
| _target_: torch.float32 |
| pin_cpu_grads: true |
| pin_cpu_params: true |
| sgd_momentum: 0.9 |
| timers: null |
| use_distributed_optimizer: true |
| use_precision_aware_optimizer: false |
| use_torch_optimizer_for_cpu_offload: false |
| weight_decay: 0.01 |
| lr_mult: 1.0 |
| lr_scheduler: null |
| no_weight_decay_cond: null |
| scale_lr_cond: null |
| tokenizer: |
| _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer |
| additional_special_tokens: [] |
| bos_token: null |
| cls_token: null |
| eos_token: null |
| include_special_tokens: false |
| mask_token: null |
| merges_file: null |
| pad_token: null |
| pretrained_model_name: nemo_tokenizer |
| sep_token: null |
| trust_remote_code: true |
| unk_token: null |
| use_fast: false |
| vocab_file: null |
|
|