Text Generation
Transformers
Safetensors
PyTorch
nvidia
nemotron-h
conversational
suhara's picture
Upload folder using huggingface_hub
9cfc143 verified
_target_: nemo.collections.llm.gpt.model.ssm.MambaModel
config:
_cpu_offloading_context: null
_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfig8B
account_for_embedding_in_pipeline_split: false
account_for_loss_in_pipeline_split: false
activation_func:
_call_: false
_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfigBase.<lambda>
activation_func_fp8_input_store: false
add_bias_linear: false
add_qkv_bias: false
apply_query_key_layer_scaling: false
apply_residual_connection_post_layernorm: false
apply_rope_fusion: true
async_tensor_model_parallel_allreduce: false
attention_backend:
_call_: true
_target_: megatron.core.transformer.enums.AttnBackend
attention_dropout: 0.0
attention_softmax_in_fp32: false
autocast_dtype: null
barrier_with_L1_time: true
batch_p2p_comm: true
batch_p2p_sync: true
bf16: true
bias_activation_fusion: false
bias_dropout_fusion: true
calculate_per_token_loss: false
clone_scatter_output_in_embedding: true
config_logger_dir: ''
context_parallel_size: 1
cp_comm_type: null
cpu_offloading: false
cpu_offloading_activations: true
cpu_offloading_num_layers: 0
cpu_offloading_weights: true
cross_entropy_fusion_impl: native
cross_entropy_loss_fusion: true
cuda_graph_retain_backward_graph: false
cuda_graph_scope: full
cuda_graph_use_single_mempool: false
cuda_graph_warmup_steps: 3
data_step_fn:
_call_: false
_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
deallocate_pipeline_outputs: true
defer_embedding_wgrad_compute: false
deterministic_mode: false
disable_parameter_transpose_cache: false
distribute_saved_activations: null
enable_autocast: false
enable_cuda_graph: false
expert_model_parallel_size: 1
expert_tensor_parallel_size: null
external_cuda_graph: false
ffn_hidden_size: 21504
finalize_model_grads_func: null
first_last_layers_bf16: true
flash_decode: false
forward_step_fn:
_call_: false
_target_: nemo.collections.llm.gpt.model.ssm.ssm_forward_step
fp16: false
fp16_lm_cross_entropy: false
fp32_residual_connection: false
fp8: null
fp8_amax_compute_algo: most_recent
fp8_amax_history_len: 1
fp8_dot_product_attention: false
fp8_interval: 1
fp8_margin: 0
fp8_multi_head_attention: false
fp8_recipe: delayed
fp8_wgrad: true
gated_linear_unit: false
get_attention_mask_from_fusion: false
grad_scale_func: null
grad_sync_func: null
gradient_accumulation_fusion: false
hidden_dropout: 0.0
hidden_size: 4096
hierarchical_context_parallel_sizes: null
hybrid_attention_ratio: 0.0
hybrid_mlp_ratio: 0.0
hybrid_override_pattern: M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-
inference_rng_tracker: false
init_method: null
init_method_std: 0.02
init_model_with_meta_device: false
is_hybrid_model: true
kv_channels: null
layernorm_epsilon: 1.0e-05
layernorm_zero_centered_gamma: false
make_vocab_size_divisible_by: 128
mamba_head_dim: 64
mamba_nheads: 128
mamba_num_groups: 8
mamba_state_dim: 128
mapping_type: nvidia-hybrid-nemotronh
masked_softmax_fusion: true
memory_efficient_layer_norm: false
microbatch_group_size_per_vp_stage: 1
moe_aux_loss_coeff: 0
moe_enable_deepep: false
moe_expert_capacity_factor: null
moe_extended_tp: false
moe_ffn_hidden_size: null
moe_grouped_gemm: false
moe_input_jitter_eps: null
moe_layer_freq: 1
moe_layer_recompute: false
moe_pad_expert_input_to_capacity: false
moe_per_layer_logging: false
moe_permute_fusion: false
moe_router_bias_update_rate: 0.001
moe_router_dtype: null
moe_router_enable_expert_bias: false
moe_router_group_topk: null
moe_router_load_balancing_type: aux_loss
moe_router_num_groups: null
moe_router_pre_softmax: false
moe_router_score_function: softmax
moe_router_topk: 2
moe_router_topk_limited_devices: null
moe_router_topk_scaling_factor: null
moe_shared_expert_intermediate_size: null
moe_shared_expert_overlap: false
moe_token_dispatcher_type: allgather
moe_token_drop_policy: probs
moe_token_dropping: false
moe_use_legacy_grouped_gemm: false
moe_z_loss_coeff: null
mtp_loss_scaling_factor: null
mtp_num_layers: null
multi_latent_attention: false
no_sync_func: null
normalization: RMSNorm
num_attention_heads: 32
num_layers: 52
num_layers_at_end_in_bf16: 1
num_layers_at_start_in_bf16: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
num_microbatches_with_partial_activation_checkpoints: null
num_moe_experts: null
num_query_groups: 8
output_layer_init_method: null
overlap_p2p_comm: false
overlap_p2p_comm_warmup_flush: false
parallel_output: true
param_sync_func: null
params_dtype:
_call_: false
_target_: torch.bfloat16
perform_initialization: true
persist_layer_norm: true
pipeline_dtype: null
pipeline_model_parallel_comm_backend: null
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: null
position_embedding_type: none
post_process: true
pre_process: true
qk_layernorm: false
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
rotary_base: 10000
rotary_interleaved: false
rotary_percent: 1.0
seq_len_interpolation_factor: null
seq_length: 8192
sequence_parallel: false
share_embeddings_and_output_weights: false
softmax_scale: null
tensor_model_parallel_size: 1
test_mode: false
timers: null
tokenizer_library: tiktoken
tokenizer_model_path: null
tokenizer_name: TiktokenTokenizer
tp_comm_atomic_ag: false
tp_comm_atomic_rs: false
tp_comm_bootstrap_backend: nccl
tp_comm_bulk_dgrad: true
tp_comm_bulk_wgrad: true
tp_comm_overlap: false
tp_comm_overlap_ag: true
tp_comm_overlap_disable_fc1: false
tp_comm_overlap_disable_qkv: false
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_split_rs: true
tp_only_amax_red: false
use_cpu_initialization: false
use_custom_fsdp: false
use_ring_exchange_p2p: false
use_te_rng_tracker: false
variable_seq_lengths: false
virtual_pipeline_model_parallel_size: null
vocab_file: null
vocab_size: 131072
wgrad_deferral_limit: 0
window_size: null
model_transform: null
optim:
_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
config:
_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1.0e-08
barrier_with_L1_time: false
bf16: false
clip_grad: 1.0
config_logger_dir: ''
decoupled_lr: null
decoupled_min_lr: null
exp_avg_dtype:
_call_: false
_target_: torch.float32
exp_avg_sq_dtype:
_call_: false
_target_: torch.float32
fp16: false
hysteresis: 2
initial_loss_scale: 4294967296
log_num_zeros_in_grad: false
loss_scale: null
loss_scale_window: 1000
lr: 0.0001
main_grads_dtype:
_call_: false
_target_: torch.float32
main_params_dtype:
_call_: false
_target_: torch.float32
min_loss_scale: 1.0
min_lr: null
optimizer: adam
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
overlap_cpu_optimizer_d2h_h2d: false
overlap_param_gather_with_optimizer_step: false
params_dtype:
_call_: false
_target_: torch.float32
pin_cpu_grads: true
pin_cpu_params: true
sgd_momentum: 0.9
timers: null
use_distributed_optimizer: true
use_precision_aware_optimizer: false
use_torch_optimizer_for_cpu_offload: false
weight_decay: 0.01
lr_mult: 1.0
lr_scheduler: null
no_weight_decay_cond: null
scale_lr_cond: null
tokenizer:
_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
additional_special_tokens: []
bos_token: null
cls_token: null
eos_token: null
include_special_tokens: false
mask_token: null
merges_file: null
pad_token: null
pretrained_model_name: nemo_tokenizer
sep_token: null
trust_remote_code: true
unk_token: null
use_fast: false
vocab_file: null