| _target_: nemo.collections.llm.gpt.model.base.GPTModel | |
| config: | |
| _cpu_offloading_context: null | |
| _target_: nemo.collections.llm.gpt.model.base.GPTConfig | |
| activation_func: | |
| _call_: false | |
| _target_: torch._C._nn.gelu | |
| activation_func_fp8_input_store: false | |
| add_bias_linear: true | |
| add_qkv_bias: false | |
| apply_query_key_layer_scaling: false | |
| apply_residual_connection_post_layernorm: false | |
| apply_rope_fusion: false | |
| async_tensor_model_parallel_allreduce: false | |
| attention_dropout: 0.1 | |
| attention_softmax_in_fp32: false | |
| autocast_dtype: null | |
| barrier_with_L1_time: true | |
| batch_p2p_comm: true | |
| batch_p2p_sync: true | |
| bf16: false | |
| bias_activation_fusion: false | |
| bias_dropout_fusion: false | |
| calculate_per_token_loss: false | |
| clone_scatter_output_in_embedding: true | |
| config_logger_dir: '' | |
| context_parallel_size: 1 | |
| cpu_offloading: false | |
| cpu_offloading_activations: true | |
| cpu_offloading_num_layers: 0 | |
| cpu_offloading_weights: true | |
| cross_entropy_loss_fusion: true | |
| data_step_fn: | |
| _call_: false | |
| _target_: nemo.collections.llm.gpt.model.base.gpt_data_step | |
| deallocate_pipeline_outputs: false | |
| defer_embedding_wgrad_compute: false | |
| deterministic_mode: false | |
| disable_parameter_transpose_cache: false | |
| distribute_saved_activations: null | |
| enable_autocast: false | |
| enable_cuda_graph: false | |
| expert_model_parallel_size: 1 | |
| external_cuda_graph: false | |
| ffn_hidden_size: 4096 | |
| finalize_model_grads_func: null | |
| first_pipeline_num_layers: null | |
| forward_step_fn: | |
| _call_: false | |
| _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step | |
| fp16: false | |
| fp16_lm_cross_entropy: false | |
| fp32_residual_connection: false | |
| fp8: null | |
| fp8_amax_compute_algo: most_recent | |
| fp8_amax_history_len: 1 | |
| fp8_dot_product_attention: false | |
| fp8_interval: 1 | |
| fp8_margin: 0 | |
| fp8_multi_head_attention: false | |
| fp8_wgrad: true | |
| gated_linear_unit: false | |
| grad_scale_func: null | |
| grad_sync_func: null | |
| gradient_accumulation_fusion: true | |
| hidden_dropout: 0.1 | |
| hidden_size: 1024 | |
| init_method: null | |
| init_method_std: 0.02 | |
| kv_channels: null | |
| last_pipeline_num_layers: null | |
| layernorm_epsilon: 1.0e-05 | |
| layernorm_zero_centered_gamma: false | |
| make_vocab_size_divisible_by: 128 | |
| masked_softmax_fusion: true | |
| memory_efficient_layer_norm: false | |
| moe_aux_loss_coeff: 0 | |
| moe_expert_capacity_factor: null | |
| moe_extended_tp: false | |
| moe_grouped_gemm: false | |
| moe_input_jitter_eps: null | |
| moe_layer_recompute: false | |
| moe_pad_expert_input_to_capacity: false | |
| moe_per_layer_logging: false | |
| moe_router_load_balancing_type: aux_loss | |
| moe_router_pre_softmax: false | |
| moe_router_topk: 2 | |
| moe_shared_expert_intermediate_size: null | |
| moe_shared_expert_overlap: false | |
| moe_token_dispatcher_type: allgather | |
| moe_token_drop_policy: probs | |
| moe_token_dropping: false | |
| moe_z_loss_coeff: null | |
| no_sync_func: null | |
| normalization: LayerNorm | |
| num_attention_heads: 8 | |
| num_layers: 2 | |
| num_microbatches_with_partial_activation_checkpoints: null | |
| num_moe_experts: null | |
| num_query_groups: null | |
| output_layer_init_method: null | |
| overlap_p2p_comm: false | |
| parallel_output: true | |
| param_sync_func: null | |
| params_dtype: | |
| _call_: false | |
| _target_: torch.float32 | |
| perform_initialization: true | |
| persist_layer_norm: false | |
| pipeline_dtype: null | |
| pipeline_model_parallel_size: 1 | |
| pipeline_model_parallel_split_rank: null | |
| position_embedding_type: learned_absolute | |
| qk_layernorm: false | |
| recompute_granularity: null | |
| recompute_method: null | |
| recompute_num_layers: null | |
| rotary_base: 10000 | |
| rotary_interleaved: false | |
| rotary_percent: 1.0 | |
| seq_len_interpolation_factor: null | |
| seq_length: 1024 | |
| sequence_parallel: false | |
| share_embeddings_and_output_weights: true | |
| tensor_model_parallel_size: 1 | |
| test_mode: false | |
| timers: null | |
| tp_comm_atomic_ag: false | |
| tp_comm_atomic_rs: false | |
| tp_comm_bulk_dgrad: true | |
| tp_comm_bulk_wgrad: true | |
| tp_comm_overlap: false | |
| tp_comm_overlap_ag: true | |
| tp_comm_overlap_disable_fc1: false | |
| tp_comm_overlap_disable_qkv: false | |
| tp_comm_overlap_rs: true | |
| tp_comm_overlap_rs_dgrad: false | |
| tp_comm_split_ag: true | |
| tp_comm_split_rs: true | |
| tp_only_amax_red: false | |
| transformer_layer_spec: | |
| _call_: false | |
| _target_: nemo.collections.llm.gpt.model.base.default_layer_spec | |
| use_cpu_initialization: false | |
| use_ring_exchange_p2p: false | |
| use_te_rng_tracker: false | |
| variable_seq_lengths: false | |
| virtual_pipeline_model_parallel_size: null | |
| wgrad_deferral_limit: 0 | |
| window_size: null | |
| model_transform: null | |
| optim: | |
| _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule | |
| config: | |
| _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.999 | |
| adam_eps: 1.0e-08 | |
| barrier_with_L1_time: false | |
| bf16: false | |
| clip_grad: 1.0 | |
| config_logger_dir: '' | |
| decoupled_lr: null | |
| decoupled_min_lr: null | |
| fp16: false | |
| hysteresis: 2 | |
| initial_loss_scale: 4294967296 | |
| log_num_zeros_in_grad: false | |
| loss_scale: null | |
| loss_scale_window: 1000 | |
| lr: 0.0001 | |
| min_loss_scale: 1.0 | |
| min_lr: null | |
| optimizer: adam | |
| overlap_param_gather_with_optimizer_step: false | |
| params_dtype: | |
| _call_: false | |
| _target_: torch.float32 | |
| sgd_momentum: 0.9 | |
| timers: null | |
| use_distributed_optimizer: true | |
| weight_decay: 0.01 | |
| lr_mult: 1.0 | |
| lr_scheduler: null | |
| no_weight_decay_cond: null | |
| scale_lr_cond: null | |
| tokenizer: | |
| _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer | |
| bos_token: null | |
| cls_token: null | |
| eos_token: null | |
| mask_token: null | |
| merges_file: megatron-gpt-345m_merges | |
| pad_token: null | |
| pretrained_model_name: gpt2 | |
| sep_token: null | |
| trust_remote_code: false | |
| unk_token: null | |
| use_fast: false | |
| vocab_file: megatron-gpt-345m_vocab | |