_target_: nemo.collections.llm.gpt.model.base.GPTModel config: _cpu_offloading_context: null _target_: nemo.collections.llm.gpt.model.base.GPTConfig activation_func: _call_: false _target_: torch._C._nn.gelu activation_func_fp8_input_store: false add_bias_linear: true add_qkv_bias: false apply_query_key_layer_scaling: false apply_residual_connection_post_layernorm: false apply_rope_fusion: false async_tensor_model_parallel_allreduce: false attention_dropout: 0.1 attention_softmax_in_fp32: false autocast_dtype: null barrier_with_L1_time: true batch_p2p_comm: true batch_p2p_sync: true bf16: false bias_activation_fusion: false bias_dropout_fusion: false calculate_per_token_loss: false clone_scatter_output_in_embedding: true config_logger_dir: '' context_parallel_size: 1 cpu_offloading: false cpu_offloading_activations: true cpu_offloading_num_layers: 0 cpu_offloading_weights: true cross_entropy_loss_fusion: true data_step_fn: _call_: false _target_: nemo.collections.llm.gpt.model.base.gpt_data_step deallocate_pipeline_outputs: false defer_embedding_wgrad_compute: false deterministic_mode: false disable_parameter_transpose_cache: false distribute_saved_activations: null enable_autocast: false enable_cuda_graph: false expert_model_parallel_size: 1 external_cuda_graph: false ffn_hidden_size: 4096 finalize_model_grads_func: null first_pipeline_num_layers: null forward_step_fn: _call_: false _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step fp16: false fp16_lm_cross_entropy: false fp32_residual_connection: false fp8: null fp8_amax_compute_algo: most_recent fp8_amax_history_len: 1 fp8_dot_product_attention: false fp8_interval: 1 fp8_margin: 0 fp8_multi_head_attention: false fp8_wgrad: true gated_linear_unit: false grad_scale_func: null grad_sync_func: null gradient_accumulation_fusion: true hidden_dropout: 0.1 hidden_size: 1024 init_method: null init_method_std: 0.02 kv_channels: null last_pipeline_num_layers: null layernorm_epsilon: 1.0e-05 layernorm_zero_centered_gamma: false make_vocab_size_divisible_by: 128 masked_softmax_fusion: true memory_efficient_layer_norm: false moe_aux_loss_coeff: 0 moe_expert_capacity_factor: null moe_extended_tp: false moe_grouped_gemm: false moe_input_jitter_eps: null moe_layer_recompute: false moe_pad_expert_input_to_capacity: false moe_per_layer_logging: false moe_router_load_balancing_type: aux_loss moe_router_pre_softmax: false moe_router_topk: 2 moe_shared_expert_intermediate_size: null moe_shared_expert_overlap: false moe_token_dispatcher_type: allgather moe_token_drop_policy: probs moe_token_dropping: false moe_z_loss_coeff: null no_sync_func: null normalization: LayerNorm num_attention_heads: 8 num_layers: 2 num_microbatches_with_partial_activation_checkpoints: null num_moe_experts: null num_query_groups: null output_layer_init_method: null overlap_p2p_comm: false parallel_output: true param_sync_func: null params_dtype: _call_: false _target_: torch.float32 perform_initialization: true persist_layer_norm: false pipeline_dtype: null pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: null position_embedding_type: learned_absolute qk_layernorm: false recompute_granularity: null recompute_method: null recompute_num_layers: null rotary_base: 10000 rotary_interleaved: false rotary_percent: 1.0 seq_len_interpolation_factor: null seq_length: 1024 sequence_parallel: false share_embeddings_and_output_weights: true tensor_model_parallel_size: 1 test_mode: false timers: null tp_comm_atomic_ag: false tp_comm_atomic_rs: false tp_comm_bulk_dgrad: true tp_comm_bulk_wgrad: true tp_comm_overlap: false tp_comm_overlap_ag: true tp_comm_overlap_disable_fc1: false tp_comm_overlap_disable_qkv: false tp_comm_overlap_rs: true tp_comm_overlap_rs_dgrad: false tp_comm_split_ag: true tp_comm_split_rs: true tp_only_amax_red: false transformer_layer_spec: _call_: false _target_: nemo.collections.llm.gpt.model.base.default_layer_spec use_cpu_initialization: false use_ring_exchange_p2p: false use_te_rng_tracker: false variable_seq_lengths: false virtual_pipeline_model_parallel_size: null wgrad_deferral_limit: 0 window_size: null model_transform: null optim: _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule config: _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig adam_beta1: 0.9 adam_beta2: 0.999 adam_eps: 1.0e-08 barrier_with_L1_time: false bf16: false clip_grad: 1.0 config_logger_dir: '' decoupled_lr: null decoupled_min_lr: null fp16: false hysteresis: 2 initial_loss_scale: 4294967296 log_num_zeros_in_grad: false loss_scale: null loss_scale_window: 1000 lr: 0.0001 min_loss_scale: 1.0 min_lr: null optimizer: adam overlap_param_gather_with_optimizer_step: false params_dtype: _call_: false _target_: torch.float32 sgd_momentum: 0.9 timers: null use_distributed_optimizer: true weight_decay: 0.01 lr_mult: 1.0 lr_scheduler: null no_weight_decay_cond: null scale_lr_cond: null tokenizer: _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer bos_token: null cls_token: null eos_token: null mask_token: null merges_file: megatron-gpt-345m_merges pad_token: null pretrained_model_name: gpt2 sep_token: null trust_remote_code: false unk_token: null use_fast: false vocab_file: megatron-gpt-345m_vocab