_target_: nemo.collections.llm.gpt.model.base.GPTModel
config:
  _cpu_offloading_context: null
  _target_: nemo.collections.llm.gpt.model.base.GPTConfig
  activation_func:
    _call_: false
    _target_: torch._C._nn.gelu
  activation_func_fp8_input_store: false
  add_bias_linear: true
  add_qkv_bias: false
  apply_query_key_layer_scaling: false
  apply_residual_connection_post_layernorm: false
  apply_rope_fusion: false
  async_tensor_model_parallel_allreduce: false
  attention_dropout: 0.1
  attention_softmax_in_fp32: false
  autocast_dtype: null
  barrier_with_L1_time: true
  batch_p2p_comm: true
  batch_p2p_sync: true
  bf16: false
  bias_activation_fusion: false
  bias_dropout_fusion: false
  calculate_per_token_loss: false
  clone_scatter_output_in_embedding: true
  config_logger_dir: ''
  context_parallel_size: 1
  cpu_offloading: false
  cpu_offloading_activations: true
  cpu_offloading_num_layers: 0
  cpu_offloading_weights: true
  cross_entropy_loss_fusion: true
  data_step_fn:
    _call_: false
    _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
  deallocate_pipeline_outputs: false
  defer_embedding_wgrad_compute: false
  deterministic_mode: false
  disable_parameter_transpose_cache: false
  distribute_saved_activations: null
  enable_autocast: false
  enable_cuda_graph: false
  expert_model_parallel_size: 1
  external_cuda_graph: false
  ffn_hidden_size: 4096
  finalize_model_grads_func: null
  first_pipeline_num_layers: null
  forward_step_fn:
    _call_: false
    _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
  fp16: false
  fp16_lm_cross_entropy: false
  fp32_residual_connection: false
  fp8: null
  fp8_amax_compute_algo: most_recent
  fp8_amax_history_len: 1
  fp8_dot_product_attention: false
  fp8_interval: 1
  fp8_margin: 0
  fp8_multi_head_attention: false
  fp8_wgrad: true
  gated_linear_unit: false
  grad_scale_func: null
  grad_sync_func: null
  gradient_accumulation_fusion: true
  hidden_dropout: 0.1
  hidden_size: 1024
  init_method: null
  init_method_std: 0.02
  kv_channels: null
  last_pipeline_num_layers: null
  layernorm_epsilon: 1.0e-05
  layernorm_zero_centered_gamma: false
  make_vocab_size_divisible_by: 128
  masked_softmax_fusion: true
  memory_efficient_layer_norm: false
  moe_aux_loss_coeff: 0
  moe_expert_capacity_factor: null
  moe_extended_tp: false
  moe_grouped_gemm: false
  moe_input_jitter_eps: null
  moe_layer_recompute: false
  moe_pad_expert_input_to_capacity: false
  moe_per_layer_logging: false
  moe_router_load_balancing_type: aux_loss
  moe_router_pre_softmax: false
  moe_router_topk: 2
  moe_shared_expert_intermediate_size: null
  moe_shared_expert_overlap: false
  moe_token_dispatcher_type: allgather
  moe_token_drop_policy: probs
  moe_token_dropping: false
  moe_z_loss_coeff: null
  no_sync_func: null
  normalization: LayerNorm
  num_attention_heads: 8
  num_layers: 2
  num_microbatches_with_partial_activation_checkpoints: null
  num_moe_experts: null
  num_query_groups: null
  output_layer_init_method: null
  overlap_p2p_comm: false
  parallel_output: true
  param_sync_func: null
  params_dtype:
    _call_: false
    _target_: torch.float32
  perform_initialization: true
  persist_layer_norm: false
  pipeline_dtype: null
  pipeline_model_parallel_size: 1
  pipeline_model_parallel_split_rank: null
  position_embedding_type: learned_absolute
  qk_layernorm: false
  recompute_granularity: null
  recompute_method: null
  recompute_num_layers: null
  rotary_base: 10000
  rotary_interleaved: false
  rotary_percent: 1.0
  seq_len_interpolation_factor: null
  seq_length: 1024
  sequence_parallel: false
  share_embeddings_and_output_weights: true
  tensor_model_parallel_size: 1
  test_mode: false
  timers: null
  tp_comm_atomic_ag: false
  tp_comm_atomic_rs: false
  tp_comm_bulk_dgrad: true
  tp_comm_bulk_wgrad: true
  tp_comm_overlap: false
  tp_comm_overlap_ag: true
  tp_comm_overlap_disable_fc1: false
  tp_comm_overlap_disable_qkv: false
  tp_comm_overlap_rs: true
  tp_comm_overlap_rs_dgrad: false
  tp_comm_split_ag: true
  tp_comm_split_rs: true
  tp_only_amax_red: false
  transformer_layer_spec:
    _call_: false
    _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
  use_cpu_initialization: false
  use_ring_exchange_p2p: false
  use_te_rng_tracker: false
  variable_seq_lengths: false
  virtual_pipeline_model_parallel_size: null
  wgrad_deferral_limit: 0
  window_size: null
model_transform: null
optim:
  _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
  config:
    _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
    adam_beta1: 0.9
    adam_beta2: 0.999
    adam_eps: 1.0e-08
    barrier_with_L1_time: false
    bf16: false
    clip_grad: 1.0
    config_logger_dir: ''
    decoupled_lr: null
    decoupled_min_lr: null
    fp16: false
    hysteresis: 2
    initial_loss_scale: 4294967296
    log_num_zeros_in_grad: false
    loss_scale: null
    loss_scale_window: 1000
    lr: 0.0001
    min_loss_scale: 1.0
    min_lr: null
    optimizer: adam
    overlap_param_gather_with_optimizer_step: false
    params_dtype:
      _call_: false
      _target_: torch.float32
    sgd_momentum: 0.9
    timers: null
    use_distributed_optimizer: true
    weight_decay: 0.01
  lr_mult: 1.0
  lr_scheduler: null
  no_weight_decay_cond: null
  scale_lr_cond: null
tokenizer:
  _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
  bos_token: null
  cls_token: null
  eos_token: null
  mask_token: null
  merges_file: megatron-gpt-345m_merges
  pad_token: null
  pretrained_model_name: gpt2
  sep_token: null
  trust_remote_code: false
  unk_token: null
  use_fast: false
  vocab_file: megatron-gpt-345m_vocab