ayrnb
/

megatron_nemo

NeMo

Model card Files Files and versions

xet

Community

ayrnb commited on Feb 27, 2024

Commit

1ebceda

1 Parent(s): bcdd41e

fix

Browse files

Files changed (2) hide show

hparams.yaml +118 -0
megatron_gpt_te_false_bf16.nemo +3 -0

hparams.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+cfg:
+  micro_batch_size: 4
+  global_batch_size: 32
+  rampup_batch_size: null
+  context_parallel_size: 1
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  resume_from_checkpoint: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 24
+  hidden_size: 4096
+  ffn_hidden_size: 16384
+  num_attention_heads: 32
+  init_method_std: 0.01
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  layernorm_epsilon: 1.0e-05
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  gradient_as_bucket_view: true
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  fsdp: false
+  fsdp_sharding_strategy: full
+  fsdp_grad_reduce_dtype: 32
+  fsdp_sharded_checkpoint: false
+  sequence_parallel: false
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  num_query_groups: null
+  tokenizer:
+    library: megatron
+    type: GPT2BPETokenizer
+    model: null
+    delimiter: null
+    vocab_file: /gpt3_dataset//bpe/vocab.json
+    merge_file: /gpt3_dataset//bpe/merges.txt
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  sharp: false
+  mcore_gpt: true
+  transformer_engine: false
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: true
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  fp8_wgrad: true
+  ub_tp_comm_overlap: false
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  seed: 1234
+  sync_batch_comm: false
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  nsys_profile:
+    enabled: false
+    trace:
+    - nvtx
+    - cuda
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  optim:
+    name: distributed_fused_adam
+    bucket_cap_mb: 400
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    lr: 0.00016
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 115
+      constant_steps: 12500
+      min_lr: 1.6e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 0.0333
+    - /gpt3_dataset/wiki_text_document
+  precision: bf16-mixed

megatron_gpt_te_false_bf16.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82cf10a38537f7335cfc30fd4e2b5048111a4499db46e27446dd27a45fee93b0
+size 2825287680