JoyboyGo commited on
Commit
8023e65
·
verified ·
1 Parent(s): ce4aea4

Upload folder using huggingface_hub

Browse files
Files changed (21) hide show
  1. .gitattributes +17 -0
  2. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/.metadata +3 -0
  3. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_0.distcp +3 -0
  4. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_1.distcp +3 -0
  5. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_0.distcp +3 -0
  6. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_1.distcp +3 -0
  7. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_0.distcp +3 -0
  8. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_1.distcp +3 -0
  9. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_0.distcp +3 -0
  10. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_1.distcp +3 -0
  11. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_0.distcp +3 -0
  12. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_1.distcp +3 -0
  13. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_0.distcp +3 -0
  14. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_1.distcp +3 -0
  15. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_0.distcp +3 -0
  16. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_1.distcp +3 -0
  17. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_0.distcp +3 -0
  18. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_1.distcp +3 -0
  19. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/common.pt +3 -0
  20. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/metadata.json +1 -0
  21. grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/modelopt_run_config.yaml +164 -0
.gitattributes CHANGED
@@ -50,3 +50,20 @@ grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/_
50
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
51
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
52
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
51
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
52
  grid_search/yulan-gdn-sft-1b-sl65536-lr3e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
53
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/.metadata filter=lfs diff=lfs merge=lfs -text
54
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
55
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
56
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
57
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
58
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
59
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
60
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
61
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
62
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
63
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
64
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
65
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
66
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
67
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
68
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
69
+ grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18480e05cc6f833f11a7270c8a3568781de2cfa0228d23e7d9788f835bd39b78
3
+ size 2969444
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e091f6abc3ce9ee329eefca2462115e82b46765dc43fae84b7cdbf566c916563
3
+ size 2338811157
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7ebfcde024ad10abd0cb6e7d6f3372b1a8eaa5e6919bc49244af84174d892f
3
+ size 2338795644
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8680b6eeca19ec10a9407cc03afb84883466ad67bd4ca809ecf14ba9fe039b98
3
+ size 2338090987
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce745e4cc6bc87dab2d41391a003750c7bc67f78ea48a248b866944e384cfad
3
+ size 2338066012
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9142f99d236420d90288335fbc780b01904eb0c6b7dc44df168cbdc9b341f8e
3
+ size 2338583696
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__2_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56cd5225be795af9c655aec0b0c717d9961cfaa92853024114a9b1e7192213d6
3
+ size 2338589940
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63fc61aa44691a44df3f35b3fd086e28bbf33851587edfe7deb14b7cf6ec366a
3
+ size 2338078692
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__3_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c196c0e25ad2afd48344a1eb05b3fd7ce6254ba497d71773e9c45ad855ce478e
3
+ size 2338067589
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c97f408ee0ca22d6f8ae07f1f53e3d22e382f6646d4f15bb406d778fc4a0b1c
3
+ size 2338591581
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__4_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:662717fabb44237e08af2d3ce92cc88eb0863fa330641363a745904289a4a7e4
3
+ size 2338607287
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76f9bbe83e7646809b6d5d965ac2a8259eacef45b383933e677673337801a095
3
+ size 2338107014
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__5_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2168246ba1c32acb3e4a047356655d3be882393c91c972e8787303e87060c9c
3
+ size 2338070743
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f72578e87bb53d6a0517af917e73d39fafbe424fe5b42c523452eb49b944c1d2
3
+ size 2338591581
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__6_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b88f2f92fdb4ac223f6440266238100d71d2c6f7dd74d184e49d438d156e5cae
3
+ size 2338610441
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f66d4a0e4428a6b6a1c57a63d5369d9bedcbbbe10cc90f65e087d0ed7ead828e
3
+ size 2338092821
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/__7_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b79c13ebab926c0aaf1fec9f65922e3434b36d29d906f2a239435b7d0475e9
3
+ size 2338089667
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f687d9d9a89d2450c02bdb473a45a670241bda25eb19e7b9d6fb9b6c77d96743
3
+ size 141799
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
grid_search/yulan-gdn-sft-1b-sl65536-lr5e-6-gbs16-mb1-tp2-pp1-cp4/iter_0000953/modelopt_run_config.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activation_func: <function silu at 0x7f4ac4d33250>
2
+ activation_func_clamp_value: None
3
+ add_bias_linear: false
4
+ add_qkv_bias: true
5
+ apply_query_key_layer_scaling: false
6
+ apply_residual_connection_post_layernorm: false
7
+ apply_rope_fusion: true
8
+ attention_backend: AttnBackend.auto
9
+ attention_dropout: '0.1'
10
+ attention_output_gate: false
11
+ attention_softmax_in_fp32: false
12
+ attn_k_token_shift: None
13
+ attn_output_gate: None
14
+ attn_q_token_shift: None
15
+ attn_token_shift: None
16
+ attn_v_token_shift: None
17
+ autocast_dtype: torch.bfloat16
18
+ barrier_with_L1_time: true
19
+ bf16: true
20
+ bias_activation_fusion: true
21
+ bias_dropout_fusion: true
22
+ calculate_per_token_loss: false
23
+ clone_scatter_output_in_embedding: true
24
+ config_logger_dir: ''
25
+ cross_entropy_fusion_impl: native
26
+ cross_entropy_loss_fusion: false
27
+ defer_embedding_wgrad_compute: false
28
+ delay_wgrad_compute: false
29
+ deterministic_mode: false
30
+ disable_bf16_reduced_precision_matmul: false
31
+ disable_parameter_transpose_cache: false
32
+ distribute_saved_activations: false
33
+ emb_deviation_loss_coeff: 0
34
+ emb_deviation_type: None
35
+ enable_autocast: false
36
+ ffn_hidden_size: 4800
37
+ ffn_intermediate_token_shift: None
38
+ ffn_token_shift: None
39
+ finalize_model_grads_func: <function finalize_model_grads at 0x7f49d90239a0>
40
+ fine_grained_activation_offloading: false
41
+ first_last_layers_bf16: false
42
+ flash_decode: false
43
+ fp16: false
44
+ fp32_residual_connection: false
45
+ freeze_layernorm_weight: false
46
+ fused_single_qkv_rope: false
47
+ gated_linear_unit: true
48
+ glu_linear_offset: '0.0'
49
+ grad_scale_func: <bound method MegatronOptimizer.scale_loss of <megatron.core.optimizer.optimizer.ChainedOptimizer
50
+ object at 0x7f490c4f21d0>>
51
+ grad_sync_func: None
52
+ gradient_accumulation_fusion: true
53
+ hetereogenous_dist_checkpoint: false
54
+ heterogeneous_block_specs: false
55
+ hidden_dropout: '0.1'
56
+ hidden_size: 1920
57
+ is_hybrid_model: false
58
+ kv_channels: 64
59
+ layernorm_epsilon: 1e-06
60
+ layernorm_zero_centered_gamma: false
61
+ linear_attention_freq: '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
62
+ 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
+ 0, 1, 0, 0, 1, 1, 1, 1, 1, 1]'
64
+ linear_attention_type: gated_delta_net
65
+ linear_conv_kernel_dim: 4
66
+ linear_key_head_dim: 64
67
+ linear_num_key_heads: 8
68
+ linear_num_value_heads: 32
69
+ linear_value_head_dim: 64
70
+ log_hidden_states: '[]'
71
+ log_params: '[]'
72
+ mamba_disable_cp: false
73
+ mamba_expand: 2
74
+ mamba_head_dim: 64
75
+ mamba_num_groups: 8
76
+ mamba_num_heads: None
77
+ mamba_state_dim: 128
78
+ masked_softmax_fusion: false
79
+ memory_efficient_layer_norm: false
80
+ min_offloaded_tensor_size: 1048576
81
+ mlp_chunks_for_prefill: 1
82
+ moe_apply_probs_on_input: false
83
+ moe_aux_loss_coeff: '0.0'
84
+ moe_deepep_num_sms: 20
85
+ moe_enable_deepep: false
86
+ moe_expert_capacity_factor: None
87
+ moe_extended_tp: false
88
+ moe_ffn_hidden_size: None
89
+ moe_flex_dispatcher_backend: deepep
90
+ moe_grouped_gemm: false
91
+ moe_hybridep_num_sms: 16
92
+ moe_input_jitter_eps: None
93
+ moe_layer_freq: 1
94
+ moe_pad_expert_input_to_capacity: false
95
+ moe_per_layer_logging: false
96
+ moe_permute_fusion: false
97
+ moe_router_bias_update_method: sign
98
+ moe_router_bias_update_rate: '0.001'
99
+ moe_router_dtype: None
100
+ moe_router_enable_expert_bias: false
101
+ moe_router_force_load_balancing: false
102
+ moe_router_fusion: false
103
+ moe_router_group_topk: None
104
+ moe_router_load_balancing_type: aux_loss
105
+ moe_router_num_groups: None
106
+ moe_router_padding_for_quantization: false
107
+ moe_router_pre_softmax: false
108
+ moe_router_score_function: softmax
109
+ moe_router_topk: 2
110
+ moe_router_topk_limited_devices: None
111
+ moe_router_topk_scaling_factor: None
112
+ moe_shared_expert_gate: false
113
+ moe_shared_expert_intermediate_size: None
114
+ moe_shared_expert_overlap: false
115
+ moe_token_dispatcher_type: allgather
116
+ moe_token_drop_policy: probs
117
+ moe_token_dropping: false
118
+ moe_use_legacy_grouped_gemm: false
119
+ moe_z_loss_coeff: None
120
+ mrope_section: None
121
+ multi_latent_attention: false
122
+ no_rope_freq: None
123
+ no_sync_func: None
124
+ normalization: RMSNorm
125
+ num_attention_heads: 30
126
+ num_layers: 56
127
+ num_layers_at_end_in_bf16: 1
128
+ num_layers_at_start_in_bf16: 1
129
+ num_moe_experts: None
130
+ num_query_groups: 6
131
+ nvidia_modelopt_version: 0.39.0
132
+ offload_modules: '[]'
133
+ param_sync_func: None
134
+ params_dtype: torch.bfloat16
135
+ perform_initialization: true
136
+ persist_layer_norm: true
137
+ qk_l2_norm: false
138
+ qk_layernorm: false
139
+ quant_recipe: None
140
+ reparam_keys: None
141
+ rotary_interleaved: false
142
+ softmax_scale: None
143
+ softmax_type: vanilla
144
+ spectral_mup_init: false
145
+ split_expert_init: true
146
+ split_fc1_init: true
147
+ split_qkv_init: true
148
+ symmetric_ar_type: None
149
+ test_mode: false
150
+ timers: <megatron.core.timers.Timers object at 0x7f494d44bd60>
151
+ token_shift_conv_init: default
152
+ token_shift_conv_size: 4
153
+ transformer_impl: transformer_engine
154
+ use_fused_weighted_squared_relu: false
155
+ use_kitchen: false
156
+ use_mamba_mem_eff_path: true
157
+ use_ring_exchange_p2p: false
158
+ use_te_activation_func: false
159
+ use_te_rng_tracker: false
160
+ variable_seq_lengths: false
161
+ wgrad_deferral_limit: 0
162
+ window_attn_skip_freq: None
163
+ window_size: None
164
+ word_embedding_dropout_prob: '0.0'