File size: 1,549 Bytes
3d83373 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
{
"encoder_config": {
"d_model": 1024,
"d_intermediate": 0,
"n_layer": 24,
"vocab_size": 5000,
"max_position_embeddings": 4096,
"ssm_cfg": {
"layer": "Mamba2"
},
"attn_layer_idx": [
6,
18
],
"attn_cfg": {
"causal": false,
"d_conv": 0,
"head_dim": 64,
"num_heads": 16,
"num_heads_kv": 8,
"out_proj_bias": false,
"qkv_proj_bias": false,
"rotary_emb_dim": 64
},
"rms_norm": true,
"residual_in_fp32": true,
"fused_add_norm": true,
"pad_vocab_size_multiple": 8,
"tie_embeddings": false
},
"decoder_config": {
"d_model": 1024,
"d_intermediate": 0,
"n_layer": 24,
"vocab_size": 5000,
"max_position_embeddings": 4096,
"ssm_cfg": {
"layer": "Mamba2"
},
"attn_layer_idx": [
6,
18
],
"attn_cfg": {
"causal": true,
"d_conv": 0,
"head_dim": 64,
"num_heads": 16,
"num_heads_kv": 8,
"out_proj_bias": false,
"qkv_proj_bias": false,
"rotary_emb_dim": 64
},
"rms_norm": true,
"residual_in_fp32": true,
"fused_add_norm": true,
"pad_vocab_size_multiple": 8,
"tie_embeddings": false
},
"tie_word_embeddings": true,
"seed": 0
} |