ZhenbinWang commited on
Commit
9fb98d3
·
verified ·
1 Parent(s): 805d830

Upload 25 files

Browse files
configs/delta_net_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "conv_size": 4,
6
+ "eos_token_id": 2,
7
+ "expand_k": 1,
8
+ "expand_v": 1,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "hidden_act": "swish",
12
+ "hidden_ratio": 4,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": null,
16
+ "model_type": "delta_net",
17
+ "norm_eps": 1e-06,
18
+ "num_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 2,
21
+ "qk_activation": "silu",
22
+ "qk_norm": "l2",
23
+ "tie_word_embeddings": false,
24
+ "use_beta": true,
25
+ "use_cache": true,
26
+ "use_gate": false,
27
+ "use_output_norm": true,
28
+ "use_short_conv": true
29
+ }
configs/delta_net_340M.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_k": 1,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "delta_net",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 8,
17
+ "num_hidden_layers": 24,
18
+ "qk_activation": "silu",
19
+ "qk_norm": "l2",
20
+ "tie_word_embeddings": false,
21
+ "use_beta": true,
22
+ "use_cache": true,
23
+ "use_gate": false,
24
+ "use_output_norm": true,
25
+ "use_short_conv": true
26
+ }
configs/gated_deltanet_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
configs/gated_deltanet_340M.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
configs/gla_340M.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "clamp_min": null,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": null,
15
+ "model_type": "gla",
16
+ "num_heads": 4,
17
+ "num_hidden_layers": 24,
18
+ "norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "vocab_size": 32000
24
+ }
configs/gla_7B.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "model_type": "gla",
16
+ "norm_eps": 1e-06,
17
+ "num_heads": 16,
18
+ "num_hidden_layers": 32,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "use_output_gate": true,
24
+ "use_short_conv": false
25
+ }
configs/gsa_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 8,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 2048,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
configs/gsa_340M.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 4,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
configs/hamilton_340M.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 20,
15
+ "P_rank": 1,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 16,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 64,
27
+ "num_heads": 16,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v2.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 2,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 18,
15
+ "P_rank": 1,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 8,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 128,
27
+ "num_heads": 16,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v3.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 2.5,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 24,
15
+ "P_rank": 2,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 8,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 64,
27
+ "num_heads": 16,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v4.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 8,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 13,
15
+ "P_rank": 1,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 16,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 64,
27
+ "num_heads": 14,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v5.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 25,
15
+ "P_rank": 1,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 8,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 64,
27
+ "num_heads": 8,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v6.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 25,
15
+ "P_rank": 1,
16
+ "use_mlp": false,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 8,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 128,
27
+ "num_heads": 16,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hamilton_340M_v7.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 3,
9
+ "hidden_size": 1024,
10
+ "max_position_embeddings": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hamilton",
14
+ "num_hidden_layers": 24,
15
+ "P_rank": 1,
16
+ "use_mlp": true,
17
+ "norm_eps": 1e-06,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "vocab_size": 32000,
21
+ "expand_spatial": 4,
22
+ "conv_size_spatial": 4,
23
+ "conv_size_temporal": 16,
24
+ "conv_bias_spatial": true,
25
+ "conv_bias_temporal": true,
26
+ "head_dim": 64,
27
+ "num_heads": 16,
28
+ "mnu_bias": true,
29
+ "dropout": 0.0,
30
+ "decay_weight": 4.0
31
+ }
configs/hgrn2_340M.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "expand_ratio": 128,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hgrn2",
14
+ "num_heads": 8,
15
+ "num_hidden_layers": 24,
16
+ "norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "use_cache": true,
19
+ "vocab_size": 32000
20
+ }
configs/mamba2_1B.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
configs/mamba2_340M.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
configs/mamba_1B.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
configs/mamba_340M.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
configs/samba_1B.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": {
3
+ "layers": [
4
+ 1,
5
+ 3,
6
+ 5,
7
+ 7,
8
+ 9,
9
+ 11,
10
+ 13,
11
+ 15,
12
+ 17
13
+ ],
14
+ "num_heads": 18,
15
+ "num_kv_heads": 18,
16
+ "qkv_bias": false,
17
+ "rope_theta": 10000.0,
18
+ "window_size": 2048
19
+ },
20
+ "bos_token_id": 1,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "hidden_act": "swish",
28
+ "hidden_ratio": 4,
29
+ "hidden_size": 2304,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 4608,
32
+ "max_position_embeddings": 2048,
33
+ "model_type": "samba",
34
+ "norm_eps": 1e-05,
35
+ "num_hidden_layers": 18,
36
+ "pad_token_id": 0,
37
+ "rescale_prenorm_residual": false,
38
+ "residual_in_fp32": false,
39
+ "state_size": 16,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_init_scheme": "random",
43
+ "time_step_max": 0.1,
44
+ "time_step_min": 0.001,
45
+ "time_step_rank": 144,
46
+ "time_step_scale": 1.0,
47
+ "transformers_version": "4.50.1",
48
+ "use_bias": false,
49
+ "use_cache": true,
50
+ "use_conv_bias": true,
51
+ "vocab_size": 32000
52
+ }
configs/sba_340m.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.006,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "sba",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
configs/transformer_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "elementwise_affine": true,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "fuse_swiglu": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "transformer",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 32,
17
+ "num_hidden_layers": 24,
18
+ "num_kv_heads": null,
19
+ "pad_token_id": 2,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false
22
+ }
configs/transformer_340M.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "transformer",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
configs/transformer_7B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "model_type": "transformer",
13
+ "norm_eps": 1e-06,
14
+ "num_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_kv_heads": 8,
17
+ "rope_theta": 10000.0,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "window_size": null
21
+ }