IvanHU commited on
Commit
5575579
·
verified ·
1 Parent(s): 4324d0d

Upload folder using huggingface_hub

Browse files
Files changed (47) hide show
  1. .gitattributes +8 -0
  2. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/config.json +56 -0
  3. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_1B.json +29 -0
  4. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_340M.json +26 -0
  5. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
  6. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
  7. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
  8. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M_bf16.json +50 -0
  9. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_nsa_1_340M.json +53 -0
  10. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_340M.json +24 -0
  11. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_7B.json +25 -0
  12. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gsa_340M.json +29 -0
  13. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/hgrn2_340M.json +20 -0
  14. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_1B.json +32 -0
  15. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_340M.json +32 -0
  16. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_6_1_340M.json +50 -0
  17. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_1B.json +30 -0
  18. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_340M.json +30 -0
  19. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/samba_1B.json +52 -0
  20. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/sba_340m.json +18 -0
  21. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_1B.json +22 -0
  22. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_340M.json +18 -0
  23. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_7B.json +21 -0
  24. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/generation_config.json +6 -0
  25. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log +3 -0
  26. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stdout.log +0 -0
  27. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log +3 -0
  28. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stdout.log +0 -0
  29. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log +3 -0
  30. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stdout.log +0 -0
  31. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log +3 -0
  32. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stdout.log +0 -0
  33. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log +3 -0
  34. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stdout.log +0 -0
  35. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log +3 -0
  36. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stdout.log +0 -0
  37. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log +3 -0
  38. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stdout.log +0 -0
  39. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log +3 -0
  40. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stdout.log +0 -0
  41. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/model.safetensors +3 -0
  42. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/special_tokens_map.json +23 -0
  43. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tb/20250725-0038/events.out.tfevents.1753375106.TENCENT64.site.571145.0 +3 -0
  44. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer.json +0 -0
  45. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer_config.json +44 -0
  46. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train-100B.sh +75 -0
  47. bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train.sh +122 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
37
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
38
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
39
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
40
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
41
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
42
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
43
+ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "allow_neg_eigval": false,
3
+ "architectures": [
4
+ "GatedDeltaNetForCausalLM"
5
+ ],
6
+ "attn": {
7
+ "block_counts": 16,
8
+ "block_size": 64,
9
+ "layers": [
10
+ 5,
11
+ 11,
12
+ 17,
13
+ 23
14
+ ],
15
+ "num_heads": 32,
16
+ "num_kv_heads": 2,
17
+ "qkv_bias": false,
18
+ "rope_theta": 160000.0,
19
+ "type": "nsa",
20
+ "window_size": 512
21
+ },
22
+ "attn_mode": "chunk",
23
+ "bos_token_id": 1,
24
+ "conv_size": 4,
25
+ "eos_token_id": 2,
26
+ "expand_k": 1,
27
+ "expand_v": 1,
28
+ "fuse_cross_entropy": true,
29
+ "fuse_norm": true,
30
+ "fuse_swiglu": true,
31
+ "head_dim": 256,
32
+ "hidden_act": "swish",
33
+ "hidden_ratio": 4,
34
+ "hidden_size": 1024,
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": null,
37
+ "max_position_embeddings": 8192,
38
+ "model_type": "gated_deltanet",
39
+ "norm_eps": 1e-06,
40
+ "norm_first": false,
41
+ "num_heads": 4,
42
+ "num_hidden_layers": 24,
43
+ "num_v_heads": null,
44
+ "qk_activation": "silu",
45
+ "qk_norm": "l2",
46
+ "tie_word_embeddings": false,
47
+ "torch_dtype": "bfloat16",
48
+ "transformers_version": "4.53.3",
49
+ "use_beta": true,
50
+ "use_cache": true,
51
+ "use_gate": true,
52
+ "use_l2warp": false,
53
+ "use_output_norm": true,
54
+ "use_short_conv": true,
55
+ "vocab_size": 32000
56
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "conv_size": 4,
6
+ "eos_token_id": 2,
7
+ "expand_k": 1,
8
+ "expand_v": 1,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "hidden_act": "swish",
12
+ "hidden_ratio": 4,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": null,
16
+ "model_type": "delta_net",
17
+ "norm_eps": 1e-06,
18
+ "num_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 2,
21
+ "qk_activation": "silu",
22
+ "qk_norm": "l2",
23
+ "tie_word_embeddings": false,
24
+ "use_beta": true,
25
+ "use_cache": true,
26
+ "use_gate": false,
27
+ "use_output_norm": true,
28
+ "use_short_conv": true
29
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_340M.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_k": 1,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "delta_net",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 8,
17
+ "num_hidden_layers": 24,
18
+ "qk_activation": "silu",
19
+ "qk_norm": "l2",
20
+ "tie_word_embeddings": false,
21
+ "use_beta": true,
22
+ "use_cache": true,
23
+ "use_gate": false,
24
+ "use_output_norm": true,
25
+ "use_short_conv": true
26
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_340M.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "conv_size": 4,
21
+ "eos_token_id": 2,
22
+ "expand_k": 1,
23
+ "expand_v": 1,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 256,
28
+ "hidden_act": "swish",
29
+ "hidden_ratio": 4,
30
+ "hidden_size": 1024,
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": null,
33
+ "max_position_embeddings": 8192,
34
+ "model_type": "gated_deltanet",
35
+ "norm_eps": 1e-06,
36
+ "norm_first": false,
37
+ "num_heads": 4,
38
+ "num_hidden_layers": 24,
39
+ "qk_activation": "silu",
40
+ "qk_norm": "l2",
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_beta": true,
45
+ "use_cache": true,
46
+ "use_gate": true,
47
+ "use_output_norm": true,
48
+ "use_short_conv": true,
49
+ "vocab_size": 32000
50
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M_bf16.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "conv_size": 4,
21
+ "eos_token_id": 2,
22
+ "expand_k": 1,
23
+ "expand_v": 1,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 256,
28
+ "hidden_act": "swish",
29
+ "hidden_ratio": 4,
30
+ "hidden_size": 1024,
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": null,
33
+ "max_position_embeddings": 8192,
34
+ "model_type": "gated_deltanet",
35
+ "norm_eps": 1e-06,
36
+ "norm_first": false,
37
+ "num_heads": 4,
38
+ "num_hidden_layers": 24,
39
+ "qk_activation": "silu",
40
+ "qk_norm": "l2",
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "bfloat16",
43
+ "transformers_version": "4.51.3",
44
+ "use_beta": true,
45
+ "use_cache": true,
46
+ "use_gate": true,
47
+ "use_output_norm": true,
48
+ "use_short_conv": true,
49
+ "vocab_size": 32000
50
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_nsa_1_340M.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 32,
13
+ "num_kv_heads": 2,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "type": "nsa",
17
+ "block_size": 64,
18
+ "block_counts": 16,
19
+ "window_size": 512
20
+ },
21
+ "attn_mode": "chunk",
22
+ "bos_token_id": 1,
23
+ "conv_size": 4,
24
+ "eos_token_id": 2,
25
+ "expand_k": 1,
26
+ "expand_v": 1,
27
+ "fuse_cross_entropy": true,
28
+ "fuse_norm": true,
29
+ "fuse_swiglu": true,
30
+ "head_dim": 256,
31
+ "hidden_act": "swish",
32
+ "hidden_ratio": 4,
33
+ "hidden_size": 1024,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": null,
36
+ "max_position_embeddings": 8192,
37
+ "model_type": "gated_deltanet",
38
+ "norm_eps": 1e-06,
39
+ "norm_first": false,
40
+ "num_heads": 4,
41
+ "num_hidden_layers": 24,
42
+ "qk_activation": "silu",
43
+ "qk_norm": "l2",
44
+ "tie_word_embeddings": false,
45
+ "torch_dtype": "bfloat16",
46
+ "transformers_version": "4.51.3",
47
+ "use_beta": true,
48
+ "use_cache": true,
49
+ "use_gate": true,
50
+ "use_output_norm": true,
51
+ "use_short_conv": true,
52
+ "vocab_size": 32000
53
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_340M.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "clamp_min": null,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": null,
15
+ "model_type": "gla",
16
+ "num_heads": 4,
17
+ "num_hidden_layers": 24,
18
+ "norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "vocab_size": 32000
24
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_7B.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "model_type": "gla",
16
+ "norm_eps": 1e-06,
17
+ "num_heads": 16,
18
+ "num_hidden_layers": 32,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "use_output_gate": true,
24
+ "use_short_conv": false
25
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gsa_340M.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 4,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/hgrn2_340M.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "expand_ratio": 128,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hgrn2",
14
+ "num_heads": 8,
15
+ "num_hidden_layers": 24,
16
+ "norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "use_cache": true,
19
+ "vocab_size": 32000
20
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_1B.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_340M.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_6_1_340M.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Mamba2ForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "chunk_size": 256,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 64,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 1024,
30
+ "initializer_range": 0.02,
31
+ "norm_eps": 1e-05,
32
+ "model_type": "mamba2",
33
+ "n_groups": 1,
34
+ "num_hidden_layers": 48,
35
+ "pad_token_id": 0,
36
+ "rescale_prenorm_residual": true,
37
+ "residual_in_fp32": true,
38
+ "rms_norm": true,
39
+ "state_size": 128,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_max": 0.1,
43
+ "time_step_min": 0.001,
44
+ "time_step_rank": 128,
45
+ "transformers_version": "4.50.1",
46
+ "use_bias": false,
47
+ "use_cache": true,
48
+ "use_conv_bias": true,
49
+ "vocab_size": 32000
50
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_1B.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_340M.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/samba_1B.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": {
3
+ "layers": [
4
+ 1,
5
+ 3,
6
+ 5,
7
+ 7,
8
+ 9,
9
+ 11,
10
+ 13,
11
+ 15,
12
+ 17
13
+ ],
14
+ "num_heads": 18,
15
+ "num_kv_heads": 18,
16
+ "qkv_bias": false,
17
+ "rope_theta": 10000.0,
18
+ "window_size": 2048
19
+ },
20
+ "bos_token_id": 1,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "hidden_act": "swish",
28
+ "hidden_ratio": 4,
29
+ "hidden_size": 2304,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 4608,
32
+ "max_position_embeddings": 2048,
33
+ "model_type": "samba",
34
+ "norm_eps": 1e-05,
35
+ "num_hidden_layers": 18,
36
+ "pad_token_id": 0,
37
+ "rescale_prenorm_residual": false,
38
+ "residual_in_fp32": false,
39
+ "state_size": 16,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_init_scheme": "random",
43
+ "time_step_max": 0.1,
44
+ "time_step_min": 0.001,
45
+ "time_step_rank": 144,
46
+ "time_step_scale": 1.0,
47
+ "transformers_version": "4.50.1",
48
+ "use_bias": false,
49
+ "use_cache": true,
50
+ "use_conv_bias": true,
51
+ "vocab_size": 32000
52
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/sba_340m.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.006,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "sba",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "elementwise_affine": true,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "fuse_swiglu": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "transformer",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 32,
17
+ "num_hidden_layers": 24,
18
+ "num_kv_heads": null,
19
+ "pad_token_id": 2,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false
22
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_340M.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "transformer",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_7B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "model_type": "transformer",
13
+ "norm_eps": 1e-06,
14
+ "num_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_kv_heads": 8,
17
+ "rope_theta": 10000.0,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "window_size": null
21
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.53.3"
6
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b2f6330ddf184c1c80c2f59f612121485c80ebbf69dcbc11879129b304e1ab
3
+ size 29157733
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58f843f2ccc558b3d0b29c636c5562d3582d245b587a08f8bf41a592ef855330
3
+ size 29484305
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b917eb02d448e7ecfa235dda14a1ac7d1457502bba5edc10055333c9636abf6f
3
+ size 29204398
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06ef22cf0fd8f6d4e34bd977a7c695ee9947baf30b2ffb92482662a16ee30bb
3
+ size 29158112
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c226e9e2381b37eec7dcaa8ba32aee339915d9facfe20807df9d5ef0f4267f1
3
+ size 29157286
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9870b9cb0ec740b2aa0d486d3b886cef15acca589f53719877561c98102205d
3
+ size 29157286
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2020c549f94418bf1777f49033d7403593283de010efd55993d44aacb92b0be
3
+ size 29157287
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a0e84a4f9a44ad4d6caf21aebab0cb8f94216f2d0f7f43998c6529d6498323c
3
+ size 29157280
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stdout.log ADDED
File without changes
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d377a720ee61c482b9f39965350b0e5a2b14207e354a9186ee50f446f6c1e83f
3
+ size 793434792
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tb/20250725-0038/events.out.tfevents.1753375106.TENCENT64.site.571145.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ff7f8ae724e9fec53bc77745a35b7a570bc28e95f322bce14595aa109b6e16
3
+ size 97469568
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": true,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": null,
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizerFast",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train-100B.sh ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
2
+ DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
3
+ TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
4
+
5
+ cd $FLAME_PATH
6
+ source .venv/bin/activate
7
+
8
+ # =========== train config ===========
9
+ CONFIG=${1:-transformer_340M.json}
10
+ SEQ_LEN=65536
11
+ WARMUP_STEPS=100
12
+ STEPS=95366
13
+ LR=3e-4
14
+ BATCH_SIZE=1
15
+ GAS=2
16
+ DECAY_TYPE=linear
17
+ DECAY_RATIO=1
18
+ NNODE=1
19
+ NGPU=8
20
+ LOG_RANK=0
21
+ EXTRA_ARGS="--training.mixed_precision_param bfloat16"
22
+ EXTRA_NAME="bf16"
23
+ # ====================================
24
+
25
+ # if jq command is not found, install it
26
+ if ! command -v jq &> /dev/null; then
27
+ echo "jq could not be found, installing it..."
28
+ sudo yum install -y jq
29
+ fi
30
+
31
+ export WANDB_ERROR_REPORTING=False
32
+
33
+ if [ -n "$EXTRA_NAME" ]; then
34
+ EXTRA_NAME="${EXTRA_NAME}-"
35
+ fi
36
+
37
+ EXP_NAME=${EXTRA_NAME}$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
38
+
39
+ bash train.sh \
40
+ --job.config_file flame/models/fla.toml \
41
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
42
+ --model.config $FLAME_PATH/configs/$CONFIG \
43
+ --model.tokenizer_path $TOKENIZER \
44
+ --optimizer.name AdamW \
45
+ --optimizer.eps 1e-8 \
46
+ --optimizer.lr $LR \
47
+ --lr_scheduler.warmup_steps $WARMUP_STEPS \
48
+ --lr_scheduler.lr_min 0.01 \
49
+ --lr_scheduler.decay_type $DECAY_TYPE \
50
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
51
+ --training.batch_size $BATCH_SIZE \
52
+ --training.seq_len $SEQ_LEN \
53
+ --training.context_len $SEQ_LEN \
54
+ --training.gradient_accumulation_steps $GAS \
55
+ --training.steps $STEPS \
56
+ --training.max_norm 1.0 \
57
+ --training.skip_nan_inf \
58
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
59
+ --training.data_probs 0.55,0.3,0.15 \
60
+ --training.dataset_split train,train,train \
61
+ --training.dataset_name default,default,default \
62
+ --training.streaming \
63
+ --training.num_workers 32 \
64
+ --training.prefetch_factor 2 \
65
+ --training.seed 42 \
66
+ --training.compile \
67
+ --checkpoint.interval 8192 \
68
+ --checkpoint.load_step -1 \
69
+ --checkpoint.keep_latest_k 100 \
70
+ --checkpoint.export_dtype bfloat16 \
71
+ --metrics.log_freq 1 \
72
+ --metrics.enable_tensorboard \
73
+ --training.streaming \
74
+ ${EXTRA_ARGS}
75
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train.sh ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ params=""
4
+ if [ $# -ne 0 ]; then
5
+ params="$*"
6
+ fi
7
+
8
+ # use envs as local params for convenience
9
+ # e.g.
10
+ # NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
11
+ NNODE=${NNODE:-"1"}
12
+ NGPU=${NGPU:-"8"}
13
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
14
+ LOG_RANK=${LOG_RANK:-0}
15
+
16
+ if [[ -z "${MASTER_ADDR}" ]]; then
17
+ export MASTER_ADDR="localhost"
18
+ fi
19
+ if [[ -z "${MASTER_PORT}" ]]; then
20
+ export MASTER_PORT="0"
21
+ fi
22
+
23
+ : '
24
+ Usage:
25
+
26
+ bash train.sh -h
27
+
28
+ Training a 340M model:
29
+
30
+ NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
31
+ --job.config_file flame/models/fla.toml \
32
+ --job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
33
+ --model.config configs/transformer_340M.json \
34
+ --model.tokenizer_path fla-hub/transformer-1.3B-100B \
35
+ --optimizer.name AdamW \
36
+ --optimizer.eps 1e-15 \
37
+ --optimizer.lr 3e-4 \
38
+ --lr_scheduler.warmup_steps 1024 \
39
+ --lr_scheduler.lr_min 0.1 \
40
+ --lr_scheduler.decay_type cosine \
41
+ --training.batch_size 32 \
42
+ --training.seq_len 2048 \
43
+ --training.gradient_accumulation_steps 1 \
44
+ --training.steps 20480 \
45
+ --training.max_norm 1.0 \
46
+ --training.skip_nan_inf \
47
+ --training.dataset HuggingFaceFW/fineweb-edu \
48
+ --training.dataset_name default \
49
+ --training.dataset_split train \
50
+ --training.streaming \
51
+ --training.num_workers 32 \
52
+ --training.prefetch_factor 2 \
53
+ --training.seed 42 \
54
+ --training.compile \
55
+ --training.tensor_parallel_degree 1 \
56
+ --training.disable_loss_parallel \
57
+ --checkpoint.interval 2048 \
58
+ --checkpoint.load_step -1 \
59
+ --metrics.log_freq 1
60
+ '
61
+
62
+ echo "Launching training..."
63
+
64
+ set -x
65
+ path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
66
+ steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
67
+ config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
68
+ tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
69
+ model=$(
70
+ python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
71
+ )
72
+
73
+ mkdir -p $path
74
+ cp *.sh $path
75
+ cp -r configs $path
76
+ cp -r flame $path
77
+ cp -r 3rdparty/flash-linear-attention/fla $path
78
+ cp -r 3rdparty/torchtitan/torchtitan $path
79
+
80
+ # for offline systems
81
+ # export TRANSFORMERS_OFFLINE=1
82
+ # export HF_DATASETS_OFFLINE=1
83
+ # export HF_HUB_OFFLINE=1
84
+ if [ "$date" == "" ]; then
85
+ date=$(date +%Y%m%d%H%M)
86
+ fi
87
+ RUN_NAME="$model-$(basename $path)"
88
+ RUN_ID="$RUN_NAME-$date"
89
+
90
+ export WANDB_RESUME=allow
91
+ if [[ -z "${WANDB_PROJECT}" ]]; then
92
+ export WANDB_PROJECT="fla"
93
+ fi
94
+ if [[ -z "${WANDB_NAME}" ]]; then
95
+ export WANDB_NAME="$RUN_NAME"
96
+ fi
97
+ if [[ -z "${WANDB_RUN_ID}" ]]; then
98
+ export WANDB_RUN_ID="$RUN_ID"
99
+ fi
100
+
101
+ PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
102
+ torchrun --nnodes=${NNODE} \
103
+ --nproc_per_node=${NGPU} \
104
+ --rdzv_backend c10d \
105
+ --rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
106
+ --local-ranks-filter ${LOG_RANK} \
107
+ --role rank \
108
+ --tee 3 \
109
+ --log-dir $path/logs \
110
+ -m flame.train \
111
+ $params
112
+
113
+ echo "TRAINING DONE!"
114
+ echo "Converting the DCP checkpoints to HF format..."
115
+
116
+ python -m flame.utils.convert_dcp_to_hf \
117
+ --path $path \
118
+ --step $steps \
119
+ --config $config \
120
+ --tokenizer $tokenizer
121
+
122
+ echo "RUNNING DONE!"