Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/config.json +56 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_1B.json +29 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_340M.json +26 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M_bf16.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_nsa_1_340M.json +53 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_340M.json +24 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_7B.json +25 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gsa_340M.json +29 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/hgrn2_340M.json +20 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_1B.json +32 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_340M.json +32 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_6_1_340M.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_1B.json +30 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_340M.json +30 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/samba_1B.json +52 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/sba_340m.json +18 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_1B.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_340M.json +18 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_7B.json +21 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/generation_config.json +6 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/model.safetensors +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/special_tokens_map.json +23 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tb/20250725-0038/events.out.tfevents.1753375106.TENCENT64.site.571145.0 +3 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer.json +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer_config.json +44 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train-100B.sh +75 -0
- bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train.sh +122 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"allow_neg_eigval": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GatedDeltaNetForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attn": {
|
| 7 |
+
"block_counts": 16,
|
| 8 |
+
"block_size": 64,
|
| 9 |
+
"layers": [
|
| 10 |
+
5,
|
| 11 |
+
11,
|
| 12 |
+
17,
|
| 13 |
+
23
|
| 14 |
+
],
|
| 15 |
+
"num_heads": 32,
|
| 16 |
+
"num_kv_heads": 2,
|
| 17 |
+
"qkv_bias": false,
|
| 18 |
+
"rope_theta": 160000.0,
|
| 19 |
+
"type": "nsa",
|
| 20 |
+
"window_size": 512
|
| 21 |
+
},
|
| 22 |
+
"attn_mode": "chunk",
|
| 23 |
+
"bos_token_id": 1,
|
| 24 |
+
"conv_size": 4,
|
| 25 |
+
"eos_token_id": 2,
|
| 26 |
+
"expand_k": 1,
|
| 27 |
+
"expand_v": 1,
|
| 28 |
+
"fuse_cross_entropy": true,
|
| 29 |
+
"fuse_norm": true,
|
| 30 |
+
"fuse_swiglu": true,
|
| 31 |
+
"head_dim": 256,
|
| 32 |
+
"hidden_act": "swish",
|
| 33 |
+
"hidden_ratio": 4,
|
| 34 |
+
"hidden_size": 1024,
|
| 35 |
+
"initializer_range": 0.02,
|
| 36 |
+
"intermediate_size": null,
|
| 37 |
+
"max_position_embeddings": 8192,
|
| 38 |
+
"model_type": "gated_deltanet",
|
| 39 |
+
"norm_eps": 1e-06,
|
| 40 |
+
"norm_first": false,
|
| 41 |
+
"num_heads": 4,
|
| 42 |
+
"num_hidden_layers": 24,
|
| 43 |
+
"num_v_heads": null,
|
| 44 |
+
"qk_activation": "silu",
|
| 45 |
+
"qk_norm": "l2",
|
| 46 |
+
"tie_word_embeddings": false,
|
| 47 |
+
"torch_dtype": "bfloat16",
|
| 48 |
+
"transformers_version": "4.53.3",
|
| 49 |
+
"use_beta": true,
|
| 50 |
+
"use_cache": true,
|
| 51 |
+
"use_gate": true,
|
| 52 |
+
"use_l2warp": false,
|
| 53 |
+
"use_output_norm": true,
|
| 54 |
+
"use_short_conv": true,
|
| 55 |
+
"vocab_size": 32000
|
| 56 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_1B.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"conv_size": 4,
|
| 6 |
+
"eos_token_id": 2,
|
| 7 |
+
"expand_k": 1,
|
| 8 |
+
"expand_v": 1,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"hidden_act": "swish",
|
| 12 |
+
"hidden_ratio": 4,
|
| 13 |
+
"hidden_size": 2048,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": null,
|
| 16 |
+
"model_type": "delta_net",
|
| 17 |
+
"norm_eps": 1e-06,
|
| 18 |
+
"num_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"pad_token_id": 2,
|
| 21 |
+
"qk_activation": "silu",
|
| 22 |
+
"qk_norm": "l2",
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_beta": true,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_gate": false,
|
| 27 |
+
"use_output_norm": true,
|
| 28 |
+
"use_short_conv": true
|
| 29 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/delta_net_340M.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 1,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "delta_net",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 8,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"qk_activation": "silu",
|
| 19 |
+
"qk_norm": "l2",
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"use_beta": true,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"use_gate": false,
|
| 24 |
+
"use_output_norm": true,
|
| 25 |
+
"use_short_conv": true
|
| 26 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gated_deltanet_340M.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "float32",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_1_340M_bf16.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "bfloat16",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gdn_6_nsa_1_340M.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 32,
|
| 13 |
+
"num_kv_heads": 2,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"type": "nsa",
|
| 17 |
+
"block_size": 64,
|
| 18 |
+
"block_counts": 16,
|
| 19 |
+
"window_size": 512
|
| 20 |
+
},
|
| 21 |
+
"attn_mode": "chunk",
|
| 22 |
+
"bos_token_id": 1,
|
| 23 |
+
"conv_size": 4,
|
| 24 |
+
"eos_token_id": 2,
|
| 25 |
+
"expand_k": 1,
|
| 26 |
+
"expand_v": 1,
|
| 27 |
+
"fuse_cross_entropy": true,
|
| 28 |
+
"fuse_norm": true,
|
| 29 |
+
"fuse_swiglu": true,
|
| 30 |
+
"head_dim": 256,
|
| 31 |
+
"hidden_act": "swish",
|
| 32 |
+
"hidden_ratio": 4,
|
| 33 |
+
"hidden_size": 1024,
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": null,
|
| 36 |
+
"max_position_embeddings": 8192,
|
| 37 |
+
"model_type": "gated_deltanet",
|
| 38 |
+
"norm_eps": 1e-06,
|
| 39 |
+
"norm_first": false,
|
| 40 |
+
"num_heads": 4,
|
| 41 |
+
"num_hidden_layers": 24,
|
| 42 |
+
"qk_activation": "silu",
|
| 43 |
+
"qk_norm": "l2",
|
| 44 |
+
"tie_word_embeddings": false,
|
| 45 |
+
"torch_dtype": "bfloat16",
|
| 46 |
+
"transformers_version": "4.51.3",
|
| 47 |
+
"use_beta": true,
|
| 48 |
+
"use_cache": true,
|
| 49 |
+
"use_gate": true,
|
| 50 |
+
"use_output_norm": true,
|
| 51 |
+
"use_short_conv": true,
|
| 52 |
+
"vocab_size": 32000
|
| 53 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_340M.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"clamp_min": null,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": null,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"num_heads": 4,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"norm_eps": 1e-06,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"vocab_size": 32000
|
| 24 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gla_7B.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"num_heads": 16,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"use_output_gate": true,
|
| 24 |
+
"use_short_conv": false
|
| 25 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/gsa_340M.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_size": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_k": 1,
|
| 6 |
+
"expand_v": 1,
|
| 7 |
+
"elementwise_affine": false,
|
| 8 |
+
"feature_map": "swish",
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"gate_logit_normalizer": 4,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"model_type": "gsa",
|
| 18 |
+
"num_heads": 4,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_slots": 64,
|
| 21 |
+
"norm_eps": 1e-06,
|
| 22 |
+
"share_conv_kernel": true,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"use_norm": true,
|
| 26 |
+
"use_output_gate": true,
|
| 27 |
+
"use_rope": false,
|
| 28 |
+
"use_short_conv": false
|
| 29 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/hgrn2_340M.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_ratio": 128,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"model_type": "hgrn2",
|
| 14 |
+
"num_heads": 8,
|
| 15 |
+
"num_hidden_layers": 24,
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"tie_word_embeddings": false,
|
| 18 |
+
"use_cache": true,
|
| 19 |
+
"vocab_size": 32000
|
| 20 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_1B.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_340M.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba2_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Mamba2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"chunk_size": 256,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 64,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 1024,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"norm_eps": 1e-05,
|
| 32 |
+
"model_type": "mamba2",
|
| 33 |
+
"n_groups": 1,
|
| 34 |
+
"num_hidden_layers": 48,
|
| 35 |
+
"pad_token_id": 0,
|
| 36 |
+
"rescale_prenorm_residual": true,
|
| 37 |
+
"residual_in_fp32": true,
|
| 38 |
+
"rms_norm": true,
|
| 39 |
+
"state_size": 128,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_max": 0.1,
|
| 43 |
+
"time_step_min": 0.001,
|
| 44 |
+
"time_step_rank": 128,
|
| 45 |
+
"transformers_version": "4.50.1",
|
| 46 |
+
"use_bias": false,
|
| 47 |
+
"use_cache": true,
|
| 48 |
+
"use_conv_bias": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_1B.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/mamba_340M.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/samba_1B.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": {
|
| 3 |
+
"layers": [
|
| 4 |
+
1,
|
| 5 |
+
3,
|
| 6 |
+
5,
|
| 7 |
+
7,
|
| 8 |
+
9,
|
| 9 |
+
11,
|
| 10 |
+
13,
|
| 11 |
+
15,
|
| 12 |
+
17
|
| 13 |
+
],
|
| 14 |
+
"num_heads": 18,
|
| 15 |
+
"num_kv_heads": 18,
|
| 16 |
+
"qkv_bias": false,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"window_size": 2048
|
| 19 |
+
},
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"hidden_act": "swish",
|
| 28 |
+
"hidden_ratio": 4,
|
| 29 |
+
"hidden_size": 2304,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 4608,
|
| 32 |
+
"max_position_embeddings": 2048,
|
| 33 |
+
"model_type": "samba",
|
| 34 |
+
"norm_eps": 1e-05,
|
| 35 |
+
"num_hidden_layers": 18,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"rescale_prenorm_residual": false,
|
| 38 |
+
"residual_in_fp32": false,
|
| 39 |
+
"state_size": 16,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_init_scheme": "random",
|
| 43 |
+
"time_step_max": 0.1,
|
| 44 |
+
"time_step_min": 0.001,
|
| 45 |
+
"time_step_rank": 144,
|
| 46 |
+
"time_step_scale": 1.0,
|
| 47 |
+
"transformers_version": "4.50.1",
|
| 48 |
+
"use_bias": false,
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"use_conv_bias": true,
|
| 51 |
+
"vocab_size": 32000
|
| 52 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/sba_340m.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.006,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "sba",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"elementwise_affine": true,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"fuse_swiglu": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"max_position_embeddings": 8192,
|
| 14 |
+
"model_type": "transformer",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 32,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"num_kv_heads": null,
|
| 19 |
+
"pad_token_id": 2,
|
| 20 |
+
"rope_theta": 10000.0,
|
| 21 |
+
"tie_word_embeddings": false
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_340M.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "transformer",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/configs/transformer_7B.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_ratio": 4,
|
| 9 |
+
"hidden_size": 4096,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 14336,
|
| 12 |
+
"model_type": "transformer",
|
| 13 |
+
"norm_eps": 1e-06,
|
| 14 |
+
"num_heads": 32,
|
| 15 |
+
"num_hidden_layers": 32,
|
| 16 |
+
"num_kv_heads": 8,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"window_size": null
|
| 21 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.53.3"
|
| 6 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06b2f6330ddf184c1c80c2f59f612121485c80ebbf69dcbc11879129b304e1ab
|
| 3 |
+
size 29157733
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/0/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58f843f2ccc558b3d0b29c636c5562d3582d245b587a08f8bf41a592ef855330
|
| 3 |
+
size 29484305
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/1/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b917eb02d448e7ecfa235dda14a1ac7d1457502bba5edc10055333c9636abf6f
|
| 3 |
+
size 29204398
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/2/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a06ef22cf0fd8f6d4e34bd977a7c695ee9947baf30b2ffb92482662a16ee30bb
|
| 3 |
+
size 29158112
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/3/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c226e9e2381b37eec7dcaa8ba32aee339915d9facfe20807df9d5ef0f4267f1
|
| 3 |
+
size 29157286
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/4/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9870b9cb0ec740b2aa0d486d3b886cef15acca589f53719877561c98102205d
|
| 3 |
+
size 29157286
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2020c549f94418bf1777f49033d7403593283de010efd55993d44aacb92b0be
|
| 3 |
+
size 29157287
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a0e84a4f9a44ad4d6caf21aebab0cb8f94216f2d0f7f43998c6529d6498323c
|
| 3 |
+
size 29157280
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d377a720ee61c482b9f39965350b0e5a2b14207e354a9186ee50f446f6c1e83f
|
| 3 |
+
size 793434792
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tb/20250725-0038/events.out.tfevents.1753375106.TENCENT64.site.571145.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2ff7f8ae724e9fec53bc77745a35b7a570bc28e95f322bce14595aa109b6e16
|
| 3 |
+
size 97469568
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"additional_special_tokens": [],
|
| 32 |
+
"bos_token": "<s>",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": true,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": null,
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"spaces_between_special_tokens": false,
|
| 41 |
+
"tokenizer_class": "LlamaTokenizerFast",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train-100B.sh
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
|
| 2 |
+
DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
|
| 3 |
+
TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
|
| 4 |
+
|
| 5 |
+
cd $FLAME_PATH
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# =========== train config ===========
|
| 9 |
+
CONFIG=${1:-transformer_340M.json}
|
| 10 |
+
SEQ_LEN=65536
|
| 11 |
+
WARMUP_STEPS=100
|
| 12 |
+
STEPS=95366
|
| 13 |
+
LR=3e-4
|
| 14 |
+
BATCH_SIZE=1
|
| 15 |
+
GAS=2
|
| 16 |
+
DECAY_TYPE=linear
|
| 17 |
+
DECAY_RATIO=1
|
| 18 |
+
NNODE=1
|
| 19 |
+
NGPU=8
|
| 20 |
+
LOG_RANK=0
|
| 21 |
+
EXTRA_ARGS="--training.mixed_precision_param bfloat16"
|
| 22 |
+
EXTRA_NAME="bf16"
|
| 23 |
+
# ====================================
|
| 24 |
+
|
| 25 |
+
# if jq command is not found, install it
|
| 26 |
+
if ! command -v jq &> /dev/null; then
|
| 27 |
+
echo "jq could not be found, installing it..."
|
| 28 |
+
sudo yum install -y jq
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
export WANDB_ERROR_REPORTING=False
|
| 32 |
+
|
| 33 |
+
if [ -n "$EXTRA_NAME" ]; then
|
| 34 |
+
EXTRA_NAME="${EXTRA_NAME}-"
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
EXP_NAME=${EXTRA_NAME}$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
|
| 38 |
+
|
| 39 |
+
bash train.sh \
|
| 40 |
+
--job.config_file flame/models/fla.toml \
|
| 41 |
+
--job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
|
| 42 |
+
--model.config $FLAME_PATH/configs/$CONFIG \
|
| 43 |
+
--model.tokenizer_path $TOKENIZER \
|
| 44 |
+
--optimizer.name AdamW \
|
| 45 |
+
--optimizer.eps 1e-8 \
|
| 46 |
+
--optimizer.lr $LR \
|
| 47 |
+
--lr_scheduler.warmup_steps $WARMUP_STEPS \
|
| 48 |
+
--lr_scheduler.lr_min 0.01 \
|
| 49 |
+
--lr_scheduler.decay_type $DECAY_TYPE \
|
| 50 |
+
--lr_scheduler.decay_ratio $DECAY_RATIO \
|
| 51 |
+
--training.batch_size $BATCH_SIZE \
|
| 52 |
+
--training.seq_len $SEQ_LEN \
|
| 53 |
+
--training.context_len $SEQ_LEN \
|
| 54 |
+
--training.gradient_accumulation_steps $GAS \
|
| 55 |
+
--training.steps $STEPS \
|
| 56 |
+
--training.max_norm 1.0 \
|
| 57 |
+
--training.skip_nan_inf \
|
| 58 |
+
--training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
|
| 59 |
+
--training.data_probs 0.55,0.3,0.15 \
|
| 60 |
+
--training.dataset_split train,train,train \
|
| 61 |
+
--training.dataset_name default,default,default \
|
| 62 |
+
--training.streaming \
|
| 63 |
+
--training.num_workers 32 \
|
| 64 |
+
--training.prefetch_factor 2 \
|
| 65 |
+
--training.seed 42 \
|
| 66 |
+
--training.compile \
|
| 67 |
+
--checkpoint.interval 8192 \
|
| 68 |
+
--checkpoint.load_step -1 \
|
| 69 |
+
--checkpoint.keep_latest_k 100 \
|
| 70 |
+
--checkpoint.export_dtype bfloat16 \
|
| 71 |
+
--metrics.log_freq 1 \
|
| 72 |
+
--metrics.enable_tensorboard \
|
| 73 |
+
--training.streaming \
|
| 74 |
+
${EXTRA_ARGS}
|
| 75 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/train.sh
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/bash
|
| 2 |
+
|
| 3 |
+
params=""
|
| 4 |
+
if [ $# -ne 0 ]; then
|
| 5 |
+
params="$*"
|
| 6 |
+
fi
|
| 7 |
+
|
| 8 |
+
# use envs as local params for convenience
|
| 9 |
+
# e.g.
|
| 10 |
+
# NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
|
| 11 |
+
NNODE=${NNODE:-"1"}
|
| 12 |
+
NGPU=${NGPU:-"8"}
|
| 13 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 14 |
+
LOG_RANK=${LOG_RANK:-0}
|
| 15 |
+
|
| 16 |
+
if [[ -z "${MASTER_ADDR}" ]]; then
|
| 17 |
+
export MASTER_ADDR="localhost"
|
| 18 |
+
fi
|
| 19 |
+
if [[ -z "${MASTER_PORT}" ]]; then
|
| 20 |
+
export MASTER_PORT="0"
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
: '
|
| 24 |
+
Usage:
|
| 25 |
+
|
| 26 |
+
bash train.sh -h
|
| 27 |
+
|
| 28 |
+
Training a 340M model:
|
| 29 |
+
|
| 30 |
+
NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
|
| 31 |
+
--job.config_file flame/models/fla.toml \
|
| 32 |
+
--job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
|
| 33 |
+
--model.config configs/transformer_340M.json \
|
| 34 |
+
--model.tokenizer_path fla-hub/transformer-1.3B-100B \
|
| 35 |
+
--optimizer.name AdamW \
|
| 36 |
+
--optimizer.eps 1e-15 \
|
| 37 |
+
--optimizer.lr 3e-4 \
|
| 38 |
+
--lr_scheduler.warmup_steps 1024 \
|
| 39 |
+
--lr_scheduler.lr_min 0.1 \
|
| 40 |
+
--lr_scheduler.decay_type cosine \
|
| 41 |
+
--training.batch_size 32 \
|
| 42 |
+
--training.seq_len 2048 \
|
| 43 |
+
--training.gradient_accumulation_steps 1 \
|
| 44 |
+
--training.steps 20480 \
|
| 45 |
+
--training.max_norm 1.0 \
|
| 46 |
+
--training.skip_nan_inf \
|
| 47 |
+
--training.dataset HuggingFaceFW/fineweb-edu \
|
| 48 |
+
--training.dataset_name default \
|
| 49 |
+
--training.dataset_split train \
|
| 50 |
+
--training.streaming \
|
| 51 |
+
--training.num_workers 32 \
|
| 52 |
+
--training.prefetch_factor 2 \
|
| 53 |
+
--training.seed 42 \
|
| 54 |
+
--training.compile \
|
| 55 |
+
--training.tensor_parallel_degree 1 \
|
| 56 |
+
--training.disable_loss_parallel \
|
| 57 |
+
--checkpoint.interval 2048 \
|
| 58 |
+
--checkpoint.load_step -1 \
|
| 59 |
+
--metrics.log_freq 1
|
| 60 |
+
'
|
| 61 |
+
|
| 62 |
+
echo "Launching training..."
|
| 63 |
+
|
| 64 |
+
set -x
|
| 65 |
+
path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
|
| 66 |
+
steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
|
| 67 |
+
config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
|
| 68 |
+
tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
|
| 69 |
+
model=$(
|
| 70 |
+
python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
mkdir -p $path
|
| 74 |
+
cp *.sh $path
|
| 75 |
+
cp -r configs $path
|
| 76 |
+
cp -r flame $path
|
| 77 |
+
cp -r 3rdparty/flash-linear-attention/fla $path
|
| 78 |
+
cp -r 3rdparty/torchtitan/torchtitan $path
|
| 79 |
+
|
| 80 |
+
# for offline systems
|
| 81 |
+
# export TRANSFORMERS_OFFLINE=1
|
| 82 |
+
# export HF_DATASETS_OFFLINE=1
|
| 83 |
+
# export HF_HUB_OFFLINE=1
|
| 84 |
+
if [ "$date" == "" ]; then
|
| 85 |
+
date=$(date +%Y%m%d%H%M)
|
| 86 |
+
fi
|
| 87 |
+
RUN_NAME="$model-$(basename $path)"
|
| 88 |
+
RUN_ID="$RUN_NAME-$date"
|
| 89 |
+
|
| 90 |
+
export WANDB_RESUME=allow
|
| 91 |
+
if [[ -z "${WANDB_PROJECT}" ]]; then
|
| 92 |
+
export WANDB_PROJECT="fla"
|
| 93 |
+
fi
|
| 94 |
+
if [[ -z "${WANDB_NAME}" ]]; then
|
| 95 |
+
export WANDB_NAME="$RUN_NAME"
|
| 96 |
+
fi
|
| 97 |
+
if [[ -z "${WANDB_RUN_ID}" ]]; then
|
| 98 |
+
export WANDB_RUN_ID="$RUN_ID"
|
| 99 |
+
fi
|
| 100 |
+
|
| 101 |
+
PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
|
| 102 |
+
torchrun --nnodes=${NNODE} \
|
| 103 |
+
--nproc_per_node=${NGPU} \
|
| 104 |
+
--rdzv_backend c10d \
|
| 105 |
+
--rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
|
| 106 |
+
--local-ranks-filter ${LOG_RANK} \
|
| 107 |
+
--role rank \
|
| 108 |
+
--tee 3 \
|
| 109 |
+
--log-dir $path/logs \
|
| 110 |
+
-m flame.train \
|
| 111 |
+
$params
|
| 112 |
+
|
| 113 |
+
echo "TRAINING DONE!"
|
| 114 |
+
echo "Converting the DCP checkpoints to HF format..."
|
| 115 |
+
|
| 116 |
+
python -m flame.utils.convert_dcp_to_hf \
|
| 117 |
+
--path $path \
|
| 118 |
+
--step $steps \
|
| 119 |
+
--config $config \
|
| 120 |
+
--tokenizer $tokenizer
|
| 121 |
+
|
| 122 |
+
echo "RUNNING DONE!"
|