IvanHU commited on
Commit
5a7e3bf
·
verified ·
1 Parent(s): afb4f1a

Upload folder using huggingface_hub

Browse files
Files changed (45) hide show
  1. .gitattributes +8 -0
  2. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh +65 -0
  3. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json +53 -0
  4. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json +29 -0
  5. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json +26 -0
  6. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
  7. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
  8. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
  9. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json +24 -0
  10. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json +25 -0
  11. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json +29 -0
  12. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json +20 -0
  13. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json +32 -0
  14. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json +32 -0
  15. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json +30 -0
  16. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json +30 -0
  17. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json +52 -0
  18. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json +18 -0
  19. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json +22 -0
  20. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json +18 -0
  21. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json +21 -0
  22. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json +6 -0
  23. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log +3 -0
  24. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stdout.log +0 -0
  25. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log +3 -0
  26. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stdout.log +0 -0
  27. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log +3 -0
  28. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stdout.log +0 -0
  29. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log +3 -0
  30. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stdout.log +0 -0
  31. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log +3 -0
  32. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stdout.log +0 -0
  33. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log +3 -0
  34. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stdout.log +0 -0
  35. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log +3 -0
  36. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stdout.log +0 -0
  37. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log +3 -0
  38. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stdout.log +0 -0
  39. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model.safetensors +3 -0
  40. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model_size=391m +0 -0
  41. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/special_tokens_map.json +23 -0
  42. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1049/events.out.tfevents.1753238968.TENCENT64.site.2520914.0 +3 -0
  43. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer.json +0 -0
  44. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer_config.json +44 -0
  45. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/train.sh +121 -0
.gitattributes CHANGED
@@ -49,3 +49,11 @@ bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_rati
49
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
50
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
51
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
49
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
50
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
51
  bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
52
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
53
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
54
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
55
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
56
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
57
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
58
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
59
+ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
2
+ DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
3
+ TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
4
+
5
+ cd $FLAME_PATH
6
+ source .venv/bin/activate
7
+
8
+ # =========== train config ===========
9
+ CONFIG=${1:-transformer_340M.json}
10
+ SEQ_LEN=8192
11
+ WARMUP_STEPS=100
12
+ STEPS=95366
13
+ LR=3e-4
14
+ BATCH_SIZE=8
15
+ GAS=2
16
+ DECAY_TYPE=linear
17
+ DECAY_RATIO=1
18
+ NNODE=1
19
+ NGPU=8
20
+ LOG_RANK=0
21
+ # ====================================
22
+
23
+ # if jq command is not found, install it
24
+ if ! command -v jq &> /dev/null; then
25
+ echo "jq could not be found, installing it..."
26
+ sudo yum install -y jq
27
+ fi
28
+
29
+ EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
30
+
31
+ bash train.sh \
32
+ --job.config_file flame/models/fla.toml \
33
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
34
+ --model.config $FLAME_PATH/configs/$CONFIG \
35
+ --model.tokenizer_path $TOKENIZER \
36
+ --optimizer.name AdamW \
37
+ --optimizer.eps 1e-8 \
38
+ --optimizer.lr $LR \
39
+ --lr_scheduler.warmup_steps $WARMUP_STEPS \
40
+ --lr_scheduler.lr_min 0.01 \
41
+ --lr_scheduler.decay_type $DECAY_TYPE \
42
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
43
+ --training.batch_size $BATCH_SIZE \
44
+ --training.seq_len $SEQ_LEN \
45
+ --training.context_len $SEQ_LEN \
46
+ --training.gradient_accumulation_steps $GAS \
47
+ --training.steps $STEPS \
48
+ --training.max_norm 1.0 \
49
+ --training.skip_nan_inf \
50
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
51
+ --training.data_probs 0.55,0.3,0.15 \
52
+ --training.dataset_split train,train,train \
53
+ --training.dataset_name default,default,default \
54
+ --training.streaming \
55
+ --training.num_workers 32 \
56
+ --training.prefetch_factor 2 \
57
+ --training.seed 42 \
58
+ --training.compile \
59
+ --checkpoint.interval 8192 \
60
+ --checkpoint.load_step -1 \
61
+ --checkpoint.keep_latest_k 100 \
62
+ --metrics.log_freq 1 \
63
+ --metrics.enable_tensorboard \
64
+ --training.streaming
65
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "allow_neg_eigval": false,
3
+ "architectures": [
4
+ "GatedDeltaNetForCausalLM"
5
+ ],
6
+ "attn": {
7
+ "layers": [
8
+ 5,
9
+ 11,
10
+ 17,
11
+ 23
12
+ ],
13
+ "num_heads": 16,
14
+ "num_kv_heads": 8,
15
+ "qkv_bias": false,
16
+ "rope_theta": 160000.0,
17
+ "window_size": null
18
+ },
19
+ "attn_mode": "chunk",
20
+ "bos_token_id": 1,
21
+ "conv_size": 4,
22
+ "eos_token_id": 2,
23
+ "expand_k": 1,
24
+ "expand_v": 1,
25
+ "fuse_cross_entropy": true,
26
+ "fuse_norm": true,
27
+ "fuse_swiglu": true,
28
+ "head_dim": 256,
29
+ "hidden_act": "swish",
30
+ "hidden_ratio": 4,
31
+ "hidden_size": 1024,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": null,
34
+ "max_position_embeddings": 8192,
35
+ "model_type": "gated_deltanet",
36
+ "norm_eps": 1e-06,
37
+ "norm_first": false,
38
+ "num_heads": 4,
39
+ "num_hidden_layers": 24,
40
+ "num_v_heads": null,
41
+ "qk_activation": "silu",
42
+ "qk_norm": "l2",
43
+ "tie_word_embeddings": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.53.3",
46
+ "use_beta": true,
47
+ "use_cache": true,
48
+ "use_gate": true,
49
+ "use_l2warp": false,
50
+ "use_output_norm": true,
51
+ "use_short_conv": true,
52
+ "vocab_size": 32000
53
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "conv_size": 4,
6
+ "eos_token_id": 2,
7
+ "expand_k": 1,
8
+ "expand_v": 1,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "hidden_act": "swish",
12
+ "hidden_ratio": 4,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": null,
16
+ "model_type": "delta_net",
17
+ "norm_eps": 1e-06,
18
+ "num_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 2,
21
+ "qk_activation": "silu",
22
+ "qk_norm": "l2",
23
+ "tie_word_embeddings": false,
24
+ "use_beta": true,
25
+ "use_cache": true,
26
+ "use_gate": false,
27
+ "use_output_norm": true,
28
+ "use_short_conv": true
29
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_k": 1,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "delta_net",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 8,
17
+ "num_hidden_layers": 24,
18
+ "qk_activation": "silu",
19
+ "qk_norm": "l2",
20
+ "tie_word_embeddings": false,
21
+ "use_beta": true,
22
+ "use_cache": true,
23
+ "use_gate": false,
24
+ "use_output_norm": true,
25
+ "use_short_conv": true
26
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "conv_size": 4,
21
+ "eos_token_id": 2,
22
+ "expand_k": 1,
23
+ "expand_v": 1,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 256,
28
+ "hidden_act": "swish",
29
+ "hidden_ratio": 4,
30
+ "hidden_size": 1024,
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": null,
33
+ "max_position_embeddings": 8192,
34
+ "model_type": "gated_deltanet",
35
+ "norm_eps": 1e-06,
36
+ "norm_first": false,
37
+ "num_heads": 4,
38
+ "num_hidden_layers": 24,
39
+ "qk_activation": "silu",
40
+ "qk_norm": "l2",
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_beta": true,
45
+ "use_cache": true,
46
+ "use_gate": true,
47
+ "use_output_norm": true,
48
+ "use_short_conv": true,
49
+ "vocab_size": 32000
50
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "clamp_min": null,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": null,
15
+ "model_type": "gla",
16
+ "num_heads": 4,
17
+ "num_hidden_layers": 24,
18
+ "norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "vocab_size": 32000
24
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "model_type": "gla",
16
+ "norm_eps": 1e-06,
17
+ "num_heads": 16,
18
+ "num_hidden_layers": 32,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "use_output_gate": true,
24
+ "use_short_conv": false
25
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 4,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "expand_ratio": 128,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hgrn2",
14
+ "num_heads": 8,
15
+ "num_hidden_layers": 24,
16
+ "norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "use_cache": true,
19
+ "vocab_size": 32000
20
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": {
3
+ "layers": [
4
+ 1,
5
+ 3,
6
+ 5,
7
+ 7,
8
+ 9,
9
+ 11,
10
+ 13,
11
+ 15,
12
+ 17
13
+ ],
14
+ "num_heads": 18,
15
+ "num_kv_heads": 18,
16
+ "qkv_bias": false,
17
+ "rope_theta": 10000.0,
18
+ "window_size": 2048
19
+ },
20
+ "bos_token_id": 1,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "hidden_act": "swish",
28
+ "hidden_ratio": 4,
29
+ "hidden_size": 2304,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 4608,
32
+ "max_position_embeddings": 2048,
33
+ "model_type": "samba",
34
+ "norm_eps": 1e-05,
35
+ "num_hidden_layers": 18,
36
+ "pad_token_id": 0,
37
+ "rescale_prenorm_residual": false,
38
+ "residual_in_fp32": false,
39
+ "state_size": 16,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_init_scheme": "random",
43
+ "time_step_max": 0.1,
44
+ "time_step_min": 0.001,
45
+ "time_step_rank": 144,
46
+ "time_step_scale": 1.0,
47
+ "transformers_version": "4.50.1",
48
+ "use_bias": false,
49
+ "use_cache": true,
50
+ "use_conv_bias": true,
51
+ "vocab_size": 32000
52
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.006,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "sba",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "elementwise_affine": true,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "fuse_swiglu": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "transformer",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 32,
17
+ "num_hidden_layers": 24,
18
+ "num_kv_heads": null,
19
+ "pad_token_id": 2,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "transformer",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "model_type": "transformer",
13
+ "norm_eps": 1e-06,
14
+ "num_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_kv_heads": 8,
17
+ "rope_theta": 10000.0,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "window_size": null
21
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.53.3"
6
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ca6611958ad6032d4edeeff70cbced189831c7377a0d51d45295d800608176
3
+ size 28967020
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0400a66f534cf20e161ed17fc4c43e5af67548222b5b1dd08a95ee48c7509f55
3
+ size 28965495
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ccf26ea910a98e3c8667a099548c305e800682d3c67595e736442d684f00d1
3
+ size 28965482
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab73f37388b964be320787d56364d2a2f150cdba3a5f2fb0b1e11539fc3d9f0b
3
+ size 28965482
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ff987941147c2805732a8f40dad209c7e620ea25e67a9b54ae1194eec3aefd
3
+ size 28965439
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57408e049045a8149f91bbb25069512d449c8d0025721db53ef2f2bbf22e590c
3
+ size 28965482
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66451df5da256bdad7f31ee0017121a643a962158ac753db9a5594a84a9ae7b7
3
+ size 28965491
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3604399b4204803d0a1fa31bf0ad96addc1ff316d5c1e2265ae0c6b2673d2523
3
+ size 28965484
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e377dafd4fe52f30ee081c674f669620d072a4ec98b9c278dc239233984f6f
3
+ size 1564281448
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model_size=391m ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1049/events.out.tfevents.1753238968.TENCENT64.site.2520914.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86ccb5e4c12d2c9b871f0a88179c24fc8ec3dbccac753b47b0578c1f636804ae
3
+ size 97469568
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": true,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": null,
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizerFast",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/train.sh ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ params=""
4
+ if [ $# -ne 0 ]; then
5
+ params="$*"
6
+ fi
7
+
8
+ # use envs as local params for convenience
9
+ # e.g.
10
+ # NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
11
+ NNODE=${NNODE:-"1"}
12
+ NGPU=${NGPU:-"8"}
13
+ LOG_RANK=${LOG_RANK:-0}
14
+
15
+ if [[ -z "${MASTER_ADDR}" ]]; then
16
+ export MASTER_ADDR="localhost"
17
+ fi
18
+ if [[ -z "${MASTER_PORT}" ]]; then
19
+ export MASTER_PORT="0"
20
+ fi
21
+
22
+ : '
23
+ Usage:
24
+
25
+ bash train.sh -h
26
+
27
+ Training a 340M model:
28
+
29
+ NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
30
+ --job.config_file flame/models/fla.toml \
31
+ --job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
32
+ --model.config configs/transformer_340M.json \
33
+ --model.tokenizer_path fla-hub/transformer-1.3B-100B \
34
+ --optimizer.name AdamW \
35
+ --optimizer.eps 1e-15 \
36
+ --optimizer.lr 3e-4 \
37
+ --lr_scheduler.warmup_steps 1024 \
38
+ --lr_scheduler.lr_min 0.1 \
39
+ --lr_scheduler.decay_type cosine \
40
+ --training.batch_size 32 \
41
+ --training.seq_len 2048 \
42
+ --training.gradient_accumulation_steps 1 \
43
+ --training.steps 20480 \
44
+ --training.max_norm 1.0 \
45
+ --training.skip_nan_inf \
46
+ --training.dataset HuggingFaceFW/fineweb-edu \
47
+ --training.dataset_name default \
48
+ --training.dataset_split train \
49
+ --training.streaming \
50
+ --training.num_workers 32 \
51
+ --training.prefetch_factor 2 \
52
+ --training.seed 42 \
53
+ --training.compile \
54
+ --training.tensor_parallel_degree 1 \
55
+ --training.disable_loss_parallel \
56
+ --checkpoint.interval 2048 \
57
+ --checkpoint.load_step -1 \
58
+ --metrics.log_freq 1
59
+ '
60
+
61
+ echo "Launching training..."
62
+
63
+ set -x
64
+ path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
65
+ steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
66
+ config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
67
+ tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
68
+ model=$(
69
+ python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
70
+ )
71
+
72
+ mkdir -p $path
73
+ cp *.sh $path
74
+ cp -r configs $path
75
+ cp -r flame $path
76
+ cp -r 3rdparty/flash-linear-attention/fla $path
77
+ cp -r 3rdparty/torchtitan/torchtitan $path
78
+
79
+ # for offline systems
80
+ # export TRANSFORMERS_OFFLINE=1
81
+ # export HF_DATASETS_OFFLINE=1
82
+ # export HF_HUB_OFFLINE=1
83
+ if [ "$date" == "" ]; then
84
+ date=$(date +%Y%m%d%H%M)
85
+ fi
86
+ RUN_NAME="$model-$(basename $path)"
87
+ RUN_ID="$RUN_NAME-$date"
88
+
89
+ export WANDB_RESUME=allow
90
+ if [[ -z "${WANDB_PROJECT}" ]]; then
91
+ export WANDB_PROJECT="fla"
92
+ fi
93
+ if [[ -z "${WANDB_NAME}" ]]; then
94
+ export WANDB_NAME="$RUN_NAME"
95
+ fi
96
+ if [[ -z "${WANDB_RUN_ID}" ]]; then
97
+ export WANDB_RUN_ID="$RUN_ID"
98
+ fi
99
+
100
+ PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
101
+ torchrun --nnodes=${NNODE} \
102
+ --nproc_per_node=${NGPU} \
103
+ --rdzv_backend c10d \
104
+ --rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
105
+ --local-ranks-filter ${LOG_RANK} \
106
+ --role rank \
107
+ --tee 3 \
108
+ --log-dir $path/logs \
109
+ -m flame.train \
110
+ $params
111
+
112
+ echo "TRAINING DONE!"
113
+ echo "Converting the DCP checkpoints to HF format..."
114
+
115
+ python -m flame.utils.convert_dcp_to_hf \
116
+ --path $path \
117
+ --step $steps \
118
+ --config $config \
119
+ --tokenizer $tokenizer
120
+
121
+ echo "RUNNING DONE!"