IvanHU commited on
Commit
afb4f1a
·
verified ·
1 Parent(s): ab79e7d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh +65 -0
  2. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json +53 -0
  3. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json +29 -0
  4. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json +26 -0
  5. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json +22 -0
  6. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json +22 -0
  7. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json +50 -0
  8. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json +24 -0
  9. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json +25 -0
  10. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json +29 -0
  11. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json +20 -0
  12. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json +32 -0
  13. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json +32 -0
  14. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json +30 -0
  15. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json +30 -0
  16. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json +52 -0
  17. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json +18 -0
  18. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json +22 -0
  19. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json +18 -0
  20. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json +21 -0
  21. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log +6 -0
  22. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log +0 -0
  23. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log +6 -0
  24. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log +0 -0
  25. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log +6 -0
  26. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log +0 -0
  27. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log +6 -0
  28. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log +0 -0
  29. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log +6 -0
  30. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log +0 -0
  31. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log +6 -0
  32. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log +0 -0
  33. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log +6 -0
  34. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log +0 -0
  35. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log +6 -0
  36. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log +0 -0
  37. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log +0 -0
  38. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log +0 -0
  39. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log +187 -0
  40. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log +0 -0
  41. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log +187 -0
  42. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log +0 -0
  43. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log +187 -0
  44. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log +0 -0
  45. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log +187 -0
  46. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log +0 -0
  47. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log +187 -0
  48. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log +0 -0
  49. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log +187 -0
  50. gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
2
+ DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
3
+ TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
4
+
5
+ cd $FLAME_PATH
6
+ source .venv/bin/activate
7
+
8
+ # =========== train config ===========
9
+ CONFIG=${1:-transformer_340M.json}
10
+ SEQ_LEN=8192
11
+ WARMUP_STEPS=100
12
+ STEPS=95366
13
+ LR=3e-4
14
+ BATCH_SIZE=16
15
+ DECAY_TYPE=linear
16
+ DECAY_RATIO=1
17
+
18
+ NNODE=1
19
+ NGPU=8
20
+ LOG_RANK=0
21
+ # ====================================
22
+
23
+ # if jq command is not found, install it
24
+ if ! command -v jq &> /dev/null; then
25
+ echo "jq could not be found, installing it..."
26
+ sudo yum install -y jq
27
+ fi
28
+
29
+ EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}
30
+
31
+ bash train.sh \
32
+ --job.config_file flame/models/fla.toml \
33
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
34
+ --model.config $FLAME_PATH/configs/$CONFIG \
35
+ --model.tokenizer_path $TOKENIZER \
36
+ --optimizer.name AdamW \
37
+ --optimizer.eps 1e-8 \
38
+ --optimizer.lr $LR \
39
+ --lr_scheduler.warmup_steps $WARMUP_STEPS \
40
+ --lr_scheduler.lr_min 0.01 \
41
+ --lr_scheduler.decay_type $DECAY_TYPE \
42
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
43
+ --training.batch_size $BATCH_SIZE \
44
+ --training.seq_len $SEQ_LEN \
45
+ --training.context_len $SEQ_LEN \
46
+ --training.gradient_accumulation_steps 1 \
47
+ --training.steps $STEPS \
48
+ --training.max_norm 1.0 \
49
+ --training.skip_nan_inf \
50
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
51
+ --training.data_probs 0.55,0.3,0.15 \
52
+ --training.dataset_split train,train,train \
53
+ --training.dataset_name default,default,default \
54
+ --training.streaming \
55
+ --training.num_workers 32 \
56
+ --training.prefetch_factor 2 \
57
+ --training.seed 42 \
58
+ --training.compile \
59
+ --checkpoint.interval 8192 \
60
+ --checkpoint.load_step -1 \
61
+ --checkpoint.keep_latest_k 100 \
62
+ --metrics.log_freq 1 \
63
+ --metrics.enable_tensorboard \
64
+ --training.streaming
65
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "allow_neg_eigval": false,
3
+ "architectures": [
4
+ "GatedDeltaNetForCausalLM"
5
+ ],
6
+ "attn": {
7
+ "layers": [
8
+ 5,
9
+ 11,
10
+ 17,
11
+ 23
12
+ ],
13
+ "num_heads": 16,
14
+ "num_kv_heads": 8,
15
+ "qkv_bias": false,
16
+ "rope_theta": 160000.0,
17
+ "window_size": null
18
+ },
19
+ "attn_mode": "chunk",
20
+ "bos_token_id": 1,
21
+ "conv_size": 4,
22
+ "eos_token_id": 2,
23
+ "expand_k": 1,
24
+ "expand_v": 1,
25
+ "fuse_cross_entropy": true,
26
+ "fuse_norm": true,
27
+ "fuse_swiglu": true,
28
+ "head_dim": 256,
29
+ "hidden_act": "swish",
30
+ "hidden_ratio": 4,
31
+ "hidden_size": 1024,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": null,
34
+ "max_position_embeddings": 8192,
35
+ "model_type": "gated_deltanet",
36
+ "norm_eps": 1e-06,
37
+ "norm_first": false,
38
+ "num_heads": 4,
39
+ "num_hidden_layers": 24,
40
+ "num_v_heads": null,
41
+ "qk_activation": "silu",
42
+ "qk_norm": "l2",
43
+ "tie_word_embeddings": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.53.3",
46
+ "use_beta": true,
47
+ "use_cache": true,
48
+ "use_gate": true,
49
+ "use_l2warp": false,
50
+ "use_output_norm": true,
51
+ "use_short_conv": true,
52
+ "vocab_size": 32000
53
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "conv_size": 4,
6
+ "eos_token_id": 2,
7
+ "expand_k": 1,
8
+ "expand_v": 1,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "hidden_act": "swish",
12
+ "hidden_ratio": 4,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": null,
16
+ "model_type": "delta_net",
17
+ "norm_eps": 1e-06,
18
+ "num_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 2,
21
+ "qk_activation": "silu",
22
+ "qk_norm": "l2",
23
+ "tie_word_embeddings": false,
24
+ "use_beta": true,
25
+ "use_cache": true,
26
+ "use_gate": false,
27
+ "use_output_norm": true,
28
+ "use_short_conv": true
29
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_k": 1,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "delta_net",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 8,
17
+ "num_hidden_layers": 24,
18
+ "qk_activation": "silu",
19
+ "qk_norm": "l2",
20
+ "tie_word_embeddings": false,
21
+ "use_beta": true,
22
+ "use_cache": true,
23
+ "use_gate": false,
24
+ "use_output_norm": true,
25
+ "use_short_conv": true
26
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "conv_size": 4,
21
+ "eos_token_id": 2,
22
+ "expand_k": 1,
23
+ "expand_v": 1,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 256,
28
+ "hidden_act": "swish",
29
+ "hidden_ratio": 4,
30
+ "hidden_size": 1024,
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": null,
33
+ "max_position_embeddings": 8192,
34
+ "model_type": "gated_deltanet",
35
+ "norm_eps": 1e-06,
36
+ "norm_first": false,
37
+ "num_heads": 4,
38
+ "num_hidden_layers": 24,
39
+ "qk_activation": "silu",
40
+ "qk_norm": "l2",
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_beta": true,
45
+ "use_cache": true,
46
+ "use_gate": true,
47
+ "use_output_norm": true,
48
+ "use_short_conv": true,
49
+ "vocab_size": 32000
50
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "clamp_min": null,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": null,
15
+ "model_type": "gla",
16
+ "num_heads": 4,
17
+ "num_hidden_layers": 24,
18
+ "norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "vocab_size": 32000
24
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "model_type": "gla",
16
+ "norm_eps": 1e-06,
17
+ "num_heads": 16,
18
+ "num_hidden_layers": 32,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "use_output_gate": true,
24
+ "use_short_conv": false
25
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 4,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "expand_ratio": 128,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hgrn2",
14
+ "num_heads": 8,
15
+ "num_hidden_layers": 24,
16
+ "norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "use_cache": true,
19
+ "vocab_size": 32000
20
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": {
3
+ "layers": [
4
+ 1,
5
+ 3,
6
+ 5,
7
+ 7,
8
+ 9,
9
+ 11,
10
+ 13,
11
+ 15,
12
+ 17
13
+ ],
14
+ "num_heads": 18,
15
+ "num_kv_heads": 18,
16
+ "qkv_bias": false,
17
+ "rope_theta": 10000.0,
18
+ "window_size": 2048
19
+ },
20
+ "bos_token_id": 1,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "hidden_act": "swish",
28
+ "hidden_ratio": 4,
29
+ "hidden_size": 2304,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 4608,
32
+ "max_position_embeddings": 2048,
33
+ "model_type": "samba",
34
+ "norm_eps": 1e-05,
35
+ "num_hidden_layers": 18,
36
+ "pad_token_id": 0,
37
+ "rescale_prenorm_residual": false,
38
+ "residual_in_fp32": false,
39
+ "state_size": 16,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_init_scheme": "random",
43
+ "time_step_max": 0.1,
44
+ "time_step_min": 0.001,
45
+ "time_step_rank": 144,
46
+ "time_step_scale": 1.0,
47
+ "transformers_version": "4.50.1",
48
+ "use_bias": false,
49
+ "use_cache": true,
50
+ "use_conv_bias": true,
51
+ "vocab_size": 32000
52
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.006,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "sba",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "elementwise_affine": true,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "fuse_swiglu": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "transformer",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 32,
17
+ "num_hidden_layers": 24,
18
+ "num_kv_heads": null,
19
+ "pad_token_id": 2,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false
22
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "transformer",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "model_type": "transformer",
13
+ "norm_eps": 1e-06,
14
+ "num_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_kv_heads": 8,
17
+ "rope_theta": 10000.0,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "window_size": null
21
+ }
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
5
+ from torchtitan.components.checkpoint import CheckpointManager
6
+ ModuleNotFoundError: No module named 'torchtitan'
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log ADDED
The diff for this file is too large to render. See raw diff
 
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:41,941 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:41,941 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:41,942 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:43,187 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:43,187 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:43,187 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:43,207 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,370 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,371 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,371 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank1]: Traceback (most recent call last):
166
+ [rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank1]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank1]: main(config)
170
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank1]: return f(*args, **kwargs)
172
+ [rank1]: ^^^^^^^^^^^^^^^^^^
173
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank1]: dataset = build_dataset(
175
+ [rank1]: ^^^^^^^^^^^^^^
176
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank1]: subset = load_dataset(
178
+ [rank1]: ^^^^^^^^^^^^^
179
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank1]: builder_instance.download_and_prepare(
181
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank1]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank1]: self.acquire()
185
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank1]: time.sleep(poll_interval)
187
+ [rank1]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:42,036 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:42,037 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:42,038 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:43,075 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:43,078 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:43,210 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:43,210 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:43,210 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:43,219 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,387 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,387 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,387 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank2]: Traceback (most recent call last):
166
+ [rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank2]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank2]: main(config)
170
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank2]: return f(*args, **kwargs)
172
+ [rank2]: ^^^^^^^^^^^^^^^^^^
173
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank2]: dataset = build_dataset(
175
+ [rank2]: ^^^^^^^^^^^^^^
176
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank2]: subset = load_dataset(
178
+ [rank2]: ^^^^^^^^^^^^^
179
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank2]: builder_instance.download_and_prepare(
181
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank2]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank2]: self.acquire()
185
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank2]: time.sleep(poll_interval)
187
+ [rank2]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:41,964 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:41,965 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:41,966 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:43,050 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:43,053 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:43,165 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:43,165 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:43,165 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:43,192 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,304 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,304 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,304 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank3]: Traceback (most recent call last):
166
+ [rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank3]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank3]: main(config)
170
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank3]: return f(*args, **kwargs)
172
+ [rank3]: ^^^^^^^^^^^^^^^^^^
173
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank3]: dataset = build_dataset(
175
+ [rank3]: ^^^^^^^^^^^^^^
176
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank3]: subset = load_dataset(
178
+ [rank3]: ^^^^^^^^^^^^^
179
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank3]: builder_instance.download_and_prepare(
181
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank3]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank3]: self.acquire()
185
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank3]: time.sleep(poll_interval)
187
+ [rank3]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:41,988 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:41,988 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:41,990 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:43,095 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:43,097 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:43,213 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:43,213 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:43,214 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:43,222 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,405 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,405 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,405 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank4]: Traceback (most recent call last):
166
+ [rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank4]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank4]: main(config)
170
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank4]: return f(*args, **kwargs)
172
+ [rank4]: ^^^^^^^^^^^^^^^^^^
173
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank4]: dataset = build_dataset(
175
+ [rank4]: ^^^^^^^^^^^^^^
176
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank4]: subset = load_dataset(
178
+ [rank4]: ^^^^^^^^^^^^^
179
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank4]: builder_instance.download_and_prepare(
181
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank4]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank4]: self.acquire()
185
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank4]: time.sleep(poll_interval)
187
+ [rank4]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:41,984 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:41,984 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:41,986 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:43,202 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:43,202 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:43,202 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:43,209 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,394 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,395 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,395 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank5]: Traceback (most recent call last):
166
+ [rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank5]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank5]: main(config)
170
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank5]: return f(*args, **kwargs)
172
+ [rank5]: ^^^^^^^^^^^^^^^^^^
173
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank5]: dataset = build_dataset(
175
+ [rank5]: ^^^^^^^^^^^^^^
176
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank5]: subset = load_dataset(
178
+ [rank5]: ^^^^^^^^^^^^^
179
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank5]: builder_instance.download_and_prepare(
181
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank5]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank5]: self.acquire()
185
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank5]: time.sleep(poll_interval)
187
+ [rank5]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log ADDED
File without changes
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:47:41,727 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:47:41,727 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:47:41,729 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:47:42,344 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:47:42,347 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:47:42,391 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:47:42,391 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:47:42,391 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:47:42,708 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:47:43,145 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:47:43,145 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:47:43,146 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [titan] 2025-07-22 22:47:43,667 - root - INFO - Subset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550):
153
+ IterableDataset({
154
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
155
+ num_shards: 140
156
+ })
157
+ [titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
158
+ [titan] 2025-07-22 22:47:43,667 - root - WARNING - Dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.
159
+ `trust_remote_code` is not supported anymore.
160
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
161
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
162
+ [titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
163
+ Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
164
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
165
+ [rank6]: Traceback (most recent call last):
166
+ [rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
167
+ [rank6]: File "<frozen runpy>", line 88, in _run_code
168
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
169
+ [rank6]: main(config)
170
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
171
+ [rank6]: return f(*args, **kwargs)
172
+ [rank6]: ^^^^^^^^^^^^^^^^^^
173
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
174
+ [rank6]: dataset = build_dataset(
175
+ [rank6]: ^^^^^^^^^^^^^^
176
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
177
+ [rank6]: subset = load_dataset(
178
+ [rank6]: ^^^^^^^^^^^^^
179
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
180
+ [rank6]: builder_instance.download_and_prepare(
181
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
182
+ [rank6]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
183
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
184
+ [rank6]: self.acquire()
185
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
186
+ [rank6]: time.sleep(poll_interval)
187
+ [rank6]: KeyboardInterrupt
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log ADDED
File without changes