IvanHU commited on Aug 6, 2025

Commit

afb4f1a

verified ·

1 Parent(s): ab79e7d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh +65 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json +53 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json +29 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json +26 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json +22 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json +22 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json +50 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json +24 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json +25 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json +29 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json +20 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json +32 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json +32 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json +30 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json +30 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json +52 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json +18 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json +22 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json +18 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json +21 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log +6 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log +187 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log +0 -0

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
+DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
+TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
+cd $FLAME_PATH
+source .venv/bin/activate
+# =========== train config ===========
+CONFIG=${1:-transformer_340M.json}
+SEQ_LEN=8192
+WARMUP_STEPS=100
+STEPS=95366
+LR=3e-4
+BATCH_SIZE=16
+DECAY_TYPE=linear
+DECAY_RATIO=1
+NNODE=1
+NGPU=8
+LOG_RANK=0
+# ====================================
+# if jq command is not found, install it
+if ! command -v jq &> /dev/null; then
+    echo "jq could not be found, installing it..."
+    sudo yum install -y jq
+fi
+EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}
+bash train.sh \
+ --job.config_file flame/models/fla.toml \
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
+ --model.config $FLAME_PATH/configs/$CONFIG \
+ --model.tokenizer_path $TOKENIZER \
+ --optimizer.name AdamW  \
+ --optimizer.eps 1e-8  \
+ --optimizer.lr $LR  \
+ --lr_scheduler.warmup_steps $WARMUP_STEPS  \
+ --lr_scheduler.lr_min 0.01  \
+ --lr_scheduler.decay_type $DECAY_TYPE  \
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
+ --training.batch_size $BATCH_SIZE  \
+ --training.seq_len $SEQ_LEN  \
+ --training.context_len $SEQ_LEN  \
+ --training.gradient_accumulation_steps 1  \
+ --training.steps $STEPS  \
+ --training.max_norm 1.0  \
+ --training.skip_nan_inf  \
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro  \
+ --training.data_probs 0.55,0.3,0.15  \
+ --training.dataset_split train,train,train  \
+ --training.dataset_name default,default,default  \
+ --training.streaming  \
+ --training.num_workers 32  \
+ --training.prefetch_factor 2  \
+ --training.seed 42  \
+ --training.compile  \
+ --checkpoint.interval 8192  \
+ --checkpoint.load_step -1  \
+ --checkpoint.keep_latest_k 100  \
+ --metrics.log_freq 1  \
+ --metrics.enable_tensorboard  \
+ --training.streaming

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "allow_neg_eigval": false,
+  "architectures": [
+    "GatedDeltaNetForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_k": 1,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 256,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 8192,
+  "model_type": "gated_deltanet",
+  "norm_eps": 1e-06,
+  "norm_first": false,
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "num_v_heads": null,
+  "qk_activation": "silu",
+  "qk_norm": "l2",
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_beta": true,
+  "use_cache": true,
+  "use_gate": true,
+  "use_l2warp": false,
+  "use_output_norm": true,
+  "use_short_conv": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 2,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "GatedDeltaNetForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_k": 1,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 256,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 8192,
+  "model_type": "gated_deltanet",
+  "norm_eps": 1e-06,
+  "norm_first": false,
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "qk_activation": "silu",
+  "qk_norm": "l2",
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_beta": true,
+  "use_cache": true,
+  "use_gate": true,
+  "use_output_norm": true,
+  "use_short_conv": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "clamp_min": null,
+  "eos_token_id": 2,
+  "expand_k": 0.5,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "model_type": "gla",
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_gk": true,
+  "use_gv": false,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_k": 0.5,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "model_type": "gla",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 32,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gk": true,
+    "use_gv": false,
+    "use_output_gate": true,
+    "use_short_conv": false
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 4,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gsa",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 64,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_ratio": 128,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "hgrn2",
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "attn": {
+    "layers": [
+      1,
+      3,
+      5,
+      7,
+      9,
+      11,
+      13,
+      15,
+      17
+    ],
+    "num_heads": 18,
+    "num_kv_heads": 18,
+    "qkv_bias": false,
+    "rope_theta": 10000.0,
+    "window_size": 2048
+  },
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 4608,
+  "max_position_embeddings": 2048,
+  "model_type": "samba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 18,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 144,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "max_position_embeddings": 8192,
+    "model_type": "sba",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "bos_token_id": 1,
+    "elementwise_affine": true,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "fuse_swiglu": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 24,
+    "num_kv_heads": null,
+    "pad_token_id": 2,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 32,
+    "num_kv_heads": 8,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "window_size": null
+}

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log ADDED Viewed

	@@ -0,0 +1,6 @@

+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
+    from torchtitan.components.checkpoint import CheckpointManager
+ModuleNotFoundError: No module named 'torchtitan'

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:41,941 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:41,941 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:41,942 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:43,187 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:43,187 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:43,187 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:43,207 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,370 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,371 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,371 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank1]:   File "<frozen runpy>", line 88, in _run_code
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank1]:     main(config)
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank1]:     return f(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank1]:     dataset = build_dataset(
+[rank1]:               ^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank1]:     subset = load_dataset(
+[rank1]:              ^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank1]:     builder_instance.download_and_prepare(
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank1]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank1]:     self.acquire()
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank1]:     time.sleep(poll_interval)
+[rank1]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:42,036 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:42,037 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:42,038 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:43,075 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:43,078 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:43,210 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:43,210 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:43,210 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:43,219 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,387 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,387 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,387 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank2]:   File "<frozen runpy>", line 88, in _run_code
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank2]:     main(config)
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank2]:     return f(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank2]:     dataset = build_dataset(
+[rank2]:               ^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank2]:     subset = load_dataset(
+[rank2]:              ^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank2]:     builder_instance.download_and_prepare(
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank2]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank2]:     self.acquire()
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank2]:     time.sleep(poll_interval)
+[rank2]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:41,964 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:41,965 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:41,966 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:43,050 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:43,053 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:43,165 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:43,165 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:43,165 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:43,192 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,304 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,304 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,304 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank3]:   File "<frozen runpy>", line 88, in _run_code
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank3]:     main(config)
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank3]:     return f(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank3]:     dataset = build_dataset(
+[rank3]:               ^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank3]:     subset = load_dataset(
+[rank3]:              ^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank3]:     builder_instance.download_and_prepare(
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank3]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank3]:     self.acquire()
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank3]:     time.sleep(poll_interval)
+[rank3]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:41,988 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:41,988 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:41,990 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:43,095 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:43,097 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:43,213 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:43,213 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:43,214 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:43,222 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,405 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,405 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,405 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank4]:   File "<frozen runpy>", line 88, in _run_code
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank4]:     main(config)
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank4]:     return f(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank4]:     dataset = build_dataset(
+[rank4]:               ^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank4]:     subset = load_dataset(
+[rank4]:              ^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank4]:     builder_instance.download_and_prepare(
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank4]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank4]:     self.acquire()
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank4]:     time.sleep(poll_interval)
+[rank4]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:41,984 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:41,984 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:41,986 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:43,202 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:43,202 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:43,202 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:43,209 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,394 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,395 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,395 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank5]:   File "<frozen runpy>", line 88, in _run_code
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank5]:     main(config)
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank5]:     return f(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank5]:     dataset = build_dataset(
+[rank5]:               ^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank5]:     subset = load_dataset(
+[rank5]:              ^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank5]:     builder_instance.download_and_prepare(
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank5]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank5]:     self.acquire()
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank5]:     time.sleep(poll_interval)
+[rank5]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log ADDED Viewed

File without changes

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log ADDED Viewed

	@@ -0,0 +1,187 @@

+[titan] 2025-07-22 22:47:41,727 - root - INFO - Starting job: default job
+[titan] 2025-07-22 22:47:41,727 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 16,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 1,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-22 22:47:41,729 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-22 22:47:42,344 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-22 22:47:42,347 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-22 22:47:42,391 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-22 22:47:42,391 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-22 22:47:42,391 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-22 22:47:42,708 - root - INFO - Loading tokenizer...
+[titan] 2025-07-22 22:47:43,145 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-22 22:47:43,145 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,146 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,667 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank6]:   File "<frozen runpy>", line 88, in _run_code
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank6]:     main(config)
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank6]:     return f(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
+[rank6]:     dataset = build_dataset(
+[rank6]:               ^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
+[rank6]:     subset = load_dataset(
+[rank6]:              ^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
+[rank6]:     builder_instance.download_and_prepare(
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
+[rank6]:     with FileLock(lock_path) if is_local else contextlib.nullcontext():
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
+[rank6]:     self.acquire()
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
+[rank6]:     time.sleep(poll_interval)
+[rank6]: KeyboardInterrupt

gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log ADDED Viewed

File without changes