IvanHU commited on Aug 7, 2025

Commit

ec3be61

verified ·

1 Parent(s): 5a7e3bf

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh +65 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json +57 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json +29 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json +26 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json +24 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json +25 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json +29 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json +20 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json +32 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json +32 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_6_1_340M.json +50 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json +30 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json +30 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json +52 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json +18 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json +22 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json +18 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json +21 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json +7 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/stderr.log +463 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/stderr.log +387 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/stdout.log +0 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/error.json +1 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/stderr.log +467 -0
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/stdout.log +0 -0

.gitattributes CHANGED Viewed

@@ -57,3 +57,11 @@ gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn
 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text

 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
 gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
+mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
+DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
+TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
+cd $FLAME_PATH
+source .venv/bin/activate
+# =========== train config ===========
+CONFIG=${1:-transformer_340M.json}
+SEQ_LEN=8192
+WARMUP_STEPS=100
+STEPS=95366
+LR=3e-4
+BATCH_SIZE=8
+GAS=2
+DECAY_TYPE=linear
+DECAY_RATIO=1
+NNODE=1
+NGPU=8
+LOG_RANK=0
+# ====================================
+# if jq command is not found, install it
+if ! command -v jq &> /dev/null; then
+    echo "jq could not be found, installing it..."
+    sudo yum install -y jq
+fi
+EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
+bash train.sh \
+ --job.config_file flame/models/fla.toml \
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
+ --model.config $FLAME_PATH/configs/$CONFIG \
+ --model.tokenizer_path $TOKENIZER \
+ --optimizer.name AdamW  \
+ --optimizer.eps 1e-8  \
+ --optimizer.lr $LR  \
+ --lr_scheduler.warmup_steps $WARMUP_STEPS  \
+ --lr_scheduler.lr_min 0.01  \
+ --lr_scheduler.decay_type $DECAY_TYPE  \
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
+ --training.batch_size $BATCH_SIZE  \
+ --training.seq_len $SEQ_LEN  \
+ --training.context_len $SEQ_LEN  \
+ --training.gradient_accumulation_steps $GAS  \
+ --training.steps $STEPS  \
+ --training.max_norm 1.0  \
+ --training.skip_nan_inf  \
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro  \
+ --training.data_probs 0.55,0.3,0.15  \
+ --training.dataset_split train,train,train  \
+ --training.dataset_name default,default,default  \
+ --training.streaming  \
+ --training.num_workers 32  \
+ --training.prefetch_factor 2  \
+ --training.seed 42  \
+ --training.compile  \
+ --checkpoint.interval 8192  \
+ --checkpoint.load_step -1  \
+ --checkpoint.keep_latest_k 100  \
+ --metrics.log_freq 1  \
+ --metrics.enable_tensorboard  \
+ --training.streaming

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 2,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "GatedDeltaNetForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_k": 1,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 256,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 8192,
+  "model_type": "gated_deltanet",
+  "norm_eps": 1e-06,
+  "norm_first": false,
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "qk_activation": "silu",
+  "qk_norm": "l2",
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_beta": true,
+  "use_cache": true,
+  "use_gate": true,
+  "use_output_norm": true,
+  "use_short_conv": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "clamp_min": null,
+  "eos_token_id": 2,
+  "expand_k": 0.5,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "model_type": "gla",
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_gk": true,
+  "use_gv": false,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_k": 0.5,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "model_type": "gla",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 32,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gk": true,
+    "use_gv": false,
+    "use_output_gate": true,
+    "use_short_conv": false
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 4,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gsa",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 64,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_ratio": 128,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "hgrn2",
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_6_1_340M.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "attn": {
+    "layers": [
+      1,
+      3,
+      5,
+      7,
+      9,
+      11,
+      13,
+      15,
+      17
+    ],
+    "num_heads": 18,
+    "num_kv_heads": 18,
+    "qkv_bias": false,
+    "rope_theta": 10000.0,
+    "window_size": 2048
+  },
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 4608,
+  "max_position_embeddings": 2048,
+  "model_type": "samba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 18,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 144,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "max_position_embeddings": 8192,
+    "model_type": "sba",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "bos_token_id": 1,
+    "elementwise_affine": true,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "fuse_swiglu": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 24,
+    "num_kv_heads": null,
+    "pad_token_id": 2,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 32,
+    "num_kv_heads": 8,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "window_size": null
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.53.3"
+}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1114693 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1114693 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/stderr.log ADDED Viewed

	@@ -0,0 +1,463 @@

+[titan] 2025-07-23 14:27:46,756 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,756 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,756 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:46,757 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:46,772 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:46,951 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:46,951 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:46,951 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,501 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,644 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,644 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,645 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,645 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,750 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,881 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:39,881 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:39,882 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,882 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,150 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,316 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,316 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,316 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,316 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,507 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,196 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,310 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,312 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,314 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,265 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,318 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,318 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,318 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,540 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,884 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,272 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,273 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,297 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+wandb: Network error (InvalidURL), entering retry loop.
+wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
+wandb: Network error (InvalidURL), entering retry loop.
+[titan] 2025-07-23 14:30:44,436 - root - ERROR - Failed to create WandB logger: Run initialization has timed out after 90.0 sec. Please try increasing the timeout with the `init_timeout` setting: `wandb.init(settings=wandb.Settings(init_timeout=120))`.
+[titan] 2025-07-23 14:30:44,442 - root - INFO - TensorBoard logging enabled. Logs will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1428
+[titan] 2025-07-23 14:30:44,442 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:30:44,527 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:30:50,781 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:30:50,784 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:30:50,784 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:30:50,784 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:30:50,785 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:30:50,785 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:30:50,785 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:30:50,785 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:30:50,785 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:30:50,785 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+    main(config)
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+    output = model(
+             ^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+    return inner()
+           ^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+    result = forward_call(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+    return func(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+    outputs = self.backbone(
+              ^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+    hidden_states = mixer_block(
+                    ^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+    return inner()
+           ^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+    result = forward_call(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+    hidden_states = self.norm(hidden_states)
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+    hidden_states = self.mixer(
+                    ^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+    return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+    G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+                     ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1114693 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank0]:   File "<frozen runpy>", line 88, in _run_code
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank0]:     main(config)
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank0]:     return f(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank0]:     output = model(
+[rank0]:              ^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank0]:     outputs = self.backbone(
+[rank0]:               ^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank0]:     hidden_states = mixer_block(
+[rank0]:                     ^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank0]:     hidden_states = self.norm(hidden_states)
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank0]:     hidden_states = self.mixer(
+[rank0]:                     ^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank0]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank0]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank0]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1114693 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/0/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 1 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003901 has 316.00 MiB memory in use. Process 696029 has 316.00 MiB memory in use. Process 1114694 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 1 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003901 has 316.00 MiB memory in use. Process 696029 has 316.00 MiB memory in use. Process 1114694 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,323 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,323 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,324 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,255 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,258 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,324 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,324 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,324 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,411 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,493 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,493 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,493 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,493 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,064 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,096 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:41,096 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:41,096 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,097 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,357 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,441 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:41,441 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:41,441 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:41,441 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:47,757 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:48,445 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:48,560 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:48,562 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:48,564 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,264 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,317 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,541 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,886 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,271 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,273 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,297 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,302 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,429 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,662 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,667 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,671 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,671 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank1]:   File "<frozen runpy>", line 88, in _run_code
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank1]:     main(config)
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank1]:     return f(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank1]:     output = model(
+[rank1]:              ^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank1]:     return inner()
+[rank1]:            ^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank1]:     result = forward_call(*args, **kwargs)
+[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank1]:     return func(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank1]:     outputs = self.backbone(
+[rank1]:               ^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank1]:     return forward_call(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank1]:     hidden_states = mixer_block(
+[rank1]:                     ^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank1]:     return inner()
+[rank1]:            ^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank1]:     result = forward_call(*args, **kwargs)
+[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank1]:     return fn(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank1]:     return forward_call(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank1]:     hidden_states = self.norm(hidden_states)
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank1]:     hidden_states = self.mixer(
+[rank1]:                     ^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank1]:     return forward_call(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank1]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank1]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank1]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 1 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003901 has 316.00 MiB memory in use. Process 696029 has 316.00 MiB memory in use. Process 1114694 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/1/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 2 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003900 has 316.00 MiB memory in use. Process 696030 has 316.00 MiB memory in use. Process 1114695 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 2 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003900 has 316.00 MiB memory in use. Process 696030 has 316.00 MiB memory in use. Process 1114695 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,385 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,386 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,386 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,325 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,327 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,375 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,375 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,376 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,418 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,543 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,543 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,544 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,544 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,999 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,032 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:40,032 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,032 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,032 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,289 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,382 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,382 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,382 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,382 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,791 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,494 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,612 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,614 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,616 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,148 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,264 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,315 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,539 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,882 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,274 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,275 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,299 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,307 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,458 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,658 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,660 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,666 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,666 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank2]:   File "<frozen runpy>", line 88, in _run_code
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank2]:     main(config)
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank2]:     return f(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank2]:     output = model(
+[rank2]:              ^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank2]:     return inner()
+[rank2]:            ^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank2]:     result = forward_call(*args, **kwargs)
+[rank2]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank2]:     return func(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank2]:     outputs = self.backbone(
+[rank2]:               ^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank2]:     return forward_call(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank2]:     hidden_states = mixer_block(
+[rank2]:                     ^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank2]:     return inner()
+[rank2]:            ^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank2]:     result = forward_call(*args, **kwargs)
+[rank2]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank2]:     return fn(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank2]:     return forward_call(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank2]:     hidden_states = self.norm(hidden_states)
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank2]:     hidden_states = self.mixer(
+[rank2]:                     ^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank2]:     return forward_call(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank2]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank2]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank2]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 2 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003900 has 316.00 MiB memory in use. Process 696030 has 316.00 MiB memory in use. Process 1114695 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/2/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 3 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003902 has 316.00 MiB memory in use. Process 696032 has 316.00 MiB memory in use. Process 1114696 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 3 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003902 has 316.00 MiB memory in use. Process 696032 has 316.00 MiB memory in use. Process 1114696 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,466 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,466 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,466 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,421 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,423 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,484 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,485 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,485 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,493 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,999 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,494 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,494 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,494 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,494 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,997 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,028 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:40,029 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,029 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,029 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,298 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,391 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,392 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,392 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,392 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,760 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,557 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,677 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,680 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,681 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,148 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,265 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,322 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,322 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,322 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,323 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,540 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,882 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,272 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,273 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,297 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,324 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,477 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,659 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,661 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,663 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,663 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,664 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,665 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank3]:   File "<frozen runpy>", line 88, in _run_code
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank3]:     main(config)
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank3]:     return f(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank3]:     output = model(
+[rank3]:              ^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank3]:     return self._call_impl(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank3]:     return inner()
+[rank3]:            ^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank3]:     result = forward_call(*args, **kwargs)
+[rank3]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank3]:     return func(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank3]:     outputs = self.backbone(
+[rank3]:               ^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank3]:     return self._call_impl(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank3]:     return forward_call(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank3]:     hidden_states = mixer_block(
+[rank3]:                     ^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank3]:     return self._call_impl(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank3]:     return inner()
+[rank3]:            ^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank3]:     result = forward_call(*args, **kwargs)
+[rank3]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank3]:     return fn(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank3]:     return self._call_impl(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank3]:     return forward_call(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank3]:     hidden_states = self.norm(hidden_states)
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank3]:     hidden_states = self.mixer(
+[rank3]:                     ^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank3]:     return self._call_impl(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank3]:     return forward_call(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank3]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank3]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank3]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank3]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 3 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003902 has 316.00 MiB memory in use. Process 696032 has 316.00 MiB memory in use. Process 1114696 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/3/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 4 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003903 has 316.00 MiB memory in use. Process 696034 has 316.00 MiB memory in use. Process 1114697 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 4 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003903 has 316.00 MiB memory in use. Process 696034 has 316.00 MiB memory in use. Process 1114697 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,452 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,453 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,453 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,428 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,431 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,487 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,487 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,487 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,494 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,492 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,492 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,492 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,492 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,720 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,830 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:39,831 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:39,831 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,831 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,087 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,312 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,313 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,313 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,313 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,672 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,386 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,512 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,515 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,516 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,265 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,317 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,541 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,886 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,272 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,274 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,298 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,315 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,473 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,659 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,667 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,669 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank4]:   File "<frozen runpy>", line 88, in _run_code
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank4]:     main(config)
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank4]:     return f(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank4]:     output = model(
+[rank4]:              ^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank4]:     return self._call_impl(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank4]:     return inner()
+[rank4]:            ^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank4]:     result = forward_call(*args, **kwargs)
+[rank4]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank4]:     return func(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank4]:     outputs = self.backbone(
+[rank4]:               ^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank4]:     return self._call_impl(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank4]:     return forward_call(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank4]:     hidden_states = mixer_block(
+[rank4]:                     ^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank4]:     return self._call_impl(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank4]:     return inner()
+[rank4]:            ^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank4]:     result = forward_call(*args, **kwargs)
+[rank4]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank4]:     return fn(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank4]:     return self._call_impl(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank4]:     return forward_call(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank4]:     hidden_states = self.norm(hidden_states)
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank4]:     hidden_states = self.mixer(
+[rank4]:                     ^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank4]:     return self._call_impl(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank4]:     return forward_call(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank4]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank4]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank4]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank4]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 4 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003903 has 316.00 MiB memory in use. Process 696034 has 316.00 MiB memory in use. Process 1114697 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/4/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 5 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003905 has 316.00 MiB memory in use. Process 696036 has 316.00 MiB memory in use. Process 1114698 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 5 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003905 has 316.00 MiB memory in use. Process 696036 has 316.00 MiB memory in use. Process 1114698 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,350 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,350 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,350 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,258 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,260 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,324 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,324 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,324 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,411 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,999 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,999 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,496 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,496 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,496 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,496 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,812 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,845 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:39,845 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:39,845 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,845 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,105 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,312 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,312 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,312 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,312 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,706 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,415 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,539 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,541 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,543 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,264 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,315 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,539 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,884 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,273 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,275 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,299 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,333 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,479 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:59,100 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:59,101 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:59,153 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:59,170 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:59,171 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:59,171 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:59,172 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:59,172 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:59,172 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:59,173 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank5]:   File "<frozen runpy>", line 88, in _run_code
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank5]:     main(config)
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank5]:     return f(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank5]:     output = model(
+[rank5]:              ^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank5]:     return self._call_impl(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank5]:     return inner()
+[rank5]:            ^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank5]:     result = forward_call(*args, **kwargs)
+[rank5]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank5]:     return func(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank5]:     outputs = self.backbone(
+[rank5]:               ^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank5]:     return self._call_impl(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank5]:     return forward_call(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank5]:     hidden_states = mixer_block(
+[rank5]:                     ^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank5]:     return self._call_impl(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank5]:     return inner()
+[rank5]:            ^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank5]:     result = forward_call(*args, **kwargs)
+[rank5]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank5]:     return fn(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank5]:     return self._call_impl(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank5]:     return forward_call(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank5]:     hidden_states = self.norm(hidden_states)
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank5]:     hidden_states = self.mixer(
+[rank5]:                     ^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank5]:     return self._call_impl(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank5]:     return forward_call(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank5]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank5]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank5]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank5]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 5 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003905 has 316.00 MiB memory in use. Process 696036 has 316.00 MiB memory in use. Process 1114698 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/5/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 6 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003904 has 316.00 MiB memory in use. Process 696035 has 316.00 MiB memory in use. Process 1114699 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 6 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003904 has 316.00 MiB memory in use. Process 696035 has 316.00 MiB memory in use. Process 1114699 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,733 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,734 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,734 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,466 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,468 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,519 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,519 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,519 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,528 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,495 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,495 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,495 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,495 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:39,968 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,002 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:40,002 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,002 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,002 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,261 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,356 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,357 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,357 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,357 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,633 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,337 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,461 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,463 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,465 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,263 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,315 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,315 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,542 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,885 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,273 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,274 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,299 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,320 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,477 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,659 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,665 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,667 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,669 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,670 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,670 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank6]:   File "<frozen runpy>", line 88, in _run_code
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank6]:     main(config)
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank6]:     return f(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank6]:     output = model(
+[rank6]:              ^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank6]:     return self._call_impl(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank6]:     return inner()
+[rank6]:            ^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank6]:     result = forward_call(*args, **kwargs)
+[rank6]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank6]:     return func(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank6]:     outputs = self.backbone(
+[rank6]:               ^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank6]:     return self._call_impl(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank6]:     return forward_call(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank6]:     hidden_states = mixer_block(
+[rank6]:                     ^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank6]:     return self._call_impl(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank6]:     return inner()
+[rank6]:            ^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank6]:     result = forward_call(*args, **kwargs)
+[rank6]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank6]:     return fn(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank6]:     return self._call_impl(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank6]:     return forward_call(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank6]:     hidden_states = self.norm(hidden_states)
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank6]:     hidden_states = self.mixer(
+[rank6]:                     ^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank6]:     return self._call_impl(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank6]:     return forward_call(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank6]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank6]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank6]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank6]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 6 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003904 has 316.00 MiB memory in use. Process 696035 has 316.00 MiB memory in use. Process 1114699 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/6/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 7 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003906 has 316.00 MiB memory in use. Process 696037 has 316.00 MiB memory in use. Process 1114702 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 7 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003906 has 316.00 MiB memory in use. Process 696037 has 316.00 MiB memory in use. Process 1114702 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753252283"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/stderr.log ADDED Viewed

	@@ -0,0 +1,387 @@

+[titan] 2025-07-23 14:27:46,256 - root - INFO - Starting job: default job
+[titan] 2025-07-23 14:27:46,256 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 14:27:46,257 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 14:27:47,217 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 14:27:47,220 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:27:47,268 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:27:47,268 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 14:27:47,268 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 14:27:47,411 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 14:27:47,997 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 14:27:47,998 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:47,998 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,594 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 14:27:48,594 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:27:48,594 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:27:48,594 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,263 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,297 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 14:28:40,298 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,298 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,298 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,563 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,649 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 14:28:40,649 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 14:28:40,649 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:40,649 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 14:28:46,975 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 14:28:47,679 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 14:28:47,795 - root - INFO - Building dataloader...
+[titan] 2025-07-23 14:28:47,797 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 14:28:47,799 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 14:28:50,147 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 14:28:50,148 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 14:28:50,264 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 14:28:50,316 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 14:28:50,317 - root - WARNING - No norm found in model
+[titan] 2025-07-23 14:28:50,317 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 14:28:50,540 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 14:28:50,884 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,042 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 14:28:51,273 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 14:28:51,275 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 14:28:51,299 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+[titan] 2025-07-23 14:28:51,311 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 14:28:51,462 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 14:28:58,659 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 14:28:58,661 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 14:28:58,662 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 14:28:58,662 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 14:28:58,667 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 14:28:58,667 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 14:28:58,668 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 14:28:58,669 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 14:28:58,669 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 14:28:58,669 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank7]:   File "<frozen runpy>", line 88, in _run_code
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank7]:     main(config)
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank7]:     return f(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank7]:     output = model(
+[rank7]:              ^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank7]:     return self._call_impl(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank7]:     return inner()
+[rank7]:            ^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank7]:     result = forward_call(*args, **kwargs)
+[rank7]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank7]:     return func(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank7]:     outputs = self.backbone(
+[rank7]:               ^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank7]:     return self._call_impl(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank7]:     return forward_call(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank7]:     hidden_states = mixer_block(
+[rank7]:                     ^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank7]:     return self._call_impl(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank7]:     return inner()
+[rank7]:            ^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank7]:     result = forward_call(*args, **kwargs)
+[rank7]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank7]:     return fn(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank7]:     return self._call_impl(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank7]:     return forward_call(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank7]:     hidden_states = self.norm(hidden_states)
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank7]:     hidden_states = self.mixer(
+[rank7]:                     ^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank7]:     return self._call_impl(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank7]:     return forward_call(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank7]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank7]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank7]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank7]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 7 has a total capacity of 95.00 GiB of which 85.09 GiB is free. Process 2003906 has 316.00 MiB memory in use. Process 696037 has 316.00 MiB memory in use. Process 1114702 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_07su7ijp/attempt_0/7/stdout.log ADDED Viewed

File without changes

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/error.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"message": {"message": "OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 63.94 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1850004 has 21.15 GiB memory in use. Process 2711975 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 487, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 526, in forward\n outputs = self.backbone(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 405, in forward\n hidden_states = mixer_block(\n ^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 161, in forward\n hidden_states = self.norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py\", line 165, in torch_dynamo_resume_in_forward_at_161\n hidden_states = self.mixer(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 601, in forward\n return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py\", line 528, in torch_forward\n G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]\n ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 63.94 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1850004 has 21.15 GiB memory in use. Process 2711975 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", "timestamp": "1753242220"}}}

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/stderr.log ADDED Viewed

	@@ -0,0 +1,467 @@

+[titan] 2025-07-23 11:13:04,987 - root - INFO - Starting job: default job
+[titan] 2025-07-23 11:13:04,987 - root - INFO - [32m{
+  "activation_checkpoint": {
+    "mode": "none",
+    "selective_ac_option": "2"
+  },
+  "activation_offload": {
+    "mode": "none"
+  },
+  "checkpoint": {
+    "async_mode": "disabled",
+    "create_seed_checkpoint": false,
+    "enable_checkpoint": true,
+    "exclude_from_loading": [],
+    "export_dtype": "float32",
+    "folder": "checkpoint",
+    "interval": 8192,
+    "interval_type": "steps",
+    "keep_latest_k": 100,
+    "load_step": -1,
+    "model_weights_only": false
+  },
+  "comm": {
+    "init_timeout_seconds": 300,
+    "trace_buf_size": 20000,
+    "train_timeout_seconds": 100
+  },
+  "experimental": {
+    "context_parallel_degree": 1,
+    "context_parallel_rotate_method": "allgather",
+    "custom_model_path": "",
+    "enable_async_tensor_parallel": false,
+    "enable_compiled_autograd": false,
+    "pipeline_parallel_degree": 1,
+    "pipeline_parallel_microbatches": null,
+    "pipeline_parallel_schedule": "1F1B",
+    "pipeline_parallel_schedule_csv": "",
+    "pipeline_parallel_split_points": []
+  },
+  "fault_tolerance": {
+    "enable": false,
+    "group_size": 0,
+    "min_replica_size": 1,
+    "replica_id": 0
+  },
+  "float8": {
+    "enable_fsdp_float8_all_gather": false,
+    "force_recompute_fp8_weight_in_bwd": false,
+    "precompute_float8_dynamic_scale_for_fsdp": false,
+    "recipe_name": null
+  },
+  "job": {
+    "config_file": "flame/models/fla.toml",
+    "description": "default job",
+    "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
+    "print_args": true,
+    "use_for_integration_test": false
+  },
+  "lr_scheduler": {
+    "decay_ratio": 1.0,
+    "decay_type": "linear",
+    "lr_min": 0.01,
+    "warmup_steps": 100
+  },
+  "memory_estimation": {
+    "disable_fake_mode": false,
+    "enabled": false
+  },
+  "metrics": {
+    "disable_color_printing": false,
+    "enable_tensorboard": true,
+    "enable_wandb": true,
+    "log_freq": 1,
+    "save_for_all_ranks": false,
+    "save_tb_folder": "tb"
+  },
+  "model": {
+    "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json",
+    "converters": [],
+    "name": "fla",
+    "print_after_conversion": false,
+    "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
+  },
+  "optimizer": {
+    "early_step_in_backward": false,
+    "eps": 1e-08,
+    "implementation": "fused",
+    "lr": 0.0003,
+    "name": "AdamW"
+  },
+  "profiling": {
+    "enable_memory_snapshot": false,
+    "enable_profiling": true,
+    "profile_freq": 512,
+    "save_memory_snapshot_folder": "memory_snapshot",
+    "save_traces_folder": "profile_trace"
+  },
+  "training": {
+    "batch_size": 8,
+    "compile": true,
+    "context_len": 8192,
+    "data_dir": null,
+    "data_files": null,
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,
+    "data_probs": "0.55,0.3,0.15",
+    "dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
+    "dataset_name": "default,default,default",
+    "dataset_split": "train,train,train",
+    "deterministic": false,
+    "disable_loss_parallel": false,
+    "enable_cpu_offload": false,
+    "fsdp_reshard_after_forward": "default",
+    "gc_freq": 50,
+    "gradient_accumulation_steps": 2,
+    "max_norm": 1.0,
+    "mixed_precision_param": "bfloat16",
+    "mixed_precision_reduce": "float32",
+    "num_workers": 32,
+    "persistent_workers": false,
+    "pin_memory": false,
+    "prefetch_factor": 2,
+    "seed": 42,
+    "seq_len": 8192,
+    "skip_nan_inf": true,
+    "steps": 95366,
+    "streaming": true,
+    "tensor_parallel_degree": 1,
+    "varlen": false
+  }
+}[39m
+[titan] 2025-07-23 11:13:04,988 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
+[titan] 2025-07-23 11:13:04,988 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
+[titan] 2025-07-23 11:13:05,005 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 11:13:05,098 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 11:13:05,098 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
+[titan] 2025-07-23 11:13:05,099 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
+[titan] 2025-07-23 11:13:07,424 - root - INFO - Loading tokenizer...
+[titan] 2025-07-23 11:13:07,703 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+)
+[titan] 2025-07-23 11:13:07,704 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:13:07,704 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:13:08,426 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
+IterableDataset({
+    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
+    num_shards: 140
+})
+[titan] 2025-07-23 11:13:08,426 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 11:13:08,426 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:13:08,427 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:35:37,776 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:35:37,867 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
+IterableDataset({
+    features: ['repo', 'content'],
+    num_shards: 1
+})
+[titan] 2025-07-23 11:35:37,867 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 11:35:37,867 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:35:37,868 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+[titan] 2025-07-23 11:35:37,949 - datasets.builder - WARNING - Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:36:07,557 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:36:08,012 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
+IterableDataset({
+    features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
+    num_shards: 100
+})
+[titan] 2025-07-23 11:36:08,012 - root - INFO - Shuffling the dataset with seed 42
+[titan] 2025-07-23 11:36:08,013 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
+`trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:36:08,013 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+[titan] 2025-07-23 11:40:36,057 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
+[titan] 2025-07-23 11:40:36,957 - root - INFO - IterableDataset({
+    features: ['text', 'content'],
+    num_shards: 256
+})
+[titan] 2025-07-23 11:40:37,082 - root - INFO - Building dataloader...
+[titan] 2025-07-23 11:40:37,085 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/mamba2_6_1_340M.json
+[titan] 2025-07-23 11:40:37,088 - root - INFO - Building model from the config
+[32mMamba2Config {
+  "architectures": [
+    "Mamba2ForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 8,
+    "qkv_bias": false,
+    "rope_theta": 160000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "norm_eps": 1e-05,
+  "num_heads": 32,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.53.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_l2warp": false,
+  "vocab_size": 32000
+}
+[39m
+[titan] 2025-07-23 11:40:39,687 - fla.layers.mamba2 - WARNING - The fast path is not available because one of `(selective_state_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation
+[titan] 2025-07-23 11:40:39,687 - fla.layers.mamba2 - WARNING - The CUDA backend is not available because `causal_conv1d` is None. Falling back to the Triton backend. To install follow https://github.com/Dao-AILab/causal-conv1d
+[titan] 2025-07-23 11:40:39,804 - root - INFO - [34m
+Mamba2ForCausalLM(
+  (backbone): Mamba2Model(
+    (embeddings): Embedding(32000, 1024)
+    (layers): ModuleList(
+      (0-47): 48 x Mamba2Block(
+        (norm): RMSNorm(1024, eps=1e-05)
+        (mixer): Mamba2(
+          (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
+          (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
+          (norm): RMSNormGated()
+          (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
+        )
+      )
+    )
+    (norm_f): RMSNorm(1024, eps=1e-05)
+  )
+  (lm_head): Linear(in_features=1024, out_features=32000, bias=False)
+  (criterion): FusedLinearCrossEntropyLoss()
+)[39m
+[titan] 2025-07-23 11:40:39,857 - root - INFO - Compiling each block with torch.compile
+[titan] 2025-07-23 11:40:39,857 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
+[titan] 2025-07-23 11:40:39,857 - root - WARNING - No norm found in model
+[titan] 2025-07-23 11:40:39,858 - root - INFO - Compiling the entire model with torch.compile
+[titan] 2025-07-23 11:40:40,108 - root - INFO - Applied FSDP to the model
+[titan] 2025-07-23 11:40:40,431 - fla.models.mamba2.modeling_mamba2 - WARNING - `A_log` is a DTensor, skipping initialization
+[titan] 2025-07-23 11:40:40,596 - fla.models.mamba2.modeling_mamba2 - WARNING - `dt_bias` is a DTensor, skipping initialization
+[titan] 2025-07-23 11:40:40,842 - root - INFO - CUDA memory usage for model: 0.19GiB(0.20%)
+[titan] 2025-07-23 11:40:40,845 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
+[titan] 2025-07-23 11:40:40,873 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
+wandb: Network error (InvalidURL), entering retry loop.
+wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
+wandb: Network error (InvalidURL), entering retry loop.
+[titan] 2025-07-23 11:42:33,611 - root - ERROR - Failed to create WandB logger: Run initialization has timed out after 90.0 sec. Please try increasing the timeout with the `init_timeout` setting: `wandb.init(settings=wandb.Settings(init_timeout=120))`.
+[titan] 2025-07-23 11:42:33,728 - root - INFO - TensorBoard logging enabled. Logs will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1140
+[titan] 2025-07-23 11:42:33,729 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
+[titan] 2025-07-23 11:42:33,774 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
+[titan] 2025-07-23 11:42:43,730 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-07-23 11:42:43,732 - root - INFO - [32m  Training starts at step 1
+[titan] 2025-07-23 11:42:43,732 - root - INFO - [32m  Number of tokens per sequence = 8,192
+[titan] 2025-07-23 11:42:43,732 - root - INFO - [32m  Gradient Accumulation steps = 2
+[titan] 2025-07-23 11:42:43,732 - root - INFO - [32m  Instantaneous batch size (per device) = 8
+[titan] 2025-07-23 11:42:43,733 - root - INFO - [32m  Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
+[titan] 2025-07-23 11:42:43,733 - root - INFO - [32m  Total optimization steps = 95,366 (99,998,498,816 tokens)
+[titan] 2025-07-23 11:42:43,733 - root - INFO - [32m  Warmup steps = 100 (104,857,600 tokens)
+[titan] 2025-07-23 11:42:43,733 - root - INFO - [32m  Number of parameters = 382,387,712 [39m
+[titan] 2025-07-23 11:42:43,733 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
+/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
+  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+    main(config)
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+    output = model(
+             ^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+    return inner()
+           ^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+    result = forward_call(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+    return func(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+    outputs = self.backbone(
+              ^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+    hidden_states = mixer_block(
+                    ^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+    return inner()
+           ^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+    result = forward_call(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+    hidden_states = self.norm(hidden_states)
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+    hidden_states = self.mixer(
+                    ^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+    return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+    G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+                     ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 63.94 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1850004 has 21.15 GiB memory in use. Process 2711975 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank0]:   File "<frozen runpy>", line 88, in _run_code
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
+[rank0]:     main(config)
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+[rank0]:     return f(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 487, in main
+[rank0]:     output = model(
+[rank0]:              ^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 526, in forward
+[rank0]:     outputs = self.backbone(
+[rank0]:               ^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 405, in forward
+[rank0]:     hidden_states = mixer_block(
+[rank0]:                     ^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 161, in forward
+[rank0]:     hidden_states = self.norm(hidden_states)
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/mamba2/modeling_mamba2.py", line 165, in torch_dynamo_resume_in_forward_at_161
+[rank0]:     hidden_states = self.mixer(
+[rank0]:                     ^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 601, in forward
+[rank0]:     return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/mamba2.py", line 528, in torch_forward
+[rank0]:     G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+[rank0]:                      ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
+[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 GiB. GPU 0 has a total capacity of 95.00 GiB of which 63.94 GiB is free. Process 2003896 has 316.00 MiB memory in use. Process 696027 has 316.00 MiB memory in use. Process 1850004 has 21.15 GiB memory in use. Process 2711975 has 9.27 GiB memory in use. Of the allocated memory 7.99 GiB is allocated by PyTorch, and 73.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_77qh1j5t/attempt_0/0/stdout.log ADDED Viewed

File without changes