Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh +65 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json +53 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json +29 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json +26 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json +50 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json +24 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json +25 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json +29 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json +20 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json +32 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json +32 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json +30 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json +30 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json +52 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json +18 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json +18 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json +21 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log +187 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log +0 -0
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
|
| 2 |
+
DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
|
| 3 |
+
TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
|
| 4 |
+
|
| 5 |
+
cd $FLAME_PATH
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# =========== train config ===========
|
| 9 |
+
CONFIG=${1:-transformer_340M.json}
|
| 10 |
+
SEQ_LEN=8192
|
| 11 |
+
WARMUP_STEPS=100
|
| 12 |
+
STEPS=95366
|
| 13 |
+
LR=3e-4
|
| 14 |
+
BATCH_SIZE=16
|
| 15 |
+
DECAY_TYPE=linear
|
| 16 |
+
DECAY_RATIO=1
|
| 17 |
+
|
| 18 |
+
NNODE=1
|
| 19 |
+
NGPU=8
|
| 20 |
+
LOG_RANK=0
|
| 21 |
+
# ====================================
|
| 22 |
+
|
| 23 |
+
# if jq command is not found, install it
|
| 24 |
+
if ! command -v jq &> /dev/null; then
|
| 25 |
+
echo "jq could not be found, installing it..."
|
| 26 |
+
sudo yum install -y jq
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}
|
| 30 |
+
|
| 31 |
+
bash train.sh \
|
| 32 |
+
--job.config_file flame/models/fla.toml \
|
| 33 |
+
--job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
|
| 34 |
+
--model.config $FLAME_PATH/configs/$CONFIG \
|
| 35 |
+
--model.tokenizer_path $TOKENIZER \
|
| 36 |
+
--optimizer.name AdamW \
|
| 37 |
+
--optimizer.eps 1e-8 \
|
| 38 |
+
--optimizer.lr $LR \
|
| 39 |
+
--lr_scheduler.warmup_steps $WARMUP_STEPS \
|
| 40 |
+
--lr_scheduler.lr_min 0.01 \
|
| 41 |
+
--lr_scheduler.decay_type $DECAY_TYPE \
|
| 42 |
+
--lr_scheduler.decay_ratio $DECAY_RATIO \
|
| 43 |
+
--training.batch_size $BATCH_SIZE \
|
| 44 |
+
--training.seq_len $SEQ_LEN \
|
| 45 |
+
--training.context_len $SEQ_LEN \
|
| 46 |
+
--training.gradient_accumulation_steps 1 \
|
| 47 |
+
--training.steps $STEPS \
|
| 48 |
+
--training.max_norm 1.0 \
|
| 49 |
+
--training.skip_nan_inf \
|
| 50 |
+
--training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
|
| 51 |
+
--training.data_probs 0.55,0.3,0.15 \
|
| 52 |
+
--training.dataset_split train,train,train \
|
| 53 |
+
--training.dataset_name default,default,default \
|
| 54 |
+
--training.streaming \
|
| 55 |
+
--training.num_workers 32 \
|
| 56 |
+
--training.prefetch_factor 2 \
|
| 57 |
+
--training.seed 42 \
|
| 58 |
+
--training.compile \
|
| 59 |
+
--checkpoint.interval 8192 \
|
| 60 |
+
--checkpoint.load_step -1 \
|
| 61 |
+
--checkpoint.keep_latest_k 100 \
|
| 62 |
+
--metrics.log_freq 1 \
|
| 63 |
+
--metrics.enable_tensorboard \
|
| 64 |
+
--training.streaming
|
| 65 |
+
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"allow_neg_eigval": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GatedDeltaNetForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attn": {
|
| 7 |
+
"layers": [
|
| 8 |
+
5,
|
| 9 |
+
11,
|
| 10 |
+
17,
|
| 11 |
+
23
|
| 12 |
+
],
|
| 13 |
+
"num_heads": 16,
|
| 14 |
+
"num_kv_heads": 8,
|
| 15 |
+
"qkv_bias": false,
|
| 16 |
+
"rope_theta": 160000.0,
|
| 17 |
+
"window_size": null
|
| 18 |
+
},
|
| 19 |
+
"attn_mode": "chunk",
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_size": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand_k": 1,
|
| 24 |
+
"expand_v": 1,
|
| 25 |
+
"fuse_cross_entropy": true,
|
| 26 |
+
"fuse_norm": true,
|
| 27 |
+
"fuse_swiglu": true,
|
| 28 |
+
"head_dim": 256,
|
| 29 |
+
"hidden_act": "swish",
|
| 30 |
+
"hidden_ratio": 4,
|
| 31 |
+
"hidden_size": 1024,
|
| 32 |
+
"initializer_range": 0.02,
|
| 33 |
+
"intermediate_size": null,
|
| 34 |
+
"max_position_embeddings": 8192,
|
| 35 |
+
"model_type": "gated_deltanet",
|
| 36 |
+
"norm_eps": 1e-06,
|
| 37 |
+
"norm_first": false,
|
| 38 |
+
"num_heads": 4,
|
| 39 |
+
"num_hidden_layers": 24,
|
| 40 |
+
"num_v_heads": null,
|
| 41 |
+
"qk_activation": "silu",
|
| 42 |
+
"qk_norm": "l2",
|
| 43 |
+
"tie_word_embeddings": false,
|
| 44 |
+
"torch_dtype": "float32",
|
| 45 |
+
"transformers_version": "4.53.3",
|
| 46 |
+
"use_beta": true,
|
| 47 |
+
"use_cache": true,
|
| 48 |
+
"use_gate": true,
|
| 49 |
+
"use_l2warp": false,
|
| 50 |
+
"use_output_norm": true,
|
| 51 |
+
"use_short_conv": true,
|
| 52 |
+
"vocab_size": 32000
|
| 53 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"conv_size": 4,
|
| 6 |
+
"eos_token_id": 2,
|
| 7 |
+
"expand_k": 1,
|
| 8 |
+
"expand_v": 1,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"hidden_act": "swish",
|
| 12 |
+
"hidden_ratio": 4,
|
| 13 |
+
"hidden_size": 2048,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": null,
|
| 16 |
+
"model_type": "delta_net",
|
| 17 |
+
"norm_eps": 1e-06,
|
| 18 |
+
"num_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"pad_token_id": 2,
|
| 21 |
+
"qk_activation": "silu",
|
| 22 |
+
"qk_norm": "l2",
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_beta": true,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_gate": false,
|
| 27 |
+
"use_output_norm": true,
|
| 28 |
+
"use_short_conv": true
|
| 29 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 1,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "delta_net",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 8,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"qk_activation": "silu",
|
| 19 |
+
"qk_norm": "l2",
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"use_beta": true,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"use_gate": false,
|
| 24 |
+
"use_output_norm": true,
|
| 25 |
+
"use_short_conv": true
|
| 26 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "float32",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"clamp_min": null,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": null,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"num_heads": 4,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"norm_eps": 1e-06,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"vocab_size": 32000
|
| 24 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"num_heads": 16,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"use_output_gate": true,
|
| 24 |
+
"use_short_conv": false
|
| 25 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_size": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_k": 1,
|
| 6 |
+
"expand_v": 1,
|
| 7 |
+
"elementwise_affine": false,
|
| 8 |
+
"feature_map": "swish",
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"gate_logit_normalizer": 4,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"model_type": "gsa",
|
| 18 |
+
"num_heads": 4,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_slots": 64,
|
| 21 |
+
"norm_eps": 1e-06,
|
| 22 |
+
"share_conv_kernel": true,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"use_norm": true,
|
| 26 |
+
"use_output_gate": true,
|
| 27 |
+
"use_rope": false,
|
| 28 |
+
"use_short_conv": false
|
| 29 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_ratio": 128,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"model_type": "hgrn2",
|
| 14 |
+
"num_heads": 8,
|
| 15 |
+
"num_hidden_layers": 24,
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"tie_word_embeddings": false,
|
| 18 |
+
"use_cache": true,
|
| 19 |
+
"vocab_size": 32000
|
| 20 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": {
|
| 3 |
+
"layers": [
|
| 4 |
+
1,
|
| 5 |
+
3,
|
| 6 |
+
5,
|
| 7 |
+
7,
|
| 8 |
+
9,
|
| 9 |
+
11,
|
| 10 |
+
13,
|
| 11 |
+
15,
|
| 12 |
+
17
|
| 13 |
+
],
|
| 14 |
+
"num_heads": 18,
|
| 15 |
+
"num_kv_heads": 18,
|
| 16 |
+
"qkv_bias": false,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"window_size": 2048
|
| 19 |
+
},
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"hidden_act": "swish",
|
| 28 |
+
"hidden_ratio": 4,
|
| 29 |
+
"hidden_size": 2304,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 4608,
|
| 32 |
+
"max_position_embeddings": 2048,
|
| 33 |
+
"model_type": "samba",
|
| 34 |
+
"norm_eps": 1e-05,
|
| 35 |
+
"num_hidden_layers": 18,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"rescale_prenorm_residual": false,
|
| 38 |
+
"residual_in_fp32": false,
|
| 39 |
+
"state_size": 16,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_init_scheme": "random",
|
| 43 |
+
"time_step_max": 0.1,
|
| 44 |
+
"time_step_min": 0.001,
|
| 45 |
+
"time_step_rank": 144,
|
| 46 |
+
"time_step_scale": 1.0,
|
| 47 |
+
"transformers_version": "4.50.1",
|
| 48 |
+
"use_bias": false,
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"use_conv_bias": true,
|
| 51 |
+
"vocab_size": 32000
|
| 52 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.006,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "sba",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"elementwise_affine": true,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"fuse_swiglu": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"max_position_embeddings": 8192,
|
| 14 |
+
"model_type": "transformer",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 32,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"num_kv_heads": null,
|
| 19 |
+
"pad_token_id": 2,
|
| 20 |
+
"rope_theta": 10000.0,
|
| 21 |
+
"tie_word_embeddings": false
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "transformer",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_ratio": 4,
|
| 9 |
+
"hidden_size": 4096,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 14336,
|
| 12 |
+
"model_type": "transformer",
|
| 13 |
+
"norm_eps": 1e-06,
|
| 14 |
+
"num_heads": 32,
|
| 15 |
+
"num_hidden_layers": 32,
|
| 16 |
+
"num_kv_heads": 8,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"window_size": null
|
| 21 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/0/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/1/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/2/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/3/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/4/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/5/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/6/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 17, in <module>
|
| 5 |
+
from torchtitan.components.checkpoint import CheckpointManager
|
| 6 |
+
ModuleNotFoundError: No module named 'torchtitan'
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_0hlbuzgb/attempt_0/7/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stderr.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/0/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:41,941 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:41,941 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:41,942 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:43,187 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:43,187 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:43,187 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:43,207 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,370 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,371 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,371 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank1]: Traceback (most recent call last):
|
| 166 |
+
[rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank1]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank1]: main(config)
|
| 170 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank1]: return f(*args, **kwargs)
|
| 172 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank1]: dataset = build_dataset(
|
| 175 |
+
[rank1]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank1]: subset = load_dataset(
|
| 178 |
+
[rank1]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank1]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank1]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank1]: self.acquire()
|
| 185 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank1]: time.sleep(poll_interval)
|
| 187 |
+
[rank1]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/1/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:42,036 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:42,037 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:42,038 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:43,075 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:43,078 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:43,210 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:43,210 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:43,210 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:43,219 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,387 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,387 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,387 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank2]: Traceback (most recent call last):
|
| 166 |
+
[rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank2]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank2]: main(config)
|
| 170 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank2]: return f(*args, **kwargs)
|
| 172 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank2]: dataset = build_dataset(
|
| 175 |
+
[rank2]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank2]: subset = load_dataset(
|
| 178 |
+
[rank2]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank2]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank2]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank2]: self.acquire()
|
| 185 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank2]: time.sleep(poll_interval)
|
| 187 |
+
[rank2]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/2/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:41,964 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:41,965 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:41,966 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:43,050 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:43,053 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:43,165 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:43,165 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:43,165 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:43,192 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,304 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,304 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,304 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank3]: Traceback (most recent call last):
|
| 166 |
+
[rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank3]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank3]: main(config)
|
| 170 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank3]: return f(*args, **kwargs)
|
| 172 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank3]: dataset = build_dataset(
|
| 175 |
+
[rank3]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank3]: subset = load_dataset(
|
| 178 |
+
[rank3]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank3]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank3]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank3]: self.acquire()
|
| 185 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank3]: time.sleep(poll_interval)
|
| 187 |
+
[rank3]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/3/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:41,988 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:41,988 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:41,990 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:43,095 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:43,097 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:43,213 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:43,213 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:43,214 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:43,222 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,405 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,405 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,405 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank4]: Traceback (most recent call last):
|
| 166 |
+
[rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank4]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank4]: main(config)
|
| 170 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank4]: return f(*args, **kwargs)
|
| 172 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank4]: dataset = build_dataset(
|
| 175 |
+
[rank4]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank4]: subset = load_dataset(
|
| 178 |
+
[rank4]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank4]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank4]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank4]: self.acquire()
|
| 185 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank4]: time.sleep(poll_interval)
|
| 187 |
+
[rank4]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/4/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:41,984 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:41,984 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:41,986 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:43,062 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:43,064 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:43,202 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:43,202 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:43,202 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:43,209 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,394 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,395 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,395 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,666 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,667 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank5]: Traceback (most recent call last):
|
| 166 |
+
[rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank5]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank5]: main(config)
|
| 170 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank5]: return f(*args, **kwargs)
|
| 172 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank5]: dataset = build_dataset(
|
| 175 |
+
[rank5]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank5]: subset = load_dataset(
|
| 178 |
+
[rank5]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank5]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank5]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank5]: self.acquire()
|
| 185 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank5]: time.sleep(poll_interval)
|
| 187 |
+
[rank5]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/5/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:47:41,727 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:47:41,727 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:47:41,729 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:47:42,344 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:47:42,347 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:47:42,391 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:47:42,391 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:47:42,391 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:47:42,708 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:47:43,145 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:47:43,145 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:47:43,146 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[titan] 2025-07-22 22:47:43,667 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 153 |
+
IterableDataset({
|
| 154 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 155 |
+
num_shards: 140
|
| 156 |
+
})
|
| 157 |
+
[titan] 2025-07-22 22:47:43,667 - root - INFO - Shuffling the dataset with seed 42
|
| 158 |
+
[titan] 2025-07-22 22:47:43,667 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 159 |
+
`trust_remote_code` is not supported anymore.
|
| 160 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 161 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 162 |
+
[titan] 2025-07-22 22:47:43,668 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 163 |
+
Please check that the Hugging Face dataset '/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 164 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 165 |
+
[rank6]: Traceback (most recent call last):
|
| 166 |
+
[rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 167 |
+
[rank6]: File "<frozen runpy>", line 88, in _run_code
|
| 168 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 169 |
+
[rank6]: main(config)
|
| 170 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 171 |
+
[rank6]: return f(*args, **kwargs)
|
| 172 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^
|
| 173 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 174 |
+
[rank6]: dataset = build_dataset(
|
| 175 |
+
[rank6]: ^^^^^^^^^^^^^^
|
| 176 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 689, in build_dataset
|
| 177 |
+
[rank6]: subset = load_dataset(
|
| 178 |
+
[rank6]: ^^^^^^^^^^^^^
|
| 179 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1412, in load_dataset
|
| 180 |
+
[rank6]: builder_instance.download_and_prepare(
|
| 181 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/builder.py", line 829, in download_and_prepare
|
| 182 |
+
[rank6]: with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 183 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 376, in __enter__
|
| 184 |
+
[rank6]: self.acquire()
|
| 185 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/filelock/_api.py", line 344, in acquire
|
| 186 |
+
[rank6]: time.sleep(poll_interval)
|
| 187 |
+
[rank6]: KeyboardInterrupt
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_ff62l9rt/attempt_0/6/stdout.log
ADDED
|
File without changes
|