Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh +65 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json +53 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json +29 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json +26 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json +24 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json +25 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json +29 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json +20 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json +32 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json +32 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json +30 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json +30 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json +52 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json +18 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json +22 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json +18 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json +21 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json +6 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stdout.log +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model.safetensors +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model_size=391m +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/special_tokens_map.json +23 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1049/events.out.tfevents.1753238968.TENCENT64.site.2520914.0 +3 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer.json +0 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer_config.json +44 -0
- gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/train.sh +121 -0
.gitattributes
CHANGED
|
@@ -49,3 +49,11 @@ bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_rati
|
|
| 49 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 50 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 51 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 50 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 51 |
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
|
| 2 |
+
DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
|
| 3 |
+
TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
|
| 4 |
+
|
| 5 |
+
cd $FLAME_PATH
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# =========== train config ===========
|
| 9 |
+
CONFIG=${1:-transformer_340M.json}
|
| 10 |
+
SEQ_LEN=8192
|
| 11 |
+
WARMUP_STEPS=100
|
| 12 |
+
STEPS=95366
|
| 13 |
+
LR=3e-4
|
| 14 |
+
BATCH_SIZE=8
|
| 15 |
+
GAS=2
|
| 16 |
+
DECAY_TYPE=linear
|
| 17 |
+
DECAY_RATIO=1
|
| 18 |
+
NNODE=1
|
| 19 |
+
NGPU=8
|
| 20 |
+
LOG_RANK=0
|
| 21 |
+
# ====================================
|
| 22 |
+
|
| 23 |
+
# if jq command is not found, install it
|
| 24 |
+
if ! command -v jq &> /dev/null; then
|
| 25 |
+
echo "jq could not be found, installing it..."
|
| 26 |
+
sudo yum install -y jq
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
|
| 30 |
+
|
| 31 |
+
bash train.sh \
|
| 32 |
+
--job.config_file flame/models/fla.toml \
|
| 33 |
+
--job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
|
| 34 |
+
--model.config $FLAME_PATH/configs/$CONFIG \
|
| 35 |
+
--model.tokenizer_path $TOKENIZER \
|
| 36 |
+
--optimizer.name AdamW \
|
| 37 |
+
--optimizer.eps 1e-8 \
|
| 38 |
+
--optimizer.lr $LR \
|
| 39 |
+
--lr_scheduler.warmup_steps $WARMUP_STEPS \
|
| 40 |
+
--lr_scheduler.lr_min 0.01 \
|
| 41 |
+
--lr_scheduler.decay_type $DECAY_TYPE \
|
| 42 |
+
--lr_scheduler.decay_ratio $DECAY_RATIO \
|
| 43 |
+
--training.batch_size $BATCH_SIZE \
|
| 44 |
+
--training.seq_len $SEQ_LEN \
|
| 45 |
+
--training.context_len $SEQ_LEN \
|
| 46 |
+
--training.gradient_accumulation_steps $GAS \
|
| 47 |
+
--training.steps $STEPS \
|
| 48 |
+
--training.max_norm 1.0 \
|
| 49 |
+
--training.skip_nan_inf \
|
| 50 |
+
--training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
|
| 51 |
+
--training.data_probs 0.55,0.3,0.15 \
|
| 52 |
+
--training.dataset_split train,train,train \
|
| 53 |
+
--training.dataset_name default,default,default \
|
| 54 |
+
--training.streaming \
|
| 55 |
+
--training.num_workers 32 \
|
| 56 |
+
--training.prefetch_factor 2 \
|
| 57 |
+
--training.seed 42 \
|
| 58 |
+
--training.compile \
|
| 59 |
+
--checkpoint.interval 8192 \
|
| 60 |
+
--checkpoint.load_step -1 \
|
| 61 |
+
--checkpoint.keep_latest_k 100 \
|
| 62 |
+
--metrics.log_freq 1 \
|
| 63 |
+
--metrics.enable_tensorboard \
|
| 64 |
+
--training.streaming
|
| 65 |
+
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"allow_neg_eigval": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GatedDeltaNetForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attn": {
|
| 7 |
+
"layers": [
|
| 8 |
+
5,
|
| 9 |
+
11,
|
| 10 |
+
17,
|
| 11 |
+
23
|
| 12 |
+
],
|
| 13 |
+
"num_heads": 16,
|
| 14 |
+
"num_kv_heads": 8,
|
| 15 |
+
"qkv_bias": false,
|
| 16 |
+
"rope_theta": 160000.0,
|
| 17 |
+
"window_size": null
|
| 18 |
+
},
|
| 19 |
+
"attn_mode": "chunk",
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_size": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand_k": 1,
|
| 24 |
+
"expand_v": 1,
|
| 25 |
+
"fuse_cross_entropy": true,
|
| 26 |
+
"fuse_norm": true,
|
| 27 |
+
"fuse_swiglu": true,
|
| 28 |
+
"head_dim": 256,
|
| 29 |
+
"hidden_act": "swish",
|
| 30 |
+
"hidden_ratio": 4,
|
| 31 |
+
"hidden_size": 1024,
|
| 32 |
+
"initializer_range": 0.02,
|
| 33 |
+
"intermediate_size": null,
|
| 34 |
+
"max_position_embeddings": 8192,
|
| 35 |
+
"model_type": "gated_deltanet",
|
| 36 |
+
"norm_eps": 1e-06,
|
| 37 |
+
"norm_first": false,
|
| 38 |
+
"num_heads": 4,
|
| 39 |
+
"num_hidden_layers": 24,
|
| 40 |
+
"num_v_heads": null,
|
| 41 |
+
"qk_activation": "silu",
|
| 42 |
+
"qk_norm": "l2",
|
| 43 |
+
"tie_word_embeddings": false,
|
| 44 |
+
"torch_dtype": "float32",
|
| 45 |
+
"transformers_version": "4.53.3",
|
| 46 |
+
"use_beta": true,
|
| 47 |
+
"use_cache": true,
|
| 48 |
+
"use_gate": true,
|
| 49 |
+
"use_l2warp": false,
|
| 50 |
+
"use_output_norm": true,
|
| 51 |
+
"use_short_conv": true,
|
| 52 |
+
"vocab_size": 32000
|
| 53 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"conv_size": 4,
|
| 6 |
+
"eos_token_id": 2,
|
| 7 |
+
"expand_k": 1,
|
| 8 |
+
"expand_v": 1,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"hidden_act": "swish",
|
| 12 |
+
"hidden_ratio": 4,
|
| 13 |
+
"hidden_size": 2048,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": null,
|
| 16 |
+
"model_type": "delta_net",
|
| 17 |
+
"norm_eps": 1e-06,
|
| 18 |
+
"num_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"pad_token_id": 2,
|
| 21 |
+
"qk_activation": "silu",
|
| 22 |
+
"qk_norm": "l2",
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_beta": true,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_gate": false,
|
| 27 |
+
"use_output_norm": true,
|
| 28 |
+
"use_short_conv": true
|
| 29 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 1,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "delta_net",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 8,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"qk_activation": "silu",
|
| 19 |
+
"qk_norm": "l2",
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"use_beta": true,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"use_gate": false,
|
| 24 |
+
"use_output_norm": true,
|
| 25 |
+
"use_short_conv": true
|
| 26 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "float32",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"clamp_min": null,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": null,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"num_heads": 4,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"norm_eps": 1e-06,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"vocab_size": 32000
|
| 24 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"num_heads": 16,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"use_output_gate": true,
|
| 24 |
+
"use_short_conv": false
|
| 25 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_size": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_k": 1,
|
| 6 |
+
"expand_v": 1,
|
| 7 |
+
"elementwise_affine": false,
|
| 8 |
+
"feature_map": "swish",
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"gate_logit_normalizer": 4,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"model_type": "gsa",
|
| 18 |
+
"num_heads": 4,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_slots": 64,
|
| 21 |
+
"norm_eps": 1e-06,
|
| 22 |
+
"share_conv_kernel": true,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"use_norm": true,
|
| 26 |
+
"use_output_gate": true,
|
| 27 |
+
"use_rope": false,
|
| 28 |
+
"use_short_conv": false
|
| 29 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_ratio": 128,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"model_type": "hgrn2",
|
| 14 |
+
"num_heads": 8,
|
| 15 |
+
"num_hidden_layers": 24,
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"tie_word_embeddings": false,
|
| 18 |
+
"use_cache": true,
|
| 19 |
+
"vocab_size": 32000
|
| 20 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": {
|
| 3 |
+
"layers": [
|
| 4 |
+
1,
|
| 5 |
+
3,
|
| 6 |
+
5,
|
| 7 |
+
7,
|
| 8 |
+
9,
|
| 9 |
+
11,
|
| 10 |
+
13,
|
| 11 |
+
15,
|
| 12 |
+
17
|
| 13 |
+
],
|
| 14 |
+
"num_heads": 18,
|
| 15 |
+
"num_kv_heads": 18,
|
| 16 |
+
"qkv_bias": false,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"window_size": 2048
|
| 19 |
+
},
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"hidden_act": "swish",
|
| 28 |
+
"hidden_ratio": 4,
|
| 29 |
+
"hidden_size": 2304,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 4608,
|
| 32 |
+
"max_position_embeddings": 2048,
|
| 33 |
+
"model_type": "samba",
|
| 34 |
+
"norm_eps": 1e-05,
|
| 35 |
+
"num_hidden_layers": 18,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"rescale_prenorm_residual": false,
|
| 38 |
+
"residual_in_fp32": false,
|
| 39 |
+
"state_size": 16,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_init_scheme": "random",
|
| 43 |
+
"time_step_max": 0.1,
|
| 44 |
+
"time_step_min": 0.001,
|
| 45 |
+
"time_step_rank": 144,
|
| 46 |
+
"time_step_scale": 1.0,
|
| 47 |
+
"transformers_version": "4.50.1",
|
| 48 |
+
"use_bias": false,
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"use_conv_bias": true,
|
| 51 |
+
"vocab_size": 32000
|
| 52 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.006,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "sba",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"elementwise_affine": true,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"fuse_swiglu": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"max_position_embeddings": 8192,
|
| 14 |
+
"model_type": "transformer",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 32,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"num_kv_heads": null,
|
| 19 |
+
"pad_token_id": 2,
|
| 20 |
+
"rope_theta": 10000.0,
|
| 21 |
+
"tie_word_embeddings": false
|
| 22 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "transformer",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_ratio": 4,
|
| 9 |
+
"hidden_size": 4096,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 14336,
|
| 12 |
+
"model_type": "transformer",
|
| 13 |
+
"norm_eps": 1e-06,
|
| 14 |
+
"num_heads": 32,
|
| 15 |
+
"num_hidden_layers": 32,
|
| 16 |
+
"num_kv_heads": 8,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"window_size": null
|
| 21 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.53.3"
|
| 6 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01ca6611958ad6032d4edeeff70cbced189831c7377a0d51d45295d800608176
|
| 3 |
+
size 28967020
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/0/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0400a66f534cf20e161ed17fc4c43e5af67548222b5b1dd08a95ee48c7509f55
|
| 3 |
+
size 28965495
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/1/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98ccf26ea910a98e3c8667a099548c305e800682d3c67595e736442d684f00d1
|
| 3 |
+
size 28965482
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/2/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab73f37388b964be320787d56364d2a2f150cdba3a5f2fb0b1e11539fc3d9f0b
|
| 3 |
+
size 28965482
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/3/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76ff987941147c2805732a8f40dad209c7e620ea25e67a9b54ae1194eec3aefd
|
| 3 |
+
size 28965439
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/4/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57408e049045a8149f91bbb25069512d449c8d0025721db53ef2f2bbf22e590c
|
| 3 |
+
size 28965482
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/5/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66451df5da256bdad7f31ee0017121a643a962158ac753db9a5594a84a9ae7b7
|
| 3 |
+
size 28965491
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/6/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3604399b4204803d0a1fa31bf0ad96addc1ff316d5c1e2265ae0c6b2673d2523
|
| 3 |
+
size 28965484
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_z0tiim1_/attempt_0/7/stdout.log
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7e377dafd4fe52f30ee081c674f669620d072a4ec98b9c278dc239233984f6f
|
| 3 |
+
size 1564281448
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/model_size=391m
ADDED
|
File without changes
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250723-1049/events.out.tfevents.1753238968.TENCENT64.site.2520914.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86ccb5e4c12d2c9b871f0a88179c24fc8ec3dbccac753b47b0578c1f636804ae
|
| 3 |
+
size 97469568
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"additional_special_tokens": [],
|
| 32 |
+
"bos_token": "<s>",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": true,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": null,
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"spaces_between_special_tokens": false,
|
| 41 |
+
"tokenizer_class": "LlamaTokenizerFast",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gdn_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/train.sh
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/bash
|
| 2 |
+
|
| 3 |
+
params=""
|
| 4 |
+
if [ $# -ne 0 ]; then
|
| 5 |
+
params="$*"
|
| 6 |
+
fi
|
| 7 |
+
|
| 8 |
+
# use envs as local params for convenience
|
| 9 |
+
# e.g.
|
| 10 |
+
# NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
|
| 11 |
+
NNODE=${NNODE:-"1"}
|
| 12 |
+
NGPU=${NGPU:-"8"}
|
| 13 |
+
LOG_RANK=${LOG_RANK:-0}
|
| 14 |
+
|
| 15 |
+
if [[ -z "${MASTER_ADDR}" ]]; then
|
| 16 |
+
export MASTER_ADDR="localhost"
|
| 17 |
+
fi
|
| 18 |
+
if [[ -z "${MASTER_PORT}" ]]; then
|
| 19 |
+
export MASTER_PORT="0"
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
: '
|
| 23 |
+
Usage:
|
| 24 |
+
|
| 25 |
+
bash train.sh -h
|
| 26 |
+
|
| 27 |
+
Training a 340M model:
|
| 28 |
+
|
| 29 |
+
NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
|
| 30 |
+
--job.config_file flame/models/fla.toml \
|
| 31 |
+
--job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
|
| 32 |
+
--model.config configs/transformer_340M.json \
|
| 33 |
+
--model.tokenizer_path fla-hub/transformer-1.3B-100B \
|
| 34 |
+
--optimizer.name AdamW \
|
| 35 |
+
--optimizer.eps 1e-15 \
|
| 36 |
+
--optimizer.lr 3e-4 \
|
| 37 |
+
--lr_scheduler.warmup_steps 1024 \
|
| 38 |
+
--lr_scheduler.lr_min 0.1 \
|
| 39 |
+
--lr_scheduler.decay_type cosine \
|
| 40 |
+
--training.batch_size 32 \
|
| 41 |
+
--training.seq_len 2048 \
|
| 42 |
+
--training.gradient_accumulation_steps 1 \
|
| 43 |
+
--training.steps 20480 \
|
| 44 |
+
--training.max_norm 1.0 \
|
| 45 |
+
--training.skip_nan_inf \
|
| 46 |
+
--training.dataset HuggingFaceFW/fineweb-edu \
|
| 47 |
+
--training.dataset_name default \
|
| 48 |
+
--training.dataset_split train \
|
| 49 |
+
--training.streaming \
|
| 50 |
+
--training.num_workers 32 \
|
| 51 |
+
--training.prefetch_factor 2 \
|
| 52 |
+
--training.seed 42 \
|
| 53 |
+
--training.compile \
|
| 54 |
+
--training.tensor_parallel_degree 1 \
|
| 55 |
+
--training.disable_loss_parallel \
|
| 56 |
+
--checkpoint.interval 2048 \
|
| 57 |
+
--checkpoint.load_step -1 \
|
| 58 |
+
--metrics.log_freq 1
|
| 59 |
+
'
|
| 60 |
+
|
| 61 |
+
echo "Launching training..."
|
| 62 |
+
|
| 63 |
+
set -x
|
| 64 |
+
path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
|
| 65 |
+
steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
|
| 66 |
+
config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
|
| 67 |
+
tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
|
| 68 |
+
model=$(
|
| 69 |
+
python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
mkdir -p $path
|
| 73 |
+
cp *.sh $path
|
| 74 |
+
cp -r configs $path
|
| 75 |
+
cp -r flame $path
|
| 76 |
+
cp -r 3rdparty/flash-linear-attention/fla $path
|
| 77 |
+
cp -r 3rdparty/torchtitan/torchtitan $path
|
| 78 |
+
|
| 79 |
+
# for offline systems
|
| 80 |
+
# export TRANSFORMERS_OFFLINE=1
|
| 81 |
+
# export HF_DATASETS_OFFLINE=1
|
| 82 |
+
# export HF_HUB_OFFLINE=1
|
| 83 |
+
if [ "$date" == "" ]; then
|
| 84 |
+
date=$(date +%Y%m%d%H%M)
|
| 85 |
+
fi
|
| 86 |
+
RUN_NAME="$model-$(basename $path)"
|
| 87 |
+
RUN_ID="$RUN_NAME-$date"
|
| 88 |
+
|
| 89 |
+
export WANDB_RESUME=allow
|
| 90 |
+
if [[ -z "${WANDB_PROJECT}" ]]; then
|
| 91 |
+
export WANDB_PROJECT="fla"
|
| 92 |
+
fi
|
| 93 |
+
if [[ -z "${WANDB_NAME}" ]]; then
|
| 94 |
+
export WANDB_NAME="$RUN_NAME"
|
| 95 |
+
fi
|
| 96 |
+
if [[ -z "${WANDB_RUN_ID}" ]]; then
|
| 97 |
+
export WANDB_RUN_ID="$RUN_ID"
|
| 98 |
+
fi
|
| 99 |
+
|
| 100 |
+
PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
|
| 101 |
+
torchrun --nnodes=${NNODE} \
|
| 102 |
+
--nproc_per_node=${NGPU} \
|
| 103 |
+
--rdzv_backend c10d \
|
| 104 |
+
--rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
|
| 105 |
+
--local-ranks-filter ${LOG_RANK} \
|
| 106 |
+
--role rank \
|
| 107 |
+
--tee 3 \
|
| 108 |
+
--log-dir $path/logs \
|
| 109 |
+
-m flame.train \
|
| 110 |
+
$params
|
| 111 |
+
|
| 112 |
+
echo "TRAINING DONE!"
|
| 113 |
+
echo "Converting the DCP checkpoints to HF format..."
|
| 114 |
+
|
| 115 |
+
python -m flame.utils.convert_dcp_to_hf \
|
| 116 |
+
--path $path \
|
| 117 |
+
--step $steps \
|
| 118 |
+
--config $config \
|
| 119 |
+
--tokenizer $tokenizer
|
| 120 |
+
|
| 121 |
+
echo "RUNNING DONE!"
|