File size: 3,488 Bytes
10f998d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# ------------------
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export USER=whoami
source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate
# ------------------

set -eo pipefail
# ------------------

cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain

LAUNCH_SCRIPT_PATH="$(realpath $0)" \
DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \
OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \
BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \
TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \
LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \
LR=2e-5 MIN_LR=7e-7 \
MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \
TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \
ACTIVATION_CHECKPOINT='true' \
NAME_PREFIX='dev-' \
HYBRID_ATTN=0.0625 \
HYBRID_MLP_RATIO=0.5 \
MAMBA_HEAD_DIM=64 \
MAMBA_NUM_GROUPS=6 \
MAMBA_STATE_DIM=320 \
MAMBA_EXPAND=1 \
NUM_LAYERS=112 \
MODEL_SIZE='2.9b' \
HIDDEN_SIZE=1920 \
NUM_ATTN_HEADS=30 \
NUM_QUERY_GROUPS=6 \
ROTARY_BASE=10000 \
MOE_FFN_HIDDEN_SIZE=4800 \
NUM_EXPERTS=0 \
SEQ_LEN=4096 \
TIE_EMBEDDING=false \
FREEZE_NON_MAMBA=false \
LOAD_FROM_CHECKPOINT='attn_mamba' \
HYBRID_OVERRIDE_PATTERN_TYPE='A0' \
CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \
EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \
bash mamba_moe_0.5b_pretrain_template.sh

# SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN
# LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba

    # group.add_argument('--hybrid-override-pattern', type=str, default=None,
    #                    help='Force a specific hybrid layer pattern. The value'
    #                         'should be a string of characters chosen from'
    #                         'core.ssm.mamba_hybrid_layer_allocation.Symbols.'
    #                         'If a value greater than 0.0 is supplied to any of the '
    #                         'hybrid ratio arguments, then the number of each type'
    #                         'of layer in the override pattern must match number in'
    #                         'the overidden pattern')

# M0 type:
# M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-
# A0 type:
# *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
# A01 type:
# *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
# M01 type:
# M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-
# Nemo_A7_M49_F49
# M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-
# yulanmini
# *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

# else or no this argument:
# No override