# ------------------ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export USER=whoami source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate # ------------------ set -eo pipefail # ------------------ cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain LAUNCH_SCRIPT_PATH="$(realpath $0)" \ DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \ OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \ BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \ TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \ LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \ LR=2e-5 MIN_LR=7e-7 \ MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \ TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \ ACTIVATION_CHECKPOINT='true' \ NAME_PREFIX='dev-' \ HYBRID_ATTN=0.0625 \ HYBRID_MLP_RATIO=0.5 \ MAMBA_HEAD_DIM=64 \ MAMBA_NUM_GROUPS=6 \ MAMBA_STATE_DIM=320 \ MAMBA_EXPAND=1 \ NUM_LAYERS=112 \ MODEL_SIZE='2.9b' \ HIDDEN_SIZE=1920 \ NUM_ATTN_HEADS=30 \ NUM_QUERY_GROUPS=6 \ ROTARY_BASE=10000 \ MOE_FFN_HIDDEN_SIZE=4800 \ NUM_EXPERTS=0 \ SEQ_LEN=4096 \ TIE_EMBEDDING=false \ FREEZE_NON_MAMBA=false \ LOAD_FROM_CHECKPOINT='attn_mamba' \ HYBRID_OVERRIDE_PATTERN_TYPE='A0' \ CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \ EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \ bash mamba_moe_0.5b_pretrain_template.sh # SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN # LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba # group.add_argument('--hybrid-override-pattern', type=str, default=None, # help='Force a specific hybrid layer pattern. The value' # 'should be a string of characters chosen from' # 'core.ssm.mamba_hybrid_layer_allocation.Symbols.' # 'If a value greater than 0.0 is supplied to any of the ' # 'hybrid ratio arguments, then the number of each type' # 'of layer in the override pattern must match number in' # 'the overidden pattern') # M0 type: # M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*- # A0 type: # *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M- # A01 type: # *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M- # M01 type: # M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*- # Nemo_A7_M49_F49 # M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M- # yulanmini # *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- # else or no this argument: # No override