Upload folder launch_script to dsv3_0.5b
Browse files
launch_script/mamba_moe_0.5b_pretrain_template.sh
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Mamba 0.5B total MoE
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
| 4 |
+
export OMP_NUM_THREADS=4
|
| 5 |
+
export TRITON_CACHE_DIR="./triton-cache/mamba-moe/"
|
| 6 |
+
|
| 7 |
+
# Dir Arguments
|
| 8 |
+
DIR=`pwd`
|
| 9 |
+
PRETRAINED_CKPT_ROOT_PATH=${PRETRAINED_CKPT_ROOT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace"}
|
| 10 |
+
PRETRAINED_CKPT_ID=${PRETRAINED_CKPT_ID:-"NOT_EXISTS"}
|
| 11 |
+
PRETRAINED_CKPT_NAME=${PRETRAINED_CKPT_NAME:-"NOT_EXISTS"}
|
| 12 |
+
OUTPUT_CHECKPOINT_PATH=${OUTPUT_CHECKPOINT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace"}
|
| 13 |
+
|
| 14 |
+
# Training Arguments
|
| 15 |
+
SEQ_LEN=${SEQ_LEN:-4096}
|
| 16 |
+
BATCH_SIZE=${BATCH_SIZE:-1}
|
| 17 |
+
GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-4096}
|
| 18 |
+
MP_SIZE=${MP_SIZE:-1}
|
| 19 |
+
PP_SIZE=${PP_SIZE:-1}
|
| 20 |
+
EP_SIZE=${EP_SIZE:-1}
|
| 21 |
+
CP_SIZE=${CP_SIZE:-1}
|
| 22 |
+
ACTIVATION_CHECKPOINT=${ACTIVATION_CHECKPOINT:-"false"}
|
| 23 |
+
LOG_INTERVAL=${LOG_INTERVAL:-1}
|
| 24 |
+
|
| 25 |
+
# Learning Rate Arguments
|
| 26 |
+
LR=${LR:-"2e-3"}
|
| 27 |
+
MIN_LR=${MIN_LR:-"3.0e-5"}
|
| 28 |
+
LR_DECAY_STYLE=${LR_DECAY_STYLE:-"linear"}
|
| 29 |
+
TRAIN_TOKENS=${TRAIN_TOKENS:-1_000_000_000}
|
| 30 |
+
LR_WARMUP_TOKENS=${LR_WARMUP_TOKENS:-10_000_000}
|
| 31 |
+
LR_DECAY_TOKENS=${LR_DECAY_TOKENS:-990_000_000}
|
| 32 |
+
SAVE_TOKENS=${SAVE_TOKENS:-1_000_000_000}
|
| 33 |
+
|
| 34 |
+
# Sample-based training
|
| 35 |
+
TRAIN_SAMPLES=$(( ${TRAIN_TOKENS//_/} / ${SEQ_LEN} ))
|
| 36 |
+
LR_DECAY_SAMPLES=$(( ${LR_DECAY_TOKENS//_/} / ${SEQ_LEN} ))
|
| 37 |
+
LR_WARMUP_SAMPLES=$(( ${LR_WARMUP_TOKENS//_/} / ${SEQ_LEN} ))
|
| 38 |
+
SAVE_INTERVAL=$(( ${SAVE_TOKENS//_/} / ${SEQ_LEN} / ${GLOBAL_BATCH_SIZE} ))
|
| 39 |
+
|
| 40 |
+
# MoE Arguments
|
| 41 |
+
MOE_FFN_HIDDEN_SIZE=${MOE_FFN_HIDDEN_SIZE:-768}
|
| 42 |
+
MOE_TOPK=${MOE_TOPK:-2}
|
| 43 |
+
NUM_EXPERTS=${NUM_EXPERTS:-16}
|
| 44 |
+
NUM_SHARED_EXPERTS=${NUM_SHARED_EXPERTS:-0}
|
| 45 |
+
LOAD_BALANCING=${LOAD_BALANCING:-"dsv3"}
|
| 46 |
+
MOE_ROUTER_SCORE_FUNCTION=${MOE_ROUTER_SCORE_FUNCTION:-"sigmoid"}
|
| 47 |
+
MOE_EXPERT_CAPACITY_FACTOR=${MOE_EXPERT_CAPACITY_FACTOR:-2}
|
| 48 |
+
MOE_ROUTER_BIAS_UPDATE_RATE=${MOE_ROUTER_BIAS_UPDATE_RATE:-1e-3}
|
| 49 |
+
|
| 50 |
+
# Model Arguments
|
| 51 |
+
INIT_STD=${INIT_STD:-0.02}
|
| 52 |
+
NUM_LAYERS=${NUM_LAYERS:-12}
|
| 53 |
+
HIDDEN_SIZE=${HIDDEN_SIZE:-1024}
|
| 54 |
+
NUM_ATTN_HEADS=${NUM_ATTN_HEADS:-16}
|
| 55 |
+
NUM_QUERY_GROUPS=${NUM_QUERY_GROUPS:-2}
|
| 56 |
+
ROTARY_BASE=${ROTARY_BASE:-"100000"}
|
| 57 |
+
TIE_EMBEDDING=${TIE_EMBEDDING:-"true"}
|
| 58 |
+
|
| 59 |
+
# Multi-node Arguments
|
| 60 |
+
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
|
| 61 |
+
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
|
| 62 |
+
MASTER_PORT=${MASTER_PORT:-"6000"}
|
| 63 |
+
NNODES=${PET_NNODES:-"1"}
|
| 64 |
+
NODE_RANK=${PET_NODE_RANK:-"0"}
|
| 65 |
+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
| 66 |
+
|
| 67 |
+
# hybrid mamba arguments
|
| 68 |
+
MAMBA_HEAD_DIM=${MAMBA_HEAD_DIM:-64}
|
| 69 |
+
MAMBA_NUM_GROUPS=${MAMBA_NUM_GROUPS:-6}
|
| 70 |
+
MAMBA_STATE_DIM=${MAMBA_STATE_DIM:-128}
|
| 71 |
+
MAMBA_EXPAND=${MAMBA_EXPAND:-2}
|
| 72 |
+
FREEZE_NON_MAMBA=${FREEZE_NON_MAMBA:-false}
|
| 73 |
+
HYBRID_OVERRIDE_PATTERN_TYPE=${HYBRID_OVERRIDE_PATTERN_TYPE:-None}
|
| 74 |
+
HYBRID_MLP_RATIO=${HYBRID_MLP_RATIO:-0.5}
|
| 75 |
+
|
| 76 |
+
EXTRA_ARGS=${EXTRA_ARGS:-""}
|
| 77 |
+
|
| 78 |
+
# ###################################################
|
| 79 |
+
# ################# Process Arguments
|
| 80 |
+
# ###################################################
|
| 81 |
+
|
| 82 |
+
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
|
| 83 |
+
JOB_ID=${TASK_UUID:-$current_time}
|
| 84 |
+
MODEL_SIZE=${MODEL_SIZE:-"unknown_size"}
|
| 85 |
+
NAME="${NAME_PREFIX}mamba_hybrid-${MODEL_SIZE}-${NUM_LAYERS}layers-q${NUM_ATTN_HEADS}-kv${NUM_QUERY_GROUPS}-hybrid${HYBRID_ATTN}-pattern_${HYBRID_OVERRIDE_PATTERN_TYPE}-mheaddim${MAMBA_HEAD_DIM}-mnumgroups${MAMBA_NUM_GROUPS}-mstatedim${MAMBA_STATE_DIM}-mexpand${MAMBA_EXPAND}-freeze_${FREEZE_NON_MAMBA}-ep${EP_SIZE}-mp${MP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-lr${LR}-minlr${MIN_LR}-bs${GLOBAL_BATCH_SIZE}-gpus${WORLD_SIZE}-seqlen${SEQ_LEN}-loadyulan_${LOAD_FROM_CHECKPOINT}"
|
| 86 |
+
CHECKPOINT_PATH="${OUTPUT_CHECKPOINT_PATH}/checkpoint/${NAME}"
|
| 87 |
+
LOG_DIR="${OUTPUT_CHECKPOINT_PATH}/log/${JOB_ID}_${NAME}"
|
| 88 |
+
mkdir -p ${CHECKPOINT_PATH}
|
| 89 |
+
mkdir -p ${LOG_DIR}
|
| 90 |
+
ln -s $CHECKPOINT_PATH $LOG_DIR/checkpoint
|
| 91 |
+
echo $JOB_ID >> $CHECKPOINT_PATH/linked_runs.txt
|
| 92 |
+
cp $LAUNCH_SCRIPT_PATH ${LOG_DIR}
|
| 93 |
+
cp $0 ${LOG_DIR}
|
| 94 |
+
mkdir -p ${CHECKPOINT_PATH}/launch_script
|
| 95 |
+
cp $LAUNCH_SCRIPT_PATH ${CHECKPOINT_PATH}/launch_script
|
| 96 |
+
cp $0 ${CHECKPOINT_PATH}/launch_script
|
| 97 |
+
|
| 98 |
+
# ottn_only / attn_mamba
|
| 99 |
+
if [ -n "${LOAD_FROM_CHECKPOINT}" ] && ( [ "${LOAD_FROM_CHECKPOINT}" = "attn_only" ] || [ "${LOAD_FROM_CHECKPOINT}" = "attn_mamba" ] ); then
|
| 100 |
+
# assert "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" does not exist
|
| 101 |
+
if [ -z "${CHECKPOINT_LOAD_PATH}" ]; then
|
| 102 |
+
echo "ERROR: CHECKPOINT_LOAD_PATH is not set"
|
| 103 |
+
exit 1
|
| 104 |
+
fi
|
| 105 |
+
if [ -f "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" ]; then
|
| 106 |
+
echo -e "\033[31mCheckpoint '$CHECKPOINT_PATH' exists. Please check if you want to continue training from Yulan-Mini checkpoint.\033[0m"
|
| 107 |
+
exit 1
|
| 108 |
+
fi
|
| 109 |
+
LOAD_CHECKPOINT_PATH="${CHECKPOINT_LOAD_PATH}"
|
| 110 |
+
echo -e "\033[32mLoad from Yulan-Mini checkpoint $CHECKPOINT_LOAD_PATH\033[0m"
|
| 111 |
+
elif [ -f "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" ]; then
|
| 112 |
+
LOAD_CHECKPOINT_PATH="${CHECKPOINT_PATH}"
|
| 113 |
+
CONTINUE_TRAIN=${CONTINUE_TRAIN:-'true'}
|
| 114 |
+
echo -e "\033[32mFind existing checkpoint $CHECKPOINT_PATH\033[0m"
|
| 115 |
+
else
|
| 116 |
+
LOAD_CHECKPOINT_PATH="${PRETRAINED_CKPT_ROOT_PATH}/${PRETRAINED_CKPT_NAME}"
|
| 117 |
+
CONTINUE_TRAIN=${CONTINUE_TRAIN:-'false'}
|
| 118 |
+
echo -e "\033[32mCheckpoint '$CHECKPOINT_PATH' does not exists. Try to load from '$LOAD_CHECKPOINT_PATH'\033[0m"
|
| 119 |
+
fi
|
| 120 |
+
|
| 121 |
+
# setup tokenizer
|
| 122 |
+
TOKENIZER_TYPE=${TOKENIZER_TYPE:-'hf_tokenizer_qwen'}
|
| 123 |
+
DATA_PATH_CACHE="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache"
|
| 124 |
+
if [[ ${TOKENIZER_TYPE} == "hf_tokenizer_qwen" ]]; then
|
| 125 |
+
DATA_PATH_TOKENIZED="${DATA_PATH}/qwen2.5"
|
| 126 |
+
TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model ../../tokenizer"
|
| 127 |
+
elif [[ ${TOKENIZER_TYPE} == "gpt2bpe" ]]; then
|
| 128 |
+
DATA_PATH_TOKENIZED="${DATA_PATH}"
|
| 129 |
+
TOKENIZER_ARGS="--vocab-file /volume/ailab4sci/models/gpt2/vocab.json --merge-file /volume/ailab4sci/models/gpt2/merges.txt"
|
| 130 |
+
elif [[ ${TOKENIZER_TYPE} == "hf_tokenizer_yulan_mini" ]]; then
|
| 131 |
+
DATA_PATH_TOKENIZED="${DATA_PATH}/yulan_mini"
|
| 132 |
+
TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/huggingface/yulan-team/YuLan-Mini"
|
| 133 |
+
else
|
| 134 |
+
echo "ERROR: Unknown tokenizer type ${TOKENIZER_TYPE}"
|
| 135 |
+
exit 1
|
| 136 |
+
fi
|
| 137 |
+
|
| 138 |
+
# setup embedding tying
|
| 139 |
+
if [[ "1${TIE_EMBEDDING}" == "1false" ]]; then
|
| 140 |
+
EXTRA_ARGS="${EXTRA_ARGS} \
|
| 141 |
+
--untie-embeddings-and-output-weights
|
| 142 |
+
"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
# # moe
|
| 146 |
+
# if [[ ${LOAD_BALANCING} == "dsv3" ]]; then
|
| 147 |
+
# EXTRA_ARGS="${EXTRA_ARGS} \
|
| 148 |
+
# --moe-router-enable-expert-bias
|
| 149 |
+
# "
|
| 150 |
+
# LOAD_BALANCING=none
|
| 151 |
+
# fi
|
| 152 |
+
# if [ -n "$MOE_AUX_LOSS_COEFF" ]; then
|
| 153 |
+
# echo "ERROR: DeepSeek V3 does not support MOE_AUX_LOSS_COEFF=$MOE_AUX_LOSS_COEFF"
|
| 154 |
+
# exit 1
|
| 155 |
+
# fi
|
| 156 |
+
|
| 157 |
+
# ###################################################
|
| 158 |
+
# ################# models
|
| 159 |
+
# ###################################################
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
DISTRIBUTED_ARGS=(
|
| 163 |
+
--nproc_per_node $GPUS_PER_NODE
|
| 164 |
+
--nnodes $NNODES
|
| 165 |
+
--node_rank $NODE_RANK
|
| 166 |
+
--master_addr $MASTER_ADDR
|
| 167 |
+
--master_port $MASTER_PORT
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
MODEL_ARGS=(
|
| 172 |
+
--use-mcore-models
|
| 173 |
+
--hybrid-attention-ratio ${HYBRID_ATTN}
|
| 174 |
+
--hybrid-mlp-ratio ${HYBRID_MLP_RATIO}
|
| 175 |
+
--spec megatron.core.models.mamba.mamba_layer_specs mamba_moe_stack_spec
|
| 176 |
+
--disable-bias-linear
|
| 177 |
+
--add-qkv-bias
|
| 178 |
+
--seq-length ${SEQ_LEN}
|
| 179 |
+
--max-position-embeddings ${SEQ_LEN}
|
| 180 |
+
--num-layers ${NUM_LAYERS}
|
| 181 |
+
--hidden-size ${HIDDEN_SIZE}
|
| 182 |
+
--ffn-hidden-size ${MOE_FFN_HIDDEN_SIZE}
|
| 183 |
+
--num-attention-heads ${NUM_ATTN_HEADS}
|
| 184 |
+
--init-method-std ${INIT_STD}
|
| 185 |
+
--attention-dropout 0.0
|
| 186 |
+
--hidden-dropout 0.0
|
| 187 |
+
--normalization RMSNorm
|
| 188 |
+
--position-embedding-type rope
|
| 189 |
+
--swiglu
|
| 190 |
+
--group-query-attention
|
| 191 |
+
--num-query-groups ${NUM_QUERY_GROUPS}
|
| 192 |
+
--no-masked-softmax-fusion
|
| 193 |
+
--no-position-embedding
|
| 194 |
+
--rotary-base ${ROTARY_BASE}
|
| 195 |
+
--use-flash-attn
|
| 196 |
+
--mamba-head-dim ${MAMBA_HEAD_DIM}
|
| 197 |
+
--mamba-num-groups ${MAMBA_NUM_GROUPS}
|
| 198 |
+
--mamba-state-dim ${MAMBA_STATE_DIM}
|
| 199 |
+
--mamba-expand ${MAMBA_EXPAND}
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
if [ -n "${HYBRID_OVERRIDE_PATTERN_TYPE}" ]; then
|
| 203 |
+
if [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "M0" ]; then
|
| 204 |
+
HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-"
|
| 205 |
+
MODEL_ARGS+=(
|
| 206 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 207 |
+
)
|
| 208 |
+
elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "A0" ]; then
|
| 209 |
+
HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 112
|
| 210 |
+
# HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 96
|
| 211 |
+
# HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 80
|
| 212 |
+
# HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 64
|
| 213 |
+
MODEL_ARGS+=(
|
| 214 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 215 |
+
)
|
| 216 |
+
elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "A01" ]; then
|
| 217 |
+
HYBRID_OVERRIDE_PATTERN="*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-"
|
| 218 |
+
MODEL_ARGS+=(
|
| 219 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 220 |
+
)
|
| 221 |
+
elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "M01" ]; then
|
| 222 |
+
HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-"
|
| 223 |
+
MODEL_ARGS+=(
|
| 224 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 225 |
+
)
|
| 226 |
+
elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "Nemo_A7_M49_F49" ]; then
|
| 227 |
+
HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-"
|
| 228 |
+
MODEL_ARGS+=(
|
| 229 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 230 |
+
)
|
| 231 |
+
elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "yulanmini" ]; then
|
| 232 |
+
HYBRID_OVERRIDE_PATTERN="*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-"
|
| 233 |
+
MODEL_ARGS+=(
|
| 234 |
+
--hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
|
| 235 |
+
)
|
| 236 |
+
fi
|
| 237 |
+
echo -e "\033[32mHYBRID_OVERRIDE_PATTERN: ${HYBRID_OVERRIDE_PATTERN}\033[0m"
|
| 238 |
+
fi
|
| 239 |
+
|
| 240 |
+
# MOE_ARGS=(
|
| 241 |
+
# --num-experts ${NUM_EXPERTS}
|
| 242 |
+
# --expert-tensor-parallel-size 1
|
| 243 |
+
# --moe-grouped-gemm
|
| 244 |
+
# --moe-router-topk ${MOE_TOPK}
|
| 245 |
+
# --moe-router-load-balancing-type ${LOAD_BALANCING}
|
| 246 |
+
# --moe-router-score-function sigmoid
|
| 247 |
+
# --moe-token-dispatcher-type alltoall
|
| 248 |
+
# --overlap-param-gather
|
| 249 |
+
# --overlap-grad-reduce
|
| 250 |
+
# --moe-expert-capacity-factor ${MOE_EXPERT_CAPACITY_FACTOR}
|
| 251 |
+
# --moe-router-bias-update-rate ${MOE_ROUTER_BIAS_UPDATE_RATE}
|
| 252 |
+
# )
|
| 253 |
+
|
| 254 |
+
MOE_ARGS=(
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
TRAINING_ARGS=(
|
| 258 |
+
--micro-batch-size ${BATCH_SIZE}
|
| 259 |
+
--global-batch-size ${GLOBAL_BATCH_SIZE}
|
| 260 |
+
--lr ${LR}
|
| 261 |
+
--train-samples ${TRAIN_SAMPLES}
|
| 262 |
+
--lr-warmup-samples ${LR_WARMUP_SAMPLES}
|
| 263 |
+
--lr-decay-samples ${LR_DECAY_SAMPLES}
|
| 264 |
+
--lr-decay-style ${LR_DECAY_STYLE}
|
| 265 |
+
--min-lr ${MIN_LR}
|
| 266 |
+
--split 100,0,0
|
| 267 |
+
--weight-decay 0.1
|
| 268 |
+
--clip-grad 0.5
|
| 269 |
+
--num-workers 2
|
| 270 |
+
--bf16
|
| 271 |
+
--save ${CHECKPOINT_PATH}
|
| 272 |
+
--load ${LOAD_CHECKPOINT_PATH}
|
| 273 |
+
--overlap-param-gather
|
| 274 |
+
--overlap-grad-reduce
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
if [ "1${FREEZE_NON_MAMBA}" = "1true" ]; then
|
| 278 |
+
TRAINING_ARGS+=(
|
| 279 |
+
--freeze-non-mamba
|
| 280 |
+
)
|
| 281 |
+
fi
|
| 282 |
+
|
| 283 |
+
DATA_ARGS=(
|
| 284 |
+
--data-path ${DATA_PATH_TOKENIZED}
|
| 285 |
+
--data-cache-path ${DATA_PATH_CACHE}
|
| 286 |
+
--no-create-attention-mask-in-dataloader
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
MODEL_PARALLEL_ARGS=(
|
| 290 |
+
--tensor-model-parallel-size ${MP_SIZE}
|
| 291 |
+
--pipeline-model-parallel-size ${PP_SIZE}
|
| 292 |
+
--expert-model-parallel-size ${EP_SIZE}
|
| 293 |
+
--use-distributed-optimizer
|
| 294 |
+
--sequence-parallel
|
| 295 |
+
--context-parallel-size ${CP_SIZE}
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
LOGGING_ARGS=(
|
| 299 |
+
--log-interval ${LOG_INTERVAL}
|
| 300 |
+
--log-throughput
|
| 301 |
+
--save-interval ${SAVE_INTERVAL}
|
| 302 |
+
--eval-interval 1000
|
| 303 |
+
--eval-iters 10
|
| 304 |
+
--tensorboard-dir ${LOG_DIR}
|
| 305 |
+
--log-timers-to-tensorboard
|
| 306 |
+
--log-memory-to-tensorboard
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
if [ -n "${WANDB_API_KEY}" ]; then
|
| 310 |
+
LOGGING_ARGS+=(
|
| 311 |
+
--wandb-project ${WANDB_PROJECT:-"DSV3"}
|
| 312 |
+
--wandb-exp-name ${NAME}
|
| 313 |
+
)
|
| 314 |
+
fi
|
| 315 |
+
|
| 316 |
+
if [ "1${ACTIVATION_CHECKPOINT}" = "1true" ]; then
|
| 317 |
+
EXTRA_ARGS="${EXTRA_ARGS} \
|
| 318 |
+
--recompute-granularity selective
|
| 319 |
+
"
|
| 320 |
+
fi
|
| 321 |
+
|
| 322 |
+
if [ $NODE_RANK == "0" ]; then
|
| 323 |
+
which torchrun >> ${LOG_DIR}/ENV-${HOSTNAME}.log
|
| 324 |
+
python -V >> ${LOG_DIR}/ENV-${HOSTNAME}.log
|
| 325 |
+
pip list >> ${LOG_DIR}/ENV-${HOSTNAME}.log
|
| 326 |
+
env >> ${LOG_DIR}/ENV-${HOSTNAME}.log
|
| 327 |
+
echo $(which torchrun) ${DISTRIBUTED_ARGS[@]} ../../pretrain_mamba.py ${MODEL_ARGS[@]} ${DATA_ARGS[@]} ${MOE_ARGS[@]} ${TRAINING_ARGS[@]} ${MODEL_PARALLEL_ARGS[@]} ${LOGGING_ARGS[@]} ${TOKENIZER_ARGS} ${EXTRA_ARGS} >> ${LOG_DIR}/ENV-${HOSTNAME}.log
|
| 328 |
+
fi
|
| 329 |
+
set -x
|
| 330 |
+
|
| 331 |
+
torchrun ${DISTRIBUTED_ARGS[@]} ../../pretrain_mamba.py \
|
| 332 |
+
${MODEL_ARGS[@]} \
|
| 333 |
+
${DATA_ARGS[@]} \
|
| 334 |
+
${MOE_ARGS[@]} \
|
| 335 |
+
${TRAINING_ARGS[@]} \
|
| 336 |
+
${MODEL_PARALLEL_ARGS[@]} \
|
| 337 |
+
${LOGGING_ARGS[@]} \
|
| 338 |
+
${TOKENIZER_ARGS} \
|
| 339 |
+
${EXTRA_ARGS} 2>&1 | tee ${LOG_DIR}/LOG_NODE_RANK_${NODE_RANK}.log
|
launch_script/run_1node_hybrid_mamba_pretrain.sh
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------------
|
| 2 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 3 |
+
export USER=whoami
|
| 4 |
+
source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate
|
| 5 |
+
# ------------------
|
| 6 |
+
|
| 7 |
+
set -eo pipefail
|
| 8 |
+
# ------------------
|
| 9 |
+
|
| 10 |
+
cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain
|
| 11 |
+
|
| 12 |
+
LAUNCH_SCRIPT_PATH="$(realpath $0)" \
|
| 13 |
+
DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \
|
| 14 |
+
OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \
|
| 15 |
+
BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \
|
| 16 |
+
TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \
|
| 17 |
+
LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \
|
| 18 |
+
LR=2e-5 MIN_LR=7e-7 \
|
| 19 |
+
MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \
|
| 20 |
+
TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \
|
| 21 |
+
ACTIVATION_CHECKPOINT='true' \
|
| 22 |
+
NAME_PREFIX='dev-' \
|
| 23 |
+
HYBRID_ATTN=0.0625 \
|
| 24 |
+
HYBRID_MLP_RATIO=0.5 \
|
| 25 |
+
MAMBA_HEAD_DIM=64 \
|
| 26 |
+
MAMBA_NUM_GROUPS=6 \
|
| 27 |
+
MAMBA_STATE_DIM=320 \
|
| 28 |
+
MAMBA_EXPAND=1 \
|
| 29 |
+
NUM_LAYERS=112 \
|
| 30 |
+
MODEL_SIZE='2.9b' \
|
| 31 |
+
HIDDEN_SIZE=1920 \
|
| 32 |
+
NUM_ATTN_HEADS=30 \
|
| 33 |
+
NUM_QUERY_GROUPS=6 \
|
| 34 |
+
ROTARY_BASE=10000 \
|
| 35 |
+
MOE_FFN_HIDDEN_SIZE=4800 \
|
| 36 |
+
NUM_EXPERTS=0 \
|
| 37 |
+
SEQ_LEN=4096 \
|
| 38 |
+
TIE_EMBEDDING=false \
|
| 39 |
+
FREEZE_NON_MAMBA=false \
|
| 40 |
+
LOAD_FROM_CHECKPOINT='attn_mamba' \
|
| 41 |
+
HYBRID_OVERRIDE_PATTERN_TYPE='A0' \
|
| 42 |
+
CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \
|
| 43 |
+
EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \
|
| 44 |
+
bash mamba_moe_0.5b_pretrain_template.sh
|
| 45 |
+
|
| 46 |
+
# SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN
|
| 47 |
+
# LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba
|
| 48 |
+
|
| 49 |
+
# group.add_argument('--hybrid-override-pattern', type=str, default=None,
|
| 50 |
+
# help='Force a specific hybrid layer pattern. The value'
|
| 51 |
+
# 'should be a string of characters chosen from'
|
| 52 |
+
# 'core.ssm.mamba_hybrid_layer_allocation.Symbols.'
|
| 53 |
+
# 'If a value greater than 0.0 is supplied to any of the '
|
| 54 |
+
# 'hybrid ratio arguments, then the number of each type'
|
| 55 |
+
# 'of layer in the override pattern must match number in'
|
| 56 |
+
# 'the overidden pattern')
|
| 57 |
+
|
| 58 |
+
# M0 type:
|
| 59 |
+
# M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-
|
| 60 |
+
# A0 type:
|
| 61 |
+
# *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
|
| 62 |
+
# A01 type:
|
| 63 |
+
# *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
|
| 64 |
+
# M01 type:
|
| 65 |
+
# M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-
|
| 66 |
+
# Nemo_A7_M49_F49
|
| 67 |
+
# M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-
|
| 68 |
+
# yulanmini
|
| 69 |
+
# *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
|
| 70 |
+
|
| 71 |
+
# else or no this argument:
|
| 72 |
+
# No override
|