O2iginal commited on
Commit
10f998d
·
verified ·
1 Parent(s): fa05b84

Upload folder launch_script to dsv3_0.5b

Browse files
launch_script/mamba_moe_0.5b_pretrain_template.sh ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Mamba 0.5B total MoE
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
4
+ export OMP_NUM_THREADS=4
5
+ export TRITON_CACHE_DIR="./triton-cache/mamba-moe/"
6
+
7
+ # Dir Arguments
8
+ DIR=`pwd`
9
+ PRETRAINED_CKPT_ROOT_PATH=${PRETRAINED_CKPT_ROOT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace"}
10
+ PRETRAINED_CKPT_ID=${PRETRAINED_CKPT_ID:-"NOT_EXISTS"}
11
+ PRETRAINED_CKPT_NAME=${PRETRAINED_CKPT_NAME:-"NOT_EXISTS"}
12
+ OUTPUT_CHECKPOINT_PATH=${OUTPUT_CHECKPOINT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace"}
13
+
14
+ # Training Arguments
15
+ SEQ_LEN=${SEQ_LEN:-4096}
16
+ BATCH_SIZE=${BATCH_SIZE:-1}
17
+ GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-4096}
18
+ MP_SIZE=${MP_SIZE:-1}
19
+ PP_SIZE=${PP_SIZE:-1}
20
+ EP_SIZE=${EP_SIZE:-1}
21
+ CP_SIZE=${CP_SIZE:-1}
22
+ ACTIVATION_CHECKPOINT=${ACTIVATION_CHECKPOINT:-"false"}
23
+ LOG_INTERVAL=${LOG_INTERVAL:-1}
24
+
25
+ # Learning Rate Arguments
26
+ LR=${LR:-"2e-3"}
27
+ MIN_LR=${MIN_LR:-"3.0e-5"}
28
+ LR_DECAY_STYLE=${LR_DECAY_STYLE:-"linear"}
29
+ TRAIN_TOKENS=${TRAIN_TOKENS:-1_000_000_000}
30
+ LR_WARMUP_TOKENS=${LR_WARMUP_TOKENS:-10_000_000}
31
+ LR_DECAY_TOKENS=${LR_DECAY_TOKENS:-990_000_000}
32
+ SAVE_TOKENS=${SAVE_TOKENS:-1_000_000_000}
33
+
34
+ # Sample-based training
35
+ TRAIN_SAMPLES=$(( ${TRAIN_TOKENS//_/} / ${SEQ_LEN} ))
36
+ LR_DECAY_SAMPLES=$(( ${LR_DECAY_TOKENS//_/} / ${SEQ_LEN} ))
37
+ LR_WARMUP_SAMPLES=$(( ${LR_WARMUP_TOKENS//_/} / ${SEQ_LEN} ))
38
+ SAVE_INTERVAL=$(( ${SAVE_TOKENS//_/} / ${SEQ_LEN} / ${GLOBAL_BATCH_SIZE} ))
39
+
40
+ # MoE Arguments
41
+ MOE_FFN_HIDDEN_SIZE=${MOE_FFN_HIDDEN_SIZE:-768}
42
+ MOE_TOPK=${MOE_TOPK:-2}
43
+ NUM_EXPERTS=${NUM_EXPERTS:-16}
44
+ NUM_SHARED_EXPERTS=${NUM_SHARED_EXPERTS:-0}
45
+ LOAD_BALANCING=${LOAD_BALANCING:-"dsv3"}
46
+ MOE_ROUTER_SCORE_FUNCTION=${MOE_ROUTER_SCORE_FUNCTION:-"sigmoid"}
47
+ MOE_EXPERT_CAPACITY_FACTOR=${MOE_EXPERT_CAPACITY_FACTOR:-2}
48
+ MOE_ROUTER_BIAS_UPDATE_RATE=${MOE_ROUTER_BIAS_UPDATE_RATE:-1e-3}
49
+
50
+ # Model Arguments
51
+ INIT_STD=${INIT_STD:-0.02}
52
+ NUM_LAYERS=${NUM_LAYERS:-12}
53
+ HIDDEN_SIZE=${HIDDEN_SIZE:-1024}
54
+ NUM_ATTN_HEADS=${NUM_ATTN_HEADS:-16}
55
+ NUM_QUERY_GROUPS=${NUM_QUERY_GROUPS:-2}
56
+ ROTARY_BASE=${ROTARY_BASE:-"100000"}
57
+ TIE_EMBEDDING=${TIE_EMBEDDING:-"true"}
58
+
59
+ # Multi-node Arguments
60
+ GPUS_PER_NODE=${GPUS_PER_NODE:-8}
61
+ MASTER_ADDR=${MASTER_ADDR:-"localhost"}
62
+ MASTER_PORT=${MASTER_PORT:-"6000"}
63
+ NNODES=${PET_NNODES:-"1"}
64
+ NODE_RANK=${PET_NODE_RANK:-"0"}
65
+ WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
66
+
67
+ # hybrid mamba arguments
68
+ MAMBA_HEAD_DIM=${MAMBA_HEAD_DIM:-64}
69
+ MAMBA_NUM_GROUPS=${MAMBA_NUM_GROUPS:-6}
70
+ MAMBA_STATE_DIM=${MAMBA_STATE_DIM:-128}
71
+ MAMBA_EXPAND=${MAMBA_EXPAND:-2}
72
+ FREEZE_NON_MAMBA=${FREEZE_NON_MAMBA:-false}
73
+ HYBRID_OVERRIDE_PATTERN_TYPE=${HYBRID_OVERRIDE_PATTERN_TYPE:-None}
74
+ HYBRID_MLP_RATIO=${HYBRID_MLP_RATIO:-0.5}
75
+
76
+ EXTRA_ARGS=${EXTRA_ARGS:-""}
77
+
78
+ # ###################################################
79
+ # ################# Process Arguments
80
+ # ###################################################
81
+
82
+ current_time=$(date "+%Y.%m.%d-%H.%M.%S")
83
+ JOB_ID=${TASK_UUID:-$current_time}
84
+ MODEL_SIZE=${MODEL_SIZE:-"unknown_size"}
85
+ NAME="${NAME_PREFIX}mamba_hybrid-${MODEL_SIZE}-${NUM_LAYERS}layers-q${NUM_ATTN_HEADS}-kv${NUM_QUERY_GROUPS}-hybrid${HYBRID_ATTN}-pattern_${HYBRID_OVERRIDE_PATTERN_TYPE}-mheaddim${MAMBA_HEAD_DIM}-mnumgroups${MAMBA_NUM_GROUPS}-mstatedim${MAMBA_STATE_DIM}-mexpand${MAMBA_EXPAND}-freeze_${FREEZE_NON_MAMBA}-ep${EP_SIZE}-mp${MP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-lr${LR}-minlr${MIN_LR}-bs${GLOBAL_BATCH_SIZE}-gpus${WORLD_SIZE}-seqlen${SEQ_LEN}-loadyulan_${LOAD_FROM_CHECKPOINT}"
86
+ CHECKPOINT_PATH="${OUTPUT_CHECKPOINT_PATH}/checkpoint/${NAME}"
87
+ LOG_DIR="${OUTPUT_CHECKPOINT_PATH}/log/${JOB_ID}_${NAME}"
88
+ mkdir -p ${CHECKPOINT_PATH}
89
+ mkdir -p ${LOG_DIR}
90
+ ln -s $CHECKPOINT_PATH $LOG_DIR/checkpoint
91
+ echo $JOB_ID >> $CHECKPOINT_PATH/linked_runs.txt
92
+ cp $LAUNCH_SCRIPT_PATH ${LOG_DIR}
93
+ cp $0 ${LOG_DIR}
94
+ mkdir -p ${CHECKPOINT_PATH}/launch_script
95
+ cp $LAUNCH_SCRIPT_PATH ${CHECKPOINT_PATH}/launch_script
96
+ cp $0 ${CHECKPOINT_PATH}/launch_script
97
+
98
+ # ottn_only / attn_mamba
99
+ if [ -n "${LOAD_FROM_CHECKPOINT}" ] && ( [ "${LOAD_FROM_CHECKPOINT}" = "attn_only" ] || [ "${LOAD_FROM_CHECKPOINT}" = "attn_mamba" ] ); then
100
+ # assert "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" does not exist
101
+ if [ -z "${CHECKPOINT_LOAD_PATH}" ]; then
102
+ echo "ERROR: CHECKPOINT_LOAD_PATH is not set"
103
+ exit 1
104
+ fi
105
+ if [ -f "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" ]; then
106
+ echo -e "\033[31mCheckpoint '$CHECKPOINT_PATH' exists. Please check if you want to continue training from Yulan-Mini checkpoint.\033[0m"
107
+ exit 1
108
+ fi
109
+ LOAD_CHECKPOINT_PATH="${CHECKPOINT_LOAD_PATH}"
110
+ echo -e "\033[32mLoad from Yulan-Mini checkpoint $CHECKPOINT_LOAD_PATH\033[0m"
111
+ elif [ -f "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" ]; then
112
+ LOAD_CHECKPOINT_PATH="${CHECKPOINT_PATH}"
113
+ CONTINUE_TRAIN=${CONTINUE_TRAIN:-'true'}
114
+ echo -e "\033[32mFind existing checkpoint $CHECKPOINT_PATH\033[0m"
115
+ else
116
+ LOAD_CHECKPOINT_PATH="${PRETRAINED_CKPT_ROOT_PATH}/${PRETRAINED_CKPT_NAME}"
117
+ CONTINUE_TRAIN=${CONTINUE_TRAIN:-'false'}
118
+ echo -e "\033[32mCheckpoint '$CHECKPOINT_PATH' does not exists. Try to load from '$LOAD_CHECKPOINT_PATH'\033[0m"
119
+ fi
120
+
121
+ # setup tokenizer
122
+ TOKENIZER_TYPE=${TOKENIZER_TYPE:-'hf_tokenizer_qwen'}
123
+ DATA_PATH_CACHE="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache"
124
+ if [[ ${TOKENIZER_TYPE} == "hf_tokenizer_qwen" ]]; then
125
+ DATA_PATH_TOKENIZED="${DATA_PATH}/qwen2.5"
126
+ TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model ../../tokenizer"
127
+ elif [[ ${TOKENIZER_TYPE} == "gpt2bpe" ]]; then
128
+ DATA_PATH_TOKENIZED="${DATA_PATH}"
129
+ TOKENIZER_ARGS="--vocab-file /volume/ailab4sci/models/gpt2/vocab.json --merge-file /volume/ailab4sci/models/gpt2/merges.txt"
130
+ elif [[ ${TOKENIZER_TYPE} == "hf_tokenizer_yulan_mini" ]]; then
131
+ DATA_PATH_TOKENIZED="${DATA_PATH}/yulan_mini"
132
+ TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/huggingface/yulan-team/YuLan-Mini"
133
+ else
134
+ echo "ERROR: Unknown tokenizer type ${TOKENIZER_TYPE}"
135
+ exit 1
136
+ fi
137
+
138
+ # setup embedding tying
139
+ if [[ "1${TIE_EMBEDDING}" == "1false" ]]; then
140
+ EXTRA_ARGS="${EXTRA_ARGS} \
141
+ --untie-embeddings-and-output-weights
142
+ "
143
+ fi
144
+
145
+ # # moe
146
+ # if [[ ${LOAD_BALANCING} == "dsv3" ]]; then
147
+ # EXTRA_ARGS="${EXTRA_ARGS} \
148
+ # --moe-router-enable-expert-bias
149
+ # "
150
+ # LOAD_BALANCING=none
151
+ # fi
152
+ # if [ -n "$MOE_AUX_LOSS_COEFF" ]; then
153
+ # echo "ERROR: DeepSeek V3 does not support MOE_AUX_LOSS_COEFF=$MOE_AUX_LOSS_COEFF"
154
+ # exit 1
155
+ # fi
156
+
157
+ # ###################################################
158
+ # ################# models
159
+ # ###################################################
160
+
161
+
162
+ DISTRIBUTED_ARGS=(
163
+ --nproc_per_node $GPUS_PER_NODE
164
+ --nnodes $NNODES
165
+ --node_rank $NODE_RANK
166
+ --master_addr $MASTER_ADDR
167
+ --master_port $MASTER_PORT
168
+ )
169
+
170
+
171
+ MODEL_ARGS=(
172
+ --use-mcore-models
173
+ --hybrid-attention-ratio ${HYBRID_ATTN}
174
+ --hybrid-mlp-ratio ${HYBRID_MLP_RATIO}
175
+ --spec megatron.core.models.mamba.mamba_layer_specs mamba_moe_stack_spec
176
+ --disable-bias-linear
177
+ --add-qkv-bias
178
+ --seq-length ${SEQ_LEN}
179
+ --max-position-embeddings ${SEQ_LEN}
180
+ --num-layers ${NUM_LAYERS}
181
+ --hidden-size ${HIDDEN_SIZE}
182
+ --ffn-hidden-size ${MOE_FFN_HIDDEN_SIZE}
183
+ --num-attention-heads ${NUM_ATTN_HEADS}
184
+ --init-method-std ${INIT_STD}
185
+ --attention-dropout 0.0
186
+ --hidden-dropout 0.0
187
+ --normalization RMSNorm
188
+ --position-embedding-type rope
189
+ --swiglu
190
+ --group-query-attention
191
+ --num-query-groups ${NUM_QUERY_GROUPS}
192
+ --no-masked-softmax-fusion
193
+ --no-position-embedding
194
+ --rotary-base ${ROTARY_BASE}
195
+ --use-flash-attn
196
+ --mamba-head-dim ${MAMBA_HEAD_DIM}
197
+ --mamba-num-groups ${MAMBA_NUM_GROUPS}
198
+ --mamba-state-dim ${MAMBA_STATE_DIM}
199
+ --mamba-expand ${MAMBA_EXPAND}
200
+ )
201
+
202
+ if [ -n "${HYBRID_OVERRIDE_PATTERN_TYPE}" ]; then
203
+ if [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "M0" ]; then
204
+ HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-"
205
+ MODEL_ARGS+=(
206
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
207
+ )
208
+ elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "A0" ]; then
209
+ HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 112
210
+ # HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 96
211
+ # HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 80
212
+ # HYBRID_OVERRIDE_PATTERN="*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-" # 64
213
+ MODEL_ARGS+=(
214
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
215
+ )
216
+ elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "A01" ]; then
217
+ HYBRID_OVERRIDE_PATTERN="*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-"
218
+ MODEL_ARGS+=(
219
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
220
+ )
221
+ elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "M01" ]; then
222
+ HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-"
223
+ MODEL_ARGS+=(
224
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
225
+ )
226
+ elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "Nemo_A7_M49_F49" ]; then
227
+ HYBRID_OVERRIDE_PATTERN="M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-"
228
+ MODEL_ARGS+=(
229
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
230
+ )
231
+ elif [ "${HYBRID_OVERRIDE_PATTERN_TYPE}" = "yulanmini" ]; then
232
+ HYBRID_OVERRIDE_PATTERN="*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-"
233
+ MODEL_ARGS+=(
234
+ --hybrid-override-pattern ${HYBRID_OVERRIDE_PATTERN}
235
+ )
236
+ fi
237
+ echo -e "\033[32mHYBRID_OVERRIDE_PATTERN: ${HYBRID_OVERRIDE_PATTERN}\033[0m"
238
+ fi
239
+
240
+ # MOE_ARGS=(
241
+ # --num-experts ${NUM_EXPERTS}
242
+ # --expert-tensor-parallel-size 1
243
+ # --moe-grouped-gemm
244
+ # --moe-router-topk ${MOE_TOPK}
245
+ # --moe-router-load-balancing-type ${LOAD_BALANCING}
246
+ # --moe-router-score-function sigmoid
247
+ # --moe-token-dispatcher-type alltoall
248
+ # --overlap-param-gather
249
+ # --overlap-grad-reduce
250
+ # --moe-expert-capacity-factor ${MOE_EXPERT_CAPACITY_FACTOR}
251
+ # --moe-router-bias-update-rate ${MOE_ROUTER_BIAS_UPDATE_RATE}
252
+ # )
253
+
254
+ MOE_ARGS=(
255
+ )
256
+
257
+ TRAINING_ARGS=(
258
+ --micro-batch-size ${BATCH_SIZE}
259
+ --global-batch-size ${GLOBAL_BATCH_SIZE}
260
+ --lr ${LR}
261
+ --train-samples ${TRAIN_SAMPLES}
262
+ --lr-warmup-samples ${LR_WARMUP_SAMPLES}
263
+ --lr-decay-samples ${LR_DECAY_SAMPLES}
264
+ --lr-decay-style ${LR_DECAY_STYLE}
265
+ --min-lr ${MIN_LR}
266
+ --split 100,0,0
267
+ --weight-decay 0.1
268
+ --clip-grad 0.5
269
+ --num-workers 2
270
+ --bf16
271
+ --save ${CHECKPOINT_PATH}
272
+ --load ${LOAD_CHECKPOINT_PATH}
273
+ --overlap-param-gather
274
+ --overlap-grad-reduce
275
+ )
276
+
277
+ if [ "1${FREEZE_NON_MAMBA}" = "1true" ]; then
278
+ TRAINING_ARGS+=(
279
+ --freeze-non-mamba
280
+ )
281
+ fi
282
+
283
+ DATA_ARGS=(
284
+ --data-path ${DATA_PATH_TOKENIZED}
285
+ --data-cache-path ${DATA_PATH_CACHE}
286
+ --no-create-attention-mask-in-dataloader
287
+ )
288
+
289
+ MODEL_PARALLEL_ARGS=(
290
+ --tensor-model-parallel-size ${MP_SIZE}
291
+ --pipeline-model-parallel-size ${PP_SIZE}
292
+ --expert-model-parallel-size ${EP_SIZE}
293
+ --use-distributed-optimizer
294
+ --sequence-parallel
295
+ --context-parallel-size ${CP_SIZE}
296
+ )
297
+
298
+ LOGGING_ARGS=(
299
+ --log-interval ${LOG_INTERVAL}
300
+ --log-throughput
301
+ --save-interval ${SAVE_INTERVAL}
302
+ --eval-interval 1000
303
+ --eval-iters 10
304
+ --tensorboard-dir ${LOG_DIR}
305
+ --log-timers-to-tensorboard
306
+ --log-memory-to-tensorboard
307
+ )
308
+
309
+ if [ -n "${WANDB_API_KEY}" ]; then
310
+ LOGGING_ARGS+=(
311
+ --wandb-project ${WANDB_PROJECT:-"DSV3"}
312
+ --wandb-exp-name ${NAME}
313
+ )
314
+ fi
315
+
316
+ if [ "1${ACTIVATION_CHECKPOINT}" = "1true" ]; then
317
+ EXTRA_ARGS="${EXTRA_ARGS} \
318
+ --recompute-granularity selective
319
+ "
320
+ fi
321
+
322
+ if [ $NODE_RANK == "0" ]; then
323
+ which torchrun >> ${LOG_DIR}/ENV-${HOSTNAME}.log
324
+ python -V >> ${LOG_DIR}/ENV-${HOSTNAME}.log
325
+ pip list >> ${LOG_DIR}/ENV-${HOSTNAME}.log
326
+ env >> ${LOG_DIR}/ENV-${HOSTNAME}.log
327
+ echo $(which torchrun) ${DISTRIBUTED_ARGS[@]} ../../pretrain_mamba.py ${MODEL_ARGS[@]} ${DATA_ARGS[@]} ${MOE_ARGS[@]} ${TRAINING_ARGS[@]} ${MODEL_PARALLEL_ARGS[@]} ${LOGGING_ARGS[@]} ${TOKENIZER_ARGS} ${EXTRA_ARGS} >> ${LOG_DIR}/ENV-${HOSTNAME}.log
328
+ fi
329
+ set -x
330
+
331
+ torchrun ${DISTRIBUTED_ARGS[@]} ../../pretrain_mamba.py \
332
+ ${MODEL_ARGS[@]} \
333
+ ${DATA_ARGS[@]} \
334
+ ${MOE_ARGS[@]} \
335
+ ${TRAINING_ARGS[@]} \
336
+ ${MODEL_PARALLEL_ARGS[@]} \
337
+ ${LOGGING_ARGS[@]} \
338
+ ${TOKENIZER_ARGS} \
339
+ ${EXTRA_ARGS} 2>&1 | tee ${LOG_DIR}/LOG_NODE_RANK_${NODE_RANK}.log
launch_script/run_1node_hybrid_mamba_pretrain.sh ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------
2
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
3
+ export USER=whoami
4
+ source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate
5
+ # ------------------
6
+
7
+ set -eo pipefail
8
+ # ------------------
9
+
10
+ cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain
11
+
12
+ LAUNCH_SCRIPT_PATH="$(realpath $0)" \
13
+ DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \
14
+ OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \
15
+ BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \
16
+ TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \
17
+ LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \
18
+ LR=2e-5 MIN_LR=7e-7 \
19
+ MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \
20
+ TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \
21
+ ACTIVATION_CHECKPOINT='true' \
22
+ NAME_PREFIX='dev-' \
23
+ HYBRID_ATTN=0.0625 \
24
+ HYBRID_MLP_RATIO=0.5 \
25
+ MAMBA_HEAD_DIM=64 \
26
+ MAMBA_NUM_GROUPS=6 \
27
+ MAMBA_STATE_DIM=320 \
28
+ MAMBA_EXPAND=1 \
29
+ NUM_LAYERS=112 \
30
+ MODEL_SIZE='2.9b' \
31
+ HIDDEN_SIZE=1920 \
32
+ NUM_ATTN_HEADS=30 \
33
+ NUM_QUERY_GROUPS=6 \
34
+ ROTARY_BASE=10000 \
35
+ MOE_FFN_HIDDEN_SIZE=4800 \
36
+ NUM_EXPERTS=0 \
37
+ SEQ_LEN=4096 \
38
+ TIE_EMBEDDING=false \
39
+ FREEZE_NON_MAMBA=false \
40
+ LOAD_FROM_CHECKPOINT='attn_mamba' \
41
+ HYBRID_OVERRIDE_PATTERN_TYPE='A0' \
42
+ CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \
43
+ EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \
44
+ bash mamba_moe_0.5b_pretrain_template.sh
45
+
46
+ # SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN
47
+ # LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba
48
+
49
+ # group.add_argument('--hybrid-override-pattern', type=str, default=None,
50
+ # help='Force a specific hybrid layer pattern. The value'
51
+ # 'should be a string of characters chosen from'
52
+ # 'core.ssm.mamba_hybrid_layer_allocation.Symbols.'
53
+ # 'If a value greater than 0.0 is supplied to any of the '
54
+ # 'hybrid ratio arguments, then the number of each type'
55
+ # 'of layer in the override pattern must match number in'
56
+ # 'the overidden pattern')
57
+
58
+ # M0 type:
59
+ # M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-
60
+ # A0 type:
61
+ # *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
62
+ # A01 type:
63
+ # *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
64
+ # M01 type:
65
+ # M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-
66
+ # Nemo_A7_M49_F49
67
+ # M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-
68
+ # yulanmini
69
+ # *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
70
+
71
+ # else or no this argument:
72
+ # No override