| #!/bin/bash |
| export NCCL_SOCKET_IFNAME=bond1 |
| export NCCL_IB_GID_INDEX=3 |
|
|
| |
| |
| |
| |
| if [ "$#" -ne 2 ]; then |
| echo "Usage: $0 <NODE_RANK> <MASTER_ADDR>" |
| echo "Example (Master): /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain/run/run_2node_dsv3_0.5b_pretrain.sh 0 29.68.136.18" |
| echo "Example (Worker):/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain/run/run_2node_dsv3_0.5b_pretrain.sh 1 29.68.136.18" |
| exit 1 |
| fi |
|
|
| |
| |
| |
| |
| |
| source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate |
| |
|
|
| set -eo pipefail |
| |
|
|
| cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain |
|
|
| |
| export NNODES=4 |
| export NODE_RANK=$1 |
| export MASTER_ADDR=$2 |
| export MASTER_PORT=36000 |
|
|
| |
| export OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/megatron_lm_workspace" |
| export DATA_PATH="1.0 /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/build/wjp-share/dataset/metadata/processed_data_baseline_text_document" |
| export BATCH_SIZE=16 |
| export GLOBAL_BATCH_SIZE=1024 |
| export TRAIN_TOKENS=100_000_000_000 |
| export LR_WARMUP_TOKENS=1_000_000_000 |
| export SAVE_TOKENS=10_000_000_000 |
| export LR_DECAY_STYLE='constant' |
| export LR_DECAY_TOKENS=99_000_000_000 |
| export LR=2e-3 |
| export MP_SIZE=2 |
| export PP_SIZE=1 |
| export TOKENIZER_TYPE="hf_tokenizer_yulan_mini" |
| export ACTIVATION_CHECKPOINT='true' |
|
|
| |
| echo "--- Starting Node Rank: ${NODE_RANK} of ${NNODES} ---" |
| echo "--- Master Address: ${MASTER_ADDR}:${MASTER_PORT} ---" |
|
|
| bash dsv3_0.5b_pretrain_template.sh |