O2iginal commited on
Commit
d175771
·
verified ·
1 Parent(s): 929b24d

Upload run_2node_dsv3_0.5b_pretrain.sh to dsv3_0.5b

Browse files
Files changed (1) hide show
  1. run_2node_dsv3_0.5b_pretrain.sh +55 -0
run_2node_dsv3_0.5b_pretrain.sh ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ export NCCL_SOCKET_IFNAME=bond1
3
+ export NCCL_IB_GID_INDEX=3
4
+
5
+ # export NCCL_IB_HCA==mlx5_0:1
6
+ # export NCCL_MAX_NCHANNELS=1
7
+ # export NCCL_NET=IB
8
+ # --- Input Validation ---
9
+ if [ "$#" -ne 2 ]; then
10
+ echo "Usage: $0 <NODE_RANK> <MASTER_ADDR>"
11
+ echo "Example (Master): /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain/run/run_2node_dsv3_0.5b_pretrain.sh 0 29.68.136.18"
12
+ echo "Example (Worker):/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain/run/run_2node_dsv3_0.5b_pretrain.sh 1 29.68.136.18"
13
+ exit 1
14
+ fi
15
+
16
+ # ------------------
17
+ # Standard Setup
18
+ # ------------------
19
+ # export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
20
+ # export USER=$(whoami)
21
+ source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate
22
+ # ------------------
23
+
24
+ set -eo pipefail
25
+ # ------------------
26
+
27
+ cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/scripts/pretrain
28
+
29
+ # --- Multi-Node Configuration (from command-line arguments) ---
30
+ export NNODES=4
31
+ export NODE_RANK=$1
32
+ export MASTER_ADDR=$2
33
+ export MASTER_PORT=36000
34
+
35
+ # --- Standard Training Configuration ---
36
+ export OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/megatron_lm_workspace"
37
+ export DATA_PATH="1.0 /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/Ubiquant-Pretrain/build/wjp-share/dataset/metadata/processed_data_baseline_text_document"
38
+ export BATCH_SIZE=16
39
+ export GLOBAL_BATCH_SIZE=1024
40
+ export TRAIN_TOKENS=100_000_000_000
41
+ export LR_WARMUP_TOKENS=1_000_000_000
42
+ export SAVE_TOKENS=10_000_000_000
43
+ export LR_DECAY_STYLE='constant'
44
+ export LR_DECAY_TOKENS=99_000_000_000
45
+ export LR=2e-3
46
+ export MP_SIZE=2
47
+ export PP_SIZE=1
48
+ export TOKENIZER_TYPE="hf_tokenizer_yulan_mini"
49
+ export ACTIVATION_CHECKPOINT='true'
50
+
51
+ # --- Execute the Training Script ---
52
+ echo "--- Starting Node Rank: ${NODE_RANK} of ${NNODES} ---"
53
+ echo "--- Master Address: ${MASTER_ADDR}:${MASTER_PORT} ---"
54
+
55
+ bash dsv3_0.5b_pretrain_template.sh