File size: 3,754 Bytes
08ff31f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env bash
# Single-experiment multinode training launcher.
#
# Usage:
#   ./scripts/run_train_multinode.sh <name> <train-config> <exp-name> [extra train args...]
#
# Example:
#   ./scripts/run_train_multinode.sh \
#       soft_prompt \
#       pi05_libero_speed_embed_softprompt_p8 \
#       pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4 \
#       --data.speed-integration soft_prompt \
#       --model.soft-prompt-p 8 \
#       --model.soft-prompt-speeds 0.75 1 1.25 1.5

set -x
umask 007

if [ "$#" -lt 3 ]; then
  echo "Usage: $0 <name> <train-config> <exp-name> [extra train args...]" >&2
  exit 1
fi

NAME="$1"
CFG="$2"
EXP="$3"
shift 3
EXTRA=("$@")

# ---------------------------------------------------------------- config
PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed"
DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero"
PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch"

ASSET_ID="${ASSET_ID:-online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05}"
# SPEEDS env var (space-separated, e.g. "0.75 1 1.25 1.5") overrides the
# default. Same string is fed to both --data.online-sliding-speeds and
# --eval-speed-set; ASSET_ID must point to a dataset built for these speeds.
read -ra SPEEDS <<< "${SPEEDS:-0.75 1 1.25 1.5}"

BATCH_SIZE=512
NUM_WORKERS=8
NUM_TRAIN_STEPS=30000
LOG_INTERVAL=100
SAVE_INTERVAL=5000
COMPILE_MODE="None"

cd "${PROJECT_ROOT}"
export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}"
unset WANDB_API_KEY WANDB_API_KEY_FILE || true

########### Multinode setting ###########
NGPU=${NGPU:-"8"}
PORT=${PORT:-"1106"}
LOG_RANK=${LOG_RANK:-"0"}
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}

num_gpu=${NGPU}
log_rank=${LOG_RANK}
torchft_lighthouse=${TORCHFT_LIGHTHOUSE}
job_num=${WORKER_NUM}

echo "multiple nodes training: WORLD_SIZE=${WORLD_SIZE}, RANK=${RANK}"

## ib setting
export NCCL_IB_TC=136
export NCCL_IB_SL=5
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=bond1
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5_bond
export NCCL_IB_TIMEOUT=20
export NCCL_NET_PLUGIN=none
export NCCL_IB_QPS_PER_CONNECTION=8
export NCCL_IB_SPLIT_DATA_ON_QPS=1
export NCCL_MIN_NCHANNELS=4
export GLOO_SOCKET_IFNAME=bond1
export TOKENIZERS_PARALLELISM=false

## wandb setting
export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6
export WANDB_BASE_URL=http://33.180.4.104

# ---------------------------------------------------------------- train
echo "=========================================================="
echo "Stage: train (${NAME})"
echo "  cfg=${CFG}"
echo "  exp=${EXP}"
echo "  speeds=${SPEEDS[*]}"
echo "  asset_id=${ASSET_ID}"
echo "  extra=${EXTRA[*]}"
echo "=========================================================="

PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
TORCHFT_LIGHTHOUSE="${torchft_lighthouse}" \
dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 \
  --log_dir ${log_dir}/${NODE_ID} -r=3 \
  --nproc_per_node="${num_gpu}" \
  --nnodes="${job_num}" \
  --local-ranks-filter="${log_rank}" \
  --tee 3 \
  scripts/train_pytorch.py "${CFG}" \
  --exp-name "${EXP}" \
  --pytorch-weight-path "${PI05_BASE}" \
  --batch-size "${BATCH_SIZE}" \
  --num-workers "${NUM_WORKERS}" \
  --num-train-steps "${NUM_TRAIN_STEPS}" \
  --log-interval "${LOG_INTERVAL}" \
  --save-interval "${SAVE_INTERVAL}" \
  --eval-speed-set "${SPEEDS[@]}" \
  --data.repo-id "${DATA_ROOT}" \
  --data.assets.asset-id "${ASSET_ID}" \
  --data.online-sliding-chunks \
  --data.online-sliding-speeds "${SPEEDS[@]}" \
  --model.pytorch-compile-mode "${COMPILE_MODE}" \
  "${EXTRA[@]}" \
  --overwrite

echo "Train stage completed for ${NAME}."