| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -x |
| umask 007 |
|
|
| if [ "$#" -lt 3 ]; then |
| echo "Usage: $0 <name> <train-config> <exp-name> [extra train args...]" >&2 |
| exit 1 |
| fi |
|
|
| NAME="$1" |
| CFG="$2" |
| EXP="$3" |
| shift 3 |
| EXTRA=("$@") |
|
|
| |
| PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed" |
| DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero" |
| PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch" |
|
|
| ASSET_ID="${ASSET_ID:-online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05}" |
| |
| |
| |
| read -ra SPEEDS <<< "${SPEEDS:-0.75 1 1.25 1.5}" |
|
|
| BATCH_SIZE=512 |
| NUM_WORKERS=8 |
| NUM_TRAIN_STEPS=30000 |
| LOG_INTERVAL=100 |
| SAVE_INTERVAL=5000 |
| COMPILE_MODE="None" |
|
|
| cd "${PROJECT_ROOT}" |
| export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}" |
| unset WANDB_API_KEY WANDB_API_KEY_FILE || true |
|
|
| |
| NGPU=${NGPU:-"8"} |
| PORT=${PORT:-"1106"} |
| LOG_RANK=${LOG_RANK:-"0"} |
| TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} |
|
|
| num_gpu=${NGPU} |
| log_rank=${LOG_RANK} |
| torchft_lighthouse=${TORCHFT_LIGHTHOUSE} |
| job_num=${WORKER_NUM} |
|
|
| echo "multiple nodes training: WORLD_SIZE=${WORLD_SIZE}, RANK=${RANK}" |
|
|
| |
| export NCCL_IB_TC=136 |
| export NCCL_IB_SL=5 |
| export NCCL_IB_GID_INDEX=3 |
| export NCCL_SOCKET_IFNAME=bond1 |
| export NCCL_DEBUG=INFO |
| export NCCL_IB_HCA=mlx5_bond |
| export NCCL_IB_TIMEOUT=20 |
| export NCCL_NET_PLUGIN=none |
| export NCCL_IB_QPS_PER_CONNECTION=8 |
| export NCCL_IB_SPLIT_DATA_ON_QPS=1 |
| export NCCL_MIN_NCHANNELS=4 |
| export GLOO_SOCKET_IFNAME=bond1 |
| export TOKENIZERS_PARALLELISM=false |
|
|
| |
| export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6 |
| export WANDB_BASE_URL=http://33.180.4.104 |
|
|
| |
| echo "==========================================================" |
| echo "Stage: train (${NAME})" |
| echo " cfg=${CFG}" |
| echo " exp=${EXP}" |
| echo " speeds=${SPEEDS[*]}" |
| echo " asset_id=${ASSET_ID}" |
| echo " extra=${EXTRA[*]}" |
| echo "==========================================================" |
|
|
| PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \ |
| TORCHFT_LIGHTHOUSE="${torchft_lighthouse}" \ |
| dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 \ |
| --log_dir ${log_dir}/${NODE_ID} -r=3 \ |
| --nproc_per_node="${num_gpu}" \ |
| --nnodes="${job_num}" \ |
| --local-ranks-filter="${log_rank}" \ |
| --tee 3 \ |
| scripts/train_pytorch.py "${CFG}" \ |
| --exp-name "${EXP}" \ |
| --pytorch-weight-path "${PI05_BASE}" \ |
| --batch-size "${BATCH_SIZE}" \ |
| --num-workers "${NUM_WORKERS}" \ |
| --num-train-steps "${NUM_TRAIN_STEPS}" \ |
| --log-interval "${LOG_INTERVAL}" \ |
| --save-interval "${SAVE_INTERVAL}" \ |
| --eval-speed-set "${SPEEDS[@]}" \ |
| --data.repo-id "${DATA_ROOT}" \ |
| --data.assets.asset-id "${ASSET_ID}" \ |
| --data.online-sliding-chunks \ |
| --data.online-sliding-speeds "${SPEEDS[@]}" \ |
| --model.pytorch-compile-mode "${COMPILE_MODE}" \ |
| "${EXTRA[@]}" \ |
| --overwrite |
|
|
| echo "Train stage completed for ${NAME}." |
|
|