#!/usr/bin/env bash # Single-experiment multinode training launcher. # # Usage: # ./scripts/run_train_multinode.sh [extra train args...] # # Example: # ./scripts/run_train_multinode.sh \ # soft_prompt \ # pi05_libero_speed_embed_softprompt_p8 \ # pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4 \ # --data.speed-integration soft_prompt \ # --model.soft-prompt-p 8 \ # --model.soft-prompt-speeds 0.75 1 1.25 1.5 set -x umask 007 if [ "$#" -lt 3 ]; then echo "Usage: $0 [extra train args...]" >&2 exit 1 fi NAME="$1" CFG="$2" EXP="$3" shift 3 EXTRA=("$@") # ---------------------------------------------------------------- config PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed" DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero" PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch" ASSET_ID="${ASSET_ID:-online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05}" # SPEEDS env var (space-separated, e.g. "0.75 1 1.25 1.5") overrides the # default. Same string is fed to both --data.online-sliding-speeds and # --eval-speed-set; ASSET_ID must point to a dataset built for these speeds. read -ra SPEEDS <<< "${SPEEDS:-0.75 1 1.25 1.5}" BATCH_SIZE=512 NUM_WORKERS=8 NUM_TRAIN_STEPS=30000 LOG_INTERVAL=100 SAVE_INTERVAL=5000 COMPILE_MODE="None" cd "${PROJECT_ROOT}" export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}" unset WANDB_API_KEY WANDB_API_KEY_FILE || true ########### Multinode setting ########### NGPU=${NGPU:-"8"} PORT=${PORT:-"1106"} LOG_RANK=${LOG_RANK:-"0"} TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} num_gpu=${NGPU} log_rank=${LOG_RANK} torchft_lighthouse=${TORCHFT_LIGHTHOUSE} job_num=${WORKER_NUM} echo "multiple nodes training: WORLD_SIZE=${WORLD_SIZE}, RANK=${RANK}" ## ib setting export NCCL_IB_TC=136 export NCCL_IB_SL=5 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=bond1 export NCCL_DEBUG=INFO export NCCL_IB_HCA=mlx5_bond export NCCL_IB_TIMEOUT=20 export NCCL_NET_PLUGIN=none export NCCL_IB_QPS_PER_CONNECTION=8 export NCCL_IB_SPLIT_DATA_ON_QPS=1 export NCCL_MIN_NCHANNELS=4 export GLOO_SOCKET_IFNAME=bond1 export TOKENIZERS_PARALLELISM=false ## wandb setting export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6 export WANDB_BASE_URL=http://33.180.4.104 # ---------------------------------------------------------------- train echo "==========================================================" echo "Stage: train (${NAME})" echo " cfg=${CFG}" echo " exp=${EXP}" echo " speeds=${SPEEDS[*]}" echo " asset_id=${ASSET_ID}" echo " extra=${EXTRA[*]}" echo "==========================================================" PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \ TORCHFT_LIGHTHOUSE="${torchft_lighthouse}" \ dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 \ --log_dir ${log_dir}/${NODE_ID} -r=3 \ --nproc_per_node="${num_gpu}" \ --nnodes="${job_num}" \ --local-ranks-filter="${log_rank}" \ --tee 3 \ scripts/train_pytorch.py "${CFG}" \ --exp-name "${EXP}" \ --pytorch-weight-path "${PI05_BASE}" \ --batch-size "${BATCH_SIZE}" \ --num-workers "${NUM_WORKERS}" \ --num-train-steps "${NUM_TRAIN_STEPS}" \ --log-interval "${LOG_INTERVAL}" \ --save-interval "${SAVE_INTERVAL}" \ --eval-speed-set "${SPEEDS[@]}" \ --data.repo-id "${DATA_ROOT}" \ --data.assets.asset-id "${ASSET_ID}" \ --data.online-sliding-chunks \ --data.online-sliding-speeds "${SPEEDS[@]}" \ --model.pytorch-compile-mode "${COMPILE_MODE}" \ "${EXTRA[@]}" \ --overwrite echo "Train stage completed for ${NAME}."