#!/usr/bin/env bash # Equivalent of: # python scripts/run_speed_embedding_ablation.py \ # --data-root /robby/share/Robotics/zhangtianqi/datasets/lerobot/libero \ # --pi05-base /robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch \ # --batch-size 512 --lr 1e-4 --num-gpus 8 --num-workers 2 \ # --num-train-steps 30000 --eval-speeds 0.75 1.0 1.25 1.5 --num-trials 50 # # Runs three speed-integration ablations end-to-end: # norm stats -> train (8-GPU torchrun) -> serve + eval at 4 speeds. set -x umask 007 # ---------------------------------------------------------------- config PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed" DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero" PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch" ASSET_ID="online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05" SPEEDS=(0.75 1 1.25 1.5) # used in CLI args EVAL_SPEEDS=(0.75 1 1.25 1.5) # eval rollouts EVAL_SPEED_TAGS=(0p75x 1x 1p25x 1p5x) NUM_GPUS=8 BATCH_SIZE=512 LR=1e-4 NUM_WORKERS=2 NUM_TRAIN_STEPS=30000 LOG_INTERVAL=100 SAVE_INTERVAL=5000 COMPILE_MODE="None" NUM_TRIALS=50 BASE_PORT=8000 HOST="0.0.0.0" SERVER_WAIT_SECONDS=120 LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation" RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation" TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun" SERVER_LOG_DIR="${LOG_DIR}/servers" # experiment name | train-config | exp-name | extra train args (space-separated) EXPERIMENTS=( "modulation|pi05_libero_speed_embed_modulation|pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation" "soft_prompt|pi05_libero_speed_embed_softprompt_p8|pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5" ) # ---------------------------------------------------------------- env cd "${PROJECT_ROOT}" # export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}" unset WANDB_API_KEY WANDB_API_KEY_FILE || true mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}" # ---------------------------------------------------------------- helpers SERVER_PIDS=() cleanup_servers() { if [ "${#SERVER_PIDS[@]}" -eq 0 ]; then return fi echo "Stopping policy servers: ${SERVER_PIDS[*]}" for pid in "${SERVER_PIDS[@]}"; do kill -TERM "${pid}" 2>/dev/null || true done # wait up to 30s, then SIGKILL for _ in $(seq 1 30); do local alive=0 for pid in "${SERVER_PIDS[@]}"; do if kill -0 "${pid}" 2>/dev/null; then alive=1; fi done [ "${alive}" -eq 0 ] && break sleep 1 done for pid in "${SERVER_PIDS[@]}"; do if kill -0 "${pid}" 2>/dev/null; then kill -KILL "${pid}" 2>/dev/null || true fi done SERVER_PIDS=() } trap cleanup_servers EXIT INT TERM latest_ckpt_dir() { # $1 = train_config, $2 = exp_name local root="${PROJECT_ROOT}/checkpoints/$1/$2" if [ ! -d "${root}" ]; then echo "${root}/$((NUM_TRAIN_STEPS - 1))" return fi local latest latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \ -printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)" if [ -z "${latest}" ]; then echo "${root}/$((NUM_TRAIN_STEPS - 1))" else echo "${root}/${latest}" fi } # ---------------------------------------------------------------- 1) norm echo "==========================================================" echo "Stage: compute norm stats" echo "==========================================================" for entry in "${EXPERIMENTS[@]}"; do IFS='|' read -r name cfg exp extra <<<"${entry}" stats="${PROJECT_ROOT}/assets/${cfg}/${ASSET_ID}/norm_stats.json" if [ -f "${stats}" ]; then echo "[skip norm] ${name}: ${stats}" continue fi echo "========== norm: ${name} ==========" python scripts/compute_norm_stats.py \ --config-name "${cfg}" \ --repo-id "${DATA_ROOT}" \ --asset-id "${ASSET_ID}" \ --online-sliding-chunks \ --online-sliding-speeds "${SPEEDS[@]}" done ########### Multinode setting ########### NGPU=${NGPU:-"8"} PORT=${PORT:-"1106"} LOG_RANK=${LOG_RANK:-"0"} TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} sleep 1 ## node setting num_gpu=${NGPU} log_rank=${LOG_RANK} torchft_lighthouse=${TORCHFT_LIGHTHOUSE} job_num=${WORKER_NUM} job_id=${RANK} echo "multiple nodes training: ${WORLD_SIZE}, ${RANK}" ## ib setting export NCCL_IB_TC=136 export NCCL_IB_SL=5 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=bond1 export NCCL_DEBUG=INFO export NCCL_IB_HCA=mlx5_bond export NCCL_IB_TIMEOUT=20 export NCCL_NET_PLUGIN=none export NCCL_IB_QPS_PER_CONNECTION=8 export NCCL_IB_SPLIT_DATA_ON_QPS=1 export NCCL_MIN_NCHANNELS=4 export GLOO_SOCKET_IFNAME=bond1 export TOKENIZERS_PARALLELISM=false ## wandb setting export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6 export WANDB_BASE_URL=http://33.180.4.104 # ---------------------------------------------------------------- 2) train echo "==========================================================" echo "Stage: train" echo "==========================================================" for entry in "${EXPERIMENTS[@]}"; do IFS='|' read -r name cfg exp extra <<<"${entry}" echo "========== train: ${name} ==========" # shellcheck disable=SC2086 PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" TORCHFT_LIGHTHOUSE=${torchft_lighthouse} \ dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 --log_dir ${log_dir}/${NODE_ID} -r=3 \ --nproc_per_node=${num_gpu} \ --nnodes=${job_num} \ --local-ranks-filter=${log_rank} \ --tee 3 \ scripts/train_pytorch.py "${cfg}" \ --exp-name "${exp}" \ --pytorch-weight-path "${PI05_BASE}" \ --batch-size "${BATCH_SIZE}" \ --num-workers "${NUM_WORKERS}" \ --num-train-steps "${NUM_TRAIN_STEPS}" \ --log-interval "${LOG_INTERVAL}" \ --save-interval "${SAVE_INTERVAL}" \ --lr-schedule.peak-lr "${LR}" \ --lr-schedule.decay-lr "${LR}" \ --eval-speed-set "${SPEEDS[@]}" \ --data.repo-id "${DATA_ROOT}" \ --data.assets.asset-id "${ASSET_ID}" \ --data.online-sliding-chunks \ --data.online-sliding-speeds "${SPEEDS[@]}" \ --model.pytorch-compile-mode "${COMPILE_MODE}" \ ${extra} \ --overwrite done # ---------------------------------------------------------------- 3) eval echo "==========================================================" echo "Stage: eval" echo "==========================================================" for entry in "${EXPERIMENTS[@]}"; do IFS='|' read -r name cfg exp extra <<<"${entry}" ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")" if [ ! -d "${ckpt_dir}" ]; then echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2 exit 1 fi echo "========== eval: ${name} ckpt=${ckpt_dir} ==========" # spin up 8 policy servers, one per GPU, ports BASE_PORT..BASE_PORT+7 SERVER_PIDS=() srv_log_dir="${SERVER_LOG_DIR}/${name}" mkdir -p "${srv_log_dir}" for rank in $(seq 0 $((NUM_GPUS - 1))); do port=$((BASE_PORT + rank)) log_file="${srv_log_dir}/gpu${rank}.log" echo " -> server gpu${rank} port=${port} log=${log_file}" CUDA_VISIBLE_DEVICES="${rank}" \ python scripts/serve_policy.py policy:checkpoint \ --policy.config "${cfg}" \ --policy.dir "${ckpt_dir}" \ --port "${port}" \ >"${log_file}" 2>&1 & SERVER_PIDS+=("$!") done echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..." sleep "${SERVER_WAIT_SECONDS}" # run eval for each speed for i in "${!EVAL_SPEEDS[@]}"; do speed="${EVAL_SPEEDS[$i]}" tag="${EVAL_SPEED_TAGS[$i]}" results_dir="${RESULTS_DIR}/${exp}/speed_${tag}" echo " -> eval speed=${speed} tag=${tag} -> ${results_dir}" SPEED="${speed}" \ BASE_PORT="${BASE_PORT}" \ HOST="${HOST}" \ NUM_TRIALS="${NUM_TRIALS}" \ SAVE_VIDEOS="0" \ PYTHON_CMD="python" \ RESULTS_DIR="${results_dir}" \ ./scripts/eval_libero_8gpu.sh done cleanup_servers done echo "All stages completed."