File size: 8,320 Bytes
08ff31f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | #!/usr/bin/env bash
# Equivalent of:
# python scripts/run_speed_embedding_ablation.py \
# --data-root /robby/share/Robotics/zhangtianqi/datasets/lerobot/libero \
# --pi05-base /robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch \
# --batch-size 512 --lr 1e-4 --num-gpus 8 --num-workers 2 \
# --num-train-steps 30000 --eval-speeds 0.75 1.0 1.25 1.5 --num-trials 50
#
# Runs three speed-integration ablations end-to-end:
# norm stats -> train (8-GPU torchrun) -> serve + eval at 4 speeds.
set -x
umask 007
# ---------------------------------------------------------------- config
PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed"
DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero"
PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch"
ASSET_ID="online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05"
SPEEDS=(0.75 1 1.25 1.5) # used in CLI args
EVAL_SPEEDS=(0.75 1 1.25 1.5) # eval rollouts
EVAL_SPEED_TAGS=(0p75x 1x 1p25x 1p5x)
NUM_GPUS=8
BATCH_SIZE=512
LR=1e-4
NUM_WORKERS=2
NUM_TRAIN_STEPS=30000
LOG_INTERVAL=100
SAVE_INTERVAL=5000
COMPILE_MODE="None"
NUM_TRIALS=50
BASE_PORT=8000
HOST="0.0.0.0"
SERVER_WAIT_SECONDS=120
LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation"
RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation"
TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun"
SERVER_LOG_DIR="${LOG_DIR}/servers"
# experiment name | train-config | exp-name | extra train args (space-separated)
EXPERIMENTS=(
"modulation|pi05_libero_speed_embed_modulation|pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation"
"soft_prompt|pi05_libero_speed_embed_softprompt_p8|pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
)
# ---------------------------------------------------------------- env
cd "${PROJECT_ROOT}"
# export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}"
unset WANDB_API_KEY WANDB_API_KEY_FILE || true
mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}"
# ---------------------------------------------------------------- helpers
SERVER_PIDS=()
cleanup_servers() {
if [ "${#SERVER_PIDS[@]}" -eq 0 ]; then
return
fi
echo "Stopping policy servers: ${SERVER_PIDS[*]}"
for pid in "${SERVER_PIDS[@]}"; do
kill -TERM "${pid}" 2>/dev/null || true
done
# wait up to 30s, then SIGKILL
for _ in $(seq 1 30); do
local alive=0
for pid in "${SERVER_PIDS[@]}"; do
if kill -0 "${pid}" 2>/dev/null; then alive=1; fi
done
[ "${alive}" -eq 0 ] && break
sleep 1
done
for pid in "${SERVER_PIDS[@]}"; do
if kill -0 "${pid}" 2>/dev/null; then
kill -KILL "${pid}" 2>/dev/null || true
fi
done
SERVER_PIDS=()
}
trap cleanup_servers EXIT INT TERM
latest_ckpt_dir() {
# $1 = train_config, $2 = exp_name
local root="${PROJECT_ROOT}/checkpoints/$1/$2"
if [ ! -d "${root}" ]; then
echo "${root}/$((NUM_TRAIN_STEPS - 1))"
return
fi
local latest
latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \
-printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)"
if [ -z "${latest}" ]; then
echo "${root}/$((NUM_TRAIN_STEPS - 1))"
else
echo "${root}/${latest}"
fi
}
# ---------------------------------------------------------------- 1) norm
echo "=========================================================="
echo "Stage: compute norm stats"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
IFS='|' read -r name cfg exp extra <<<"${entry}"
stats="${PROJECT_ROOT}/assets/${cfg}/${ASSET_ID}/norm_stats.json"
if [ -f "${stats}" ]; then
echo "[skip norm] ${name}: ${stats}"
continue
fi
echo "========== norm: ${name} =========="
python scripts/compute_norm_stats.py \
--config-name "${cfg}" \
--repo-id "${DATA_ROOT}" \
--asset-id "${ASSET_ID}" \
--online-sliding-chunks \
--online-sliding-speeds "${SPEEDS[@]}"
done
########### Multinode setting ###########
NGPU=${NGPU:-"8"}
PORT=${PORT:-"1106"}
LOG_RANK=${LOG_RANK:-"0"}
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
sleep 1
## node setting
num_gpu=${NGPU}
log_rank=${LOG_RANK}
torchft_lighthouse=${TORCHFT_LIGHTHOUSE}
job_num=${WORKER_NUM}
job_id=${RANK}
echo "multiple nodes training: ${WORLD_SIZE}, ${RANK}"
## ib setting
export NCCL_IB_TC=136
export NCCL_IB_SL=5
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=bond1
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5_bond
export NCCL_IB_TIMEOUT=20
export NCCL_NET_PLUGIN=none
export NCCL_IB_QPS_PER_CONNECTION=8
export NCCL_IB_SPLIT_DATA_ON_QPS=1
export NCCL_MIN_NCHANNELS=4
export GLOO_SOCKET_IFNAME=bond1
export TOKENIZERS_PARALLELISM=false
## wandb setting
export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6
export WANDB_BASE_URL=http://33.180.4.104
# ---------------------------------------------------------------- 2) train
echo "=========================================================="
echo "Stage: train"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
IFS='|' read -r name cfg exp extra <<<"${entry}"
echo "========== train: ${name} =========="
# shellcheck disable=SC2086
PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" TORCHFT_LIGHTHOUSE=${torchft_lighthouse} \
dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 --log_dir ${log_dir}/${NODE_ID} -r=3 \
--nproc_per_node=${num_gpu} \
--nnodes=${job_num} \
--local-ranks-filter=${log_rank} \
--tee 3 \
scripts/train_pytorch.py "${cfg}" \
--exp-name "${exp}" \
--pytorch-weight-path "${PI05_BASE}" \
--batch-size "${BATCH_SIZE}" \
--num-workers "${NUM_WORKERS}" \
--num-train-steps "${NUM_TRAIN_STEPS}" \
--log-interval "${LOG_INTERVAL}" \
--save-interval "${SAVE_INTERVAL}" \
--lr-schedule.peak-lr "${LR}" \
--lr-schedule.decay-lr "${LR}" \
--eval-speed-set "${SPEEDS[@]}" \
--data.repo-id "${DATA_ROOT}" \
--data.assets.asset-id "${ASSET_ID}" \
--data.online-sliding-chunks \
--data.online-sliding-speeds "${SPEEDS[@]}" \
--model.pytorch-compile-mode "${COMPILE_MODE}" \
${extra} \
--overwrite
done
# ---------------------------------------------------------------- 3) eval
echo "=========================================================="
echo "Stage: eval"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
IFS='|' read -r name cfg exp extra <<<"${entry}"
ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")"
if [ ! -d "${ckpt_dir}" ]; then
echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2
exit 1
fi
echo "========== eval: ${name} ckpt=${ckpt_dir} =========="
# spin up 8 policy servers, one per GPU, ports BASE_PORT..BASE_PORT+7
SERVER_PIDS=()
srv_log_dir="${SERVER_LOG_DIR}/${name}"
mkdir -p "${srv_log_dir}"
for rank in $(seq 0 $((NUM_GPUS - 1))); do
port=$((BASE_PORT + rank))
log_file="${srv_log_dir}/gpu${rank}.log"
echo " -> server gpu${rank} port=${port} log=${log_file}"
CUDA_VISIBLE_DEVICES="${rank}" \
python scripts/serve_policy.py policy:checkpoint \
--policy.config "${cfg}" \
--policy.dir "${ckpt_dir}" \
--port "${port}" \
>"${log_file}" 2>&1 &
SERVER_PIDS+=("$!")
done
echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..."
sleep "${SERVER_WAIT_SECONDS}"
# run eval for each speed
for i in "${!EVAL_SPEEDS[@]}"; do
speed="${EVAL_SPEEDS[$i]}"
tag="${EVAL_SPEED_TAGS[$i]}"
results_dir="${RESULTS_DIR}/${exp}/speed_${tag}"
echo " -> eval speed=${speed} tag=${tag} -> ${results_dir}"
SPEED="${speed}" \
BASE_PORT="${BASE_PORT}" \
HOST="${HOST}" \
NUM_TRIALS="${NUM_TRIALS}" \
SAVE_VIDEOS="0" \
PYTHON_CMD="python" \
RESULTS_DIR="${results_dir}" \
./scripts/eval_libero_8gpu.sh
done
cleanup_servers
done
echo "All stages completed."
|