| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -x |
|
|
| umask 007 |
|
|
| |
| PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed" |
| DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero" |
| PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch" |
|
|
| ASSET_ID="online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05" |
| SPEEDS=(0.75 1 1.25 1.5) |
| EVAL_SPEEDS=(0.75 1 1.25 1.5) |
| EVAL_SPEED_TAGS=(0p75x 1x 1p25x 1p5x) |
|
|
| NUM_GPUS=8 |
| BATCH_SIZE=512 |
| LR=1e-4 |
| NUM_WORKERS=2 |
| NUM_TRAIN_STEPS=30000 |
| LOG_INTERVAL=100 |
| SAVE_INTERVAL=5000 |
| COMPILE_MODE="None" |
| NUM_TRIALS=50 |
| BASE_PORT=8000 |
| HOST="0.0.0.0" |
| SERVER_WAIT_SECONDS=120 |
|
|
| LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation" |
| RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation" |
| TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun" |
| SERVER_LOG_DIR="${LOG_DIR}/servers" |
|
|
| |
| EXPERIMENTS=( |
| "modulation|pi05_libero_speed_embed_modulation|pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation" |
| "soft_prompt|pi05_libero_speed_embed_softprompt_p8|pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5" |
| ) |
|
|
| |
| cd "${PROJECT_ROOT}" |
| |
| export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}" |
| unset WANDB_API_KEY WANDB_API_KEY_FILE || true |
|
|
| mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}" |
|
|
| |
| SERVER_PIDS=() |
|
|
| cleanup_servers() { |
| if [ "${#SERVER_PIDS[@]}" -eq 0 ]; then |
| return |
| fi |
| echo "Stopping policy servers: ${SERVER_PIDS[*]}" |
| for pid in "${SERVER_PIDS[@]}"; do |
| kill -TERM "${pid}" 2>/dev/null || true |
| done |
| |
| for _ in $(seq 1 30); do |
| local alive=0 |
| for pid in "${SERVER_PIDS[@]}"; do |
| if kill -0 "${pid}" 2>/dev/null; then alive=1; fi |
| done |
| [ "${alive}" -eq 0 ] && break |
| sleep 1 |
| done |
| for pid in "${SERVER_PIDS[@]}"; do |
| if kill -0 "${pid}" 2>/dev/null; then |
| kill -KILL "${pid}" 2>/dev/null || true |
| fi |
| done |
| SERVER_PIDS=() |
| } |
|
|
| trap cleanup_servers EXIT INT TERM |
|
|
| latest_ckpt_dir() { |
| |
| local root="${PROJECT_ROOT}/checkpoints/$1/$2" |
| if [ ! -d "${root}" ]; then |
| echo "${root}/$((NUM_TRAIN_STEPS - 1))" |
| return |
| fi |
| local latest |
| latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \ |
| -printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)" |
| if [ -z "${latest}" ]; then |
| echo "${root}/$((NUM_TRAIN_STEPS - 1))" |
| else |
| echo "${root}/${latest}" |
| fi |
| } |
|
|
| |
| echo "==========================================================" |
| echo "Stage: compute norm stats" |
| echo "==========================================================" |
| for entry in "${EXPERIMENTS[@]}"; do |
| IFS='|' read -r name cfg exp extra <<<"${entry}" |
| stats="${PROJECT_ROOT}/assets/${cfg}/${ASSET_ID}/norm_stats.json" |
| if [ -f "${stats}" ]; then |
| echo "[skip norm] ${name}: ${stats}" |
| continue |
| fi |
| echo "========== norm: ${name} ==========" |
| python scripts/compute_norm_stats.py \ |
| --config-name "${cfg}" \ |
| --repo-id "${DATA_ROOT}" \ |
| --asset-id "${ASSET_ID}" \ |
| --online-sliding-chunks \ |
| --online-sliding-speeds "${SPEEDS[@]}" |
| done |
|
|
|
|
| |
| NGPU=${NGPU:-"8"} |
| PORT=${PORT:-"1106"} |
| LOG_RANK=${LOG_RANK:-"0"} |
| TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} |
|
|
| sleep 1 |
|
|
| |
| num_gpu=${NGPU} |
| log_rank=${LOG_RANK} |
| torchft_lighthouse=${TORCHFT_LIGHTHOUSE} |
| job_num=${WORKER_NUM} |
| job_id=${RANK} |
|
|
| echo "multiple nodes training: ${WORLD_SIZE}, ${RANK}" |
|
|
| |
| export NCCL_IB_TC=136 |
| export NCCL_IB_SL=5 |
| export NCCL_IB_GID_INDEX=3 |
| export NCCL_SOCKET_IFNAME=bond1 |
| export NCCL_DEBUG=INFO |
| export NCCL_IB_HCA=mlx5_bond |
| export NCCL_IB_TIMEOUT=20 |
| export NCCL_NET_PLUGIN=none |
| export NCCL_IB_QPS_PER_CONNECTION=8 |
| export NCCL_IB_SPLIT_DATA_ON_QPS=1 |
| export NCCL_MIN_NCHANNELS=4 |
| export GLOO_SOCKET_IFNAME=bond1 |
| export TOKENIZERS_PARALLELISM=false |
|
|
| |
| export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6 |
| export WANDB_BASE_URL=http://33.180.4.104 |
|
|
| |
| echo "==========================================================" |
| echo "Stage: train" |
| echo "==========================================================" |
| for entry in "${EXPERIMENTS[@]}"; do |
| IFS='|' read -r name cfg exp extra <<<"${entry}" |
| echo "========== train: ${name} ==========" |
| |
| PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" TORCHFT_LIGHTHOUSE=${torchft_lighthouse} \ |
| dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 --log_dir ${log_dir}/${NODE_ID} -r=3 \ |
| --nproc_per_node=${num_gpu} \ |
| --nnodes=${job_num} \ |
| --local-ranks-filter=${log_rank} \ |
| --tee 3 \ |
| scripts/train_pytorch.py "${cfg}" \ |
| --exp-name "${exp}" \ |
| --pytorch-weight-path "${PI05_BASE}" \ |
| --batch-size "${BATCH_SIZE}" \ |
| --num-workers "${NUM_WORKERS}" \ |
| --num-train-steps "${NUM_TRAIN_STEPS}" \ |
| --log-interval "${LOG_INTERVAL}" \ |
| --save-interval "${SAVE_INTERVAL}" \ |
| --lr-schedule.peak-lr "${LR}" \ |
| --lr-schedule.decay-lr "${LR}" \ |
| --eval-speed-set "${SPEEDS[@]}" \ |
| --data.repo-id "${DATA_ROOT}" \ |
| --data.assets.asset-id "${ASSET_ID}" \ |
| --data.online-sliding-chunks \ |
| --data.online-sliding-speeds "${SPEEDS[@]}" \ |
| --model.pytorch-compile-mode "${COMPILE_MODE}" \ |
| ${extra} \ |
| --overwrite |
| done |
|
|
| |
| echo "==========================================================" |
| echo "Stage: eval" |
| echo "==========================================================" |
| for entry in "${EXPERIMENTS[@]}"; do |
| IFS='|' read -r name cfg exp extra <<<"${entry}" |
| ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")" |
| if [ ! -d "${ckpt_dir}" ]; then |
| echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2 |
| exit 1 |
| fi |
| echo "========== eval: ${name} ckpt=${ckpt_dir} ==========" |
|
|
| |
| SERVER_PIDS=() |
| srv_log_dir="${SERVER_LOG_DIR}/${name}" |
| mkdir -p "${srv_log_dir}" |
| for rank in $(seq 0 $((NUM_GPUS - 1))); do |
| port=$((BASE_PORT + rank)) |
| log_file="${srv_log_dir}/gpu${rank}.log" |
| echo " -> server gpu${rank} port=${port} log=${log_file}" |
| CUDA_VISIBLE_DEVICES="${rank}" \ |
| python scripts/serve_policy.py policy:checkpoint \ |
| --policy.config "${cfg}" \ |
| --policy.dir "${ckpt_dir}" \ |
| --port "${port}" \ |
| >"${log_file}" 2>&1 & |
| SERVER_PIDS+=("$!") |
| done |
|
|
| echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..." |
| sleep "${SERVER_WAIT_SECONDS}" |
|
|
| |
| for i in "${!EVAL_SPEEDS[@]}"; do |
| speed="${EVAL_SPEEDS[$i]}" |
| tag="${EVAL_SPEED_TAGS[$i]}" |
| results_dir="${RESULTS_DIR}/${exp}/speed_${tag}" |
| echo " -> eval speed=${speed} tag=${tag} -> ${results_dir}" |
| SPEED="${speed}" \ |
| BASE_PORT="${BASE_PORT}" \ |
| HOST="${HOST}" \ |
| NUM_TRIALS="${NUM_TRIALS}" \ |
| SAVE_VIDEOS="0" \ |
| PYTHON_CMD="python" \ |
| RESULTS_DIR="${results_dir}" \ |
| ./scripts/eval_libero_8gpu.sh |
| done |
|
|
| cleanup_servers |
| done |
|
|
| echo "All stages completed." |
|
|