| # Equivalent of: | |
| # python scripts/run_speed_embedding_ablation.py \ | |
| # --data-root /robby/share/Robotics/zhangtianqi/datasets/lerobot/libero \ | |
| # --pi05-base /robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch \ | |
| # --batch-size 512 --lr 1e-4 --num-gpus 8 --num-workers 2 \ | |
| # --num-train-steps 30000 --eval-speeds 0.75 1.0 1.25 1.5 --num-trials 50 | |
| # | |
| # Runs three speed-integration ablations end-to-end: | |
| # norm stats -> train (8-GPU torchrun) -> serve + eval at 4 speeds. | |
| set -euo pipefail | |
| # ---------------------------------------------------------------- config | |
| PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed" | |
| DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero" | |
| PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch" | |
| # ASSET_ID="online_sliding_speed_embed_0p5_1p0_1p5_2p0_pi05" | |
| # SPEEDS=(0.5 1 1.5 2.0) # used in CLI args | |
| # EVAL_SPEEDS=(0.5 1 1.5 2.0) # eval rollouts | |
| # EVAL_SPEED_TAGS=(0p5x 1x 1p5x 2p0x) | |
| # ASSET_ID="online_sliding_speed_embed_1p0_pi05" | |
| # SPEEDS=(1) # used in CLI args | |
| # EVAL_SPEEDS=(1) # eval rollouts | |
| # EVAL_SPEED_TAGS=(1x) | |
| # ASSET_ID="online_sliding_speed_embed_0p25_0p5_1p0_2p0_4p0_pi05" | |
| # SPEEDS=(0.25 0.5 1.0 2.0 4.0) # used in CLI args | |
| # EVAL_SPEEDS=(0.25 0.5 1.0 2.0 4.0) # eval rollouts | |
| # EVAL_SPEED_TAGS=(0p25x 0p5x 1p0x 2p0x 4p0x) | |
| ASSET_ID="online_sliding_speed_embed_0p5_0p75_1p0_1p25_1p5_1p75_2p0_pi05" | |
| SPEEDS=(0.5 0.75 1.0 1.25 1.5 1.75 2.0) # used in CLI args | |
| EVAL_SPEEDS=(0.5 0.75 1.0 1.25 1.5 1.75 2.0) # eval rollouts | |
| EVAL_SPEED_TAGS=(0p5x 0p75x 1x 1p25x 1p5x 1p75x 2p0x) | |
| NUM_GPUS=8 | |
| NUM_TRAIN_STEPS=30000 | |
| NUM_TRIALS=50 | |
| BASE_PORT=8020 | |
| HOST="localhost" | |
| SERVER_WAIT_SECONDS=120 | |
| LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation" | |
| RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation" | |
| TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun" | |
| SERVER_LOG_DIR="${LOG_DIR}/servers" | |
| # experiment name | train-config | exp-name | extra train args (space-separated) | |
| EXPERIMENTS=( | |
| #"text|pi05_libero_speed_embed_text|0510_pi05_online_sliding_speed_embed_text_bs512_lr1e4|--data.speed-integration text" | |
| #"modulation|pi05_libero_speed_embed_modulation|0510_pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation" | |
| #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0510_pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5" | |
| #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p4_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 4 --model.soft-prompt-speeds 0.75 1 1.25 1.5" | |
| #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p16_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 16 --model.soft-prompt-speeds 0.75 1 1.25 1.5" | |
| #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p32_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 32 --model.soft-prompt-speeds 0.75 1 1.25 1.5" | |
| #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p5_1p0_1p5_2p0|--data.speed-integration text" | |
| #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_1p0|--data.speed-integration text" | |
| #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p25_0p5_1p0_2p0_4p0|--data.speed-integration text" | |
| "text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p5_0p75_1p0_1p25_1p5_1p75_2p0|--data.speed-integration text" | |
| ) | |
| # ---------------------------------------------------------------- env | |
| cd "${PROJECT_ROOT}" | |
| export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" | |
| # MuJoCo offscreen rendering on GPU (EGL); otherwise LIBERO sim falls back to | |
| # CPU software rendering and 8 concurrent envs will peg CPU while GPU stays idle. | |
| export MUJOCO_GL="${MUJOCO_GL:-egl}" | |
| export PYOPENGL_PLATFORM="${PYOPENGL_PLATFORM:-egl}" | |
| # Disable torch.compile during eval. The train config ships with | |
| # pytorch_compile_mode='max-autotune', whose first-call codegen + ptxas pass | |
| # eats 5-15 min per server and stalls the eval clients waiting on the first | |
| # infer response. For eval we don't need the squeezed-out throughput. | |
| export TORCH_COMPILE_DISABLE="${TORCH_COMPILE_DISABLE:-1}" | |
| mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}" | |
| # ---------------------------------------------------------------- helpers | |
| # Intentionally no trap/cleanup: we sometimes run several instances of this | |
| # script concurrently against different checkpoints, and any pkill-style | |
| # cleanup would tear down a sibling run's servers/clients. Manage stragglers | |
| # manually (e.g. ps -ef | grep serve_policy) when needed. | |
| latest_ckpt_dir() { | |
| # $1 = train_config, $2 = exp_name | |
| local root="${PROJECT_ROOT}/checkpoints/$1/$2" | |
| if [ ! -d "${root}" ]; then | |
| echo "${root}/$((NUM_TRAIN_STEPS - 1))" | |
| return | |
| fi | |
| local latest | |
| latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \ | |
| -printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)" | |
| if [ -z "${latest}" ]; then | |
| echo "${root}/$((NUM_TRAIN_STEPS - 1))" | |
| else | |
| echo "${root}/${latest}" | |
| fi | |
| } | |
| # ---------------------------------------------------------------- 3) eval | |
| echo "==========================================================" | |
| echo "Stage: eval" | |
| echo "==========================================================" | |
| for entry in "${EXPERIMENTS[@]}"; do | |
| IFS='|' read -r name cfg exp extra <<<"${entry}" | |
| ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")" | |
| if [ ! -d "${ckpt_dir}" ]; then | |
| echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2 | |
| exit 1 | |
| fi | |
| echo "========== eval: ${name} ckpt=${ckpt_dir} ==========" | |
| # spin up 8 policy servers, one per GPU, ports BASE_PORT..BASE_PORT+7 | |
| srv_log_dir="${SERVER_LOG_DIR}/${name}" | |
| mkdir -p "${srv_log_dir}" | |
| for rank in $(seq 0 $((NUM_GPUS - 1))); do | |
| port=$((BASE_PORT + rank)) | |
| log_file="${srv_log_dir}/gpu${rank}.log" | |
| echo " -> server gpu${rank} port=${port} log=${log_file}" | |
| CUDA_VISIBLE_DEVICES="${rank}" \ | |
| python scripts/serve_policy.py \ | |
| --port "${port}" \ | |
| policy:checkpoint \ | |
| --policy.config "${cfg}" \ | |
| --policy.dir "${ckpt_dir}" \ | |
| --policy.asset-id "${ASSET_ID}" \ | |
| >"${log_file}" 2>&1 & | |
| done | |
| echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..." | |
| sleep "${SERVER_WAIT_SECONDS}" | |
| # run eval for each speed | |
| for i in "${!EVAL_SPEEDS[@]}"; do | |
| speed="${EVAL_SPEEDS[$i]}" | |
| tag="${EVAL_SPEED_TAGS[$i]}" | |
| results_dir="${RESULTS_DIR}/${exp}/speed_${tag}" | |
| echo " -> eval speed=${speed} tag=${tag} -> ${results_dir}" | |
| SPEED="${speed}" \ | |
| BASE_PORT="${BASE_PORT}" \ | |
| HOST="${HOST}" \ | |
| NUM_TRIALS="${NUM_TRIALS}" \ | |
| SAVE_VIDEOS="1" \ | |
| PYTHON_CMD="python" \ | |
| RESULTS_DIR="${results_dir}" \ | |
| ./scripts/eval_libero_8gpu.sh | |
| done | |
| done | |
| echo "All stages completed." | |