pi05tests-openpi-multiarm / openpi /scripts /run_twin_handover_packed_10k.sh
lsnu's picture
Add files using upload-large-folder tool
ccf25b1 verified
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$ROOT/.venv/bin/activate"
RUN_LOG_DIR="/workspace/run_logs"
mkdir -p "$RUN_LOG_DIR"
export HF_HOME=/workspace/.hf
export HF_HUB_CACHE=/workspace/.hf/hub
export HF_DATASETS_CACHE=/workspace/.hf/datasets
export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub
export XDG_CACHE_HOME=/workspace/.cache
export OPENPI_LEROBOT_HOME=/workspace/lerobot
export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0
export TOKENIZERS_PARALLELISM=false
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
cd "$ROOT"
source "$VENV"
BASELINE_CONFIG="pi05_twin_handover_256_packed_baseline_pytorch_10k"
PARALLEL_CONFIG="pi05_twin_handover_256_packed_parallel_pytorch_10k"
BASELINE_EXP="handover_packed_baseline_10k"
PARALLEL_EXP="handover_packed_parallel_10k"
VAL_REPO="lsnu/twin_handover_256_val"
INTERMEDIATE_VAL_BATCHES=50
FINAL_VAL_BATCHES=100
SAMPLE_VAL_BATCHES=64
SAMPLE_NUM_STEPS="1,2,4,8,16"
BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP"
PARALLEL_CKPT_ROOT="$ROOT/checkpoints/$PARALLEL_CONFIG/$PARALLEL_EXP"
BASELINE_LOG="$RUN_LOG_DIR/${BASELINE_EXP}.log"
PARALLEL_LOG="$RUN_LOG_DIR/${PARALLEL_EXP}.log"
WARMSTART_LOG="$RUN_LOG_DIR/warmstart_equivalence_10k.log"
wait_for_checkpoint_dir() {
local ckpt_dir="$1"
while [[ ! -d "$ckpt_dir" ]]; do
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] waiting for checkpoint dir: $ckpt_dir"
sleep 30
done
}
run_eval() {
local config_name="$1"
local ckpt_dir="$2"
local num_batches="$3"
local log_path="$4"
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] eval start config=$config_name ckpt=$ckpt_dir batches=$num_batches"
python scripts/eval_twin_val_loss_pytorch.py \
--config_name "$config_name" \
--checkpoint_dir "$ckpt_dir" \
--repo_id "$VAL_REPO" \
--num_batches "$num_batches" \
--num_workers 0 \
--sample_num_batches "$SAMPLE_VAL_BATCHES" \
--sample_num_steps "$SAMPLE_NUM_STEPS" \
>"$log_path" 2>&1
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] eval done log=$log_path"
}
run_train_and_evals() {
local config_name="$1"
local exp_name="$2"
local ckpt_root="$3"
local train_log="$4"
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] train start config=$config_name exp=$exp_name"
torchrun --standalone --nproc_per_node=4 scripts/train_pytorch.py \
"$config_name" \
--exp_name "$exp_name" \
--overwrite \
>"$train_log" 2>&1
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] train done config=$config_name exp=$exp_name"
for step in 1000 2000 5000 10000; do
wait_for_checkpoint_dir "$ckpt_root/$step"
local num_batches="$INTERMEDIATE_VAL_BATCHES"
if [[ "$step" == "10000" ]]; then
num_batches="$FINAL_VAL_BATCHES"
fi
run_eval \
"$config_name" \
"$ckpt_root/$step" \
"$num_batches" \
"$RUN_LOG_DIR/${exp_name}_val_${step}.log"
done
}
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] packed 10k runner started"
python scripts/check_parallel_warmstart_equivalence.py >"$WARMSTART_LOG" 2>&1
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] warm-start equivalence check logged to $WARMSTART_LOG"
run_train_and_evals "$BASELINE_CONFIG" "$BASELINE_EXP" "$BASELINE_CKPT_ROOT" "$BASELINE_LOG"
run_train_and_evals "$PARALLEL_CONFIG" "$PARALLEL_EXP" "$PARALLEL_CKPT_ROOT" "$PARALLEL_LOG"
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] packed 10k runner finished"