| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| VENV="$ROOT/.venv/bin/activate" |
| RUN_LOG_DIR="/workspace/run_logs" |
| mkdir -p "$RUN_LOG_DIR" |
|
|
| export HF_HOME=/workspace/.hf |
| export HF_HUB_CACHE=/workspace/.hf/hub |
| export HF_DATASETS_CACHE=/workspace/.hf/datasets |
| export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub |
| export XDG_CACHE_HOME=/workspace/.cache |
| export OPENPI_LEROBOT_HOME=/workspace/lerobot |
| export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0 |
| export TOKENIZERS_PARALLELISM=false |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
| cd "$ROOT" |
| source "$VENV" |
|
|
| BASELINE_CONFIG="pi05_twin_handover_256_packed_baseline_pytorch_10k" |
| PARALLEL_CONFIG="pi05_twin_handover_256_packed_parallel_pytorch_10k" |
| BASELINE_EXP="handover_packed_baseline_10k" |
| PARALLEL_EXP="handover_packed_parallel_10k" |
| VAL_REPO="lsnu/twin_handover_256_val" |
| INTERMEDIATE_VAL_BATCHES=50 |
| FINAL_VAL_BATCHES=100 |
| SAMPLE_VAL_BATCHES=64 |
| SAMPLE_NUM_STEPS="1,2,4,8,16" |
|
|
| BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP" |
| PARALLEL_CKPT_ROOT="$ROOT/checkpoints/$PARALLEL_CONFIG/$PARALLEL_EXP" |
|
|
| BASELINE_LOG="$RUN_LOG_DIR/${BASELINE_EXP}.log" |
| PARALLEL_LOG="$RUN_LOG_DIR/${PARALLEL_EXP}.log" |
| WARMSTART_LOG="$RUN_LOG_DIR/warmstart_equivalence_10k.log" |
|
|
| wait_for_checkpoint_dir() { |
| local ckpt_dir="$1" |
| while [[ ! -d "$ckpt_dir" ]]; do |
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] waiting for checkpoint dir: $ckpt_dir" |
| sleep 30 |
| done |
| } |
|
|
| run_eval() { |
| local config_name="$1" |
| local ckpt_dir="$2" |
| local num_batches="$3" |
| local log_path="$4" |
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] eval start config=$config_name ckpt=$ckpt_dir batches=$num_batches" |
| python scripts/eval_twin_val_loss_pytorch.py \ |
| --config_name "$config_name" \ |
| --checkpoint_dir "$ckpt_dir" \ |
| --repo_id "$VAL_REPO" \ |
| --num_batches "$num_batches" \ |
| --num_workers 0 \ |
| --sample_num_batches "$SAMPLE_VAL_BATCHES" \ |
| --sample_num_steps "$SAMPLE_NUM_STEPS" \ |
| >"$log_path" 2>&1 |
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] eval done log=$log_path" |
| } |
|
|
| run_train_and_evals() { |
| local config_name="$1" |
| local exp_name="$2" |
| local ckpt_root="$3" |
| local train_log="$4" |
|
|
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] train start config=$config_name exp=$exp_name" |
| torchrun --standalone --nproc_per_node=4 scripts/train_pytorch.py \ |
| "$config_name" \ |
| --exp_name "$exp_name" \ |
| --overwrite \ |
| >"$train_log" 2>&1 |
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] train done config=$config_name exp=$exp_name" |
|
|
| for step in 1000 2000 5000 10000; do |
| wait_for_checkpoint_dir "$ckpt_root/$step" |
| local num_batches="$INTERMEDIATE_VAL_BATCHES" |
| if [[ "$step" == "10000" ]]; then |
| num_batches="$FINAL_VAL_BATCHES" |
| fi |
| run_eval \ |
| "$config_name" \ |
| "$ckpt_root/$step" \ |
| "$num_batches" \ |
| "$RUN_LOG_DIR/${exp_name}_val_${step}.log" |
| done |
| } |
|
|
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] packed 10k runner started" |
| python scripts/check_parallel_warmstart_equivalence.py >"$WARMSTART_LOG" 2>&1 |
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] warm-start equivalence check logged to $WARMSTART_LOG" |
|
|
| run_train_and_evals "$BASELINE_CONFIG" "$BASELINE_EXP" "$BASELINE_CKPT_ROOT" "$BASELINE_LOG" |
| run_train_and_evals "$PARALLEL_CONFIG" "$PARALLEL_EXP" "$PARALLEL_CKPT_ROOT" "$PARALLEL_LOG" |
|
|
| echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] packed 10k runner finished" |
|
|