File size: 2,951 Bytes

#!/bin/bash

# Run training on cloud instance, then auto-stop

# Prerequisites: set WANDB_API_KEY and HF_TOKEN

set +e

LOG_FILE="training_$(date +%Y%m%d_%H%M%S).log"
NUM_GPUS="${NUM_GPUS:-1}"
DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}"

if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: WANDB_API_KEY not set"; exit 1; fi

if [ -z "$HF_TOKEN" ]; then echo "ERROR: HF_TOKEN not set"; exit 1; fi

if [ ! -d "$DATASET_DIR" ]; then echo "ERROR: Dataset not at $DATASET_DIR"; exit 1; fi

echo "=== Starting Training ===" | tee "$LOG_FILE"

# Activate conda env if available (bare metal), otherwise assume deps are global (Docker)
if command -v conda &> /dev/null; then
    eval "$(conda shell.bash hook)"
    conda activate lerobot
fi

ACCEL_FLAGS=""
if [ "$NUM_GPUS" -gt 1 ]; then
    ACCEL_FLAGS="--multi_gpu --num_processes $NUM_GPUS"
fi

# Check if we can resume from a checkpoint
RESUME_ARGS=""
LAST_CKPT="/ephemeral/production_run/checkpoints/last/pretrained_model/train_config.json"
if [ -f "$LAST_CKPT" ]; then
    echo "Resuming from checkpoint: $LAST_CKPT" | tee -a "$LOG_FILE"
    RESUME_ARGS="--resume=true --config_path=$LAST_CKPT"
else
    echo "Starting fresh training" | tee -a "$LOG_FILE"
    RESUME_ARGS="--policy.path=lerobot/pi05_base"
fi

python3.12 -m accelerate.commands.launch $ACCEL_FLAGS \
-m lerobot.scripts.lerobot_train \
$RESUME_ARGS \
--dataset.repo_id="so100:$DATASET_DIR:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json" \
--policy.train_expert_only=true \
--policy.dtype=bfloat16 \
--policy.gradient_checkpointing=false \
--policy.push_to_hub=true \
--policy.repo_id=StrongRoboticsLab/pi05-so100-diverse \
--policy.normalization_mapping='{"VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD"}' \
--policy.scheduler_warmup_steps=1000 \
--policy.scheduler_decay_steps=340000 \
--rename_map='{"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}' \
--batch_size=16 \
--steps=340000 \
--save_freq=5000 \
--log_freq=50 \
--num_workers=4 \
--wandb.enable=true \
--wandb.project=pi05-so100-diverse \
--output_dir=/ephemeral/production_run \
2>&1 | tee -a "$LOG_FILE"

TRAIN_EXIT=${PIPESTATUS[0]}

echo "=== Training Complete (exit: $TRAIN_EXIT) ===" | tee -a "$LOG_FILE"

python -c "
from huggingface_hub import HfApi
HfApi().upload_file(path_or_fileobj='$LOG_FILE', path_in_repo='logs/$LOG_FILE',
repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model')
print('Log uploaded')
" 2>&1 | tee -a "$LOG_FILE"

# Only auto-shutdown if training succeeded (exit 0 = weights uploaded)
if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then
    sudo shutdown -h now
else
    echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE"
    echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE"
fi