tactile-vae / script /train_vae_pl.sh
WitneyWW's picture
Initial upload of tactile_vae (code, model, config, inference)
3770c94 verified
Raw
History Blame Contribute Delete
6.24 kB
#!/usr/bin/env bash
#SBATCH --job-name=tactile_vae_pl
#SBATCH --partition=sharedp
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --gres=gpu:h100:4
#SBATCH --requeue
#SBATCH --exclude=mfmc10
#SBATCH --output=/group2/ct/weihanx/tactile_world_model/slurm-logs/tactile_vae_pl.%j.log
#SBATCH --error=/group2/ct/weihanx/tactile_world_model/slurm-logs/tactile_vae_pl.%j.log
# Train TactileVAE with PyTorch Lightning (train_vae_pl.py).
#
# Each run lives at <RUNS_DIR>/<RUN_ID>/. Re-launching with the same RUN_ID
# auto-resumes from checkpoints/last.ckpt (Lightning) or ckpt_last.pt (compat).
# wandb keeps the same run id so metrics append to the same dashboard.
#
# Usage (sbatch): sbatch tactile_vae/script/train_vae_pl.sh <run_id> [config.yaml]
# Usage (local): ./tactile_vae/script/train_vae_pl.sh <run_id> [config.yaml]
#
# Diagnostics: set DEBUG=1 to enable `set -x` command tracing.
# sbatch --export=ALL,DEBUG=1 tactile_vae/script/train_vae_pl.sh <run_id>
exec 1> >(stdbuf -oL -eL cat) 2>&1
set -euo pipefail
[[ "${DEBUG:-0}" == "1" ]] && set -x
# ============================================================
# Inputs (positional)
# ============================================================
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <run_id> [config.yaml]" >&2
echo " run_id : required. Both the output subdir name and the wandb run id." >&2
echo " config : optional. Defaults to tactile_vae/config/train_vae.yaml." >&2
exit 2
fi
RUN_ID="$1"
CONFIG="${2:-tactile_vae/config/train_vae.yaml}"
# ============================================================
# Paths
# ============================================================
WORKDIR="/group2/ct/weihanx/tactile_world_model"
RUNS_DIR="$WORKDIR/runs"
RUN_DIR="$RUNS_DIR/$RUN_ID"
DATA_DIR="$WORKDIR/tactile_vae/data"
SPLITS_PATH="$WORKDIR/tactile_vae/dataset/splits_subset.json"
CONDA_ENV="${CONDA_ENV:-twm}"
mkdir -p "$WORKDIR/slurm-logs"
mkdir -p "$RUNS_DIR"
umask 027
# ============================================================
# Print startup info
# ============================================================
echo "=== Tactile VAE (PyTorch Lightning) ==="
echo "Host: $(hostname)"
echo "Job ID: ${SLURM_JOB_ID:-N/A}"
echo "Start time: $(date)"
echo "Run ID: $RUN_ID"
echo "Workdir: $WORKDIR"
echo "Config: $CONFIG"
echo "Run dir: $RUN_DIR"
echo "Conda env: $CONDA_ENV"
echo
# ============================================================
# Environment knobs
# ============================================================
export OMP_NUM_THREADS=8
export MKL_NUM_THREADS=8
export TOKENIZERS_PARALLELISM="false"
export PYTHONFAULTHANDLER=1
export PYTHONUNBUFFERED=1
# ============================================================
# Weights & Biases
# ============================================================
export WANDB_API_KEY="76cdc4261bf436617e661171fd41d80403e69e9b"
export WANDB_ENTITY="weihanx-university-of-michigan"
export WANDB_USERNAME="weihanx@umich.edu"
export WANDB_PROJECT="tactile_vae"
export WANDB_MODE="online"
export WANDB_RUN_ID="$RUN_ID"
export WANDB_NAME="$RUN_ID"
export WANDB_SERVICE_WAIT=300
export WANDB_INIT_TIMEOUT=300
export WANDB_START_METHOD="thread"
export WANDB_CONSOLE="wrap"
export WANDB_DIR="${WANDB_DIR:-/tmp/$USER/wandb/$RUN_ID}"
export WANDB_CACHE_DIR="${WANDB_CACHE_DIR:-/tmp/$USER/wandb-cache}"
export WANDB_DATA_DIR="${WANDB_DATA_DIR:-/tmp/$USER/wandb-data}"
mkdir -p "$WANDB_DIR" "$WANDB_CACHE_DIR" "$WANDB_DATA_DIR"
DISABLE_WANDB="${DISABLE_WANDB:-0}"
if [[ "$DISABLE_WANDB" == "1" ]]; then
unset WANDB_PROJECT WANDB_ENTITY WANDB_API_KEY WANDB_USERNAME
export WANDB_MODE="disabled"
fi
echo "--- Wandb ---"
echo " project=${WANDB_PROJECT:-<disabled>} entity=${WANDB_ENTITY:-<disabled>}"
echo " run_id=$WANDB_RUN_ID name=$WANDB_NAME mode=$WANDB_MODE"
echo " dir=$WANDB_DIR"
echo " cache_dir=$WANDB_CACHE_DIR"
echo " data_dir=$WANDB_DATA_DIR"
if [[ -n "${WANDB_API_KEY:-}" ]]; then
echo " api_key=${WANDB_API_KEY:0:10}...${WANDB_API_KEY: -4}"
else
echo " api_key=<disabled>"
fi
echo
# ============================================================
# Sanity checks
# ============================================================
if [[ ! -f "$WORKDIR/$CONFIG" ]] && [[ ! -f "$CONFIG" ]]; then
echo "ERROR: config not found: $CONFIG (or $WORKDIR/$CONFIG)" >&2
exit 2
fi
if [[ ! -d "$DATA_DIR" ]]; then
echo "ERROR: data dir does not exist: $DATA_DIR" >&2
exit 2
fi
if [[ ! -f "$SPLITS_PATH" ]]; then
echo "ERROR: splits manifest not found: $SPLITS_PATH" >&2
echo " Generate it with: python tactile_vae/dataset/make_splits.py" >&2
exit 2
fi
# Report resume state (Lightning ckpt takes priority over compat ckpt)
if [[ -f "$RUN_DIR/checkpoints/last.ckpt" ]]; then
echo "Resume: auto-resume from $RUN_DIR/checkpoints/last.ckpt (Lightning)"
elif [[ -f "$RUN_DIR/ckpt_last.pt" ]]; then
echo "Resume: auto-resume from $RUN_DIR/ckpt_last.pt (compat)"
else
echo "Resume: fresh run (no checkpoint found)"
fi
echo
# ============================================================
# Resolve Python interpreter
# ============================================================
PYTHON_BIN="${PYTHON_BIN:-$HOME/miniconda3/envs/$CONDA_ENV/bin/python}"
if [[ -x "$PYTHON_BIN" ]]; then
echo "[$(date +%H:%M:%S)] using env python directly: $PYTHON_BIN"
else
echo "[$(date +%H:%M:%S)] env python not found; falling back to conda activate..."
source ~/miniconda3/etc/profile.d/conda.sh
echo "[$(date +%H:%M:%S)] activating $CONDA_ENV..."
conda activate "$CONDA_ENV"
PYTHON_BIN="$(which python)"
echo "[$(date +%H:%M:%S)] env activated. python = $PYTHON_BIN"
fi
echo "[$(date +%H:%M:%S)] GPU(s): ${CUDA_VISIBLE_DEVICES:-$(nvidia-smi -L 2>/dev/null | head -1 || echo none)}"
echo
# ============================================================
# Launch training
# ============================================================
cd "$WORKDIR"
echo "[$(date +%H:%M:%S)] launching trainer (PyTorch Lightning)..."
"$PYTHON_BIN" -u tactile_vae/script/train_vae_pl.py \
--config "$CONFIG" \
--run-id "$RUN_ID"
echo
echo "[$(date +%H:%M:%S)] Finished."
echo "Run dir contents:"
ls -lh "$RUN_DIR" 2>/dev/null || echo " (empty)"