| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| exec 1> >(stdbuf -oL -eL cat) 2>&1 |
|
|
| set -euo pipefail |
| [[ "${DEBUG:-0}" == "1" ]] && set -x |
|
|
| |
| |
| |
| if [[ $# -lt 1 ]]; then |
| echo "Usage: $0 <run_id> [config.yaml]" >&2 |
| echo " run_id : required. Both the output subdir name and the wandb run id." >&2 |
| echo " config : optional. Defaults to tactile_vae/config/train_vae.yaml." >&2 |
| exit 2 |
| fi |
| RUN_ID="$1" |
| CONFIG="${2:-tactile_vae/config/train_vae.yaml}" |
|
|
| |
| |
| |
| WORKDIR="/group2/ct/weihanx/tactile_world_model" |
| |
| RUNS_DIR="$WORKDIR/tactile_world_model/runs" |
| RUN_DIR="$RUNS_DIR/$RUN_ID" |
| DATA_DIR="$WORKDIR/tactile_vae/data" |
| SPLITS_PATH="$WORKDIR/tactile_vae/dataset/splits.json" |
|
|
| |
| |
| |
| |
| CONDA_ENV="${CONDA_ENV:-twm}" |
|
|
| mkdir -p "$WORKDIR/slurm-logs" |
| mkdir -p "$RUNS_DIR" |
| umask 027 |
|
|
| |
| |
| |
| |
| |
| echo "=== Tactile VAE training ===" |
| echo "Host: $(hostname)" |
| echo "Job ID: ${SLURM_JOB_ID:-N/A}" |
| echo "Start time: $(date)" |
| echo "Run ID: $RUN_ID" |
| echo "Workdir: $WORKDIR" |
| echo "Config: $CONFIG" |
| echo "Run dir: $RUN_DIR" |
| echo "Conda env: $CONDA_ENV" |
| echo |
|
|
| |
| |
| |
| export OMP_NUM_THREADS=8 |
| export MKL_NUM_THREADS=8 |
| export TOKENIZERS_PARALLELISM="false" |
| export PYTHONFAULTHANDLER=1 |
| export PYTHONUNBUFFERED=1 |
|
|
| |
| |
| |
| export WANDB_API_KEY="76cdc4261bf436617e661171fd41d80403e69e9b" |
| export WANDB_ENTITY="weihanx-university-of-michigan" |
| export WANDB_USERNAME="weihanx@umich.edu" |
| export WANDB_PROJECT="tactile_vae" |
| export WANDB_MODE="online" |
| export WANDB_RUN_ID="$RUN_ID" |
| export WANDB_NAME="$RUN_ID" |
| export WANDB_SERVICE_WAIT=300 |
| export WANDB_INIT_TIMEOUT=300 |
| export WANDB_START_METHOD="thread" |
| export WANDB_CONSOLE="wrap" |
| |
| export WANDB_DIR="${WANDB_DIR:-/tmp/$USER/wandb/$RUN_ID}" |
| export WANDB_CACHE_DIR="${WANDB_CACHE_DIR:-/tmp/$USER/wandb-cache}" |
| export WANDB_DATA_DIR="${WANDB_DATA_DIR:-/tmp/$USER/wandb-data}" |
| mkdir -p "$WANDB_DIR" "$WANDB_CACHE_DIR" "$WANDB_DATA_DIR" |
|
|
| |
| |
| DISABLE_WANDB="${DISABLE_WANDB:-0}" |
| if [[ "$DISABLE_WANDB" == "1" ]]; then |
| unset WANDB_PROJECT WANDB_ENTITY WANDB_API_KEY WANDB_USERNAME |
| export WANDB_MODE="disabled" |
| fi |
|
|
| echo "--- Wandb ---" |
| echo " project=${WANDB_PROJECT:-<disabled>} entity=${WANDB_ENTITY:-<disabled>}" |
| echo " run_id=$WANDB_RUN_ID name=$WANDB_NAME mode=$WANDB_MODE" |
| echo " dir=$WANDB_DIR" |
| echo " cache_dir=$WANDB_CACHE_DIR" |
| echo " data_dir=$WANDB_DATA_DIR" |
| if [[ -n "${WANDB_API_KEY:-}" ]]; then |
| echo " api_key=${WANDB_API_KEY:0:10}...${WANDB_API_KEY: -4}" |
| else |
| echo " api_key=<disabled>" |
| fi |
| echo |
|
|
| |
| |
| |
| if [[ ! -f "$WORKDIR/$CONFIG" ]] && [[ ! -f "$CONFIG" ]]; then |
| echo "ERROR: config not found: $CONFIG (or $WORKDIR/$CONFIG)" >&2 |
| exit 2 |
| fi |
| if [[ ! -d "$DATA_DIR" ]]; then |
| echo "ERROR: data dir does not exist: $DATA_DIR" >&2 |
| exit 2 |
| fi |
| if [[ ! -f "$SPLITS_PATH" ]]; then |
| echo "ERROR: splits manifest not found: $SPLITS_PATH" >&2 |
| echo " Generate it with: python tactile_vae/dataset/make_splits.py" >&2 |
| exit 2 |
| fi |
|
|
| if [[ -f "$RUN_DIR/ckpt_last.pt" ]]; then |
| echo "Resume: auto-resume from $RUN_DIR/ckpt_last.pt" |
| else |
| echo "Resume: fresh run (no $RUN_DIR/ckpt_last.pt)" |
| fi |
| echo |
|
|
| |
| |
| |
| |
| |
| PYTHON_BIN="${PYTHON_BIN:-$HOME/miniconda3/envs/$CONDA_ENV/bin/python}" |
| if [[ -x "$PYTHON_BIN" ]]; then |
| echo "[$(date +%H:%M:%S)] using env python directly: $PYTHON_BIN" |
| else |
| echo "[$(date +%H:%M:%S)] env python not found; falling back to conda activate..." |
| source ~/miniconda3/etc/profile.d/conda.sh |
| echo "[$(date +%H:%M:%S)] activating $CONDA_ENV..." |
| conda activate "$CONDA_ENV" |
| PYTHON_BIN="$(which python)" |
| echo "[$(date +%H:%M:%S)] env activated. python = $PYTHON_BIN" |
| fi |
| echo "[$(date +%H:%M:%S)] GPU(s): ${CUDA_VISIBLE_DEVICES:-$(nvidia-smi -L 2>/dev/null | head -1 || echo none)}" |
| echo |
|
|
| |
| |
| |
| cd "$WORKDIR" |
| echo "[$(date +%H:%M:%S)] launching trainer..." |
| "$PYTHON_BIN" -u tactile_vae/script/train_vae.py \ |
| --config "$CONFIG" \ |
| --run-id "$RUN_ID" |
|
|
| echo |
| echo "[$(date +%H:%M:%S)] Finished." |
| echo "Run dir contents:" |
| ls -lh "$RUN_DIR" 2>/dev/null || echo " (empty)" |
|
|