File size: 3,009 Bytes

#!/bin/bash
#SBATCH --account=<your-slurm-account>
#SBATCH --partition=l40s
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --gres=gpu:l40s:1
#SBATCH --time=02:00:00
#SBATCH --job-name=ddpm_hi_eval
#SBATCH --mail-user=<your-email>   # replace before submitting
#SBATCH --output=slurm-eval-%j.out
#SBATCH --error=slurm-eval-%j.err

# Evaluate conditional diffusion checkpoint (evaluate_conditional.py).
#
# Usage (interactive):
#   bash scripts/shell/evaluate_conditional.sh outputs_conditional_2label_*/checkpoints/best_model.pt
#   CHECKPOINT=... OUTPUT_DIR=... bash scripts/shell/evaluate_conditional.sh
#
# Usage (SLURM):
#   cd .../Models/2param_DDPM_HI_Emulation && sbatch scripts/shell/evaluate_conditional.sh
#   (uses newest outputs_*/checkpoints/best_model.pt under this repo if CHECKPOINT is unset)
#
#   Or pick a checkpoint explicitly:
#   sbatch --export=CHECKPOINT=/abs/path/to/best_model.pt scripts/shell/evaluate_conditional.sh
#
# Optional env vars: DATA_DIR, OUTPUT_DIR. Extra CLI flags go after the checkpoint path.

set -euo pipefail

cd <DDPM_ROOT>/Models/2param_DDPM_HI_Emulation

module load python/miniconda3-py3.12-usr

CHECKPOINT="${CHECKPOINT:-}"
if [[ -z "${CHECKPOINT}" ]] && [[ -n "${1:-}" ]] && [[ "${1}" != -* ]]; then
  CHECKPOINT="$1"
  shift
fi

# Uncomment to pin a run when several outputs_* folders exist:
# CHECKPOINT="outputs_conditional_2label_20260330_235542/checkpoints/best_model.pt"

if [[ -z "${CHECKPOINT}" ]]; then
  # sbatch does not pass CLI args to the job script; pick latest best_model.pt by mtime
  _line="$(find "${PWD}" -maxdepth 8 -type f -name 'best_model.pt' -printf '%T@ %p\n' 2>/dev/null | sort -n | tail -n1)" || true
  if [[ -n "${_line}" ]]; then
    CHECKPOINT="${_line#* }"
    echo "Auto-selected checkpoint (newest best_model.pt): ${CHECKPOINT}" >&2
  fi
fi

if [[ -z "${CHECKPOINT}" ]]; then
  echo "No checkpoint found. Set CHECKPOINT, pass a .pt path as arg 1, or train first." >&2
  echo "SLURM example:" >&2
  echo "  sbatch --export=CHECKPOINT=${PWD}/outputs_conditional_2label_*/checkpoints/best_model.pt scripts/shell/evaluate_conditional.sh" >&2
  exit 1
fi

if [[ ! -f "${CHECKPOINT}" ]]; then
  echo "Checkpoint file not found: ${CHECKPOINT}" >&2
  exit 1
fi

DATA_DIR="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_2}"
OUTPUT_DIR="${OUTPUT_DIR:-evaluation_outputs}"

echo "==============================================="
echo "Job ID: ${SLURM_JOB_ID:-local}"
echo "Node: ${SLURM_NODELIST:-$(hostname)}"
echo "GPU: ${CUDA_VISIBLE_DEVICES:-default}"
echo "Started: $(date)"
echo "Checkpoint: ${CHECKPOINT}"
echo "Output dir: ${OUTPUT_DIR}"
echo "==============================================="

python evaluate_conditional.py \
  --checkpoint "${CHECKPOINT}" \
  --data_dir "${DATA_DIR}" \
  --output_dir "${OUTPUT_DIR}" \
  --split test \
  --num_samples 8 \
  --ddim_steps 50 \
  "$@"

echo "==============================================="
echo "Evaluation finished: $(date)"
echo "==============================================="