DDPM-2param / scripts /shell /evaluate_conditional.sh

Upload 2-parameter conditional DDPM (HI emulation, CAMELS LH params_2, epoch 200) with full training/eval/posterior toolchain

f513198 verified 23 days ago

raw

history blame contribute delete

3.01 kB

	#!/bin/bash
	#SBATCH --account=<your-slurm-account>
	#SBATCH --partition=l40s
	#SBATCH --nodes=1
	#SBATCH --ntasks=4
	#SBATCH --gres=gpu:l40s:1
	#SBATCH --time=02:00:00
	#SBATCH --job-name=ddpm_hi_eval
	#SBATCH --mail-user=<your-email> # replace before submitting
	#SBATCH --output=slurm-eval-%j.out
	#SBATCH --error=slurm-eval-%j.err

	# Evaluate conditional diffusion checkpoint (evaluate_conditional.py).
	#
	# Usage (interactive):
	# bash scripts/shell/evaluate_conditional.sh outputs_conditional_2label_*/checkpoints/best_model.pt
	# CHECKPOINT=... OUTPUT_DIR=... bash scripts/shell/evaluate_conditional.sh
	#
	# Usage (SLURM):
	# cd .../Models/2param_DDPM_HI_Emulation && sbatch scripts/shell/evaluate_conditional.sh
	# (uses newest outputs_*/checkpoints/best_model.pt under this repo if CHECKPOINT is unset)
	#
	# Or pick a checkpoint explicitly:
	# sbatch --export=CHECKPOINT=/abs/path/to/best_model.pt scripts/shell/evaluate_conditional.sh
	#
	# Optional env vars: DATA_DIR, OUTPUT_DIR. Extra CLI flags go after the checkpoint path.

	set -euo pipefail

	cd <DDPM_ROOT>/Models/2param_DDPM_HI_Emulation

	module load python/miniconda3-py3.12-usr

	CHECKPOINT="${CHECKPOINT:-}"
	if [[ -z "${CHECKPOINT}" ]] && [[ -n "${1:-}" ]] && [[ "${1}" != -* ]]; then
	CHECKPOINT="$1"
	shift
	fi

	# Uncomment to pin a run when several outputs_* folders exist:
	# CHECKPOINT="outputs_conditional_2label_20260330_235542/checkpoints/best_model.pt"

	if [[ -z "${CHECKPOINT}" ]]; then
	# sbatch does not pass CLI args to the job script; pick latest best_model.pt by mtime
	_line="$(find "${PWD}" -maxdepth 8 -type f -name 'best_model.pt' -printf '%T@ %p\n' 2>/dev/null \| sort -n \| tail -n1)" \|\| true
	if [[ -n "${_line}" ]]; then
	CHECKPOINT="${_line#* }"
	echo "Auto-selected checkpoint (newest best_model.pt): ${CHECKPOINT}" >&2
	fi
	fi

	if [[ -z "${CHECKPOINT}" ]]; then
	echo "No checkpoint found. Set CHECKPOINT, pass a .pt path as arg 1, or train first." >&2
	echo "SLURM example:" >&2
	echo " sbatch --export=CHECKPOINT=${PWD}/outputs_conditional_2label_*/checkpoints/best_model.pt scripts/shell/evaluate_conditional.sh" >&2
	exit 1
	fi

	if [[ ! -f "${CHECKPOINT}" ]]; then
	echo "Checkpoint file not found: ${CHECKPOINT}" >&2
	exit 1
	fi

	DATA_DIR="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_2}"
	OUTPUT_DIR="${OUTPUT_DIR:-evaluation_outputs}"

	echo "==============================================="
	echo "Job ID: ${SLURM_JOB_ID:-local}"
	echo "Node: ${SLURM_NODELIST:-$(hostname)}"
	echo "GPU: ${CUDA_VISIBLE_DEVICES:-default}"
	echo "Started: $(date)"
	echo "Checkpoint: ${CHECKPOINT}"
	echo "Output dir: ${OUTPUT_DIR}"
	echo "==============================================="

	python evaluate_conditional.py \
	--checkpoint "${CHECKPOINT}" \
	--data_dir "${DATA_DIR}" \
	--output_dir "${OUTPUT_DIR}" \
	--split test \
	--num_samples 8 \
	--ddim_steps 50 \
	"$@"

	echo "==============================================="
	echo "Evaluation finished: $(date)"
	echo "==============================================="