DDPM-2param / cross_model /scripts /run_posterior_inference.sh

Upload 2-parameter conditional DDPM (HI emulation, CAMELS LH params_2, epoch 200) with full training/eval/posterior toolchain

f513198 verified 22 days ago

raw

history blame contribute delete

2.47 kB

	#!/bin/bash
	#SBATCH --account=<your-slurm-account>
	#SBATCH --partition=l40s
	#SBATCH --nodes=1
	#SBATCH --ntasks=8
	#SBATCH --gres=gpu:l40s:1
	#SBATCH --time=48:00:00
	#SBATCH --job-name=vlb_infer
	#SBATCH --mail-user=<your-email> # replace before submitting
	#SBATCH --output=slurm-vlb-infer-%j.out
	#SBATCH --error=slurm-vlb-infer-%j.err

	# VLB / Mudur-style posterior_inference.py (pure inference-time L_t surfaces).
	#
	# Defaults match bundled 6-param checkpoint + LH test data (override via env).
	#
	# Submit:
	# sbatch <DDPM_ROOT>/Models/scripts/run_posterior_inference.sh
	#
	# Defaults (posterior_inference.py): n_fields=9, grid_size=10000 (needs --allow_huge_grid),
	# mosaic figure posterior_L0_mosaic_3x3.png at ~10000×10000 px.
	# Override grid without huge scan, e.g.: --grid_size 50 (then --allow_huge_grid not needed)
	# Smoke test:
	# sbatch .../run_posterior_inference.sh --n_fields 1 --grid_size 25 --t_subset 0 --batch_size 16
	#
	# Custom checkpoint / args / data:
	# sbatch --export=CHECKPOINT=/path/best_model.pt,TRAINING_ARGS=/path/args.json,DATA_DIR=/path/params_6 \\
	# .../run_posterior_inference.sh --grid_size 40
	#
	# Logs: Slurm .out/.err plus OUTPUT_DIR/run_log.txt (override CUSTOM_LOG).

	set -euo pipefail

	ROOT="<DDPM_ROOT>/Models"
	cd "$ROOT"

	module load python/miniconda3-py3.12-usr

	PY="${ROOT}/6param_ddpm_hi_lh6/posterior_inference.py"
	OUT="${OUTPUT_DIR:-${ROOT}/vlb_inference_outputs}"

	CHK="${CHECKPOINT:-${ROOT}/notebook_model_weights/6param_best/best_model.pt}"
	ARGS="${TRAINING_ARGS:-${ROOT}/notebook_model_weights/6param_best/args.json}"
	DATA="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_6}"

	mkdir -p "${OUT}"
	RUN_LOG="${CUSTOM_LOG:-${OUT}/run_log.txt}"

	echo "==============================================="
	echo "Job ID: ${SLURM_JOB_ID:-local}"
	echo "Node: ${SLURM_NODELIST:-$(hostname)}"
	echo "GPU: ${CUDA_VISIBLE_DEVICES:-n/a}"
	echo "Started: $(date)"
	echo "Python: ${PY}"
	echo "checkpoint: ${CHK}"
	echo "training_args: ${ARGS}"
	echo "data_dir: ${DATA}"
	echo "output_dir: ${OUT}"
	echo "Progress log: ${RUN_LOG}"
	echo "==============================================="

	set -o pipefail
	python -u "${PY}" \
	--checkpoint "${CHK}" \
	--training_args "${ARGS}" \
	--data_dir "${DATA}" \
	--output_dir "${OUT}" \
	--allow_huge_grid \
	"$@" 2>&1 \| tee -a "${RUN_LOG}"

	echo "==============================================="
	echo "Finished: $(date)"
	echo "Artifacts → ${OUT}"
	echo "==============================================="