DDPM-2param / cross_model /scripts /run_posterior_inference.sh
collins909's picture
Upload 2-parameter conditional DDPM (HI emulation, CAMELS LH params_2, epoch 200) with full training/eval/posterior toolchain
f513198 verified
#!/bin/bash
#SBATCH --account=<your-slurm-account>
#SBATCH --partition=l40s
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --gres=gpu:l40s:1
#SBATCH --time=48:00:00
#SBATCH --job-name=vlb_infer
#SBATCH --mail-user=<your-email> # replace before submitting
#SBATCH --output=slurm-vlb-infer-%j.out
#SBATCH --error=slurm-vlb-infer-%j.err
# VLB / Mudur-style posterior_inference.py (pure inference-time L_t surfaces).
#
# Defaults match bundled 6-param checkpoint + LH test data (override via env).
#
# Submit:
# sbatch <DDPM_ROOT>/Models/scripts/run_posterior_inference.sh
#
# Defaults (posterior_inference.py): n_fields=9, grid_size=10000 (needs --allow_huge_grid),
# mosaic figure posterior_L0_mosaic_3x3.png at ~10000×10000 px.
# Override grid without huge scan, e.g.: --grid_size 50 (then --allow_huge_grid not needed)
# Smoke test:
# sbatch .../run_posterior_inference.sh --n_fields 1 --grid_size 25 --t_subset 0 --batch_size 16
#
# Custom checkpoint / args / data:
# sbatch --export=CHECKPOINT=/path/best_model.pt,TRAINING_ARGS=/path/args.json,DATA_DIR=/path/params_6 \\
# .../run_posterior_inference.sh --grid_size 40
#
# Logs: Slurm .out/.err plus OUTPUT_DIR/run_log.txt (override CUSTOM_LOG).
set -euo pipefail
ROOT="<DDPM_ROOT>/Models"
cd "$ROOT"
module load python/miniconda3-py3.12-usr
PY="${ROOT}/6param_ddpm_hi_lh6/posterior_inference.py"
OUT="${OUTPUT_DIR:-${ROOT}/vlb_inference_outputs}"
CHK="${CHECKPOINT:-${ROOT}/notebook_model_weights/6param_best/best_model.pt}"
ARGS="${TRAINING_ARGS:-${ROOT}/notebook_model_weights/6param_best/args.json}"
DATA="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_6}"
mkdir -p "${OUT}"
RUN_LOG="${CUSTOM_LOG:-${OUT}/run_log.txt}"
echo "==============================================="
echo "Job ID: ${SLURM_JOB_ID:-local}"
echo "Node: ${SLURM_NODELIST:-$(hostname)}"
echo "GPU: ${CUDA_VISIBLE_DEVICES:-n/a}"
echo "Started: $(date)"
echo "Python: ${PY}"
echo "checkpoint: ${CHK}"
echo "training_args: ${ARGS}"
echo "data_dir: ${DATA}"
echo "output_dir: ${OUT}"
echo "Progress log: ${RUN_LOG}"
echo "==============================================="
set -o pipefail
python -u "${PY}" \
--checkpoint "${CHK}" \
--training_args "${ARGS}" \
--data_dir "${DATA}" \
--output_dir "${OUT}" \
--allow_huge_grid \
"$@" 2>&1 | tee -a "${RUN_LOG}"
echo "==============================================="
echo "Finished: $(date)"
echo "Artifacts → ${OUT}"
echo "==============================================="