Upload 2-parameter conditional DDPM (HI emulation, CAMELS LH params_2, epoch 200) with full training/eval/posterior toolchain
f513198 verified | #SBATCH --account=<your-slurm-account> | |
| #SBATCH --partition=l40s | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks=8 | |
| #SBATCH --gres=gpu:l40s:1 | |
| #SBATCH --time=48:00:00 | |
| #SBATCH --job-name=vlb_infer | |
| #SBATCH --mail-user=<your-email> # replace before submitting | |
| #SBATCH --output=slurm-vlb-infer-%j.out | |
| #SBATCH --error=slurm-vlb-infer-%j.err | |
| # VLB / Mudur-style posterior_inference.py (pure inference-time L_t surfaces). | |
| # | |
| # Defaults match bundled 6-param checkpoint + LH test data (override via env). | |
| # | |
| # Submit: | |
| # sbatch <DDPM_ROOT>/Models/scripts/run_posterior_inference.sh | |
| # | |
| # Defaults (posterior_inference.py): n_fields=9, grid_size=10000 (needs --allow_huge_grid), | |
| # mosaic figure posterior_L0_mosaic_3x3.png at ~10000×10000 px. | |
| # Override grid without huge scan, e.g.: --grid_size 50 (then --allow_huge_grid not needed) | |
| # Smoke test: | |
| # sbatch .../run_posterior_inference.sh --n_fields 1 --grid_size 25 --t_subset 0 --batch_size 16 | |
| # | |
| # Custom checkpoint / args / data: | |
| # sbatch --export=CHECKPOINT=/path/best_model.pt,TRAINING_ARGS=/path/args.json,DATA_DIR=/path/params_6 \\ | |
| # .../run_posterior_inference.sh --grid_size 40 | |
| # | |
| # Logs: Slurm .out/.err plus OUTPUT_DIR/run_log.txt (override CUSTOM_LOG). | |
| set -euo pipefail | |
| ROOT="<DDPM_ROOT>/Models" | |
| cd "$ROOT" | |
| module load python/miniconda3-py3.12-usr | |
| PY="${ROOT}/6param_ddpm_hi_lh6/posterior_inference.py" | |
| OUT="${OUTPUT_DIR:-${ROOT}/vlb_inference_outputs}" | |
| CHK="${CHECKPOINT:-${ROOT}/notebook_model_weights/6param_best/best_model.pt}" | |
| ARGS="${TRAINING_ARGS:-${ROOT}/notebook_model_weights/6param_best/args.json}" | |
| DATA="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_6}" | |
| mkdir -p "${OUT}" | |
| RUN_LOG="${CUSTOM_LOG:-${OUT}/run_log.txt}" | |
| echo "===============================================" | |
| echo "Job ID: ${SLURM_JOB_ID:-local}" | |
| echo "Node: ${SLURM_NODELIST:-$(hostname)}" | |
| echo "GPU: ${CUDA_VISIBLE_DEVICES:-n/a}" | |
| echo "Started: $(date)" | |
| echo "Python: ${PY}" | |
| echo "checkpoint: ${CHK}" | |
| echo "training_args: ${ARGS}" | |
| echo "data_dir: ${DATA}" | |
| echo "output_dir: ${OUT}" | |
| echo "Progress log: ${RUN_LOG}" | |
| echo "===============================================" | |
| set -o pipefail | |
| python -u "${PY}" \ | |
| --checkpoint "${CHK}" \ | |
| --training_args "${ARGS}" \ | |
| --data_dir "${DATA}" \ | |
| --output_dir "${OUT}" \ | |
| --allow_huge_grid \ | |
| "$@" 2>&1 | tee -a "${RUN_LOG}" | |
| echo "===============================================" | |
| echo "Finished: $(date)" | |
| echo "Artifacts → ${OUT}" | |
| echo "===============================================" | |