Upload 6-parameter conditional DDPM (HI emulation, CAMELS LH params_6, best checkpoint) with full training/eval/posterior toolchain
1f3e7a2 verified | #SBATCH --account=<your-slurm-account> | |
| #SBATCH --partition=l40s | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks=8 | |
| #SBATCH --gres=gpu:l40s:1 | |
| #SBATCH --time=48:00:00 | |
| #SBATCH --job-name=ddpm_hi_lh6 | |
| #SBATCH --mail-user=<your-email> # replace before submitting | |
| #SBATCH --output=slurm-%j.out | |
| #SBATCH --error=slurm-%j.err | |
| # Conditional DDPM training — 6 CAMELS LH parameters (ddpm_hi_lh6). | |
| # Submit from anywhere: | |
| # sbatch <DDPM_ROOT>/Models/6param_ddpm_hi_lh6/scripts/shell/train_conditional_lh6.sh | |
| # | |
| # Override data path (optional): any folder containing *_LH_6.npy and *_labels_LH.npy | |
| # sbatch --export=DATA_DIR=<DDPM_ROOT>/data/LH_data/params_6 train_conditional_lh6.sh | |
| cd <DDPM_ROOT>/Models/6param_ddpm_hi_lh6 | |
| module load python/miniconda3-py3.12-usr | |
| # Same LH_data layout as DDPM_HI_Emulation_improved (params_2 for 2 labels → params_6 here). | |
| DATA_DIR="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_6}" | |
| echo "===============================================" | |
| echo "Job ID: $SLURM_JOB_ID" | |
| echo "Job Name: $SLURM_JOB_NAME" | |
| echo "Node: $SLURM_NODELIST" | |
| echo "GPU: $CUDA_VISIBLE_DEVICES" | |
| echo "Starting Time: $(date)" | |
| echo "Conditional diffusion training (ddpm_hi_lh6, 6 labels)" | |
| echo "DATA_DIR: ${DATA_DIR}" | |
| echo "===============================================" | |
| python train_conditional.py \ | |
| --label_dim 6 \ | |
| --timesteps 1500 \ | |
| --use_ddim \ | |
| --ddim_steps 50 \ | |
| --normalize_labels \ | |
| --batch_size 8 \ | |
| --epochs 200 \ | |
| --lr 2e-4 \ | |
| --early_stop_patience 100 \ | |
| --sample_every 10 \ | |
| --base_channels 64 \ | |
| --channel_multipliers 1 2 4 8 \ | |
| --attention_levels 2 3 \ | |
| --data_dir "${DATA_DIR}" \ | |
| --output_dir outputs_conditional_6param \ | |
| --use_amp | |
| # To resume: point --resume at checkpoints/checkpoint_epoch_N.pt and set --epochs to the | |
| # new total; add --resume_refresh_scheduler if extending past the original epoch count. | |
| echo "===============================================" | |
| echo "Training completed at: $(date)" | |
| echo "===============================================" | |