#!/bin/bash #SBATCH --account= #SBATCH --partition=l40s #SBATCH --nodes=1 #SBATCH --ntasks=8 #SBATCH --gres=gpu:l40s:1 #SBATCH --time=48:00:00 #SBATCH --job-name=ddpm_hi_lh6 #SBATCH --mail-user= # replace before submitting #SBATCH --output=slurm-%j.out #SBATCH --error=slurm-%j.err # Conditional DDPM training — 6 CAMELS LH parameters (ddpm_hi_lh6). # Submit from anywhere: # sbatch /Models/6param_ddpm_hi_lh6/scripts/shell/train_conditional_lh6.sh # # Override data path (optional): any folder containing *_LH_6.npy and *_labels_LH.npy # sbatch --export=DATA_DIR=/data/LH_data/params_6 train_conditional_lh6.sh cd /Models/6param_ddpm_hi_lh6 module load python/miniconda3-py3.12-usr # Same LH_data layout as DDPM_HI_Emulation_improved (params_2 for 2 labels → params_6 here). DATA_DIR="${DATA_DIR:-/data/LH_data/params_6}" echo "===============================================" echo "Job ID: $SLURM_JOB_ID" echo "Job Name: $SLURM_JOB_NAME" echo "Node: $SLURM_NODELIST" echo "GPU: $CUDA_VISIBLE_DEVICES" echo "Starting Time: $(date)" echo "Conditional diffusion training (ddpm_hi_lh6, 6 labels)" echo "DATA_DIR: ${DATA_DIR}" echo "===============================================" python train_conditional.py \ --label_dim 6 \ --timesteps 1500 \ --use_ddim \ --ddim_steps 50 \ --normalize_labels \ --batch_size 8 \ --epochs 200 \ --lr 2e-4 \ --early_stop_patience 100 \ --sample_every 10 \ --base_channels 64 \ --channel_multipliers 1 2 4 8 \ --attention_levels 2 3 \ --data_dir "${DATA_DIR}" \ --output_dir outputs_conditional_6param \ --use_amp # To resume: point --resume at checkpoints/checkpoint_epoch_N.pt and set --epochs to the # new total; add --resume_refresh_scheduler if extending past the original epoch count. echo "===============================================" echo "Training completed at: $(date)" echo "==============================================="