File size: 1,542 Bytes
c496462 f513198 c496462 f513198 c496462 f513198 c496462 f513198 c496462 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | #!/bin/bash
#SBATCH --account=<your-slurm-account>
#SBATCH --partition=l40s
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --gres=gpu:l40s:1
#SBATCH --time=48:00:00
#SBATCH --job-name=ddpm_hi_april26
#SBATCH --mail-user=<your-email> # replace before submitting
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
# Project root (this repo)
cd <DDPM_ROOT>/Models/2param_DDPM_HI_Emulation
module load python/miniconda3-py3.12-usr
echo "==============================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Starting Time: $(date)"
echo "Conditional diffusion training (DDPM_HI_Emulation_improved)"
echo "==============================================="
python train_conditional.py \
--label_dim 2 \
--timesteps 1500 \
--use_ddim \
--ddim_steps 50 \
--normalize_labels \
--batch_size 8 \
--epochs 200 \
--lr 2e-4 \
--early_stop_patience 100 \
--sample_every 100 \
--base_channels 64 \
--channel_multipliers 1 2 4 8 \
--attention_levels 2 3 \
--data_dir <DDPM_ROOT>/data/LH_data/params_2 \
--output_dir outputs_conditional_2label
# To resume (e.g. epoch 100 → 150): use scripts/shell/resume_conditional_epoch100_50more.sh
# or add --epochs <new_total> --resume .../checkpoint_epoch_N.pt --resume_refresh_scheduler
echo "==============================================="
echo "Training completed at: $(date)"
echo "==============================================="
|