#!/bin/bash #SBATCH --account= #SBATCH --partition=l40s #SBATCH --nodes=1 #SBATCH --ntasks=8 #SBATCH --gres=gpu:l40s:1 #SBATCH --time=48:00:00 #SBATCH --job-name=ddpm_hi_april26 #SBATCH --mail-user= # replace before submitting #SBATCH --output=slurm-%j.out #SBATCH --error=slurm-%j.err # Project root (this repo) cd /Models/2param_DDPM_HI_Emulation module load python/miniconda3-py3.12-usr echo "===============================================" echo "Job ID: $SLURM_JOB_ID" echo "Job Name: $SLURM_JOB_NAME" echo "Node: $SLURM_NODELIST" echo "GPU: $CUDA_VISIBLE_DEVICES" echo "Starting Time: $(date)" echo "Conditional diffusion training (DDPM_HI_Emulation_improved)" echo "===============================================" python train_conditional.py \ --label_dim 2 \ --timesteps 1500 \ --use_ddim \ --ddim_steps 50 \ --normalize_labels \ --batch_size 8 \ --epochs 200 \ --lr 2e-4 \ --early_stop_patience 100 \ --sample_every 100 \ --base_channels 64 \ --channel_multipliers 1 2 4 8 \ --attention_levels 2 3 \ --data_dir /data/LH_data/params_2 \ --output_dir outputs_conditional_2label # To resume (e.g. epoch 100 → 150): use scripts/shell/resume_conditional_epoch100_50more.sh # or add --epochs --resume .../checkpoint_epoch_N.pt --resume_refresh_scheduler echo "===============================================" echo "Training completed at: $(date)" echo "==============================================="