File size: 1,542 Bytes
c496462
f513198
c496462
 
 
 
 
 
f513198
c496462
 
 
 
f513198
c496462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f513198
c496462
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash
#SBATCH --account=<your-slurm-account>
#SBATCH --partition=l40s
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --gres=gpu:l40s:1
#SBATCH --time=48:00:00
#SBATCH --job-name=ddpm_hi_april26
#SBATCH --mail-user=<your-email>   # replace before submitting
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err

# Project root (this repo)
cd <DDPM_ROOT>/Models/2param_DDPM_HI_Emulation

module load python/miniconda3-py3.12-usr

echo "==============================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Starting Time: $(date)"
echo "Conditional diffusion training (DDPM_HI_Emulation_improved)"
echo "==============================================="

python train_conditional.py \
    --label_dim 2 \
    --timesteps 1500 \
    --use_ddim \
    --ddim_steps 50 \
    --normalize_labels \
    --batch_size 8 \
    --epochs 200 \
    --lr 2e-4 \
    --early_stop_patience 100 \
    --sample_every 100 \
    --base_channels 64 \
    --channel_multipliers 1 2 4 8 \
    --attention_levels 2 3 \
    --data_dir <DDPM_ROOT>/data/LH_data/params_2 \
    --output_dir outputs_conditional_2label

# To resume (e.g. epoch 100 → 150): use scripts/shell/resume_conditional_epoch100_50more.sh
# or add --epochs <new_total> --resume .../checkpoint_epoch_N.pt --resume_refresh_scheduler

echo "==============================================="
echo "Training completed at: $(date)"
echo "==============================================="