File size: 2,028 Bytes
eb725f8
1f3e7a2
eb725f8
 
 
 
 
 
1f3e7a2
eb725f8
 
 
 
 
1f3e7a2
eb725f8
 
1f3e7a2
eb725f8
1f3e7a2
eb725f8
 
 
 
1f3e7a2
eb725f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
#SBATCH --account=<your-slurm-account>
#SBATCH --partition=l40s
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --gres=gpu:l40s:1
#SBATCH --time=48:00:00
#SBATCH --job-name=ddpm_hi_lh6
#SBATCH --mail-user=<your-email>   # replace before submitting
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err

# Conditional DDPM training — 6 CAMELS LH parameters (ddpm_hi_lh6).
# Submit from anywhere:
#   sbatch <DDPM_ROOT>/Models/6param_ddpm_hi_lh6/scripts/shell/train_conditional_lh6.sh
#
# Override data path (optional): any folder containing *_LH_6.npy and *_labels_LH.npy
#   sbatch --export=DATA_DIR=<DDPM_ROOT>/data/LH_data/params_6 train_conditional_lh6.sh

cd <DDPM_ROOT>/Models/6param_ddpm_hi_lh6

module load python/miniconda3-py3.12-usr

# Same LH_data layout as DDPM_HI_Emulation_improved (params_2 for 2 labels → params_6 here).
DATA_DIR="${DATA_DIR:-<DDPM_ROOT>/data/LH_data/params_6}"

echo "==============================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Starting Time: $(date)"
echo "Conditional diffusion training (ddpm_hi_lh6, 6 labels)"
echo "DATA_DIR: ${DATA_DIR}"
echo "==============================================="

python train_conditional.py \
    --label_dim 6 \
    --timesteps 1500 \
    --use_ddim \
    --ddim_steps 50 \
    --normalize_labels \
    --batch_size 8 \
    --epochs 200 \
    --lr 2e-4 \
    --early_stop_patience 100 \
    --sample_every 10 \
    --base_channels 64 \
    --channel_multipliers 1 2 4 8 \
    --attention_levels 2 3 \
    --data_dir "${DATA_DIR}" \
    --output_dir outputs_conditional_6param \
    --use_amp

# To resume: point --resume at checkpoints/checkpoint_epoch_N.pt and set --epochs to the
# new total; add --resume_refresh_scheduler if extending past the original epoch count.

echo "==============================================="
echo "Training completed at: $(date)"
echo "==============================================="