06184958

Files changed (4) hide show

context_unet.py CHANGED Viewed

@@ -127,12 +127,19 @@ class TimestepBlock(ABC, nn.Module):
         """
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     def forward(self, x, emb, encoder_out=None):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, AttentionBlock):
                 x = layer(x, encoder_out)
             else:
                 x = layer(x)
         return x

         """
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def __init__(self, *args, use_checkpoint=False):
+        super().__init__(*args)
+        self.use_checkpoint = use_checkpoint
     def forward(self, x, emb, encoder_out=None):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, AttentionBlock):
                 x = layer(x, encoder_out)
+            elif self.use_checkpoint and isinstance(layer, tuple(Conv.values())):
+                print(f"TimestepEmbedSequential checkpoint working for layer {type(layer)}")
+                x = checkpoint.checkpoint(layer, x)
             else:
                 x = layer(x)
         return x

diffusion.py CHANGED Viewed

@@ -241,8 +241,8 @@ class TrainConfig:
     world_size = 1#torch.cuda.device_count()
     # repeat = 2
-    dim = 2
-    #dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,4)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
@@ -296,7 +296,7 @@ class TrainConfig:
     #mixed_precision = "no" #"fp16"
     gradient_accumulation_steps = 1
-    pbar_update_step = 20
     channel_mult = (1,2,2,2,4)
     # date = datetime.datetime.now().strftime("%m%d-%H%M")

     world_size = 1#torch.cuda.device_count()
     # repeat = 2
+    #dim = 2
+    dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,4)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     #mixed_precision = "no" #"fp16"
     gradient_accumulation_steps = 1
+    #pbar_update_step = 20
     channel_mult = (1,2,2,2,4)
     # date = datetime.datetime.now().strftime("%m%d-%H%M")

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -2,10 +2,10 @@
 #SBATCH -A m4717
 #SBATCH -J diffusion
 #SBATCH -C gpu&hbm80g
-#SBATCH -q regular #shared
 #SBATCH -N1
-#SBATCH --gpus-per-node=4
-#SBATCH -t 08:30:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
@@ -36,16 +36,16 @@ cat $0
 #nvidia-smi
 srun python diffusion.py \
-    --num_image 6400 \
-    --batch_size 128 \
-    --n_epoch 200 \
-    --num_new_img_per_gpu 20 \
-    --max_num_img_per_gpu 4 \
-    --channel_mult 0.5 1 2 2 4 8 \
     --gradient_accumulation_steps 1 \
     --autocast 1 \
     --use_checkpoint 1 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
-    #--resume ./outputs/model-N6400-device_count4-node1-epoch49-30143433 \
 date

 #SBATCH -A m4717
 #SBATCH -J diffusion
 #SBATCH -C gpu&hbm80g
+#SBATCH -q shared #regular
 #SBATCH -N1
+#SBATCH --gpus-per-node=1
+#SBATCH -t 00:30:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 #nvidia-smi
 srun python diffusion.py \
+    --num_image 64 \
+    --batch_size 2 \
+    --n_epoch 50 \
+    --channel_mult 0.5 1 2 4 4 8 \
+    --num_new_img_per_gpu 800 \
+    --max_num_img_per_gpu 100 \
     --gradient_accumulation_steps 1 \
     --autocast 1 \
     --use_checkpoint 1 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    #--resume ./outputs/model-N6400-device_count4-node1-epoch199-05185634 \
 date

quantify_results.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3618a302e39b53b9514cfc8bdee9a3b8e40e51565fb6ad99b2783ce2b89764cd
-size 14880549

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f5c609710980f1c8798c5f4732afe3f28bce2a24799b0ef5028f1c9fef85a5d
+size 15711677