32257242

Files changed (4) hide show

diffusion.py CHANGED Viewed

@@ -438,14 +438,13 @@ class DDPM21CM:
         del dataset
-    def transform(self, img):
-        if self.config.dim == 3:
-            #flip along x or y or both
-            flip_xy = [i+2 for i in range(2) if getrandbits(1)]
-            img = torch.flip(img, dims=flip_xy)
-            # flip diagonally
-            if getrandbits(1):
-                img = img.transpose(2,3)
         return img
     def train(self):
@@ -496,7 +495,10 @@ class DDPM21CM:
             pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
             epoch_start = time()
             for i, (x, c) in enumerate(self.dataloader):
-                x = self.transform(x)
                 x = x.to(self.config.device)#.to(self.config.dtype)
                 # autocast forward propogation
                 with autocast(enabled=self.config.autocast):

         del dataset
+    def transform(self, img, idx):
+        #flip along x or y or both
+        flip_xy = [i+1 for i in range(2) if getrandbits(1)]
+        img[idx] = torch.flip(img[idx], dims=flip_xy)
+        # flip diagonally
+        if getrandbits(1):
+            img[idx] = img[idx].clone().transpose(1,2)
         return img
     def train(self):
             pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
             epoch_start = time()
             for i, (x, c) in enumerate(self.dataloader):
+                if self.config.dim == 3:
+                    for idx in range(len(x)):
+                        x = self.transform(x, idx)
                 x = x.to(self.config.device)#.to(self.config.dtype)
                 # autocast forward propogation
                 with autocast(enabled=self.config.autocast):

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -5,7 +5,7 @@
 #SBATCH -q regular #shared
 #SBATCH -N4
 #SBATCH --gpus-per-node=4
-#SBATCH -t 16:00:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
@@ -30,16 +30,16 @@ cat $0
 srun python diffusion.py \
     --num_image 1600 \
     --batch_size 2 \
-    --n_epoch 20 \
     --channel_mult 0.5 1 2 4 4 8 \
     --num_new_img_per_gpu 4 \
     --max_num_img_per_gpu 2 \
     --gradient_accumulation_steps 10 \
     --autocast 1 \
     --use_checkpoint 1 \
-    --dropout 0 \
     --lrate 2e-5 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
-    #--resume ./outputs/model-N1600-device_count4-node4-epoch19-32096018 \
 date

 #SBATCH -q regular #shared
 #SBATCH -N4
 #SBATCH --gpus-per-node=4
+#SBATCH -t 48:00:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 srun python diffusion.py \
     --num_image 1600 \
     --batch_size 2 \
+    --n_epoch 60 \
     --channel_mult 0.5 1 2 4 4 8 \
     --num_new_img_per_gpu 4 \
     --max_num_img_per_gpu 2 \
     --gradient_accumulation_steps 10 \
     --autocast 1 \
     --use_checkpoint 1 \
+    --dropout 0.2 \
     --lrate 2e-5 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    #--resume ./outputs/model-N1600-device_count4-node4-epoch19-32185426 \
 date

quantify_results.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:076520e2ea10edaa431fab43004103361a160e2b900aa59b3baa114b0aaa5773
-size 24213875

 version https://git-lfs.github.com/spec/v1
+oid sha256:6691f0135f3bb373506e6090511c5527f23e0b4dc780f031bf80ca6d141e32ca
+size 25396988

tensorboard.ipynb CHANGED Viewed

@@ -23,13 +23,13 @@
      "data": {
       "text/html": [
        "\n",
-       "      <iframe id=\"tensorboard-frame-54a74258cbb72d6\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
-       "          const frame = document.getElementById(\"tensorboard-frame-54a74258cbb72d6\");\n",
        "          const url = new URL(\"/\", window.location);\n",
-       "          const port = 42029;\n",
        "          if (port) {\n",
        "            url.port = port;\n",
        "          }\n",
@@ -59,7 +59,7 @@
     {
      "data": {
       "text/html": [
-       "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/42029/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/42029/</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"

      "data": {
       "text/html": [
        "\n",
+       "      <iframe id=\"tensorboard-frame-b3fe77206bcde3f5\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
+       "          const frame = document.getElementById(\"tensorboard-frame-b3fe77206bcde3f5\");\n",
        "          const url = new URL(\"/\", window.location);\n",
+       "          const port = 33553;\n",
        "          if (port) {\n",
        "            url.port = port;\n",
        "          }\n",
     {
      "data": {
       "text/html": [
+       "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/33553/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/33553/</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"