03100329

Browse files

Files changed (5) hide show

diffusion.py +4 -4
load_h5.py +6 -4
perlmutter_diffusion.sbatch +9 -9
quantify_results.ipynb +2 -2
tensorboard.ipynb +6 -6

diffusion.py CHANGED Viewed

@@ -241,8 +241,8 @@ class TrainConfig:
     world_size = 1#torch.cuda.device_count()
     # repeat = 2
-    dim = 2
-    #dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,4)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
@@ -275,7 +275,7 @@ class TrainConfig:
     # seed = 0
     # save_dir = './outputs/'
-    save_period = np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
     # general parameters for the name and logger
     # device = "cuda" if torch.cuda.is_available() else "cpu"
     lrate = 1e-4
@@ -405,7 +405,7 @@ class DDPM21CM:
         dataset = Dataset4h5(
             self.config.dataset_name,
             num_image=self.config.num_image,
-            idx = "random",#'range',
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
             startat=self.config.startat,

     world_size = 1#torch.cuda.device_count()
     # repeat = 2
+    #dim = 2
+    dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,4)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     # seed = 0
     # save_dir = './outputs/'
+    save_period = 5 #np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
     # general parameters for the name and logger
     # device = "cuda" if torch.cuda.is_available() else "cpu"
     lrate = 1e-4
         dataset = Dataset4h5(
             self.config.dataset_name,
             num_image=self.config.num_image,
+            idx = 'range',#"random",#
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
             startat=self.config.startat,

load_h5.py CHANGED Viewed

@@ -100,8 +100,10 @@ class Dataset4h5(Dataset):
             # print(self.idx)
         elif self.idx == "range":
             rank = torch.cuda.current_device()
             self.idx = range(
-                rank*self.num_image, (rank+1)*self.num_image
                 )
             print(f"loading {len(self.idx)} images with idx = {self.idx}")
         else:
@@ -116,7 +118,7 @@ class Dataset4h5(Dataset):
         concurrent_init_start = time()
         with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
             concurrent_init_end = time()
-            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
             futures = [None] * self.num_workers
             for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
                 executor_start = time()
@@ -131,7 +133,7 @@ class Dataset4h5(Dataset):
                 self.params[start_idx:start_idx+batch_size] = params
                 start_idx += batch_size
             concurrent_end = time()
-            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
         transform_start = time()
         if self.transform:
@@ -162,7 +164,7 @@ class Dataset4h5(Dataset):
             param_start = time()
             params = f['params']['values'][idx]
             param_end = time()
-            print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
         return images, params

             # print(self.idx)
         elif self.idx == "range":
             rank = torch.cuda.current_device()
+            local_world_size = torch.cuda.device_count()
+            self.global_rank = rank + local_world_size * int(os.environ["SLURM_NODEID"])
             self.idx = range(
+                self.global_rank*self.num_image, (self.global_rank+1)*self.num_image
                 )
             print(f"loading {len(self.idx)} images with idx = {self.idx}")
         else:
         concurrent_init_start = time()
         with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
             concurrent_init_end = time()
+            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
             futures = [None] * self.num_workers
             for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
                 executor_start = time()
                 self.params[start_idx:start_idx+batch_size] = params
                 start_idx += batch_size
             concurrent_end = time()
+            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
         transform_start = time()
         if self.transform:
             param_start = time()
             params = f['params']['values'][idx]
             param_end = time()
+            print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
         return images, params

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -1,11 +1,11 @@
 #!/bin/bash
 #SBATCH -A m4717
 #SBATCH -J diffusion
-#SBATCH -C gpu&hbm40g
 #SBATCH -q regular #shared
-#SBATCH -N1
 #SBATCH --gpus-per-node=4
-#SBATCH -t 06:00:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
@@ -36,16 +36,16 @@ cat $0
 #nvidia-smi
 srun python diffusion.py \
-    --num_image 6400 \
-    --batch_size 5 \
     --n_epoch 50 \
-    --gradient_accumulation_steps 10 \
-    --num_new_img_per_gpu 200 \
-    --max_num_img_per_gpu 100 \
     --channel_mult 1 1 2 2 4 4 \
     --autocast 1 \
     --use_checkpoint 1 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
-    #--resume outputs/model-N6400-device_count4-node2-epoch49-29150501 \
 date

 #!/bin/bash
 #SBATCH -A m4717
 #SBATCH -J diffusion
+#SBATCH -C gpu&hbm80g
 #SBATCH -q regular #shared
+#SBATCH -N10
 #SBATCH --gpus-per-node=4
+#SBATCH -t 30:20:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 #nvidia-smi
 srun python diffusion.py \
+    --num_image 640 \
+    --batch_size 1 \
     --n_epoch 50 \
+    --num_new_img_per_gpu 20 \
+    --max_num_img_per_gpu 4 \
     --channel_mult 1 1 2 2 4 4 \
+    --gradient_accumulation_steps 1 \
     --autocast 1 \
     --use_checkpoint 1 \
     --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    #--resume ./outputs/model-N6400-device_count4-node1-epoch49-30143433 \
 date

quantify_results.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba1c20f3e7f85a9317d48540306a8ce1ba8f6ad847164ddd45b1167afe45a9f3
-size 14549371

 version https://git-lfs.github.com/spec/v1
+oid sha256:3618a302e39b53b9514cfc8bdee9a3b8e40e51565fb6ad99b2783ce2b89764cd
+size 14880549

tensorboard.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
    "metadata": {
     "tags": []
@@ -24,14 +24,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Reusing TensorBoard on port 45739 (pid 1821871), started 22:49:21 ago. (Use '!kill 1821871' to kill it.)"
       ]
      },
      "metadata": {},
@@ -41,11 +41,11 @@
      "data": {
       "text/html": [
        "\n",
-       "      <iframe id=\"tensorboard-frame-6e7bfddd80e38793\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
-       "          const frame = document.getElementById(\"tensorboard-frame-6e7bfddd80e38793\");\n",
        "          const url = new URL(\"/\", window.location);\n",
        "          const port = 45739;\n",
        "          if (port) {\n",
@@ -70,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
    "metadata": {},
    "outputs": [

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
    "metadata": {
     "tags": []
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "Reusing TensorBoard on port 45739 (pid 1821871), started 2 days, 2:32:50 ago. (Use '!kill 1821871' to kill it.)"
       ]
      },
      "metadata": {},
      "data": {
       "text/html": [
        "\n",
+       "      <iframe id=\"tensorboard-frame-905898ab07792b79\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
+       "          const frame = document.getElementById(\"tensorboard-frame-905898ab07792b79\");\n",
        "          const url = new URL(\"/\", window.location);\n",
        "          const port = 45739;\n",
        "          if (port) {\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
    "metadata": {},
    "outputs": [