Xsmos commited on
Commit
2bcd1d4
·
verified ·
1 Parent(s): 963af0e
diffusion.py CHANGED
@@ -241,8 +241,8 @@ class TrainConfig:
241
  world_size = 1#torch.cuda.device_count()
242
  # repeat = 2
243
 
244
- dim = 2
245
- #dim = 3#2
246
  stride = (2,4) if dim == 2 else (2,2,4)
247
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
248
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
@@ -275,7 +275,7 @@ class TrainConfig:
275
  # seed = 0
276
  # save_dir = './outputs/'
277
 
278
- save_period = np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
279
  # general parameters for the name and logger
280
  # device = "cuda" if torch.cuda.is_available() else "cpu"
281
  lrate = 1e-4
@@ -405,7 +405,7 @@ class DDPM21CM:
405
  dataset = Dataset4h5(
406
  self.config.dataset_name,
407
  num_image=self.config.num_image,
408
- idx = "random",#'range',
409
  HII_DIM=self.config.HII_DIM,
410
  num_redshift=self.config.num_redshift,
411
  startat=self.config.startat,
 
241
  world_size = 1#torch.cuda.device_count()
242
  # repeat = 2
243
 
244
+ #dim = 2
245
+ dim = 3#2
246
  stride = (2,4) if dim == 2 else (2,2,4)
247
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
248
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
 
275
  # seed = 0
276
  # save_dir = './outputs/'
277
 
278
+ save_period = 5 #np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
279
  # general parameters for the name and logger
280
  # device = "cuda" if torch.cuda.is_available() else "cpu"
281
  lrate = 1e-4
 
405
  dataset = Dataset4h5(
406
  self.config.dataset_name,
407
  num_image=self.config.num_image,
408
+ idx = 'range',#"random",#
409
  HII_DIM=self.config.HII_DIM,
410
  num_redshift=self.config.num_redshift,
411
  startat=self.config.startat,
load_h5.py CHANGED
@@ -100,8 +100,10 @@ class Dataset4h5(Dataset):
100
  # print(self.idx)
101
  elif self.idx == "range":
102
  rank = torch.cuda.current_device()
 
 
103
  self.idx = range(
104
- rank*self.num_image, (rank+1)*self.num_image
105
  )
106
  print(f"loading {len(self.idx)} images with idx = {self.idx}")
107
  else:
@@ -116,7 +118,7 @@ class Dataset4h5(Dataset):
116
  concurrent_init_start = time()
117
  with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
118
  concurrent_init_end = time()
119
- print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
120
  futures = [None] * self.num_workers
121
  for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
122
  executor_start = time()
@@ -131,7 +133,7 @@ class Dataset4h5(Dataset):
131
  self.params[start_idx:start_idx+batch_size] = params
132
  start_idx += batch_size
133
  concurrent_end = time()
134
- print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
135
 
136
  transform_start = time()
137
  if self.transform:
@@ -162,7 +164,7 @@ class Dataset4h5(Dataset):
162
  param_start = time()
163
  params = f['params']['values'][idx]
164
  param_end = time()
165
- print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
166
 
167
  return images, params
168
 
 
100
  # print(self.idx)
101
  elif self.idx == "range":
102
  rank = torch.cuda.current_device()
103
+ local_world_size = torch.cuda.device_count()
104
+ self.global_rank = rank + local_world_size * int(os.environ["SLURM_NODEID"])
105
  self.idx = range(
106
+ self.global_rank*self.num_image, (self.global_rank+1)*self.num_image
107
  )
108
  print(f"loading {len(self.idx)} images with idx = {self.idx}")
109
  else:
 
118
  concurrent_init_start = time()
119
  with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
120
  concurrent_init_end = time()
121
+ print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
122
  futures = [None] * self.num_workers
123
  for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
124
  executor_start = time()
 
133
  self.params[start_idx:start_idx+batch_size] = params
134
  start_idx += batch_size
135
  concurrent_end = time()
136
+ print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
137
 
138
  transform_start = time()
139
  if self.transform:
 
164
  param_start = time()
165
  params = f['params']['values'][idx]
166
  param_end = time()
167
+ print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
168
 
169
  return images, params
170
 
perlmutter_diffusion.sbatch CHANGED
@@ -1,11 +1,11 @@
1
  #!/bin/bash
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
- #SBATCH -C gpu&hbm40g
5
  #SBATCH -q regular #shared
6
- #SBATCH -N1
7
  #SBATCH --gpus-per-node=4
8
- #SBATCH -t 06:00:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -36,16 +36,16 @@ cat $0
36
  #nvidia-smi
37
 
38
  srun python diffusion.py \
39
- --num_image 6400 \
40
- --batch_size 5 \
41
  --n_epoch 50 \
42
- --gradient_accumulation_steps 10 \
43
- --num_new_img_per_gpu 200 \
44
- --max_num_img_per_gpu 100 \
45
  --channel_mult 1 1 2 2 4 4 \
 
46
  --autocast 1 \
47
  --use_checkpoint 1 \
48
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
49
- #--resume outputs/model-N6400-device_count4-node2-epoch49-29150501 \
50
 
51
  date
 
1
  #!/bin/bash
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
+ #SBATCH -C gpu&hbm80g
5
  #SBATCH -q regular #shared
6
+ #SBATCH -N10
7
  #SBATCH --gpus-per-node=4
8
+ #SBATCH -t 30:20:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
36
  #nvidia-smi
37
 
38
  srun python diffusion.py \
39
+ --num_image 640 \
40
+ --batch_size 1 \
41
  --n_epoch 50 \
42
+ --num_new_img_per_gpu 20 \
43
+ --max_num_img_per_gpu 4 \
 
44
  --channel_mult 1 1 2 2 4 4 \
45
+ --gradient_accumulation_steps 1 \
46
  --autocast 1 \
47
  --use_checkpoint 1 \
48
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
49
+ #--resume ./outputs/model-N6400-device_count4-node1-epoch49-30143433 \
50
 
51
  date
quantify_results.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba1c20f3e7f85a9317d48540306a8ce1ba8f6ad847164ddd45b1167afe45a9f3
3
- size 14549371
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3618a302e39b53b9514cfc8bdee9a3b8e40e51565fb6ad99b2783ce2b89764cd
3
+ size 14880549
tensorboard.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
  "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
7
  "metadata": {
8
  "tags": []
@@ -24,14 +24,14 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 5,
28
  "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
29
  "metadata": {},
30
  "outputs": [
31
  {
32
  "data": {
33
  "text/plain": [
34
- "Reusing TensorBoard on port 45739 (pid 1821871), started 22:49:21 ago. (Use '!kill 1821871' to kill it.)"
35
  ]
36
  },
37
  "metadata": {},
@@ -41,11 +41,11 @@
41
  "data": {
42
  "text/html": [
43
  "\n",
44
- " <iframe id=\"tensorboard-frame-6e7bfddd80e38793\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
45
  " </iframe>\n",
46
  " <script>\n",
47
  " (function() {\n",
48
- " const frame = document.getElementById(\"tensorboard-frame-6e7bfddd80e38793\");\n",
49
  " const url = new URL(\"/\", window.location);\n",
50
  " const port = 45739;\n",
51
  " if (port) {\n",
@@ -70,7 +70,7 @@
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 6,
74
  "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
75
  "metadata": {},
76
  "outputs": [
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 7,
6
  "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
7
  "metadata": {
8
  "tags": []
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 8,
28
  "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
29
  "metadata": {},
30
  "outputs": [
31
  {
32
  "data": {
33
  "text/plain": [
34
+ "Reusing TensorBoard on port 45739 (pid 1821871), started 2 days, 2:32:50 ago. (Use '!kill 1821871' to kill it.)"
35
  ]
36
  },
37
  "metadata": {},
 
41
  "data": {
42
  "text/html": [
43
  "\n",
44
+ " <iframe id=\"tensorboard-frame-905898ab07792b79\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
45
  " </iframe>\n",
46
  " <script>\n",
47
  " (function() {\n",
48
+ " const frame = document.getElementById(\"tensorboard-frame-905898ab07792b79\");\n",
49
  " const url = new URL(\"/\", window.location);\n",
50
  " const port = 45739;\n",
51
  " if (port) {\n",
 
70
  },
71
  {
72
  "cell_type": "code",
73
+ "execution_count": 9,
74
  "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
75
  "metadata": {},
76
  "outputs": [