03100329
Browse files- diffusion.py +4 -4
- load_h5.py +6 -4
- perlmutter_diffusion.sbatch +9 -9
- quantify_results.ipynb +2 -2
- tensorboard.ipynb +6 -6
diffusion.py
CHANGED
|
@@ -241,8 +241,8 @@ class TrainConfig:
|
|
| 241 |
world_size = 1#torch.cuda.device_count()
|
| 242 |
# repeat = 2
|
| 243 |
|
| 244 |
-
dim = 2
|
| 245 |
-
|
| 246 |
stride = (2,4) if dim == 2 else (2,2,4)
|
| 247 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 248 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
|
@@ -275,7 +275,7 @@ class TrainConfig:
|
|
| 275 |
# seed = 0
|
| 276 |
# save_dir = './outputs/'
|
| 277 |
|
| 278 |
-
save_period = np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
|
| 279 |
# general parameters for the name and logger
|
| 280 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 281 |
lrate = 1e-4
|
|
@@ -405,7 +405,7 @@ class DDPM21CM:
|
|
| 405 |
dataset = Dataset4h5(
|
| 406 |
self.config.dataset_name,
|
| 407 |
num_image=self.config.num_image,
|
| 408 |
-
idx = "random",#
|
| 409 |
HII_DIM=self.config.HII_DIM,
|
| 410 |
num_redshift=self.config.num_redshift,
|
| 411 |
startat=self.config.startat,
|
|
|
|
| 241 |
world_size = 1#torch.cuda.device_count()
|
| 242 |
# repeat = 2
|
| 243 |
|
| 244 |
+
#dim = 2
|
| 245 |
+
dim = 3#2
|
| 246 |
stride = (2,4) if dim == 2 else (2,2,4)
|
| 247 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 248 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
|
|
|
| 275 |
# seed = 0
|
| 276 |
# save_dir = './outputs/'
|
| 277 |
|
| 278 |
+
save_period = 5 #np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
|
| 279 |
# general parameters for the name and logger
|
| 280 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 281 |
lrate = 1e-4
|
|
|
|
| 405 |
dataset = Dataset4h5(
|
| 406 |
self.config.dataset_name,
|
| 407 |
num_image=self.config.num_image,
|
| 408 |
+
idx = 'range',#"random",#
|
| 409 |
HII_DIM=self.config.HII_DIM,
|
| 410 |
num_redshift=self.config.num_redshift,
|
| 411 |
startat=self.config.startat,
|
load_h5.py
CHANGED
|
@@ -100,8 +100,10 @@ class Dataset4h5(Dataset):
|
|
| 100 |
# print(self.idx)
|
| 101 |
elif self.idx == "range":
|
| 102 |
rank = torch.cuda.current_device()
|
|
|
|
|
|
|
| 103 |
self.idx = range(
|
| 104 |
-
|
| 105 |
)
|
| 106 |
print(f"loading {len(self.idx)} images with idx = {self.idx}")
|
| 107 |
else:
|
|
@@ -116,7 +118,7 @@ class Dataset4h5(Dataset):
|
|
| 116 |
concurrent_init_start = time()
|
| 117 |
with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
|
| 118 |
concurrent_init_end = time()
|
| 119 |
-
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
|
| 120 |
futures = [None] * self.num_workers
|
| 121 |
for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
|
| 122 |
executor_start = time()
|
|
@@ -131,7 +133,7 @@ class Dataset4h5(Dataset):
|
|
| 131 |
self.params[start_idx:start_idx+batch_size] = params
|
| 132 |
start_idx += batch_size
|
| 133 |
concurrent_end = time()
|
| 134 |
-
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
|
| 135 |
|
| 136 |
transform_start = time()
|
| 137 |
if self.transform:
|
|
@@ -162,7 +164,7 @@ class Dataset4h5(Dataset):
|
|
| 162 |
param_start = time()
|
| 163 |
params = f['params']['values'][idx]
|
| 164 |
param_end = time()
|
| 165 |
-
print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
|
| 166 |
|
| 167 |
return images, params
|
| 168 |
|
|
|
|
| 100 |
# print(self.idx)
|
| 101 |
elif self.idx == "range":
|
| 102 |
rank = torch.cuda.current_device()
|
| 103 |
+
local_world_size = torch.cuda.device_count()
|
| 104 |
+
self.global_rank = rank + local_world_size * int(os.environ["SLURM_NODEID"])
|
| 105 |
self.idx = range(
|
| 106 |
+
self.global_rank*self.num_image, (self.global_rank+1)*self.num_image
|
| 107 |
)
|
| 108 |
print(f"loading {len(self.idx)} images with idx = {self.idx}")
|
| 109 |
else:
|
|
|
|
| 118 |
concurrent_init_start = time()
|
| 119 |
with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
|
| 120 |
concurrent_init_end = time()
|
| 121 |
+
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
|
| 122 |
futures = [None] * self.num_workers
|
| 123 |
for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
|
| 124 |
executor_start = time()
|
|
|
|
| 133 |
self.params[start_idx:start_idx+batch_size] = params
|
| 134 |
start_idx += batch_size
|
| 135 |
concurrent_end = time()
|
| 136 |
+
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
|
| 137 |
|
| 138 |
transform_start = time()
|
| 139 |
if self.transform:
|
|
|
|
| 164 |
param_start = time()
|
| 165 |
params = f['params']['values'][idx]
|
| 166 |
param_end = time()
|
| 167 |
+
print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
|
| 168 |
|
| 169 |
return images, params
|
| 170 |
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
#SBATCH -A m4717
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
-
#SBATCH -C gpu&
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
-
#SBATCH -
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
-
#SBATCH -t
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -36,16 +36,16 @@ cat $0
|
|
| 36 |
#nvidia-smi
|
| 37 |
|
| 38 |
srun python diffusion.py \
|
| 39 |
-
--num_image
|
| 40 |
-
--batch_size
|
| 41 |
--n_epoch 50 \
|
| 42 |
-
--
|
| 43 |
-
--
|
| 44 |
-
--max_num_img_per_gpu 100 \
|
| 45 |
--channel_mult 1 1 2 2 4 4 \
|
|
|
|
| 46 |
--autocast 1 \
|
| 47 |
--use_checkpoint 1 \
|
| 48 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 49 |
-
#--resume outputs/model-N6400-device_count4-
|
| 50 |
|
| 51 |
date
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
#SBATCH -A m4717
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
+
#SBATCH -C gpu&hbm80g
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
+
#SBATCH -N10
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
+
#SBATCH -t 30:20:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 36 |
#nvidia-smi
|
| 37 |
|
| 38 |
srun python diffusion.py \
|
| 39 |
+
--num_image 640 \
|
| 40 |
+
--batch_size 1 \
|
| 41 |
--n_epoch 50 \
|
| 42 |
+
--num_new_img_per_gpu 20 \
|
| 43 |
+
--max_num_img_per_gpu 4 \
|
|
|
|
| 44 |
--channel_mult 1 1 2 2 4 4 \
|
| 45 |
+
--gradient_accumulation_steps 1 \
|
| 46 |
--autocast 1 \
|
| 47 |
--use_checkpoint 1 \
|
| 48 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 49 |
+
#--resume ./outputs/model-N6400-device_count4-node1-epoch49-30143433 \
|
| 50 |
|
| 51 |
date
|
quantify_results.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3618a302e39b53b9514cfc8bdee9a3b8e40e51565fb6ad99b2783ce2b89764cd
|
| 3 |
+
size 14880549
|
tensorboard.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
|
| 7 |
"metadata": {
|
| 8 |
"tags": []
|
|
@@ -24,14 +24,14 @@
|
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"cell_type": "code",
|
| 27 |
-
"execution_count":
|
| 28 |
"id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [
|
| 31 |
{
|
| 32 |
"data": {
|
| 33 |
"text/plain": [
|
| 34 |
-
"Reusing TensorBoard on port 45739 (pid 1821871), started
|
| 35 |
]
|
| 36 |
},
|
| 37 |
"metadata": {},
|
|
@@ -41,11 +41,11 @@
|
|
| 41 |
"data": {
|
| 42 |
"text/html": [
|
| 43 |
"\n",
|
| 44 |
-
" <iframe id=\"tensorboard-frame-
|
| 45 |
" </iframe>\n",
|
| 46 |
" <script>\n",
|
| 47 |
" (function() {\n",
|
| 48 |
-
" const frame = document.getElementById(\"tensorboard-frame-
|
| 49 |
" const url = new URL(\"/\", window.location);\n",
|
| 50 |
" const port = 45739;\n",
|
| 51 |
" if (port) {\n",
|
|
@@ -70,7 +70,7 @@
|
|
| 70 |
},
|
| 71 |
{
|
| 72 |
"cell_type": "code",
|
| 73 |
-
"execution_count":
|
| 74 |
"id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
|
| 75 |
"metadata": {},
|
| 76 |
"outputs": [
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 7,
|
| 6 |
"id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
|
| 7 |
"metadata": {
|
| 8 |
"tags": []
|
|
|
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"cell_type": "code",
|
| 27 |
+
"execution_count": 8,
|
| 28 |
"id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [
|
| 31 |
{
|
| 32 |
"data": {
|
| 33 |
"text/plain": [
|
| 34 |
+
"Reusing TensorBoard on port 45739 (pid 1821871), started 2 days, 2:32:50 ago. (Use '!kill 1821871' to kill it.)"
|
| 35 |
]
|
| 36 |
},
|
| 37 |
"metadata": {},
|
|
|
|
| 41 |
"data": {
|
| 42 |
"text/html": [
|
| 43 |
"\n",
|
| 44 |
+
" <iframe id=\"tensorboard-frame-905898ab07792b79\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
|
| 45 |
" </iframe>\n",
|
| 46 |
" <script>\n",
|
| 47 |
" (function() {\n",
|
| 48 |
+
" const frame = document.getElementById(\"tensorboard-frame-905898ab07792b79\");\n",
|
| 49 |
" const url = new URL(\"/\", window.location);\n",
|
| 50 |
" const port = 45739;\n",
|
| 51 |
" if (port) {\n",
|
|
|
|
| 70 |
},
|
| 71 |
{
|
| 72 |
"cell_type": "code",
|
| 73 |
+
"execution_count": 9,
|
| 74 |
"id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
|
| 75 |
"metadata": {},
|
| 76 |
"outputs": [
|