05153506
Browse files- context_unet.py +33 -12
- diffusion.py +14 -9
- load_h5.py +27 -19
- perlmutter_diffusion.sbatch +5 -5
- tensorboard.ipynb +8 -26
context_unet.py
CHANGED
|
@@ -63,44 +63,52 @@ AvgPool = {
|
|
| 63 |
}
|
| 64 |
|
| 65 |
class Downsample(nn.Module):
|
| 66 |
-
def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
|
| 67 |
super().__init__()
|
| 68 |
self.channels = channels
|
| 69 |
self.out_channels = out_channels or channels
|
| 70 |
-
|
|
|
|
| 71 |
if use_conv:
|
| 72 |
-
# print("conv")
|
| 73 |
self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)
|
| 74 |
else:
|
| 75 |
-
# print("pool")
|
| 76 |
assert channels == self.out_channels
|
| 77 |
self.op = AvgPool[dim](kernel_size=stride, stride=stride)
|
| 78 |
|
| 79 |
def forward(self, x):
|
| 80 |
assert x.shape[1] == self.channels
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
class Upsample(nn.Module):
|
| 84 |
-
def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
|
| 85 |
super().__init__()
|
| 86 |
self.channels = channels
|
| 87 |
self.out_channels = out_channels
|
| 88 |
self.use_conv = use_conv
|
| 89 |
self.stride = stride
|
|
|
|
|
|
|
| 90 |
if self.use_conv:
|
| 91 |
self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)
|
| 92 |
|
| 93 |
def forward(self, x):
|
| 94 |
assert x.shape[1] == self.channels
|
| 95 |
-
# stride = config.stride
|
| 96 |
-
# print(torch.tensor(x.shape[2:]))
|
| 97 |
-
# print(torch.tensor(stride))
|
| 98 |
shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)
|
| 99 |
shape = tuple(shape.detach().numpy())
|
| 100 |
# print(shape)
|
| 101 |
x = F.interpolate(x, shape, mode='nearest')
|
|
|
|
| 102 |
if self.use_conv:
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return x
|
| 105 |
|
| 106 |
def zero_module(module):
|
|
@@ -335,6 +343,7 @@ class ContextUnet(nn.Module):
|
|
| 335 |
#dtype = torch.float32,
|
| 336 |
):
|
| 337 |
super().__init__()
|
|
|
|
| 338 |
|
| 339 |
if channel_mult == None:
|
| 340 |
if image_size == 512:
|
|
@@ -433,7 +442,13 @@ class ContextUnet(nn.Module):
|
|
| 433 |
stride = stride,
|
| 434 |
)
|
| 435 |
if resblock_updown
|
| 436 |
-
else Downsample(ch,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
)
|
| 438 |
)
|
| 439 |
ch = out_ch
|
|
@@ -519,7 +534,13 @@ class ContextUnet(nn.Module):
|
|
| 519 |
stride = stride,
|
| 520 |
)
|
| 521 |
if resblock_updown
|
| 522 |
-
else Upsample(ch,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
)
|
| 524 |
ds //= 2
|
| 525 |
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
class Downsample(nn.Module):
|
| 66 |
+
def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2), use_checkpoint=False):
|
| 67 |
super().__init__()
|
| 68 |
self.channels = channels
|
| 69 |
self.out_channels = out_channels or channels
|
| 70 |
+
self.use_checkpoint = use_checkpoint
|
| 71 |
+
self.dim = dim
|
| 72 |
if use_conv:
|
|
|
|
| 73 |
self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)
|
| 74 |
else:
|
|
|
|
| 75 |
assert channels == self.out_channels
|
| 76 |
self.op = AvgPool[dim](kernel_size=stride, stride=stride)
|
| 77 |
|
| 78 |
def forward(self, x):
|
| 79 |
assert x.shape[1] == self.channels
|
| 80 |
+
if self.use_checkpoint and isinstance(self.op, Conv[self.dim]):
|
| 81 |
+
print(f"checkpoint working in Downsample")
|
| 82 |
+
return checkpoint.checkpoint(self.op, x)
|
| 83 |
+
else:
|
| 84 |
+
return self.op(x)
|
| 85 |
|
| 86 |
class Upsample(nn.Module):
|
| 87 |
+
def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2), use_checkpoint=False):
|
| 88 |
super().__init__()
|
| 89 |
self.channels = channels
|
| 90 |
self.out_channels = out_channels
|
| 91 |
self.use_conv = use_conv
|
| 92 |
self.stride = stride
|
| 93 |
+
self.use_checkpoint = use_checkpoint
|
| 94 |
+
|
| 95 |
if self.use_conv:
|
| 96 |
self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)
|
| 97 |
|
| 98 |
def forward(self, x):
|
| 99 |
assert x.shape[1] == self.channels
|
|
|
|
|
|
|
|
|
|
| 100 |
shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)
|
| 101 |
shape = tuple(shape.detach().numpy())
|
| 102 |
# print(shape)
|
| 103 |
x = F.interpolate(x, shape, mode='nearest')
|
| 104 |
+
|
| 105 |
if self.use_conv:
|
| 106 |
+
if self.use_checkpoint:
|
| 107 |
+
print(f"checkpoint working in upsample")
|
| 108 |
+
return checkpoint.checkpoint(self.conv, x)
|
| 109 |
+
else:
|
| 110 |
+
x = self.conv(x)
|
| 111 |
+
|
| 112 |
return x
|
| 113 |
|
| 114 |
def zero_module(module):
|
|
|
|
| 343 |
#dtype = torch.float32,
|
| 344 |
):
|
| 345 |
super().__init__()
|
| 346 |
+
#self.use_checkpoint = use_checkpoint
|
| 347 |
|
| 348 |
if channel_mult == None:
|
| 349 |
if image_size == 512:
|
|
|
|
| 442 |
stride = stride,
|
| 443 |
)
|
| 444 |
if resblock_updown
|
| 445 |
+
else Downsample(ch,
|
| 446 |
+
conv_resample,
|
| 447 |
+
out_channels=out_ch,
|
| 448 |
+
dim=dim,
|
| 449 |
+
stride=stride,
|
| 450 |
+
#use_checkpoint=use_checkpoint,
|
| 451 |
+
)
|
| 452 |
)
|
| 453 |
)
|
| 454 |
ch = out_ch
|
|
|
|
| 534 |
stride = stride,
|
| 535 |
)
|
| 536 |
if resblock_updown
|
| 537 |
+
else Upsample(ch,
|
| 538 |
+
conv_resample,
|
| 539 |
+
out_channels=out_ch,
|
| 540 |
+
dim=dim,
|
| 541 |
+
stride=stride,
|
| 542 |
+
#use_checkpoint=use_checkpoint,
|
| 543 |
+
)
|
| 544 |
)
|
| 545 |
ds //= 2
|
| 546 |
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
diffusion.py
CHANGED
|
@@ -23,7 +23,7 @@ import copy
|
|
| 23 |
from tqdm.auto import tqdm
|
| 24 |
# from diffusers import UNet2DModel#, UNet3DConditionModel
|
| 25 |
# from diffusers import DDPMScheduler
|
| 26 |
-
import datetime
|
| 27 |
from pathlib import Path
|
| 28 |
#from diffusers.optimization import get_cosine_schedule_with_warmup
|
| 29 |
#from accelerate import notebook_launcher, Accelerator
|
|
@@ -241,8 +241,8 @@ class TrainConfig:
|
|
| 241 |
world_size = 1#torch.cuda.device_count()
|
| 242 |
# repeat = 2
|
| 243 |
|
| 244 |
-
|
| 245 |
-
dim = 3#2
|
| 246 |
stride = (2,4) if dim == 2 else (2,2,4)
|
| 247 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 248 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
|
@@ -360,7 +360,7 @@ def get_gpu_info(device):
|
|
| 360 |
|
| 361 |
class DDPM21CM:
|
| 362 |
def __init__(self, config):
|
| 363 |
-
config.run_name = datetime.
|
| 364 |
self.config = config
|
| 365 |
self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
|
| 366 |
|
|
@@ -380,7 +380,7 @@ class DDPM21CM:
|
|
| 380 |
#self.nn_model.module.to(config.dtype)
|
| 381 |
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
|
| 382 |
else:
|
| 383 |
-
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
|
| 384 |
|
| 385 |
# whether to use ema
|
| 386 |
if config.ema:
|
|
@@ -412,9 +412,10 @@ class DDPM21CM:
|
|
| 412 |
drop_prob=self.config.drop_prob,
|
| 413 |
dim=self.config.dim,
|
| 414 |
ranges_dict=self.ranges_dict,
|
| 415 |
-
num_workers=min(
|
| 416 |
str_len = self.config.str_len,
|
| 417 |
)
|
|
|
|
| 418 |
|
| 419 |
dataloader_start = time()
|
| 420 |
self.dataloader = DataLoader(
|
|
@@ -488,7 +489,7 @@ class DDPM21CM:
|
|
| 488 |
global_step = 0
|
| 489 |
for ep in range(self.config.n_epoch):
|
| 490 |
self.ddpm.train()
|
| 491 |
-
pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr
|
| 492 |
pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
|
| 493 |
epoch_start = time()
|
| 494 |
for i, (x, c) in enumerate(self.dataloader):
|
|
@@ -507,6 +508,10 @@ class DDPM21CM:
|
|
| 507 |
loss = F.mse_loss(noise, noise_pred)
|
| 508 |
loss = loss / self.config.gradient_accumulation_steps
|
| 509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
# scaler backward propogation
|
| 511 |
self.scaler.scale(loss).backward()
|
| 512 |
#loss.backward()
|
|
@@ -624,7 +629,7 @@ class DDPM21CM:
|
|
| 624 |
#print(f"x_last.dtype = {x_last.dtype}")
|
| 625 |
if save:
|
| 626 |
# np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
|
| 627 |
-
savetime = datetime.
|
| 628 |
savename = os.path.join(self.config.output_dir, f"Tvir{params_backup[0]:.3f}-zeta{params_backup[1]:.3f}-N{self.config.num_image}-device{self.config.global_rank}-{os.path.basename(self.config.resume)}-{savetime}{'ema' if ema else ''}.npy")
|
| 629 |
if not os.path.exists(self.config.output_dir):
|
| 630 |
os.makedirs(self.config.output_dir)
|
|
@@ -721,7 +726,7 @@ if __name__ == "__main__":
|
|
| 721 |
############################ training ################################
|
| 722 |
if args.train:
|
| 723 |
config.dataset_name = args.train
|
| 724 |
-
print(f" training,
|
| 725 |
mp.spawn(
|
| 726 |
train,
|
| 727 |
args=(world_size, local_world_size, master_addr, master_port, config),
|
|
|
|
| 23 |
from tqdm.auto import tqdm
|
| 24 |
# from diffusers import UNet2DModel#, UNet3DConditionModel
|
| 25 |
# from diffusers import DDPMScheduler
|
| 26 |
+
from datetime import datetime
|
| 27 |
from pathlib import Path
|
| 28 |
#from diffusers.optimization import get_cosine_schedule_with_warmup
|
| 29 |
#from accelerate import notebook_launcher, Accelerator
|
|
|
|
| 241 |
world_size = 1#torch.cuda.device_count()
|
| 242 |
# repeat = 2
|
| 243 |
|
| 244 |
+
dim = 2
|
| 245 |
+
#dim = 3#2
|
| 246 |
stride = (2,4) if dim == 2 else (2,2,4)
|
| 247 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 248 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
|
|
|
| 360 |
|
| 361 |
class DDPM21CM:
|
| 362 |
def __init__(self, config):
|
| 363 |
+
config.run_name = datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
|
| 364 |
self.config = config
|
| 365 |
self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
|
| 366 |
|
|
|
|
| 380 |
#self.nn_model.module.to(config.dtype)
|
| 381 |
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
|
| 382 |
else:
|
| 383 |
+
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
|
| 384 |
|
| 385 |
# whether to use ema
|
| 386 |
if config.ema:
|
|
|
|
| 412 |
drop_prob=self.config.drop_prob,
|
| 413 |
dim=self.config.dim,
|
| 414 |
ranges_dict=self.ranges_dict,
|
| 415 |
+
num_workers=min(1,len(os.sched_getaffinity(0))//self.config.world_size),
|
| 416 |
str_len = self.config.str_len,
|
| 417 |
)
|
| 418 |
+
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}: Dataset4h5 done")
|
| 419 |
|
| 420 |
dataloader_start = time()
|
| 421 |
self.dataloader = DataLoader(
|
|
|
|
| 489 |
global_step = 0
|
| 490 |
for ep in range(self.config.n_epoch):
|
| 491 |
self.ddpm.train()
|
| 492 |
+
pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr)#, disable=True)#, mininterval=self.config.pbar_update_step)#, disable=True)#not self.accelerator.is_local_main_process)
|
| 493 |
pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
|
| 494 |
epoch_start = time()
|
| 495 |
for i, (x, c) in enumerate(self.dataloader):
|
|
|
|
| 508 |
loss = F.mse_loss(noise, noise_pred)
|
| 509 |
loss = loss / self.config.gradient_accumulation_steps
|
| 510 |
|
| 511 |
+
#print(f"loss = {loss}")
|
| 512 |
+
if torch.isnan(loss).any():
|
| 513 |
+
raise ValueError(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}, loss: {loss}")
|
| 514 |
+
|
| 515 |
# scaler backward propogation
|
| 516 |
self.scaler.scale(loss).backward()
|
| 517 |
#loss.backward()
|
|
|
|
| 629 |
#print(f"x_last.dtype = {x_last.dtype}")
|
| 630 |
if save:
|
| 631 |
# np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
|
| 632 |
+
savetime = datetime.now().strftime("%d%H%M%S")
|
| 633 |
savename = os.path.join(self.config.output_dir, f"Tvir{params_backup[0]:.3f}-zeta{params_backup[1]:.3f}-N{self.config.num_image}-device{self.config.global_rank}-{os.path.basename(self.config.resume)}-{savetime}{'ema' if ema else ''}.npy")
|
| 634 |
if not os.path.exists(self.config.output_dir):
|
| 635 |
os.makedirs(self.config.output_dir)
|
|
|
|
| 726 |
############################ training ################################
|
| 727 |
if args.train:
|
| 728 |
config.dataset_name = args.train
|
| 729 |
+
print(f" training, ip = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size}, {datetime.now().strftime('%d-%H:%M:%S.%f')} ".center(config.str_len,'#'))
|
| 730 |
mp.spawn(
|
| 731 |
train,
|
| 732 |
args=(world_size, local_world_size, master_addr, master_port, config),
|
load_h5.py
CHANGED
|
@@ -20,7 +20,7 @@ import os
|
|
| 20 |
# from diffusers import DDPMScheduler
|
| 21 |
# from diffusers.utils import make_image_grid
|
| 22 |
from time import time
|
| 23 |
-
import datetime
|
| 24 |
import concurrent.futures
|
| 25 |
import psutil
|
| 26 |
# from pathlib import Path
|
|
@@ -115,25 +115,33 @@ class Dataset4h5(Dataset):
|
|
| 115 |
elif self.dim == 3:
|
| 116 |
self.images = np.empty((self.num_image, 1, self.HII_DIM, self.HII_DIM, self.num_redshift), dtype=np.float32)
|
| 117 |
# self.num_workers = len(os.sched_getaffinity(0))//torch.cuda.device_count()
|
| 118 |
-
concurrent_init_start = time()
|
| 119 |
-
with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
|
| 120 |
-
concurrent_init_end = time()
|
| 121 |
-
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
|
| 122 |
-
futures = [None] * self.num_workers
|
| 123 |
-
for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
|
| 124 |
-
executor_start = time()
|
| 125 |
-
futures[i] = executor.submit(self.read_data_chunk, self.dir_name, idx, torch.cuda.current_device(), concurrent_init_end, executor_start)
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
concurrent_start = time()
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
self.
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
transform_start = time()
|
| 139 |
if self.transform:
|
|
@@ -164,7 +172,7 @@ class Dataset4h5(Dataset):
|
|
| 164 |
param_start = time()
|
| 165 |
params = f['params']['values'][idx]
|
| 166 |
param_end = time()
|
| 167 |
-
print(f"
|
| 168 |
|
| 169 |
return images, params
|
| 170 |
|
|
|
|
| 20 |
# from diffusers import DDPMScheduler
|
| 21 |
# from diffusers.utils import make_image_grid
|
| 22 |
from time import time
|
| 23 |
+
from datetime import datetime
|
| 24 |
import concurrent.futures
|
| 25 |
import psutil
|
| 26 |
# from pathlib import Path
|
|
|
|
| 115 |
elif self.dim == 3:
|
| 116 |
self.images = np.empty((self.num_image, 1, self.HII_DIM, self.HII_DIM, self.num_redshift), dtype=np.float32)
|
| 117 |
# self.num_workers = len(os.sched_getaffinity(0))//torch.cuda.device_count()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
concurrent_init_start = time()
|
| 120 |
+
if self.num_workers == 1:
|
| 121 |
+
print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, loading by {self.num_workers} workers, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.str_len, '-'))
|
| 122 |
+
self.images, self.params = self.read_data_chunk(self.dir_name, self.idx, torch.cuda.current_device(), concurrent_init_start, concurrent_init_start)
|
| 123 |
+
self.params = self.params.astype(self.images.dtype)
|
| 124 |
concurrent_start = time()
|
| 125 |
+
print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}s, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.str_len, '-'))
|
| 126 |
+
else:
|
| 127 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
|
| 128 |
+
concurrent_init_end = time()
|
| 129 |
+
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
|
| 130 |
+
futures = [None] * self.num_workers
|
| 131 |
+
for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
|
| 132 |
+
executor_start = time()
|
| 133 |
+
futures[i] = executor.submit(self.read_data_chunk, self.dir_name, idx, torch.cuda.current_device(), concurrent_init_end, executor_start)
|
| 134 |
+
|
| 135 |
+
concurrent_start = time()
|
| 136 |
+
start_idx = 0
|
| 137 |
+
for future in concurrent.futures.as_completed(futures):
|
| 138 |
+
images, params = future.result()
|
| 139 |
+
batch_size = params.shape[0]
|
| 140 |
+
self.images[start_idx:start_idx+batch_size] = images
|
| 141 |
+
self.params[start_idx:start_idx+batch_size] = params
|
| 142 |
+
start_idx += batch_size
|
| 143 |
+
concurrent_end = time()
|
| 144 |
+
print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
|
| 145 |
|
| 146 |
transform_start = time()
|
| 147 |
if self.transform:
|
|
|
|
| 172 |
param_start = time()
|
| 173 |
params = f['params']['values'][idx]
|
| 174 |
param_end = time()
|
| 175 |
+
print(f"cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU:{cpu_num}, images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
|
| 176 |
|
| 177 |
return images, params
|
| 178 |
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -3,9 +3,9 @@
|
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu&hbm80g
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
-
#SBATCH -
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
-
#SBATCH -t 30:
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -36,12 +36,12 @@ cat $0
|
|
| 36 |
#nvidia-smi
|
| 37 |
|
| 38 |
srun python diffusion.py \
|
| 39 |
-
--num_image
|
| 40 |
-
--batch_size
|
| 41 |
--n_epoch 50 \
|
| 42 |
--num_new_img_per_gpu 20 \
|
| 43 |
--max_num_img_per_gpu 4 \
|
| 44 |
-
--channel_mult
|
| 45 |
--gradient_accumulation_steps 1 \
|
| 46 |
--autocast 1 \
|
| 47 |
--use_checkpoint 1 \
|
|
|
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu&hbm80g
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
+
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
+
#SBATCH -t 02:30:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 36 |
#nvidia-smi
|
| 37 |
|
| 38 |
srun python diffusion.py \
|
| 39 |
+
--num_image 6400 \
|
| 40 |
+
--batch_size 128 \
|
| 41 |
--n_epoch 50 \
|
| 42 |
--num_new_img_per_gpu 20 \
|
| 43 |
--max_num_img_per_gpu 4 \
|
| 44 |
+
--channel_mult 0.5 1 2 2 4 8 \
|
| 45 |
--gradient_accumulation_steps 1 \
|
| 46 |
--autocast 1 \
|
| 47 |
--use_checkpoint 1 \
|
tensorboard.ipynb
CHANGED
|
@@ -2,21 +2,12 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
|
| 7 |
"metadata": {
|
| 8 |
"tags": []
|
| 9 |
},
|
| 10 |
-
"outputs": [
|
| 11 |
-
{
|
| 12 |
-
"name": "stdout",
|
| 13 |
-
"output_type": "stream",
|
| 14 |
-
"text": [
|
| 15 |
-
"The tensorboard extension is already loaded. To reload it, use:\n",
|
| 16 |
-
" %reload_ext tensorboard\n"
|
| 17 |
-
]
|
| 18 |
-
}
|
| 19 |
-
],
|
| 20 |
"source": [
|
| 21 |
"import nersc_tensorboard_helper\n",
|
| 22 |
"%load_ext tensorboard"
|
|
@@ -24,30 +15,21 @@
|
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"cell_type": "code",
|
| 27 |
-
"execution_count":
|
| 28 |
"id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [
|
| 31 |
-
{
|
| 32 |
-
"data": {
|
| 33 |
-
"text/plain": [
|
| 34 |
-
"Reusing TensorBoard on port 45739 (pid 1821871), started 2 days, 2:32:50 ago. (Use '!kill 1821871' to kill it.)"
|
| 35 |
-
]
|
| 36 |
-
},
|
| 37 |
-
"metadata": {},
|
| 38 |
-
"output_type": "display_data"
|
| 39 |
-
},
|
| 40 |
{
|
| 41 |
"data": {
|
| 42 |
"text/html": [
|
| 43 |
"\n",
|
| 44 |
-
" <iframe id=\"tensorboard-frame-
|
| 45 |
" </iframe>\n",
|
| 46 |
" <script>\n",
|
| 47 |
" (function() {\n",
|
| 48 |
-
" const frame = document.getElementById(\"tensorboard-frame-
|
| 49 |
" const url = new URL(\"/\", window.location);\n",
|
| 50 |
-
" const port =
|
| 51 |
" if (port) {\n",
|
| 52 |
" url.port = port;\n",
|
| 53 |
" }\n",
|
|
@@ -70,14 +52,14 @@
|
|
| 70 |
},
|
| 71 |
{
|
| 72 |
"cell_type": "code",
|
| 73 |
-
"execution_count":
|
| 74 |
"id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
|
| 75 |
"metadata": {},
|
| 76 |
"outputs": [
|
| 77 |
{
|
| 78 |
"data": {
|
| 79 |
"text/html": [
|
| 80 |
-
"<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/
|
| 81 |
],
|
| 82 |
"text/plain": [
|
| 83 |
"<IPython.core.display.HTML object>"
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
"id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
|
| 7 |
"metadata": {
|
| 8 |
"tags": []
|
| 9 |
},
|
| 10 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"source": [
|
| 12 |
"import nersc_tensorboard_helper\n",
|
| 13 |
"%load_ext tensorboard"
|
|
|
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"cell_type": "code",
|
| 18 |
+
"execution_count": 2,
|
| 19 |
"id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
|
| 20 |
"metadata": {},
|
| 21 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
{
|
| 23 |
"data": {
|
| 24 |
"text/html": [
|
| 25 |
"\n",
|
| 26 |
+
" <iframe id=\"tensorboard-frame-497d865784f48ad7\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
|
| 27 |
" </iframe>\n",
|
| 28 |
" <script>\n",
|
| 29 |
" (function() {\n",
|
| 30 |
+
" const frame = document.getElementById(\"tensorboard-frame-497d865784f48ad7\");\n",
|
| 31 |
" const url = new URL(\"/\", window.location);\n",
|
| 32 |
+
" const port = 38971;\n",
|
| 33 |
" if (port) {\n",
|
| 34 |
" url.port = port;\n",
|
| 35 |
" }\n",
|
|
|
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"cell_type": "code",
|
| 55 |
+
"execution_count": 3,
|
| 56 |
"id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
|
| 57 |
"metadata": {},
|
| 58 |
"outputs": [
|
| 59 |
{
|
| 60 |
"data": {
|
| 61 |
"text/html": [
|
| 62 |
+
"<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/38971/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/38971/</a>"
|
| 63 |
],
|
| 64 |
"text/plain": [
|
| 65 |
"<IPython.core.display.HTML object>"
|