20153915

Browse files

Files changed (6) hide show

context_unet.py +19 -3
diffusion.py +57 -109
load_h5.py +4 -2
perlmutter_diffusion.sbatch +14 -10
quantify_results.ipynb +0 -0
tensorboard.ipynb +12 -4

context_unet.py CHANGED Viewed

@@ -20,6 +20,7 @@ import copy
 # from diffusers import DDPMScheduler
 # from diffusers.utils import make_image_grid
 import datetime
 # from pathlib import Path
 # from diffusers.optimization import get_cosine_schedule_with_warmup
 # from accelerate import notebook_launcher, Accelerator
@@ -132,10 +133,12 @@ class ResBlock(TimestepBlock):
     def __init__(
         self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_checkpoint=False, use_scale_shift_norm=False, up=False, down=False, dim=2, stride=(2,2),
         ):
         super().__init__()
         self.out_channels = out_channels or channels
         self.use_scale_shift_norm = use_scale_shift_norm
         self.stride = stride
         self.in_layers = nn.Sequential(
             # nn.BatchNorm2d(channels), # normalize to standard gaussian
@@ -177,8 +180,13 @@ class ResBlock(TimestepBlock):
         else:
             self.skip_connection = Conv[dim](channels, self.out_channels, 1)
     def forward(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
@@ -239,6 +247,7 @@ class AttentionBlock(nn.Module):
         use_checkpoint=False,
         encoder_channels=None,
     ):
         super().__init__()
         self.channels = channels
         if num_head_channels == -1:
@@ -260,6 +269,12 @@ class AttentionBlock(nn.Module):
         self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
     def forward(self, x, encoder_out=None):
         b, c, *spatial = x.shape
         qkv = self.qkv(self.norm(x).view(b, c, -1))
         if encoder_out is not None:
@@ -533,11 +548,12 @@ class ContextUnet(nn.Module):
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
             #print("module encoder, h.shape =", h.shape)
-        # print("2,h.shape =", h.shape)
         h = self.middle_block(h, emb)
-        #print("middle block, h.shape =", h.shape)
         #print("2, h.dtype =", h.dtype)
         for module in self.output_blocks:
             #print("for module in self.output_blocks, h.shape =", h.shape)

 # from diffusers import DDPMScheduler
 # from diffusers.utils import make_image_grid
 import datetime
+import torch.utils.checkpoint as checkpoint
 # from pathlib import Path
 # from diffusers.optimization import get_cosine_schedule_with_warmup
 # from accelerate import notebook_launcher, Accelerator
     def __init__(
         self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_checkpoint=False, use_scale_shift_norm=False, up=False, down=False, dim=2, stride=(2,2),
         ):
+        #print(f"Resblock, use_checkpoint = {use_checkpoint}")
         super().__init__()
         self.out_channels = out_channels or channels
         self.use_scale_shift_norm = use_scale_shift_norm
         self.stride = stride
+        self.use_checkpoint = use_checkpoint
         self.in_layers = nn.Sequential(
             # nn.BatchNorm2d(channels), # normalize to standard gaussian
         else:
             self.skip_connection = Conv[dim](channels, self.out_channels, 1)
     def forward(self, x, emb):
+        if self.use_checkpoint:
+            return checkpoint.checkpoint(self._forward_impl, x, emb, use_reentrant=False)
+        else:
+            return self._forward_impl(x, emb)
+    def _forward_impl(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
         use_checkpoint=False,
         encoder_channels=None,
     ):
+        #print(f"AttentionBlock, use_checkpoint = {use_checkpoint}")
         super().__init__()
         self.channels = channels
         if num_head_channels == -1:
         self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
     def forward(self, x, encoder_out=None):
+        if self.use_checkpoint:
+            return checkpoint.checkpoint(self._forward_impl, x, encoder_out, use_reentrant=False)
+        else:
+            return self._forward_impl(x, encoder_out)
+    def _forward_impl(self, x, encoder_out=None):
         b, c, *spatial = x.shape
         qkv = self.qkv(self.norm(x).view(b, c, -1))
         if encoder_out is not None:
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
             h = module(h, emb)
+            #print(f"in for loop, h.shape = {h.shape}")
             hs.append(h)
             #print("module encoder, h.shape =", h.shape)
+        #print("before middle block, h.shape =", h.shape)
         h = self.middle_block(h, emb)
+        #print("after middle block, h.shape =", h.shape)
         #print("2, h.dtype =", h.dtype)
         for module in self.output_blocks:
             #print("for module in self.output_blocks, h.shape =", h.shape)

diffusion.py CHANGED Viewed

@@ -1,31 +1,3 @@
-# %% [markdown]
-# ## 改編ContextUnet及相關代碼，使其首先對二維的情況適用。並於diffusers.Unet2DModel作比較並加以優化。最後再改寫爲3維的情形。
-# - 經試用diffusers的Unet2DModel，發現loss從0.3降到0.2但仍然很高，説明存在非Unet2DModel的問題可以優化
-# - 改用diffusers的DDMPScheduler和DDPMPipeline后，loss降低至0.1以下，有時甚至可以低至0.004，可見我的代碼問題主要出在DDPM部分。DDPMScheduler部分比較簡短，似乎沒有問題，所以問題應該在DDPMPipeline裏某一部分代碼是我代碼欠缺的。
-# - 我在DDPMScheduler部分有一個typo，導致beta_t一直很小，修正后loss從0.2能降低至0.02, 維持在0.1以下
-# - 用diffusers的DDPMScheduler似乎效果要好一些，loss總是比我的DDPMScheduler要小一點。儅epoch為19時，前者的loss約0.02，後者loss約0.07。而且前者還支持3維圖像的加噪，不如直接用別人的輪子。但我想知道爲什麽我的loss會高一些。
-# - 我意識到別人的DDPMScheduler在sample函數中沒有兼容輸入參數，所以歸根結底還是需要我的DDPMscheduler。不過我可以先用別人的來debug我的ContextUnet.
-# - 我需要將我的ContextUnet擴展兼容不同維度的照片，畢竟我本身也需要和原文獻對比完了再拓展到三維的情形
-# - 我已將我的ContextUnet轉成了2維的模式，與diffusers.Unet2DModel的loss=0.037相比，我的Unet的loss=0.07。同時我的Unet生成的圖像看上去很奇怪，説明我的Unet也有問題。我需要將代碼退回原Unet，並檢查問題所在。
-# - 我將紅移方向的像素的數量限制在了64.以此比較兩個Unet的差別。經比較：\
-# Unet2DModel loss：0.03, 0.0655, 0.05, 0.02, 0.05\
-# ContextUnet loss: 0.1, 0.16, 0.1, 0.2186, 0.06
-# - 我把ContextUnet退回到了原作者的版本，結果loss=0.05，輸出的照片也不錯。我主要的改動是改回了他原用的normalization函數，其中還有個參數swish。有時間我可以研究一下具體是哪裏影響了訓練的結果。另外我發現了要想tensorboard的圖綫獨立美觀，需要把他們放在不同的文件夾下
-# - 經過驗證，GroupNorm比batchNorm效果要好
-# - 已擴展爲接受不同維度的情形
-# - 融合cond, guide_w, drop_out這些參數
-# - 生成的21cm圖像該暗的地方不夠暗，似乎換成MNIST的數字圖像就沒問題
-# - 我用diffusion模型生成MNIST的數字時發現，儘管生成的數據的範圍也存在負數數值，如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別，我現在打算把代碼退回到21cm的情形
-# - 我統一了ddpm21cm這個module，能統一實現訓練和生成樣本，但目前有個bug， sample時總是會cuda out of memory，然而單獨resume model並sample就不會。
-# - 解決了，問題出在我忘了寫with torch.no_grad():
-# - 接下來就是生成800個lightcones，與此同時研究如何計算global signal以及power spectrum
-# - 儅訓練圖片的數量達到5000時，生成的圖片與檢測數據的相似程度很高
-# - it takes 62 mins to generated 8 images with shape of (64,64,64), which is even slower than simulation, which takes ~5 mins for each image. Besides, the batch_size during training and num of images to be generated are limited to be 2 and 8, respectively.
-# - the slowerness can be solved by using multi-GPUs, and the limited-num-of-images can be solved by multi-accuracy, multi-GPUs.
-# - In addtion, the performance of DDPM can looks better compared to computation-intensive simulations.
-# 1 GPU, batch_size = 10, num_image = 3200, 50s for each epoch
-# 4 GPU, batch_size = 10, num_image = 3200,
 # %%
 import logging
 #logging.getLogger("torch").setLevel(logging.ERROR)
@@ -66,7 +38,7 @@ from context_unet import ContextUnet
 from huggingface_hub import notebook_login
 import torch.multiprocessing as mp
-from torch.utils.data.distributed import DistributedSampler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group
 import torch.distributed as dist
@@ -271,13 +243,14 @@ class TrainConfig:
     dim = 2
     #dim = 3#2
-    stride = (2,2) if dim == 2 else (2,2,2)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 100#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
-    num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
-    startat = 512-num_redshift
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
@@ -324,9 +297,11 @@ class TrainConfig:
     gradient_accumulation_steps = 1
     pbar_update_step = 20
     # date = datetime.datetime.now().strftime("%m%d-%H%M")
     # run_name = f'{date}' # the unique name of each experiment
 # config = TrainConfig()
 # print("device =", config.device)
@@ -372,53 +347,40 @@ class TrainConfig:
 #     if rank == 0 and all_gradients_consistent:
 #         print("All model gradients are consistent across GPUs.")
 #     return all_gradients_consistent
 class DDPM21CM:
     def __init__(self, config):
-        # print(
-        #     "torch.cuda.is_available() =", torch.cuda.is_available(),
-        #     "torch.cuda.device_count() =", torch.cuda.device_count(),
-        #     "torch.cuda.is_initialized() =", torch.cuda.is_initialized(),
-        #     "torch.cuda.current_device() =", torch.cuda.current_device()
-        # )
-        # config = TrainConfig()
-        # date = datetime.datetime.now().strftime("%m%d-%H%M")
         config.run_name = datetime.datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
         self.config = config
-        # dataset = Dataset4h5(config.dataset_name, num_image=config.num_image, HII_DIM=config.HII_DIM, num_redshift=config.num_redshift, drop_prob=config.drop_prob, dim=config.dim)
-        # # self.shape_loaded = dataset.images.shape
-        # # print("shape_loaded =", self.shape_loaded)
-        # self.dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
-        # del dataset
-        # print("self.ddpm = DDPMScheduler")
         self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
-        # print("self.nn_model = ContextUnet")
         # initialize the unet
-        self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride)#, dtype=config.dtype)
-        # print("self.nn_model.train()")
-        # nn_model = ContextUnet(n_param=1, image_size=28)
         self.nn_model.train()
-        # print("self.ddpm.device =", self.ddpm.device)
         self.nn_model.to(self.ddpm.device)
-        # print("before, nn_model.device =", self.ddpm.device)
         self.nn_model = DDP(self.nn_model, device_ids=[self.ddpm.device])
-        # print("after, nn_model.device =", self.ddpm.device)
-        # number of parameters to be trained
         if config.resume and os.path.exists(config.resume):
             # resume_file = os.path.join(config.output_dir, f"{config.resume}")
             # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             # print(f"resumed nn_model from {config.resume}")
             self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             #self.nn_model.module.to(config.dtype)
-            print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
         else:
-            print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
-        # self.number_of_params = sum(x.numel() for x in self.nn_model.parameters())
-        # print(f" Number of parameters for nn_model: {self.number_of_params} ".center(120,'-'))
         # whether to use ema
         if config.ema:
@@ -452,6 +414,7 @@ class DDPM21CM:
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
             num_workers=min(8,len(os.sched_getaffinity(0))//self.config.world_size),
             )
         # self.shape_loaded = dataset.images.shape
         # print("shape_loaded =", self.shape_loaded)
@@ -520,29 +483,11 @@ class DDPM21CM:
         else:
             print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
-        #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
-        #acc_prep_start = time()
-        #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
-        #    self.accelerator.prepare(
-        #    self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
-        #    )
-        #self.nn_model = self.accelerator.prepare(self.nn_model)
-        #self.optimizer = self.accelerator.prepare(self.optimizer)
-        #self.dataloader = self.accelerator.prepare(self.dataloader)
-        #self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
-        #acc_prep_end = time()
-        #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
-        # self.nn_model, self.optimizer, self.lr_scheduler = \
-        #     self.accelerator.prepare(
-        #     self.nn_model, self.optimizer, self.lr_scheduler
-        #     )
-        # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
-        # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
-        # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
-        #train_start = time()
         global_step = 0
         for ep in range(self.config.n_epoch):
             self.ddpm.train()
             # self.dataloader.sampler.set_epoch(ep)
             pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr)#, disable=self.config.global_rank!=0)#, mininterval=self.config.pbar_update_step)#, disable=True)#not self.accelerator.is_local_main_process)
@@ -550,20 +495,11 @@ class DDPM21CM:
             #train_end = time()
             #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
             for i, (x, c) in enumerate(self.dataloader):
-                #if i == 0:
-                #    train_end = time()
-                #    print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
                 # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
                 #with self.accelerator.accumulate(self.nn_model):
                 x = x.to(self.config.device)#.to(self.config.dtype)
-                # print("x = x.to(self.config.device), x.dtype =", x.dtype)
-                # print("x = x.to(self.dtype), x.dtype =", x.dtype)
-                # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
-                # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                 # autocast forward propogation
-                with autocast():
                     xt, noise, ts = self.ddpm.add_noise(x)
                     if self.config.guide_w == -1:
@@ -574,7 +510,9 @@ class DDPM21CM:
                     loss = F.mse_loss(noise, noise_pred)
                     loss = loss / self.config.gradient_accumulation_steps
                 # scaler backward propogation
                 self.scaler.scale(loss).backward()
                 #loss.backward()
@@ -610,11 +548,13 @@ class DDPM21CM:
                 global_step += 1
             if (i+1) % self.config.gradient_accumulation_steps != 0:
-                print(f"(i+1)%self.config.gradient_accumulation_steps = {(i+1)%self.config.gradient_accumulation_steps}, i = {i}, scg = {self.config.gradient_accumulation_steps}".center(120,'-'))
-                torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
-                self.optimizer.step()
-                self.lr_scheduler.step()
-                self.optimizer.zero_grad()
             # if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
@@ -631,7 +571,7 @@ class DDPM21CM:
         del self.nn_model
         if self.config.ema:
             del self.ema_model
-        torch.cuda.empty_cache()
     def save(self, ep):
         # save model
@@ -720,20 +660,21 @@ class DDPM21CM:
         #        self.nn_model, self.optimizer, self.lr_scheduler
         #        )
-        self.nn_model.eval()
         # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
         # self.ema_model.load_state_dict(torch.load(os.path.join(config.output_dir, f"{config.resume}"))['ema_unet_state_dict'])
         # print(f"resumed ema_model from {config.resume}")
         with torch.no_grad():
-            x_last, x_entire = self.ddpm.sample(
-                nn_model=self.nn_model,
-                params=params_normalized.to(self.config.device),
-                device=self.config.device,
-                guide_w=self.config.guide_w
-                )
         if save:
             # np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
             savetime = datetime.datetime.now().strftime("%d%H%M%S")
@@ -809,6 +750,9 @@ if __name__ == "__main__":
     parser.add_argument("--num_image", type=int, required=False, default=32)
     parser.add_argument("--n_epoch", type=int, required=False, default=50)
     parser.add_argument("--batch_size", type=int, required=False, default=2)
     args = parser.parse_args()
@@ -823,10 +767,14 @@ if __name__ == "__main__":
     config.num_image = args.num_image
     config.n_epoch = args.n_epoch
     config.batch_size = args.batch_size
     ############################ training ################################
     if args.train:
         config.dataset_name = args.train
-        print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'#'))
         mp.spawn(
                 train,
                 args=(world_size, local_world_size, master_addr, master_port, config),
@@ -856,7 +804,7 @@ if __name__ == "__main__":
         ]
         for params in params_pairs:
-            print(f"sampling for {params}, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size}".center(120,'-'))
             mp.spawn(
                     generate_samples,
                     args=(world_size, local_world_size, master_addr, master_port, config, num_new_img_per_gpu, max_num_img_per_gpu, torch.tensor(params)),

 # %%
 import logging
 #logging.getLogger("torch").setLevel(logging.ERROR)
 from huggingface_hub import notebook_login
 import torch.multiprocessing as mp
+#from torch.utils.data.distributed import DistributedSampler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group
 import torch.distributed as dist
     dim = 2
     #dim = 3#2
+    stride = (2,4) if dim == 2 else (2,2,4)
     num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 100#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
+    num_redshift = 1024#512#256#1024#64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
+    startat = 0#512-num_redshift
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
     gradient_accumulation_steps = 1
     pbar_update_step = 20
+    channel_mult = (1,2,2,2,4)
     # date = datetime.datetime.now().strftime("%m%d-%H%M")
     # run_name = f'{date}' # the unique name of each experiment
+    str_len = 140
 # config = TrainConfig()
 # print("device =", config.device)
 #     if rank == 0 and all_gradients_consistent:
 #         print("All model gradients are consistent across GPUs.")
 #     return all_gradients_consistent
+def get_gpu_info(device):
+    total_memory = torch.cuda.get_device_properties(device).total_memory
+    reserved_memory = torch.cuda.memory_reserved(device)
+    allocated_memory = torch.cuda.memory_allocated(device)
+    free_memory = reserved_memory - allocated_memory
+    return {
+        'total': int(total_memory / 1024**2),
+        'used': int(allocated_memory / 1024**2),
+        'free': int(free_memory / 1024**2),
+    }
 class DDPM21CM:
     def __init__(self, config):
         config.run_name = datetime.datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
         self.config = config
         self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
         # initialize the unet
+        self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride, channel_mult=config.channel_mult, use_checkpoint=config.use_checkpoint)#, dtype=config.dtype)
         self.nn_model.train()
         self.nn_model.to(self.ddpm.device)
         self.nn_model = DDP(self.nn_model, device_ids=[self.ddpm.device])
+        gpu_info = get_gpu_info(config.device)
         if config.resume and os.path.exists(config.resume):
             # resume_file = os.path.join(config.output_dir, f"{config.resume}")
             # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             # print(f"resumed nn_model from {config.resume}")
             self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             #self.nn_model.module.to(config.dtype)
+            print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters, gpu:{gpu_info} MB".center(self.config.str_len,'+'))
         else:
+            print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, gpu:{gpu_info} MB".center(self.config.str_len,'+'))
         # whether to use ema
         if config.ema:
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
             num_workers=min(8,len(os.sched_getaffinity(0))//self.config.world_size),
+            str_len = self.config.str_len,
             )
         # self.shape_loaded = dataset.images.shape
         # print("shape_loaded =", self.shape_loaded)
         else:
             print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
         global_step = 0
         for ep in range(self.config.n_epoch):
+            #torch.cuda.empty_cache()
+            #print(torch.cuda.memory_summary())#abbreviated=True))
+            #print(f"before for loop device{self.config.device} {get_gpu_info(self.config.device)}")
             self.ddpm.train()
             # self.dataloader.sampler.set_epoch(ep)
             pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr)#, disable=self.config.global_rank!=0)#, mininterval=self.config.pbar_update_step)#, disable=True)#not self.accelerator.is_local_main_process)
             #train_end = time()
             #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
             for i, (x, c) in enumerate(self.dataloader):
                 # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
                 #with self.accelerator.accumulate(self.nn_model):
                 x = x.to(self.config.device)#.to(self.config.dtype)
                 # autocast forward propogation
+                with autocast(enabled=self.config.autocast):
                     xt, noise, ts = self.ddpm.add_noise(x)
                     if self.config.guide_w == -1:
                     loss = F.mse_loss(noise, noise_pred)
                     loss = loss / self.config.gradient_accumulation_steps
+                    #print(f"within autocast #{i}-device{self.config.device} {get_gpu_info(self.config.device)}")
+                    #print(f"within autocast #{i}-device{self.config.device} t-r-a: {torch.cuda.get_device_properties(self.config.device).total_memory/1024**2}-{torch.cuda.memory_reserved(self.config.device)/1024**2}-{torch.cuda.memory_allocated(self.config.device)/1024**2}")
                 # scaler backward propogation
                 self.scaler.scale(loss).backward()
                 #loss.backward()
                 global_step += 1
             if (i+1) % self.config.gradient_accumulation_steps != 0:
+                print(f"(i+1)%self.config.gradient_accumulation_steps = {(i+1)%self.config.gradient_accumulation_steps}, i = {i}, scg = {self.config.gradient_accumulation_steps}".center(self.config.str_len,'-'))
+                #torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
+                #self.optimizer.step()
+                #self.lr_scheduler.step()
+                #self.optimizer.zero_grad()
+            #print(f"after autocast #{i}-device{self.config.device} {get_gpu_info(self.config.device)}")
+            #print(f"after autocast #{i}-device{self.config.device} t-r-a: {torch.cuda.get_device_properties(self.config.device).total_memory/1024**2}-{torch.cuda.memory_reserved(self.config.device)/1024**2}-{torch.cuda.memory_allocated(self.config.device)/1024**2}")
             # if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
         del self.nn_model
         if self.config.ema:
             del self.ema_model
+        #torch.cuda.empty_cache()
     def save(self, ep):
         # save model
         #        self.nn_model, self.optimizer, self.lr_scheduler
         #        )
         # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
         # self.ema_model.load_state_dict(torch.load(os.path.join(config.output_dir, f"{config.resume}"))['ema_unet_state_dict'])
         # print(f"resumed ema_model from {config.resume}")
+        self.nn_model.eval()
         with torch.no_grad():
+            with autocast(enabled=self.config.autocast):
+            #with autocast():
+                x_last, x_entire = self.ddpm.sample(
+                    nn_model=self.nn_model,
+                    params=params_normalized.to(self.config.device),
+                    device=self.config.device,
+                    guide_w=self.config.guide_w
+                    )
+        #print(f"x_last.dtype = {x_last.dtype}")
         if save:
             # np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
             savetime = datetime.datetime.now().strftime("%d%H%M%S")
     parser.add_argument("--num_image", type=int, required=False, default=32)
     parser.add_argument("--n_epoch", type=int, required=False, default=50)
     parser.add_argument("--batch_size", type=int, required=False, default=2)
+    parser.add_argument("--channel_mult", type=float, nargs="+", required=False, default=(1,2,2,2,4))
+    parser.add_argument("--autocast", type=int, required=False, default=False)
+    parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
     args = parser.parse_args()
     config.num_image = args.num_image
     config.n_epoch = args.n_epoch
     config.batch_size = args.batch_size
+    config.channel_mult = args.channel_mult
+    config.autocast = bool(args.autocast)
+    config.use_checkpoint = bool(args.use_checkpoint)
     ############################ training ################################
     if args.train:
         config.dataset_name = args.train
+        print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(config.str_len,'#'))
         mp.spawn(
                 train,
                 args=(world_size, local_world_size, master_addr, master_port, config),
         ]
         for params in params_pairs:
+            print(f"sampling for {params}, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size}".center(config.str_len,'-'))
             mp.spawn(
                     generate_samples,
                     args=(world_size, local_world_size, master_addr, master_port, config, num_new_img_per_gpu, max_num_img_per_gpu, torch.tensor(params)),

load_h5.py CHANGED Viewed

@@ -46,6 +46,7 @@ class Dataset4h5(Dataset):
         num_workers=1,#len(os.sched_getaffinity(0))//torch.cuda.device_count(),
         startat=0,
         # shuffle=False,
         ):
         super().__init__()
@@ -61,6 +62,7 @@ class Dataset4h5(Dataset):
         self.transform = transform
         self.num_workers = num_workers
         self.startat = startat
         self.load_h5()
         if rescale:
@@ -114,7 +116,7 @@ class Dataset4h5(Dataset):
         concurrent_init_start = time()
         with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
             concurrent_init_end = time()
-            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(120, '-'))
             futures = [None] * self.num_workers
             for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
                 executor_start = time()
@@ -129,7 +131,7 @@ class Dataset4h5(Dataset):
                 self.params[start_idx:start_idx+batch_size] = params
                 start_idx += batch_size
             concurrent_end = time()
-            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(120, '-'))
         transform_start = time()
         if self.transform:

         num_workers=1,#len(os.sched_getaffinity(0))//torch.cuda.device_count(),
         startat=0,
         # shuffle=False,
+        str_len = 120,
         ):
         super().__init__()
         self.transform = transform
         self.num_workers = num_workers
         self.startat = startat
+        self.str_len = str_len
         self.load_h5()
         if rescale:
         concurrent_init_start = time()
         with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
             concurrent_init_end = time()
+            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
             futures = [None] * self.num_workers
             for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
                 executor_start = time()
                 self.params[start_idx:start_idx+batch_size] = params
                 start_idx += batch_size
             concurrent_end = time()
+            print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
         transform_start = time()
         if self.transform:

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -1,13 +1,13 @@
 #!/bin/bash
 #SBATCH -A m4717
 #SBATCH -J diffusion
-#SBATCH -C gpu
 #SBATCH -q shared #regular
 #SBATCH -N1
 #SBATCH --gpus-per-node=1
-#SBATCH -t 0:59:00
 #SBATCH --ntasks-per-node=1
-#SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --gpu-bind=none
@@ -33,15 +33,19 @@ export MASTER_PORT=$MASTER_PORT
 #export NCCL_DEBUG=INFO
 #export NCCL_DEBUG_SUBSYS=ALL
 cat $0
 srun python diffusion.py \
-    --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
-    --num_image 3200 \
-    --batch_size 32 \
-    --n_epoch 100 \
     --gradient_accumulation_steps 1 \
-    --num_new_img_per_gpu 800 \
-    --max_num_img_per_gpu 80 \
-    #--resume outputs/model-N3200-device_count1-node1-epoch99-16103542 \
 date

 #!/bin/bash
 #SBATCH -A m4717
 #SBATCH -J diffusion
+#SBATCH -C gpu&hbm80g
 #SBATCH -q shared #regular
 #SBATCH -N1
 #SBATCH --gpus-per-node=1
+#SBATCH -t 3:00:00
 #SBATCH --ntasks-per-node=1
+#SBATCH -o%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --gpu-bind=none
 #export NCCL_DEBUG=INFO
 #export NCCL_DEBUG_SUBSYS=ALL
 cat $0
+#nvidia-smi
 srun python diffusion.py \
+    --num_image 6400 \
+    --batch_size 64 \
+    --n_epoch 50 \
     --gradient_accumulation_steps 1 \
+    --num_new_img_per_gpu 200 \
+    --max_num_img_per_gpu 20 \
+    --channel_mult 1 1 2 2 4 \
+    --autocast 1 \
+    --use_checkpoint 1 \
+    --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    #--resume outputs/model-N3200-device_count1-node1-epoch99-17160118 \
 date

quantify_results.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

tensorboard.ipynb CHANGED Viewed

@@ -23,13 +23,13 @@
      "data": {
       "text/html": [
        "\n",
-       "      <iframe id=\"tensorboard-frame-262245829087dd6a\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
-       "          const frame = document.getElementById(\"tensorboard-frame-262245829087dd6a\");\n",
        "          const url = new URL(\"/\", window.location);\n",
-       "          const port = 45355;\n",
        "          if (port) {\n",
        "            url.port = port;\n",
        "          }\n",
@@ -59,7 +59,7 @@
     {
      "data": {
       "text/html": [
-       "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45355/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45355/</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -72,6 +72,14 @@
    "source": [
     "nersc_tensorboard_helper.tb_address()"
    ]
   }
  ],
  "metadata": {

      "data": {
       "text/html": [
        "\n",
+       "      <iframe id=\"tensorboard-frame-13f025ce79187ae\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
        "      </iframe>\n",
        "      <script>\n",
        "        (function() {\n",
+       "          const frame = document.getElementById(\"tensorboard-frame-13f025ce79187ae\");\n",
        "          const url = new URL(\"/\", window.location);\n",
+       "          const port = 41355;\n",
        "          if (port) {\n",
        "            url.port = port;\n",
        "          }\n",
     {
      "data": {
       "text/html": [
+       "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/41355/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/41355/</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
    "source": [
     "nersc_tensorboard_helper.tb_address()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ca783fe-501c-4e12-b769-f037b4671ef0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {