20051216

Browse files

Files changed (5) hide show

context_unet.py +1 -1
diffusion.py +15 -6
load_h5.py +1 -1
perlmutter_diffusion.sbatch +13 -19
quantify_results.ipynb +2 -2

context_unet.py CHANGED Viewed

@@ -179,7 +179,7 @@ class ResBlock(TimestepBlock):
                 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
                 ),
         )
         self.out_layers = nn.Sequential(
             # nn.BatchNorm2d(self.out_channels),
             normalization(self.out_channels, swish=0.0 if use_scale_shift_norm else 1.0),

                 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
                 ),
         )
+        print(f"resnet: dropout = {dropout}")
         self.out_layers = nn.Sequential(
             # nn.BatchNorm2d(self.out_channels),
             normalization(self.out_channels, swish=0.0 if use_scale_shift_norm else 1.0),

diffusion.py CHANGED Viewed

@@ -51,6 +51,8 @@ from time import time
 from torch.cuda.amp import autocast, GradScaler
 from random import getrandbits
 # %%
 def ddp_setup(rank: int, world_size: int, master_addr, master_port):
     """
@@ -268,7 +270,8 @@ class TrainConfig:
     # n_sample = 24 # 64, the number of samples in sampling process
     n_param = 2
     guide_w = 0#-1#0#-1#0#-1#0.1#[0,0.1] #[0,0.5,2] strength of generative guidance
-    drop_prob = 0#0.28 # only takes effect when guide_w != -1
     ema=False # whether to use ema
     ema_rate=0.995
@@ -365,7 +368,7 @@ class DDPM21CM:
         self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
         # initialize the unet
-        self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride, channel_mult=config.channel_mult, use_checkpoint=config.use_checkpoint)#, dtype=config.dtype)
         self.nn_model.train()
         self.nn_model.to(self.ddpm.device)
@@ -386,7 +389,7 @@ class DDPM21CM:
         if config.ema:
             self.ema = EMA(config.ema_rate)
             if config.resume and os.path.exists(config.resume):
-                self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)#, dtype=config.dtype
                 self.ema_model.load_state_dict(torch.load(config.resume)['ema_unet_state_dict'])
                 print(f"resumed ema_model from {config.resume}")
             else:
@@ -409,7 +412,7 @@ class DDPM21CM:
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
             startat=self.config.startat,
-            drop_prob=self.config.drop_prob,
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
             num_workers=min(1,len(os.sched_getaffinity(0))//self.config.world_size),
@@ -505,6 +508,10 @@ class DDPM21CM:
                         c = c.to(self.config.device)
                         noise_pred = self.nn_model(xt, ts, c)#.to(x.dtype)
                     loss = F.mse_loss(noise, noise_pred)
                     loss = loss / self.config.gradient_accumulation_steps
@@ -610,7 +617,7 @@ class DDPM21CM:
         params_backup = params.numpy().copy()
         params_normalized = self.rescale(params, self.ranges_dict['params'], to=[0,1])
-        print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} sampling {num_new_img_per_gpu} images with normalized params = {params_normalized}")
         params_normalized = params_normalized.repeat(num_new_img_per_gpu,1)
         assert params_normalized.dim() == 2, "params_normalized must be a 2D torch.tensor"
         # print("params =", params)
@@ -705,6 +712,7 @@ if __name__ == "__main__":
     parser.add_argument("--channel_mult", type=float, nargs="+", required=False, default=(1,2,2,2,4))
     parser.add_argument("--autocast", type=int, required=False, default=False)
     parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
     args = parser.parse_args()
@@ -722,6 +730,7 @@ if __name__ == "__main__":
     config.channel_mult = args.channel_mult
     config.autocast = bool(args.autocast)
     config.use_checkpoint = bool(args.use_checkpoint)
     ############################ training ################################
     if args.train:
@@ -756,7 +765,7 @@ if __name__ == "__main__":
         ]
         for params in params_pairs:
-            print(f"sampling for {params}, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size}".center(config.str_len,'-'))
             mp.spawn(
                     generate_samples,
                     args=(world_size, local_world_size, master_addr, master_port, config, num_new_img_per_gpu, max_num_img_per_gpu, torch.tensor(params)),

 from torch.cuda.amp import autocast, GradScaler
 from random import getrandbits
+import subprocess
 # %%
 def ddp_setup(rank: int, world_size: int, master_addr, master_port):
     """
     # n_sample = 24 # 64, the number of samples in sampling process
     n_param = 2
     guide_w = 0#-1#0#-1#0#-1#0.1#[0,0.1] #[0,0.5,2] strength of generative guidance
+    dropout = 0
+    #drop_prob = 0.1 #0.28 # only takes effect when guide_w != -1
     ema=False # whether to use ema
     ema_rate=0.995
         self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
         # initialize the unet
+        self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride, channel_mult=config.channel_mult, use_checkpoint=config.use_checkpoint, dropout=config.dropout)#, dtype=config.dtype)
         self.nn_model.train()
         self.nn_model.to(self.ddpm.device)
         if config.ema:
             self.ema = EMA(config.ema_rate)
             if config.resume and os.path.exists(config.resume):
+                self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device, dropout=config.dropout)#, dtype=config.dtype
                 self.ema_model.load_state_dict(torch.load(config.resume)['ema_unet_state_dict'])
                 print(f"resumed ema_model from {config.resume}")
             else:
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
             startat=self.config.startat,
+            #drop_prob=self.config.drop_prob,
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
             num_workers=min(1,len(os.sched_getaffinity(0))//self.config.world_size),
                         c = c.to(self.config.device)
                         noise_pred = self.nn_model(xt, ts, c)#.to(x.dtype)
+                    #if ep == 0 and i == 0 and self.config.global_rank == 0:
+                    #    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                    #    print(result.stdout, flush=True)
                     loss = F.mse_loss(noise, noise_pred)
                     loss = loss / self.config.gradient_accumulation_steps
         params_backup = params.numpy().copy()
         params_normalized = self.rescale(params, self.ranges_dict['params'], to=[0,1])
+        print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} sampling {num_new_img_per_gpu} images with normalized params = {params_normalized}, {datetime.now().strftime('%d-%H:%M:%S.%f')}")
         params_normalized = params_normalized.repeat(num_new_img_per_gpu,1)
         assert params_normalized.dim() == 2, "params_normalized must be a 2D torch.tensor"
         # print("params =", params)
     parser.add_argument("--channel_mult", type=float, nargs="+", required=False, default=(1,2,2,2,4))
     parser.add_argument("--autocast", type=int, required=False, default=False)
     parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
+    parser.add_argument("--dropout", type=float, required=False, default=0)
     args = parser.parse_args()
     config.channel_mult = args.channel_mult
     config.autocast = bool(args.autocast)
     config.use_checkpoint = bool(args.use_checkpoint)
+    config.dropout = args.dropout
     ############################ training ################################
     if args.train:
         ]
         for params in params_pairs:
+            print(f"sampling, {params}, ip = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size}, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(config.str_len,'#'))
             mp.spawn(
                     generate_samples,
                     args=(world_size, local_world_size, master_addr, master_port, config, num_new_img_per_gpu, max_num_img_per_gpu, torch.tensor(params)),

load_h5.py CHANGED Viewed

@@ -91,7 +91,7 @@ class Dataset4h5(Dataset):
             field_shape = f['brightness_temp'].shape[1:]
             #print(f"field.shape = {field_shape}")
             self.params_keys = list(f['params']['keys'])
-            print(f"{max_num_image} images of shape {field_shape} can be loaded with different params.keys {self.params_keys}")
             #print(f"params keys = {self.params_keys}")
         if self.idx == "random":

             field_shape = f['brightness_temp'].shape[1:]
             #print(f"field.shape = {field_shape}")
             self.params_keys = list(f['params']['keys'])
+            print(f"{max_num_image} {f['brightness_temp'].dtype} images of shape {field_shape} can be loaded with params.keys {self.params_keys}")
             #print(f"params keys = {self.params_keys}")
         if self.idx == "random":

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -2,10 +2,10 @@
 #SBATCH -A m4717
 #SBATCH -J diffusion
 #SBATCH -C gpu&hbm80g
-#SBATCH -q shared #regular
-#SBATCH -N1
-#SBATCH --gpus-per-node=1
-#SBATCH -t 6:00:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
@@ -25,27 +25,21 @@ MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
 #export OMP_NUM_THREADS=1
 export MASTER_ADDR=$MASTER_ADDR
 export MASTER_PORT=$MASTER_PORT
-#export SLURM_CPU_BIND="cores"
-#echo $MASTER_ADDR
-#echo $MASTER_PORT
-#nc -zv $MASTER_ADDR $MASTER_PORT
-#export NCCL_DEBUG=INFO
-#export NCCL_DEBUG_SUBSYS=ALL
 cat $0
-#nvidia-smi
 srun python diffusion.py \
-    --num_image 1280 \
     --batch_size 2 \
-    --n_epoch 80 \
-    --channel_mult 0.5 1 2 4 4 8 \
-    --num_new_img_per_gpu 9 \
-    --max_num_img_per_gpu 3 \
     --gradient_accumulation_steps 1 \
     --autocast 1 \
     --use_checkpoint 1 \
-    --resume ./outputs/model-N1280-device_count4-node5-epoch34-13133235 \
-    #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
 date

 #SBATCH -A m4717
 #SBATCH -J diffusion
 #SBATCH -C gpu&hbm80g
+#SBATCH -q regular #shared
+#SBATCH -N4
+#SBATCH --gpus-per-node=4
+#SBATCH -t 48:00:00
 #SBATCH --ntasks-per-node=1
 #SBATCH -oReport-%j
 #SBATCH --mail-type=BEGIN,END,FAIL
 #export OMP_NUM_THREADS=1
 export MASTER_ADDR=$MASTER_ADDR
 export MASTER_PORT=$MASTER_PORT
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 cat $0
 srun python diffusion.py \
+    --num_image 1600 \
     --batch_size 2 \
+    --n_epoch 40 \
+    --channel_mult 1 1 2 2 4 4 \
+    --num_new_img_per_gpu 4 \
+    --max_num_img_per_gpu 2 \
     --gradient_accumulation_steps 1 \
     --autocast 1 \
     --use_checkpoint 1 \
+    --dropout 0.1 \
+    --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    #--resume ./outputs/model-N1280-device_count4-node5-epoch24-13133235 \
 date

quantify_results.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4feb0d9bf444b9783c9d63200ff956c00b720a75e82a3b25597432ea88122b2a
-size 16041018

 version https://git-lfs.github.com/spec/v1
+oid sha256:a974cc33812b8a680d4704e46268d54a185da33abeab18704d8100f69369b692
+size 16920502