0810-170449

Browse files

Files changed (7) hide show

context_unet.py +18 -12
diffusion.py +30 -18
frontera_generate_dataset.sbatch +53 -0
generate_dataset.ipynb +0 -0
generate_dataset.py +19 -10
phoenix_diffusion.sbatch +4 -4
quantify_results.ipynb +0 -0

context_unet.py CHANGED Viewed

@@ -32,12 +32,15 @@ class GroupNorm32(nn.GroupNorm):
         self.swish = swish
     def forward(self, x):
-        # print("GroupNorm32, x.dtype =", x.dtype)
-        y = super().forward(x.float()).to(x.dtype)
         if self.swish == 1.0:
             y = F.silu(y)
         elif self.swish:
             y = y * F.sigmoid(y * float(self.swish))
         return y
 def normalization(channels, swish=0.0):
@@ -284,7 +287,7 @@ def timestep_embedding(timesteps, dim, max_period=10000):
     :param max_period: controls the minimum frequency of the embeddings.
     :return: an [N x dim] Tensor of positional embeddings.
     """
-    #print (timesteps.shape)
     half = dim // 2
     freqs = torch.exp(
         -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
@@ -294,6 +297,7 @@ def timestep_embedding(timesteps, dim, max_period=10000):
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
     if dim % 2:
         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
     return embedding
 class ContextUnet(nn.Module):
@@ -522,32 +526,34 @@ class ContextUnet(nn.Module):
     def forward(self, x, timesteps, y=None):
         hs = []
         # print("device of timesteps, self.model_channels:", timesteps.device, self.model_channels)
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         if y != None:
-            text_outputs = self.token_embedding(y.float())
             emb = emb + text_outputs.to(emb)
-        # print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
-        # print("0,h.shape =", h.shape)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
-            # print("module encoder, h.shape =", h.shape)
         # print("2,h.shape =", h.shape)
         h = self.middle_block(h, emb)
-        # print("middle block, h.shape =", h.shape)
         # print("2,h.shape =", h.shape)
         for module in self.output_blocks:
-            # print("for module in self.output_blocks, h.shape =", h.shape)
             # print("len(hs) =", len(hs), ", hs[-1].shape =", hs[-1].shape)
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
-        # print("h = h.type(x.dtype), x.dtype =", x.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
-        # print("self.out(h)", "h.shape =", h.shape)
         return h

         self.swish = swish
     def forward(self, x):
+        #print(f"GroupNorm32, x.dtype = {x.dtype}, x.float().dtype = {x.float().dtype}, swish = {self.swish}")
+        #y = super().forward(x.float()).to(x.dtype)
+        y = super().forward(x)
+        #print(f"swish == {self.swish}, {y.dtype}")
         if self.swish == 1.0:
             y = F.silu(y)
         elif self.swish:
             y = y * F.sigmoid(y * float(self.swish))
+        #print(f"swish == {self.swish}, {y.dtype}")
         return y
 def normalization(channels, swish=0.0):
     :param max_period: controls the minimum frequency of the embeddings.
     :return: an [N x dim] Tensor of positional embeddings.
     """
+    #print(f"timestep_embedding is running")
     half = dim // 2
     freqs = torch.exp(
         -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
     if dim % 2:
         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    #print(f"timestep_embedding is ending")
     return embedding
 class ContextUnet(nn.Module):
     def forward(self, x, timesteps, y=None):
         hs = []
         # print("device of timesteps, self.model_channels:", timesteps.device, self.model_channels)
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels).to(self.dtype))
+        #print(f"forward after emb")
         if y != None:
+            #text_outputs = self.token_embedding(y.float())
+            text_outputs = self.token_embedding(y.to(self.dtype))
             emb = emb + text_outputs.to(emb)
+        #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
+        #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
+            #print("module encoder, h.shape =", h.shape)
         # print("2,h.shape =", h.shape)
         h = self.middle_block(h, emb)
+        #print("middle block, h.shape =", h.shape)
         # print("2,h.shape =", h.shape)
         for module in self.output_blocks:
+            #print("for module in self.output_blocks, h.shape =", h.shape)
             # print("len(hs) =", len(hs), ", hs[-1].shape =", hs[-1].shape)
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
+        #print("h = h.type(x.dtype), x.dtype =", x.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
+        #print("self.out(h)", "h.shape =", h.shape)
         return h

diffusion.py CHANGED Viewed

@@ -115,8 +115,9 @@ def ddp_setup(rank: int, world_size: int, master_addr, master_port):
 # %%
 class DDPMScheduler(nn.Module):
-    def __init__(self, betas: tuple, num_timesteps: int, img_shape: list, device='cpu', dtype=torch.float32, config=None):
         super().__init__()
         beta_1, beta_T = betas
         assert 0 < beta_1 <= beta_T <= 1, "ensure 0 < beta_1 <= beta_T <= 1"
@@ -124,6 +125,7 @@ class DDPMScheduler(nn.Module):
         self.num_timesteps = num_timesteps
         self.img_shape = img_shape
         self.beta_t = torch.linspace(beta_1, beta_T, self.num_timesteps) #* (beta_T-beta_1) + beta_1
         self.beta_t = self.beta_t.to(self.device)
         # self.drop_prob = drop_prob
@@ -132,7 +134,6 @@ class DDPMScheduler(nn.Module):
         # self.bar_alpha_t = torch.exp(torch.cumsum(torch.log(self.alpha_t), dim=0))
         self.bar_alpha_t = torch.cumprod(self.alpha_t, dim=0)
         # self.use_fp16 = use_fp16
-        self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
         self.config = config
     def add_noise(self, clean_images):
@@ -157,15 +158,18 @@ class DDPMScheduler(nn.Module):
     def sample(self, nn_model, params, device, guide_w = 0):
         n_sample = len(params) #params.shape[0]
         # print("params.shape[0], len(params)", params.shape[0], len(params))
-        x_i = torch.randn(n_sample, *self.img_shape).to(device)
         # print("x_i.shape =", x_i.shape)
         # print("x_i.shape =", x_i.shape)
         if guide_w != -1:
             c_i = params
-            uncond_tokens = torch.zeros(int(n_sample), params.shape[1]).to(device)
             # uncond_tokens = torch.tensor(np.float32(np.array([0,0]))).to(device)
             # uncond_tokens = uncond_tokens.repeat(int(n_sample),1)
-            c_i = torch.cat((c_i, uncond_tokens), 0)
         x_i_entire = [] # keep track of generated steps in case want to plot something
         # print("self.num_timesteps =", self.num_timesteps)
@@ -177,8 +181,10 @@ class DDPMScheduler(nn.Module):
             # print(f'sampling timestep {i:4d}',end='\r')
             t_is = torch.tensor([i]).to(device)
             t_is = t_is.repeat(n_sample)
-            z = torch.randn(n_sample, *self.img_shape).to(device) if i > 0 else 0
             if guide_w == -1:
                 # eps = nn_model(x_i, t_is, return_dict=False)[0]
@@ -186,22 +192,26 @@ class DDPMScheduler(nn.Module):
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             else:
                 # double batch
-                x_i = x_i.repeat(2, *torch.ones(len(self.img_shape), dtype=int).tolist())
-                t_is = t_is.repeat(2)
                 # split predictions and compute weighting
                 # print("nn_model input shape", x_i.shape, t_is.shape, c_i.shape)
                 eps = nn_model(x_i, t_is, c_i)
-                eps1 = eps[:n_sample]
-                eps2 = eps[n_sample:]
-                eps = eps1 + guide_w*(eps1 - eps2)
                 # eps = (1+guide_w)*eps1 - guide_w*eps2
-                x_i = x_i[:n_sample]
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             pbar_sample.update(1)
             # store only part of the intermediate steps
@@ -257,12 +267,12 @@ class TrainConfig:
     # dim = 2
     dim = 3#2
-    stride = (2,4) if dim == 2 else (2,2,4)
-    num_image = 480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
-    batch_size = 1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
-    num_redshift = 512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
@@ -396,6 +406,7 @@ class DDPM21CM:
             # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             # print(f"resumed nn_model from {config.resume}")
             self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             print(f"{config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(120,'-'))
         else:
             print(f"{config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(120,'-'))
@@ -511,8 +522,9 @@ class DDPM21CM:
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     # x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
                     xt, noise, ts = self.ddpm.add_noise(x)
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts)
                     else:

 # %%
 class DDPMScheduler(nn.Module):
+    def __init__(self, betas: tuple, num_timesteps: int, img_shape: list, device='cpu', dtype=torch.float16, config=None):
         super().__init__()
+        self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
         beta_1, beta_T = betas
         assert 0 < beta_1 <= beta_T <= 1, "ensure 0 < beta_1 <= beta_T <= 1"
         self.num_timesteps = num_timesteps
         self.img_shape = img_shape
         self.beta_t = torch.linspace(beta_1, beta_T, self.num_timesteps) #* (beta_T-beta_1) + beta_1
+        self.beta_t = self.beta_t.to(self.dtype)
         self.beta_t = self.beta_t.to(self.device)
         # self.drop_prob = drop_prob
         # self.bar_alpha_t = torch.exp(torch.cumsum(torch.log(self.alpha_t), dim=0))
         self.bar_alpha_t = torch.cumprod(self.alpha_t, dim=0)
         # self.use_fp16 = use_fp16
         self.config = config
     def add_noise(self, clean_images):
     def sample(self, nn_model, params, device, guide_w = 0):
         n_sample = len(params) #params.shape[0]
         # print("params.shape[0], len(params)", params.shape[0], len(params))
+        x_i = torch.randn(n_sample, *self.img_shape).to(self.dtype)
+        x_i = x_i.to(device)
+        #print(f"#1 x_i.device = {x_i.device}")
         # print("x_i.shape =", x_i.shape)
         # print("x_i.shape =", x_i.shape)
         if guide_w != -1:
             c_i = params
+            #uncond_tokens = torch.zeros(int(n_sample), params.shape[1]).to(device)
             # uncond_tokens = torch.tensor(np.float32(np.array([0,0]))).to(device)
             # uncond_tokens = uncond_tokens.repeat(int(n_sample),1)
+            #c_i = torch.cat((c_i, uncond_tokens), 0)
+            c_i = c_i.to(self.dtype)
         x_i_entire = [] # keep track of generated steps in case want to plot something
         # print("self.num_timesteps =", self.num_timesteps)
             # print(f'sampling timestep {i:4d}',end='\r')
             t_is = torch.tensor([i]).to(device)
             t_is = t_is.repeat(n_sample)
+            t_is = t_is.to(self.dtype)
+            z = torch.randn(n_sample, *self.img_shape).to(device) if i > 0 else torch.tensor(0.)
+            z = z.to(self.dtype)
             if guide_w == -1:
                 # eps = nn_model(x_i, t_is, return_dict=False)[0]
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             else:
                 # double batch
+                #print(f"#2 x_i.device = {x_i.device}")
+                #x_i = x_i.repeat(2, *torch.ones(len(self.img_shape), dtype=int).tolist())
+                #t_is = t_is.repeat(2)
                 # split predictions and compute weighting
                 # print("nn_model input shape", x_i.shape, t_is.shape, c_i.shape)
+                #print(f"sample, i = {i}, x_i.dtype = {x_i.dtype}, c_i.dtype = {c_i.dtype}")
                 eps = nn_model(x_i, t_is, c_i)
+                #eps1 = eps[:n_sample]
+                #eps2 = eps[n_sample:]
+                #eps = eps1 + guide_w*(eps1 - eps2)
                 # eps = (1+guide_w)*eps1 - guide_w*eps2
+                #x_i = x_i[:n_sample]
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
+            #print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
+            #print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             pbar_sample.update(1)
             # store only part of the intermediate steps
     # dim = 2
     dim = 3#2
+    stride = (2,4) if dim == 2 else (2,2,2)
+    num_image = 30#00#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
+    batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
+    num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
             # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
             # print(f"resumed nn_model from {config.resume}")
             self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
+            self.nn_model.module.to(config.dtype)
             print(f"{config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(120,'-'))
         else:
             print(f"{config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(120,'-'))
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     # x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
+                    #print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
                     xt, noise, ts = self.ddpm.add_noise(x)
+                    #print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts)
                     else:

frontera_generate_dataset.sbatch ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/bin/bash
+#----------------------------------------------------
+# Sample Slurm job script
+#   for TACC Frontera CLX nodes
+#
+#   *** MPI Job in Normal Queue ***
+#
+# Last revised: 20 May 2019
+#
+# Notes:
+#
+#   -- Launch this script by executing
+#      "sbatch clx.mpi.slurm" on a Frontera login node.
+#
+#   -- Use ibrun to launch MPI codes on TACC systems.
+#      Do NOT use mpirun or mpiexec.
+#
+#   -- Max recommended MPI ranks per CLX node: 56
+#      (start small, increase gradually).
+#
+#   -- If you're running out of memory, try running
+#      fewer tasks per node to give each task more memory.
+#
+#----------------------------------------------------
+#SBATCH -J datasets           # Job name
+#SBATCH -o Report-%j       # Name of stdout output file
+#SBATCH -p normal # Queue (partition) name
+#SBATCH -N 12 # 50              # Total # of nodes
+#SBATCH -t 2-00:00:00        # Run time (hh:mm:ss)
+#SBATCH --mail-type=all    # Send email at begin and end of job
+#SBATCH --mail-user=xiabin@gatech.edu
+#SBATCH --ntasks-per-node=1
+# Any other commands must follow all #SBATCH directives...
+############# #SBATCH -c 56              # Total # of mpi tasks
+#----------------------------------------------------
+cat $0
+date
+pwd
+module list
+conda env list
+srun python generate_dataset.py \
+    --save_direc $SCRATCH \
+    --num_images 25600 \
+    --BOX_LEN 128 \
+    --HII_DIM 64 \
+    --NON_CUBIC_FACTOR 16 \
+    --cpus_per_node 38 \
+#----------------------------------------------------

generate_dataset.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

generate_dataset.py CHANGED Viewed

@@ -19,6 +19,7 @@ import fcntl
 import time
 from time import sleep
 from pathlib import Path
 # Parallize
 try:
@@ -120,6 +121,7 @@ class Generator():
             BOX_LEN = 150,
             HII_DIM = 60,
             USE_INTERPOLATION_TABLES = True,
             # cosmo_params of py21cmfast.run_coeval():
             SIGMA_8 = 0.810,
@@ -201,6 +203,7 @@ class Generator():
                 user_params = kwargs_params_cpu,
                 cosmo_params = p21c.CosmoParams(kwargs_params_cpu),
                 astro_params = p21c.AstroParams(kwargs_params_cpu),
                 random_seed = random_seed,
                 write = kwargs_params_cpu['write'],
             )
@@ -210,11 +213,13 @@ class Generator():
         elif self.kwargs['p21c_run'] == 'lightcone':
             lightcone_cpu = p21c.run_lightcone(
                 redshift = kwargs_params_cpu['redshift'][0],
-                max_redshift = kwargs_params_cpu['redshift'][-1],
                 lightcone_quantities = kwargs_params_cpu['fields'],
                 user_params = kwargs_params_cpu,
                 cosmo_params = p21c.CosmoParams(kwargs_params_cpu),
                 astro_params = p21c.AstroParams(kwargs_params_cpu),
                 random_seed = random_seed,
                 write = kwargs_params_cpu['write'],
             )
@@ -362,14 +367,16 @@ class Generator():
                 # break
             except IOError or BlockingIOError:
                 if try_time > 30:
-                    print(f"{rank}-{multiprocessing.current_process().pid}, try_time = {try_time:.2f} sec")
-                    sleep(10)
                 else:
                     sleep(0.1)
     # Save as hdf5
     def save(self, images_node, params_seeds):
-        max_num_images = None # self.kwargs['num_images']
         with h5py.File(self.kwargs['save_direc_name'], 'a') as f:
             if 'kwargs' not in f.keys():
                 keys = list(self.kwargs)
@@ -436,23 +443,25 @@ if __name__ == '__main__':
     args = parser.parse_args()
     params_ranges = dict(
-        ION_Tvir_MIN = 4.4, #[4,6],
-        HII_EFF_FACTOR = 131.341, #[10, 250],
         )
     kwargs = dict(
         num_images=args.num_images,#2400,#30000,
         fields = ['brightness_temp', 'density', 'xH_box'],
-        BOX_LEN=args.BOX_LEN,#128,
-        HII_DIM=args.HII_DIM,
-        verbose=3, redshift=[7.51, 11.93],
         NON_CUBIC_FACTOR = args.NON_CUBIC_FACTOR,
         write = True,
         cpus_per_node = args.cpus_per_node,#10,#112,#20,
         cache_rmdir = False,
         )
-    save_name = f"LEN{kwargs['BOX_LEN']}-DIM{kwargs['HII_DIM']}-CUB{kwargs['NON_CUBIC_FACTOR']}-{params_ranges['ION_Tvir_MIN']}-{params_ranges['HII_EFF_FACTOR']}.h5"
     kwargs['save_direc_name'] = os.path.join(args.save_direc, save_name)
     generator = Generator(params_ranges, **kwargs)

 import time
 from time import sleep
 from pathlib import Path
+import datetime
 # Parallize
 try:
             BOX_LEN = 150,
             HII_DIM = 60,
             USE_INTERPOLATION_TABLES = True,
+            USE_TS_FLUCT = True,
             # cosmo_params of py21cmfast.run_coeval():
             SIGMA_8 = 0.810,
                 user_params = kwargs_params_cpu,
                 cosmo_params = p21c.CosmoParams(kwargs_params_cpu),
                 astro_params = p21c.AstroParams(kwargs_params_cpu),
+                flag_options = p21c.FlagOptions(kwargs_params_cpu),
                 random_seed = random_seed,
                 write = kwargs_params_cpu['write'],
             )
         elif self.kwargs['p21c_run'] == 'lightcone':
             lightcone_cpu = p21c.run_lightcone(
                 redshift = kwargs_params_cpu['redshift'][0],
+                #max_redshift = kwargs_params_cpu['redshift'][-1],
+                z_heat_max = kwargs_params_cpu['redshift'][-1],
                 lightcone_quantities = kwargs_params_cpu['fields'],
                 user_params = kwargs_params_cpu,
                 cosmo_params = p21c.CosmoParams(kwargs_params_cpu),
                 astro_params = p21c.AstroParams(kwargs_params_cpu),
+                flag_options = p21c.FlagOptions(kwargs_params_cpu),
                 random_seed = random_seed,
                 write = kwargs_params_cpu['write'],
             )
                 # break
             except IOError or BlockingIOError:
                 if try_time > 30:
+                    print(f"cpu {multiprocessing.current_process().pid}-{rank}, try_time = {try_time:.2f} sec")
+                    sleep(5)
                 else:
                     sleep(0.1)
     # Save as hdf5
     def save(self, images_node, params_seeds):
+        #max_num_images = None # self.kwargs['num_images']
+        max_num_images = self.kwargs['num_images']
+        #print(f"max_num_images = {max_num_images}")
         with h5py.File(self.kwargs['save_direc_name'], 'a') as f:
             if 'kwargs' not in f.keys():
                 keys = list(self.kwargs)
     args = parser.parse_args()
     params_ranges = dict(
+        ION_Tvir_MIN = [4,6],#4.8,#5.477,#4.699,#5.6,#4.4, #[4,6],
+        HII_EFF_FACTOR = [10,250],#131.341,#200,#30,#19.037,#131.341, #[10, 250],
         )
     kwargs = dict(
         num_images=args.num_images,#2400,#30000,
         fields = ['brightness_temp', 'density', 'xH_box'],
+        BOX_LEN = args.BOX_LEN,#128,
+        HII_DIM = args.HII_DIM,
+        verbose = 3,
+        redshift = [7.51, 21.02],#11.93],
         NON_CUBIC_FACTOR = args.NON_CUBIC_FACTOR,
         write = True,
         cpus_per_node = args.cpus_per_node,#10,#112,#20,
         cache_rmdir = False,
         )
+    now = datetime.datetime.now().strftime("%m%d-%H%M%S")
+    save_name = f"LEN{kwargs['BOX_LEN']}-DIM{kwargs['HII_DIM']}-CUB{kwargs['NON_CUBIC_FACTOR']}-Tvir{params_ranges['ION_Tvir_MIN']}-zeta{params_ranges['HII_EFF_FACTOR']}-{now}.h5"
     kwargs['save_direc_name'] = os.path.join(args.save_direc, save_name)
     generator = Generator(params_ranges, **kwargs)

phoenix_diffusion.sbatch CHANGED Viewed

@@ -2,10 +2,10 @@
 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
-#SBATCH -N4 --gpus-per-node=V100:2 -C V100-32GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
-#SBATCH -t 08:00:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
@@ -30,9 +30,9 @@ export MASTER_PORT=$MASTER_PORT
 srun python diffusion.py \
     --train 1 \
-    --resume outputs/model_state-N3000-device_count1-node8-epoch49-172.27.149.181 \
     --num_new_img_per_gpu 50 \
-    --max_num_img_per_gpu 10 \
 ######################################################################################

 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
+#SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
+#SBATCH -t 00:10:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
 srun python diffusion.py \
     --train 1 \
+    --resume outputs/model_state-N480-device_count1-node4-epoch49-172.27.149.66 \
     --num_new_img_per_gpu 50 \
+    --max_num_img_per_gpu 2 \
 ######################################################################################

quantify_results.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff