Xsmos commited on
Commit
e564941
·
verified ·
1 Parent(s): f890a31
context_unet.py CHANGED
@@ -32,15 +32,11 @@ class GroupNorm32(nn.GroupNorm):
32
  self.swish = swish
33
 
34
  def forward(self, x):
35
- #print(f"GroupNorm32, x.dtype = {x.dtype}, x.float().dtype = {x.float().dtype}, swish = {self.swish}")
36
- #y = super().forward(x.float()).to(x.dtype)
37
  y = super().forward(x)
38
- #print(f"swish == {self.swish}, {y.dtype}")
39
  if self.swish == 1.0:
40
  y = F.silu(y)
41
  elif self.swish:
42
  y = y * F.sigmoid(y * float(self.swish))
43
- #print(f"swish == {self.swish}, {y.dtype}")
44
  return y
45
 
46
  def normalization(channels, swish=0.0):
@@ -191,8 +187,7 @@ class ResBlock(TimestepBlock):
191
  h = in_conv(h)
192
  else:
193
  h = self.in_layers(x)
194
- # print("forward, h.dtype =", h.dtype)
195
- emb_out = self.emb_layers(emb).type(h.dtype)
196
 
197
  while len(emb_out.shape) < len(h.shape):
198
  emb_out = emb_out[..., None]
@@ -230,7 +225,7 @@ class QKVAttention(nn.Module):
230
  scale = 1 / math.sqrt(math.sqrt(ch))
231
  weight = torch.einsum("bct,bcs->bts", q*scale, k*scale)
232
  # print("forward, weight.dtype =", weight.dtype)
233
- weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
234
 
235
  a = torch.einsum("bts,bcs->bct", weight, v)
236
  return a.reshape(bs, -1, length)
@@ -290,7 +285,7 @@ def timestep_embedding(timesteps, dim, max_period=10000):
290
  #print(f"timestep_embedding is running")
291
  half = dim // 2
292
  freqs = torch.exp(
293
- -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
294
  ).to(device=timesteps.device)
295
  #print (timesteps[:, None].float().shape,freqs[None].shape)
296
  args = timesteps[:, None].float() * freqs[None]
@@ -322,7 +317,7 @@ class ContextUnet(nn.Module):
322
  encoder_channels = None,
323
  dim = 2,
324
  stride = (2,2),
325
- dtype = torch.float32,
326
  ):
327
  super().__init__()
328
 
@@ -356,7 +351,7 @@ class ContextUnet(nn.Module):
356
  # self.n_param = n_param
357
  self.model_channels = model_channels
358
  # self.use_fp16 = use_fp16
359
- self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
360
 
361
  self.token_embedding = nn.Linear(n_param, model_channels * 4)
362
 
@@ -526,15 +521,15 @@ class ContextUnet(nn.Module):
526
  def forward(self, x, timesteps, y=None):
527
  hs = []
528
  # print("device of timesteps, self.model_channels:", timesteps.device, self.model_channels)
529
- emb = self.time_embed(timestep_embedding(timesteps, self.model_channels).to(self.dtype))
530
  #print(f"forward after emb")
531
  if y != None:
532
  #text_outputs = self.token_embedding(y.float())
533
- text_outputs = self.token_embedding(y.to(self.dtype))
534
  emb = emb + text_outputs.to(emb)
535
 
536
  #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
537
- h = x.type(self.dtype)
538
  #print("0,h.shape =", h.shape)
539
  for module in self.input_blocks:
540
  h = module(h, emb)
@@ -552,7 +547,7 @@ class ContextUnet(nn.Module):
552
  # print("module decoder, h.shape =", h.shape)
553
 
554
  #print("h = h.type(x.dtype), x.dtype =", x.dtype, h.dtype)
555
- h = h.type(x.dtype)
556
  h = self.out(h)
557
  #print("self.out(h)", "h.dtype =", h.dtype)
558
 
 
32
  self.swish = swish
33
 
34
  def forward(self, x):
 
 
35
  y = super().forward(x)
 
36
  if self.swish == 1.0:
37
  y = F.silu(y)
38
  elif self.swish:
39
  y = y * F.sigmoid(y * float(self.swish))
 
40
  return y
41
 
42
  def normalization(channels, swish=0.0):
 
187
  h = in_conv(h)
188
  else:
189
  h = self.in_layers(x)
190
+ emb_out = self.emb_layers(emb)#.type(h.dtype)
 
191
 
192
  while len(emb_out.shape) < len(h.shape):
193
  emb_out = emb_out[..., None]
 
225
  scale = 1 / math.sqrt(math.sqrt(ch))
226
  weight = torch.einsum("bct,bcs->bts", q*scale, k*scale)
227
  # print("forward, weight.dtype =", weight.dtype)
228
+ weight = torch.softmax(weight.float(), dim=-1)#.type(weight.dtype)
229
 
230
  a = torch.einsum("bts,bcs->bct", weight, v)
231
  return a.reshape(bs, -1, length)
 
285
  #print(f"timestep_embedding is running")
286
  half = dim // 2
287
  freqs = torch.exp(
288
+ -math.log(max_period) * torch.arange(start=0, end=half) / half #, dtype=torch.float32) / half
289
  ).to(device=timesteps.device)
290
  #print (timesteps[:, None].float().shape,freqs[None].shape)
291
  args = timesteps[:, None].float() * freqs[None]
 
317
  encoder_channels = None,
318
  dim = 2,
319
  stride = (2,2),
320
+ #dtype = torch.float32,
321
  ):
322
  super().__init__()
323
 
 
351
  # self.n_param = n_param
352
  self.model_channels = model_channels
353
  # self.use_fp16 = use_fp16
354
+ #self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
355
 
356
  self.token_embedding = nn.Linear(n_param, model_channels * 4)
357
 
 
521
  def forward(self, x, timesteps, y=None):
522
  hs = []
523
  # print("device of timesteps, self.model_channels:", timesteps.device, self.model_channels)
524
+ emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))#.to(self.dtype))
525
  #print(f"forward after emb")
526
  if y != None:
527
  #text_outputs = self.token_embedding(y.float())
528
+ text_outputs = self.token_embedding(y)#.to(self.dtype))
529
  emb = emb + text_outputs.to(emb)
530
 
531
  #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
532
+ h = x.clone()#.type(self.dtype)
533
  #print("0,h.shape =", h.shape)
534
  for module in self.input_blocks:
535
  h = module(h, emb)
 
547
  # print("module decoder, h.shape =", h.shape)
548
 
549
  #print("h = h.type(x.dtype), x.dtype =", x.dtype, h.dtype)
550
+ #h = h.type(x.dtype)
551
  h = self.out(h)
552
  #print("self.out(h)", "h.dtype =", h.dtype)
553
 
diffusion.py CHANGED
@@ -77,6 +77,8 @@ import sys
77
  from datetime import timedelta
78
  from time import time
79
 
 
 
80
  # %%
81
  def ddp_setup(rank: int, world_size: int, master_addr, master_port):
82
  """
@@ -117,9 +119,9 @@ def ddp_setup(rank: int, world_size: int, master_addr, master_port):
117
 
118
  # %%
119
  class DDPMScheduler(nn.Module):
120
- def __init__(self, betas: tuple, num_timesteps: int, img_shape: list, device='cpu', dtype=torch.float16, config=None):
121
  super().__init__()
122
- self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
123
 
124
  beta_1, beta_T = betas
125
  assert 0 < beta_1 <= beta_T <= 1, "ensure 0 < beta_1 <= beta_T <= 1"
@@ -127,7 +129,7 @@ class DDPMScheduler(nn.Module):
127
  self.num_timesteps = num_timesteps
128
  self.img_shape = img_shape
129
  self.beta_t = torch.linspace(beta_1, beta_T, self.num_timesteps) #* (beta_T-beta_1) + beta_1
130
- self.beta_t = self.beta_t.to(self.dtype)
131
  self.beta_t = self.beta_t.to(self.device)
132
 
133
  # self.drop_prob = drop_prob
@@ -160,7 +162,7 @@ class DDPMScheduler(nn.Module):
160
  def sample(self, nn_model, params, device, guide_w = 0):
161
  n_sample = len(params) #params.shape[0]
162
  # print("params.shape[0], len(params)", params.shape[0], len(params))
163
- x_i = torch.randn(n_sample, *self.img_shape).to(self.dtype)
164
  x_i = x_i.to(device)
165
  #print(f"#1 x_i.device = {x_i.device}")
166
  # print("x_i.shape =", x_i.shape)
@@ -171,7 +173,7 @@ class DDPMScheduler(nn.Module):
171
  # uncond_tokens = torch.tensor(np.float32(np.array([0,0]))).to(device)
172
  # uncond_tokens = uncond_tokens.repeat(int(n_sample),1)
173
  #c_i = torch.cat((c_i, uncond_tokens), 0)
174
- c_i = c_i.to(self.dtype)
175
 
176
  x_i_entire = [] # keep track of generated steps in case want to plot something
177
  # print("self.num_timesteps =", self.num_timesteps)
@@ -183,14 +185,14 @@ class DDPMScheduler(nn.Module):
183
  # print(f'sampling timestep {i:4d}',end='\r')
184
  t_is = torch.tensor([i]).to(device)
185
  t_is = t_is.repeat(n_sample)
186
- t_is = t_is.to(self.dtype)
187
 
188
  z = torch.randn(n_sample, *self.img_shape).to(device) if i > 0 else torch.tensor(0.)
189
- z = z.to(self.dtype)
190
 
191
  if guide_w == -1:
192
  # eps = nn_model(x_i, t_is, return_dict=False)[0]
193
- eps = nn_model(x_i, t_is).to(self.dtype)
194
  # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
195
  else:
196
  # double batch
@@ -201,7 +203,7 @@ class DDPMScheduler(nn.Module):
201
  # split predictions and compute weighting
202
  # print("nn_model input shape", x_i.shape, t_is.shape, c_i.shape)
203
  #print(f"sample, i = {i}, x_i.dtype = {x_i.dtype}, c_i.dtype = {c_i.dtype}")
204
- eps = nn_model(x_i, t_is, c_i).to(self.dtype)
205
  #eps1 = eps[:n_sample]
206
  #eps2 = eps[n_sample:]
207
  #eps = eps1 + guide_w*(eps1 - eps2)
@@ -317,8 +319,8 @@ class TrainConfig:
317
  # data_dir = './data' # data directory
318
 
319
  #use_fp16 = True
320
- dtype = torch.float32 #if use_fp16 else torch.float32
321
- mixed_precision = "no" #"fp16"
322
  gradient_accumulation_steps = 1
323
 
324
  pbar_update_step = 20
@@ -389,11 +391,11 @@ class DDPM21CM:
389
  # self.dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
390
  # del dataset
391
  # print("self.ddpm = DDPMScheduler")
392
- self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, dtype=config.dtype, config=config,)
393
 
394
  # print("self.nn_model = ContextUnet")
395
  # initialize the unet
396
- self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride, dtype=config.dtype)
397
 
398
  # print("self.nn_model.train()")
399
  # nn_model = ContextUnet(n_param=1, image_size=28)
@@ -410,7 +412,7 @@ class DDPM21CM:
410
  # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
411
  # print(f"resumed nn_model from {config.resume}")
412
  self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
413
- self.nn_model.module.to(config.dtype)
414
  print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
415
  else:
416
  print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
@@ -422,7 +424,7 @@ class DDPM21CM:
422
  if config.ema:
423
  self.ema = EMA(config.ema_rate)
424
  if config.resume and os.path.exists(config.resume):
425
- self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride, dtype=config.dtype).to(config.device)
426
  self.ema_model.load_state_dict(torch.load(config.resume)['ema_unet_state_dict'])
427
  print(f"resumed ema_model from {config.resume}")
428
  else:
@@ -435,6 +437,7 @@ class DDPM21CM:
435
  )
436
 
437
  self.ranges_dict = config.ranges_dict
 
438
 
439
  def load(self):
440
  # rank = torch.cuda.current_device()
@@ -553,27 +556,37 @@ class DDPM21CM:
553
 
554
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
555
  #with self.accelerator.accumulate(self.nn_model):
556
- x = x.to(self.config.device)
557
  # print("x = x.to(self.config.device), x.dtype =", x.dtype)
558
- x = x.to(self.config.dtype)
559
  # print("x = x.to(self.dtype), x.dtype =", x.dtype)
560
  # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
561
- xt, noise, ts = self.ddpm.add_noise(x)
562
  # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
563
- if self.config.guide_w == -1:
564
- noise_pred = self.nn_model(xt, ts).to(x.dtype)
565
- else:
566
- c = c.to(self.config.device)
567
- noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
 
 
 
 
 
568
 
569
- loss = F.mse_loss(noise, noise_pred)
570
- loss = loss / self.config.gradient_accumulation_steps
571
- loss.backward()
 
 
 
572
 
573
  if (i+1) % self.config.gradient_accumulation_steps == 0:
 
574
  torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
575
- self.optimizer.step()
 
576
  self.lr_scheduler.step()
 
 
577
  self.optimizer.zero_grad()
578
 
579
  # ema update
@@ -826,7 +839,7 @@ if __name__ == "__main__":
826
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
827
  #config = TrainConfig()
828
  #config.world_size = world_size
829
- config.dtype = torch.float32
830
  config.resume = args.resume
831
  #config.gradient_accumulation_steps = args.gradient_accumulation_steps
832
  # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
 
77
  from datetime import timedelta
78
  from time import time
79
 
80
+ from torch.cuda.amp import autocast, GradScaler
81
+
82
  # %%
83
  def ddp_setup(rank: int, world_size: int, master_addr, master_port):
84
  """
 
119
 
120
  # %%
121
  class DDPMScheduler(nn.Module):
122
+ def __init__(self, betas: tuple, num_timesteps: int, img_shape: list, device='cpu', config=None):#, dtype=torch.float16,
123
  super().__init__()
124
+ #self.dtype = dtype#torch.float16 if self.use_fp16 else torch.float32
125
 
126
  beta_1, beta_T = betas
127
  assert 0 < beta_1 <= beta_T <= 1, "ensure 0 < beta_1 <= beta_T <= 1"
 
129
  self.num_timesteps = num_timesteps
130
  self.img_shape = img_shape
131
  self.beta_t = torch.linspace(beta_1, beta_T, self.num_timesteps) #* (beta_T-beta_1) + beta_1
132
+ #self.beta_t = self.beta_t.to(self.dtype)
133
  self.beta_t = self.beta_t.to(self.device)
134
 
135
  # self.drop_prob = drop_prob
 
162
  def sample(self, nn_model, params, device, guide_w = 0):
163
  n_sample = len(params) #params.shape[0]
164
  # print("params.shape[0], len(params)", params.shape[0], len(params))
165
+ x_i = torch.randn(n_sample, *self.img_shape)#.to(self.dtype)
166
  x_i = x_i.to(device)
167
  #print(f"#1 x_i.device = {x_i.device}")
168
  # print("x_i.shape =", x_i.shape)
 
173
  # uncond_tokens = torch.tensor(np.float32(np.array([0,0]))).to(device)
174
  # uncond_tokens = uncond_tokens.repeat(int(n_sample),1)
175
  #c_i = torch.cat((c_i, uncond_tokens), 0)
176
+ #c_i = c_i.to(self.dtype)
177
 
178
  x_i_entire = [] # keep track of generated steps in case want to plot something
179
  # print("self.num_timesteps =", self.num_timesteps)
 
185
  # print(f'sampling timestep {i:4d}',end='\r')
186
  t_is = torch.tensor([i]).to(device)
187
  t_is = t_is.repeat(n_sample)
188
+ #t_is = t_is.to(self.dtype)
189
 
190
  z = torch.randn(n_sample, *self.img_shape).to(device) if i > 0 else torch.tensor(0.)
191
+ #z = z.to(self.dtype)
192
 
193
  if guide_w == -1:
194
  # eps = nn_model(x_i, t_is, return_dict=False)[0]
195
+ eps = nn_model(x_i, t_is)#.to(self.dtype)
196
  # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
197
  else:
198
  # double batch
 
203
  # split predictions and compute weighting
204
  # print("nn_model input shape", x_i.shape, t_is.shape, c_i.shape)
205
  #print(f"sample, i = {i}, x_i.dtype = {x_i.dtype}, c_i.dtype = {c_i.dtype}")
206
+ eps = nn_model(x_i, t_is, c_i)#.to(self.dtype)
207
  #eps1 = eps[:n_sample]
208
  #eps2 = eps[n_sample:]
209
  #eps = eps1 + guide_w*(eps1 - eps2)
 
319
  # data_dir = './data' # data directory
320
 
321
  #use_fp16 = True
322
+ #dtype = torch.float32 #if use_fp16 else torch.float32
323
+ #mixed_precision = "no" #"fp16"
324
  gradient_accumulation_steps = 1
325
 
326
  pbar_update_step = 20
 
391
  # self.dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
392
  # del dataset
393
  # print("self.ddpm = DDPMScheduler")
394
+ self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
395
 
396
  # print("self.nn_model = ContextUnet")
397
  # initialize the unet
398
+ self.nn_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride)#, dtype=config.dtype)
399
 
400
  # print("self.nn_model.train()")
401
  # nn_model = ContextUnet(n_param=1, image_size=28)
 
412
  # self.nn_model.load_state_dict(torch.load(config.resume)['unet_state_dict'])
413
  # print(f"resumed nn_model from {config.resume}")
414
  self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
415
+ #self.nn_model.module.to(config.dtype)
416
  print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
417
  else:
418
  print(f" {config.run_name} {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters ".center(120,'+'))
 
424
  if config.ema:
425
  self.ema = EMA(config.ema_rate)
426
  if config.resume and os.path.exists(config.resume):
427
+ self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)#, dtype=config.dtype
428
  self.ema_model.load_state_dict(torch.load(config.resume)['ema_unet_state_dict'])
429
  print(f"resumed ema_model from {config.resume}")
430
  else:
 
437
  )
438
 
439
  self.ranges_dict = config.ranges_dict
440
+ self.scaler = GradScaler()
441
 
442
  def load(self):
443
  # rank = torch.cuda.current_device()
 
556
 
557
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
558
  #with self.accelerator.accumulate(self.nn_model):
559
+ x = x.to(self.config.device)#.to(self.config.dtype)
560
  # print("x = x.to(self.config.device), x.dtype =", x.dtype)
 
561
  # print("x = x.to(self.dtype), x.dtype =", x.dtype)
562
  # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
 
563
  # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
564
+
565
+ # autocast forward propogation
566
+ with autocast():
567
+ xt, noise, ts = self.ddpm.add_noise(x)
568
+
569
+ if self.config.guide_w == -1:
570
+ noise_pred = self.nn_model(xt, ts)#.to(x.dtype)
571
+ else:
572
+ c = c.to(self.config.device)
573
+ noise_pred = self.nn_model(xt, ts, c)#.to(x.dtype)
574
 
575
+ loss = F.mse_loss(noise, noise_pred)
576
+ loss = loss / self.config.gradient_accumulation_steps
577
+
578
+ # scaler backward propogation
579
+ self.scaler.scale(loss).backward()
580
+ #loss.backward()
581
 
582
  if (i+1) % self.config.gradient_accumulation_steps == 0:
583
+ self.scaler.unscale_(self.optimizer)
584
  torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
585
+
586
+ self.scaler.step(self.optimizer)
587
  self.lr_scheduler.step()
588
+
589
+ self.scaler.update()
590
  self.optimizer.zero_grad()
591
 
592
  # ema update
 
839
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
840
  #config = TrainConfig()
841
  #config.world_size = world_size
842
+ #config.dtype = torch.float32
843
  config.resume = args.resume
844
  #config.gradient_accumulation_steps = args.gradient_accumulation_steps
845
  # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
perlmutter_diffusion.sbatch CHANGED
@@ -5,7 +5,7 @@
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
- #SBATCH -t 0:30:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -42,6 +42,6 @@ srun python diffusion.py \
42
  --gradient_accumulation_steps 1 \
43
  --num_new_img_per_gpu 800 \
44
  --max_num_img_per_gpu 80 \
45
- #--resume outputs/model-N3200-device_count1-node1-epoch99-07213338 \
46
 
47
  date
 
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 0:59:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
42
  --gradient_accumulation_steps 1 \
43
  --num_new_img_per_gpu 800 \
44
  --max_num_img_per_gpu 80 \
45
+ #--resume outputs/model-N3200-device_count1-node1-epoch99-16103542 \
46
 
47
  date
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
tensorboard.ipynb ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "import nersc_tensorboard_helper\n",
13
+ "%load_ext tensorboard"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "data": {
24
+ "text/html": [
25
+ "\n",
26
+ " <iframe id=\"tensorboard-frame-262245829087dd6a\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
27
+ " </iframe>\n",
28
+ " <script>\n",
29
+ " (function() {\n",
30
+ " const frame = document.getElementById(\"tensorboard-frame-262245829087dd6a\");\n",
31
+ " const url = new URL(\"/\", window.location);\n",
32
+ " const port = 45355;\n",
33
+ " if (port) {\n",
34
+ " url.port = port;\n",
35
+ " }\n",
36
+ " frame.src = url;\n",
37
+ " })();\n",
38
+ " </script>\n",
39
+ " "
40
+ ],
41
+ "text/plain": [
42
+ "<IPython.core.display.HTML object>"
43
+ ]
44
+ },
45
+ "metadata": {},
46
+ "output_type": "display_data"
47
+ }
48
+ ],
49
+ "source": [
50
+ "%tensorboard --logdir logs --port 0"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 3,
56
+ "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "data": {
61
+ "text/html": [
62
+ "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45355/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45355/</a>"
63
+ ],
64
+ "text/plain": [
65
+ "<IPython.core.display.HTML object>"
66
+ ]
67
+ },
68
+ "metadata": {},
69
+ "output_type": "display_data"
70
+ }
71
+ ],
72
+ "source": [
73
+ "nersc_tensorboard_helper.tb_address()"
74
+ ]
75
+ }
76
+ ],
77
+ "metadata": {
78
+ "kernelspec": {
79
+ "display_name": "tensorflow-2.15.0",
80
+ "language": "python",
81
+ "name": "tensorflow-2.15.0"
82
+ },
83
+ "language_info": {
84
+ "codemirror_mode": {
85
+ "name": "ipython",
86
+ "version": 3
87
+ },
88
+ "file_extension": ".py",
89
+ "mimetype": "text/x-python",
90
+ "name": "python",
91
+ "nbconvert_exporter": "python",
92
+ "pygments_lexer": "ipython3",
93
+ "version": "3.9.18"
94
+ }
95
+ },
96
+ "nbformat": 4,
97
+ "nbformat_minor": 5
98
+ }