Xsmos commited on
Commit
7cf6202
·
verified ·
1 Parent(s): 6e5400a
Files changed (5) hide show
  1. context_unet.py +33 -12
  2. diffusion.py +14 -9
  3. load_h5.py +27 -19
  4. perlmutter_diffusion.sbatch +5 -5
  5. tensorboard.ipynb +8 -26
context_unet.py CHANGED
@@ -63,44 +63,52 @@ AvgPool = {
63
  }
64
 
65
  class Downsample(nn.Module):
66
- def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
67
  super().__init__()
68
  self.channels = channels
69
  self.out_channels = out_channels or channels
70
- # stride = config.stride
 
71
  if use_conv:
72
- # print("conv")
73
  self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)
74
  else:
75
- # print("pool")
76
  assert channels == self.out_channels
77
  self.op = AvgPool[dim](kernel_size=stride, stride=stride)
78
 
79
  def forward(self, x):
80
  assert x.shape[1] == self.channels
81
- return self.op(x)
 
 
 
 
82
 
83
  class Upsample(nn.Module):
84
- def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
85
  super().__init__()
86
  self.channels = channels
87
  self.out_channels = out_channels
88
  self.use_conv = use_conv
89
  self.stride = stride
 
 
90
  if self.use_conv:
91
  self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)
92
 
93
  def forward(self, x):
94
  assert x.shape[1] == self.channels
95
- # stride = config.stride
96
- # print(torch.tensor(x.shape[2:]))
97
- # print(torch.tensor(stride))
98
  shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)
99
  shape = tuple(shape.detach().numpy())
100
  # print(shape)
101
  x = F.interpolate(x, shape, mode='nearest')
 
102
  if self.use_conv:
103
- x = self.conv(x)
 
 
 
 
 
104
  return x
105
 
106
  def zero_module(module):
@@ -335,6 +343,7 @@ class ContextUnet(nn.Module):
335
  #dtype = torch.float32,
336
  ):
337
  super().__init__()
 
338
 
339
  if channel_mult == None:
340
  if image_size == 512:
@@ -433,7 +442,13 @@ class ContextUnet(nn.Module):
433
  stride = stride,
434
  )
435
  if resblock_updown
436
- else Downsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)
 
 
 
 
 
 
437
  )
438
  )
439
  ch = out_ch
@@ -519,7 +534,13 @@ class ContextUnet(nn.Module):
519
  stride = stride,
520
  )
521
  if resblock_updown
522
- else Upsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)
 
 
 
 
 
 
523
  )
524
  ds //= 2
525
  self.output_blocks.append(TimestepEmbedSequential(*layers))
 
63
  }
64
 
65
  class Downsample(nn.Module):
66
+ def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2), use_checkpoint=False):
67
  super().__init__()
68
  self.channels = channels
69
  self.out_channels = out_channels or channels
70
+ self.use_checkpoint = use_checkpoint
71
+ self.dim = dim
72
  if use_conv:
 
73
  self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)
74
  else:
 
75
  assert channels == self.out_channels
76
  self.op = AvgPool[dim](kernel_size=stride, stride=stride)
77
 
78
  def forward(self, x):
79
  assert x.shape[1] == self.channels
80
+ if self.use_checkpoint and isinstance(self.op, Conv[self.dim]):
81
+ print(f"checkpoint working in Downsample")
82
+ return checkpoint.checkpoint(self.op, x)
83
+ else:
84
+ return self.op(x)
85
 
86
  class Upsample(nn.Module):
87
+ def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2), use_checkpoint=False):
88
  super().__init__()
89
  self.channels = channels
90
  self.out_channels = out_channels
91
  self.use_conv = use_conv
92
  self.stride = stride
93
+ self.use_checkpoint = use_checkpoint
94
+
95
  if self.use_conv:
96
  self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)
97
 
98
  def forward(self, x):
99
  assert x.shape[1] == self.channels
 
 
 
100
  shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)
101
  shape = tuple(shape.detach().numpy())
102
  # print(shape)
103
  x = F.interpolate(x, shape, mode='nearest')
104
+
105
  if self.use_conv:
106
+ if self.use_checkpoint:
107
+ print(f"checkpoint working in upsample")
108
+ return checkpoint.checkpoint(self.conv, x)
109
+ else:
110
+ x = self.conv(x)
111
+
112
  return x
113
 
114
  def zero_module(module):
 
343
  #dtype = torch.float32,
344
  ):
345
  super().__init__()
346
+ #self.use_checkpoint = use_checkpoint
347
 
348
  if channel_mult == None:
349
  if image_size == 512:
 
442
  stride = stride,
443
  )
444
  if resblock_updown
445
+ else Downsample(ch,
446
+ conv_resample,
447
+ out_channels=out_ch,
448
+ dim=dim,
449
+ stride=stride,
450
+ #use_checkpoint=use_checkpoint,
451
+ )
452
  )
453
  )
454
  ch = out_ch
 
534
  stride = stride,
535
  )
536
  if resblock_updown
537
+ else Upsample(ch,
538
+ conv_resample,
539
+ out_channels=out_ch,
540
+ dim=dim,
541
+ stride=stride,
542
+ #use_checkpoint=use_checkpoint,
543
+ )
544
  )
545
  ds //= 2
546
  self.output_blocks.append(TimestepEmbedSequential(*layers))
diffusion.py CHANGED
@@ -23,7 +23,7 @@ import copy
23
  from tqdm.auto import tqdm
24
  # from diffusers import UNet2DModel#, UNet3DConditionModel
25
  # from diffusers import DDPMScheduler
26
- import datetime
27
  from pathlib import Path
28
  #from diffusers.optimization import get_cosine_schedule_with_warmup
29
  #from accelerate import notebook_launcher, Accelerator
@@ -241,8 +241,8 @@ class TrainConfig:
241
  world_size = 1#torch.cuda.device_count()
242
  # repeat = 2
243
 
244
- #dim = 2
245
- dim = 3#2
246
  stride = (2,4) if dim == 2 else (2,2,4)
247
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
248
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
@@ -360,7 +360,7 @@ def get_gpu_info(device):
360
 
361
  class DDPM21CM:
362
  def __init__(self, config):
363
- config.run_name = datetime.datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
364
  self.config = config
365
  self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
366
 
@@ -380,7 +380,7 @@ class DDPM21CM:
380
  #self.nn_model.module.to(config.dtype)
381
  print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
382
  else:
383
- print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
384
 
385
  # whether to use ema
386
  if config.ema:
@@ -412,9 +412,10 @@ class DDPM21CM:
412
  drop_prob=self.config.drop_prob,
413
  dim=self.config.dim,
414
  ranges_dict=self.ranges_dict,
415
- num_workers=min(2,len(os.sched_getaffinity(0))//self.config.world_size),
416
  str_len = self.config.str_len,
417
  )
 
418
 
419
  dataloader_start = time()
420
  self.dataloader = DataLoader(
@@ -488,7 +489,7 @@ class DDPM21CM:
488
  global_step = 0
489
  for ep in range(self.config.n_epoch):
490
  self.ddpm.train()
491
- pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr, disable=True)#, mininterval=self.config.pbar_update_step)#, disable=True)#not self.accelerator.is_local_main_process)
492
  pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
493
  epoch_start = time()
494
  for i, (x, c) in enumerate(self.dataloader):
@@ -507,6 +508,10 @@ class DDPM21CM:
507
  loss = F.mse_loss(noise, noise_pred)
508
  loss = loss / self.config.gradient_accumulation_steps
509
 
 
 
 
 
510
  # scaler backward propogation
511
  self.scaler.scale(loss).backward()
512
  #loss.backward()
@@ -624,7 +629,7 @@ class DDPM21CM:
624
  #print(f"x_last.dtype = {x_last.dtype}")
625
  if save:
626
  # np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
627
- savetime = datetime.datetime.now().strftime("%d%H%M%S")
628
  savename = os.path.join(self.config.output_dir, f"Tvir{params_backup[0]:.3f}-zeta{params_backup[1]:.3f}-N{self.config.num_image}-device{self.config.global_rank}-{os.path.basename(self.config.resume)}-{savetime}{'ema' if ema else ''}.npy")
629
  if not os.path.exists(self.config.output_dir):
630
  os.makedirs(self.config.output_dir)
@@ -721,7 +726,7 @@ if __name__ == "__main__":
721
  ############################ training ################################
722
  if args.train:
723
  config.dataset_name = args.train
724
- print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(config.str_len,'#'))
725
  mp.spawn(
726
  train,
727
  args=(world_size, local_world_size, master_addr, master_port, config),
 
23
  from tqdm.auto import tqdm
24
  # from diffusers import UNet2DModel#, UNet3DConditionModel
25
  # from diffusers import DDPMScheduler
26
+ from datetime import datetime
27
  from pathlib import Path
28
  #from diffusers.optimization import get_cosine_schedule_with_warmup
29
  #from accelerate import notebook_launcher, Accelerator
 
241
  world_size = 1#torch.cuda.device_count()
242
  # repeat = 2
243
 
244
+ dim = 2
245
+ #dim = 3#2
246
  stride = (2,4) if dim == 2 else (2,2,4)
247
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
248
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
 
360
 
361
  class DDPM21CM:
362
  def __init__(self, config):
363
+ config.run_name = datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
364
  self.config = config
365
  self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
366
 
 
380
  #self.nn_model.module.to(config.dtype)
381
  print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
382
  else:
383
+ print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
384
 
385
  # whether to use ema
386
  if config.ema:
 
412
  drop_prob=self.config.drop_prob,
413
  dim=self.config.dim,
414
  ranges_dict=self.ranges_dict,
415
+ num_workers=min(1,len(os.sched_getaffinity(0))//self.config.world_size),
416
  str_len = self.config.str_len,
417
  )
418
+ #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}: Dataset4h5 done")
419
 
420
  dataloader_start = time()
421
  self.dataloader = DataLoader(
 
489
  global_step = 0
490
  for ep in range(self.config.n_epoch):
491
  self.ddpm.train()
492
+ pbar_train = tqdm(total=len(self.dataloader), file=sys.stderr)#, disable=True)#, mininterval=self.config.pbar_update_step)#, disable=True)#not self.accelerator.is_local_main_process)
493
  pbar_train.set_description(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}")
494
  epoch_start = time()
495
  for i, (x, c) in enumerate(self.dataloader):
 
508
  loss = F.mse_loss(noise, noise_pred)
509
  loss = loss / self.config.gradient_accumulation_steps
510
 
511
+ #print(f"loss = {loss}")
512
+ if torch.isnan(loss).any():
513
+ raise ValueError(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.config.global_rank} Epoch {ep}, loss: {loss}")
514
+
515
  # scaler backward propogation
516
  self.scaler.scale(loss).backward()
517
  #loss.backward()
 
629
  #print(f"x_last.dtype = {x_last.dtype}")
630
  if save:
631
  # np.save(os.path.join(self.config.output_dir, f"{self.config.run_name}{'ema' if ema else ''}.npy"), x_last)
632
+ savetime = datetime.now().strftime("%d%H%M%S")
633
  savename = os.path.join(self.config.output_dir, f"Tvir{params_backup[0]:.3f}-zeta{params_backup[1]:.3f}-N{self.config.num_image}-device{self.config.global_rank}-{os.path.basename(self.config.resume)}-{savetime}{'ema' if ema else ''}.npy")
634
  if not os.path.exists(self.config.output_dir):
635
  os.makedirs(self.config.output_dir)
 
726
  ############################ training ################################
727
  if args.train:
728
  config.dataset_name = args.train
729
+ print(f" training, ip = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size}, {datetime.now().strftime('%d-%H:%M:%S.%f')} ".center(config.str_len,'#'))
730
  mp.spawn(
731
  train,
732
  args=(world_size, local_world_size, master_addr, master_port, config),
load_h5.py CHANGED
@@ -20,7 +20,7 @@ import os
20
  # from diffusers import DDPMScheduler
21
  # from diffusers.utils import make_image_grid
22
  from time import time
23
- import datetime
24
  import concurrent.futures
25
  import psutil
26
  # from pathlib import Path
@@ -115,25 +115,33 @@ class Dataset4h5(Dataset):
115
  elif self.dim == 3:
116
  self.images = np.empty((self.num_image, 1, self.HII_DIM, self.HII_DIM, self.num_redshift), dtype=np.float32)
117
  # self.num_workers = len(os.sched_getaffinity(0))//torch.cuda.device_count()
118
- concurrent_init_start = time()
119
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
120
- concurrent_init_end = time()
121
- print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
122
- futures = [None] * self.num_workers
123
- for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
124
- executor_start = time()
125
- futures[i] = executor.submit(self.read_data_chunk, self.dir_name, idx, torch.cuda.current_device(), concurrent_init_end, executor_start)
126
 
 
 
 
 
 
127
  concurrent_start = time()
128
- start_idx = 0
129
- for future in concurrent.futures.as_completed(futures):
130
- images, params = future.result()
131
- batch_size = params.shape[0]
132
- self.images[start_idx:start_idx+batch_size] = images
133
- self.params[start_idx:start_idx+batch_size] = params
134
- start_idx += batch_size
135
- concurrent_end = time()
136
- print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  transform_start = time()
139
  if self.transform:
@@ -164,7 +172,7 @@ class Dataset4h5(Dataset):
164
  param_start = time()
165
  params = f['params']['values'][idx]
166
  param_end = time()
167
- print(f"{socket.gethostbyname(socket.gethostname())}, cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU-pid {cpu_num}-{pid}: images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
168
 
169
  return images, params
170
 
 
20
  # from diffusers import DDPMScheduler
21
  # from diffusers.utils import make_image_grid
22
  from time import time
23
+ from datetime import datetime
24
  import concurrent.futures
25
  import psutil
26
  # from pathlib import Path
 
115
  elif self.dim == 3:
116
  self.images = np.empty((self.num_image, 1, self.HII_DIM, self.HII_DIM, self.num_redshift), dtype=np.float32)
117
  # self.num_workers = len(os.sched_getaffinity(0))//torch.cuda.device_count()
 
 
 
 
 
 
 
 
118
 
119
+ concurrent_init_start = time()
120
+ if self.num_workers == 1:
121
+ print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, loading by {self.num_workers} workers, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.str_len, '-'))
122
+ self.images, self.params = self.read_data_chunk(self.dir_name, self.idx, torch.cuda.current_device(), concurrent_init_start, concurrent_init_start)
123
+ self.params = self.params.astype(self.images.dtype)
124
  concurrent_start = time()
125
+ print(f"{socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}s, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.str_len, '-'))
126
+ else:
127
+ with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
128
+ concurrent_init_end = time()
129
+ print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, concurrently loading by {self.num_workers}/{len(os.sched_getaffinity(0))} workers, initialized after {concurrent_init_end-concurrent_init_start:.3f}s ".center(self.str_len, '-'))
130
+ futures = [None] * self.num_workers
131
+ for i, idx in enumerate(np.array_split(self.idx, self.num_workers)):
132
+ executor_start = time()
133
+ futures[i] = executor.submit(self.read_data_chunk, self.dir_name, idx, torch.cuda.current_device(), concurrent_init_end, executor_start)
134
+
135
+ concurrent_start = time()
136
+ start_idx = 0
137
+ for future in concurrent.futures.as_completed(futures):
138
+ images, params = future.result()
139
+ batch_size = params.shape[0]
140
+ self.images[start_idx:start_idx+batch_size] = images
141
+ self.params[start_idx:start_idx+batch_size] = params
142
+ start_idx += batch_size
143
+ concurrent_end = time()
144
+ print(f" {socket.gethostbyname(socket.gethostname())} cuda:{torch.cuda.current_device()}/{self.global_rank}, {start_idx} images {self.images.shape} & params {self.params.shape} loaded after {concurrent_start-concurrent_init_start:.3f}/{concurrent_end-concurrent_start:.3f}s ".center(self.str_len, '-'))
145
 
146
  transform_start = time()
147
  if self.transform:
 
172
  param_start = time()
173
  params = f['params']['values'][idx]
174
  param_end = time()
175
+ print(f"cuda:{torch.cuda.current_device()}/{self.global_rank}, CPU:{cpu_num}, images {images.shape} & params {params.shape} loaded after {executor_start-concurrent_init_end:.3f}/{set_device-executor_start:.3f}/{open_h5py-set_device:.3f}/{images_start-open_h5py:.3f}s + {images_end-images_start:.3f}s & {param_end-param_start:.3f}s")
176
 
177
  return images, params
178
 
perlmutter_diffusion.sbatch CHANGED
@@ -3,9 +3,9 @@
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu&hbm80g
5
  #SBATCH -q regular #shared
6
- #SBATCH -N10
7
  #SBATCH --gpus-per-node=4
8
- #SBATCH -t 30:20:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -36,12 +36,12 @@ cat $0
36
  #nvidia-smi
37
 
38
  srun python diffusion.py \
39
- --num_image 640 \
40
- --batch_size 1 \
41
  --n_epoch 50 \
42
  --num_new_img_per_gpu 20 \
43
  --max_num_img_per_gpu 4 \
44
- --channel_mult 1 1 2 2 4 4 \
45
  --gradient_accumulation_steps 1 \
46
  --autocast 1 \
47
  --use_checkpoint 1 \
 
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu&hbm80g
5
  #SBATCH -q regular #shared
6
+ #SBATCH -N1
7
  #SBATCH --gpus-per-node=4
8
+ #SBATCH -t 02:30:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
36
  #nvidia-smi
37
 
38
  srun python diffusion.py \
39
+ --num_image 6400 \
40
+ --batch_size 128 \
41
  --n_epoch 50 \
42
  --num_new_img_per_gpu 20 \
43
  --max_num_img_per_gpu 4 \
44
+ --channel_mult 0.5 1 2 2 4 8 \
45
  --gradient_accumulation_steps 1 \
46
  --autocast 1 \
47
  --use_checkpoint 1 \
tensorboard.ipynb CHANGED
@@ -2,21 +2,12 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 7,
6
  "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
7
  "metadata": {
8
  "tags": []
9
  },
10
- "outputs": [
11
- {
12
- "name": "stdout",
13
- "output_type": "stream",
14
- "text": [
15
- "The tensorboard extension is already loaded. To reload it, use:\n",
16
- " %reload_ext tensorboard\n"
17
- ]
18
- }
19
- ],
20
  "source": [
21
  "import nersc_tensorboard_helper\n",
22
  "%load_ext tensorboard"
@@ -24,30 +15,21 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 8,
28
  "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
29
  "metadata": {},
30
  "outputs": [
31
- {
32
- "data": {
33
- "text/plain": [
34
- "Reusing TensorBoard on port 45739 (pid 1821871), started 2 days, 2:32:50 ago. (Use '!kill 1821871' to kill it.)"
35
- ]
36
- },
37
- "metadata": {},
38
- "output_type": "display_data"
39
- },
40
  {
41
  "data": {
42
  "text/html": [
43
  "\n",
44
- " <iframe id=\"tensorboard-frame-905898ab07792b79\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
45
  " </iframe>\n",
46
  " <script>\n",
47
  " (function() {\n",
48
- " const frame = document.getElementById(\"tensorboard-frame-905898ab07792b79\");\n",
49
  " const url = new URL(\"/\", window.location);\n",
50
- " const port = 45739;\n",
51
  " if (port) {\n",
52
  " url.port = port;\n",
53
  " }\n",
@@ -70,14 +52,14 @@
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 9,
74
  "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
75
  "metadata": {},
76
  "outputs": [
77
  {
78
  "data": {
79
  "text/html": [
80
- "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45739/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/45739/</a>"
81
  ],
82
  "text/plain": [
83
  "<IPython.core.display.HTML object>"
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "ae45e44e-a11c-43ef-b830-c7a58a72f51e",
7
  "metadata": {
8
  "tags": []
9
  },
10
+ "outputs": [],
 
 
 
 
 
 
 
 
 
11
  "source": [
12
  "import nersc_tensorboard_helper\n",
13
  "%load_ext tensorboard"
 
15
  },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": 2,
19
  "id": "a5c088b8-5051-402f-b4ec-2b684ad5a952",
20
  "metadata": {},
21
  "outputs": [
 
 
 
 
 
 
 
 
 
22
  {
23
  "data": {
24
  "text/html": [
25
  "\n",
26
+ " <iframe id=\"tensorboard-frame-497d865784f48ad7\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
27
  " </iframe>\n",
28
  " <script>\n",
29
  " (function() {\n",
30
+ " const frame = document.getElementById(\"tensorboard-frame-497d865784f48ad7\");\n",
31
  " const url = new URL(\"/\", window.location);\n",
32
+ " const port = 38971;\n",
33
  " if (port) {\n",
34
  " url.port = port;\n",
35
  " }\n",
 
52
  },
53
  {
54
  "cell_type": "code",
55
+ "execution_count": 3,
56
  "id": "2f76c0a9-2218-4073-86aa-f4f655d7642f",
57
  "metadata": {},
58
  "outputs": [
59
  {
60
  "data": {
61
  "text/html": [
62
+ "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/38971/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/38971/</a>"
63
  ],
64
  "text/plain": [
65
  "<IPython.core.display.HTML object>"