ho22joshua commited on
Commit
a3b9c93
·
1 Parent(s): d9c0f2d

profiling in training script

Browse files
root_gnn_dgl/scripts/training_script.py CHANGED
@@ -45,10 +45,10 @@ def gpu_mem():
45
  # except:
46
  # pass
47
  print(f'Current GPU memory usage: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024} GB')
48
- print(f'Current GPU cache usage: {torch.cuda.memory_cached() / 1024 / 1024 / 1024} GB')
49
- print(f'Current GPU max memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024} GB')
50
- print(f'Current GPU max cache usage: {torch.cuda.max_memory_cached() / 1024 / 1024 / 1024} GB')
51
- print(f'Numel in current tensors: {sum}')
52
  mem()
53
 
54
 
@@ -263,11 +263,20 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
263
  for epoch in range(starting_epoch, config['Training']['epochs']):
264
  start = time.time()
265
  run = start
 
 
 
 
 
 
 
 
 
266
  if (args.multigpu or args.multinode):
267
  dist.barrier()
268
- if (epoch == 2):
269
- # torch.cuda.cudart().cudaProfilerStart()
270
- pass
271
 
272
  # training
273
  model.train()
@@ -292,6 +301,8 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
292
  if is_padded: #Padding the globals to match padded graphs.
293
  global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
294
  load = time.time()
 
 
295
  if (len(logits) == 0):
296
  logits = model(graph, global_feats)
297
  tlabels = label
@@ -302,6 +313,9 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
302
  weights = torch.concatenate((weights, track[:,1]), dim=0)
303
  batch_lengths.append(logits.shape[0] - 1)
304
 
 
 
 
305
  if is_padded:
306
  keepmask = torch.full_like(logits[:,0], True, dtype=torch.bool)
307
  keepmask[batch_lengths] = False
@@ -340,11 +354,15 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
340
  normalized_loss += label_loss
341
  loss = normalized_loss / len(unique_labels)
342
 
343
-
 
344
  optimizer.zero_grad()
345
  loss.backward()
346
  optimizer.step()
347
  total_loss += loss.detach().cpu().item()
 
 
 
348
  ibatch += 1
349
  cumulative_times[0] += batch_start - run
350
  cumulative_times[1] += load - batch_start
@@ -366,6 +384,10 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
366
  labels = []
367
  weights = []
368
  model.eval()
 
 
 
 
369
  with torch.no_grad():
370
  for loader in test_loaders:
371
  for batch, label, track, global_feats in loader:
@@ -386,6 +408,9 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
386
  eval_end = time.time()
387
  cumulative_times[3] += eval_end - run
388
 
 
 
 
389
  if scores == []: #If validation set is empty.
390
  continue
391
  logits = torch.concatenate(scores).to(device)
@@ -579,6 +604,10 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
579
  custom_scheduler.step(model, {'test_auc':test_auc})
580
  scheduler.step()
581
 
 
 
 
 
582
  print(f"Load: {cumulative_times[0]:.4f} s")
583
  print(f"Batch: {cumulative_times[1]:.4f} s")
584
  print(f"Train: {cumulative_times[2]:.4f} s")
@@ -788,6 +817,7 @@ if __name__ == "__main__":
788
  add_arg("--directory", type=str, help="Append to Training Directory")
789
  add_arg("--seed", type=int, default=2, help="Sets random seed")
790
  add_arg("--abs", action="store_true", help="Use abs value of per-event weight")
 
791
 
792
  pargs = parser.parse_args()
793
 
 
45
  # except:
46
  # pass
47
  print(f'Current GPU memory usage: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024} GB')
48
+ # print(f'Current GPU cache usage: {torch.cuda.memory_cached() / 1024 / 1024 / 1024} GB')
49
+ # print(f'Current GPU max memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024} GB')
50
+ # print(f'Current GPU max cache usage: {torch.cuda.max_memory_cached() / 1024 / 1024 / 1024} GB')
51
+ # print(f'Numel in current tensors: {sum}')
52
  mem()
53
 
54
 
 
263
  for epoch in range(starting_epoch, config['Training']['epochs']):
264
  start = time.time()
265
  run = start
266
+ if (args.profile):
267
+ if (epoch == 2):
268
+ torch.cuda.cudart().cudaProfilerStart()
269
+ if (epoch == 3):
270
+ print("Done profiling")
271
+ torch.cuda.cudart().cudaProfilerStop()
272
+ torch.cuda.nvtx.range_push("Epoch Start")
273
+ print("executed push")
274
+
275
  if (args.multigpu or args.multinode):
276
  dist.barrier()
277
+
278
+ if (epoch == 5):
279
+ exit
280
 
281
  # training
282
  model.train()
 
301
  if is_padded: #Padding the globals to match padded graphs.
302
  global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
303
  load = time.time()
304
+ if (args.profile):
305
+ torch.cuda.nvtx.range_push("Model Forward")
306
  if (len(logits) == 0):
307
  logits = model(graph, global_feats)
308
  tlabels = label
 
313
  weights = torch.concatenate((weights, track[:,1]), dim=0)
314
  batch_lengths.append(logits.shape[0] - 1)
315
 
316
+ if (args.profile):
317
+ torch.cuda.nvtx.range_pop() # popping model forward
318
+
319
  if is_padded:
320
  keepmask = torch.full_like(logits[:,0], True, dtype=torch.bool)
321
  keepmask[batch_lengths] = False
 
354
  normalized_loss += label_loss
355
  loss = normalized_loss / len(unique_labels)
356
 
357
+ if (args.profile):
358
+ torch.cuda.nvtx.range_push("Model Backward")
359
  optimizer.zero_grad()
360
  loss.backward()
361
  optimizer.step()
362
  total_loss += loss.detach().cpu().item()
363
+
364
+ if (args.profile):
365
+ torch.cuda.nvtx.range_pop() # pop model backward
366
  ibatch += 1
367
  cumulative_times[0] += batch_start - run
368
  cumulative_times[1] += load - batch_start
 
384
  labels = []
385
  weights = []
386
  model.eval()
387
+
388
+ if (args.profile):
389
+ torch.cuda.nvtx.range_push("Model Evaluation")
390
+
391
  with torch.no_grad():
392
  for loader in test_loaders:
393
  for batch, label, track, global_feats in loader:
 
408
  eval_end = time.time()
409
  cumulative_times[3] += eval_end - run
410
 
411
+ if (args.profile):
412
+ torch.cuda.nvtx.range_pop() # pop evaluation
413
+
414
  if scores == []: #If validation set is empty.
415
  continue
416
  logits = torch.concatenate(scores).to(device)
 
604
  custom_scheduler.step(model, {'test_auc':test_auc})
605
  scheduler.step()
606
 
607
+ if (args.profile):
608
+ torch.cuda.nvtx.range_pop() # pop epoch
609
+ print("executed pop")
610
+
611
  print(f"Load: {cumulative_times[0]:.4f} s")
612
  print(f"Batch: {cumulative_times[1]:.4f} s")
613
  print(f"Train: {cumulative_times[2]:.4f} s")
 
817
  add_arg("--directory", type=str, help="Append to Training Directory")
818
  add_arg("--seed", type=int, default=2, help="Sets random seed")
819
  add_arg("--abs", action="store_true", help="Use abs value of per-event weight")
820
+ add_arg("--profile", action="store_true", help="use nsight systems profiler")
821
 
822
  pargs = parser.parse_args()
823