Commit
·
a3b9c93
1
Parent(s):
d9c0f2d
profiling in training script
Browse files
root_gnn_dgl/scripts/training_script.py
CHANGED
|
@@ -45,10 +45,10 @@ def gpu_mem():
|
|
| 45 |
# except:
|
| 46 |
# pass
|
| 47 |
print(f'Current GPU memory usage: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024} GB')
|
| 48 |
-
print(f'Current GPU cache usage: {torch.cuda.memory_cached() / 1024 / 1024 / 1024} GB')
|
| 49 |
-
print(f'Current GPU max memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024} GB')
|
| 50 |
-
print(f'Current GPU max cache usage: {torch.cuda.max_memory_cached() / 1024 / 1024 / 1024} GB')
|
| 51 |
-
print(f'Numel in current tensors: {sum}')
|
| 52 |
mem()
|
| 53 |
|
| 54 |
|
|
@@ -263,11 +263,20 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 263 |
for epoch in range(starting_epoch, config['Training']['epochs']):
|
| 264 |
start = time.time()
|
| 265 |
run = start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
if (args.multigpu or args.multinode):
|
| 267 |
dist.barrier()
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
|
| 272 |
# training
|
| 273 |
model.train()
|
|
@@ -292,6 +301,8 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 292 |
if is_padded: #Padding the globals to match padded graphs.
|
| 293 |
global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
|
| 294 |
load = time.time()
|
|
|
|
|
|
|
| 295 |
if (len(logits) == 0):
|
| 296 |
logits = model(graph, global_feats)
|
| 297 |
tlabels = label
|
|
@@ -302,6 +313,9 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 302 |
weights = torch.concatenate((weights, track[:,1]), dim=0)
|
| 303 |
batch_lengths.append(logits.shape[0] - 1)
|
| 304 |
|
|
|
|
|
|
|
|
|
|
| 305 |
if is_padded:
|
| 306 |
keepmask = torch.full_like(logits[:,0], True, dtype=torch.bool)
|
| 307 |
keepmask[batch_lengths] = False
|
|
@@ -340,11 +354,15 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 340 |
normalized_loss += label_loss
|
| 341 |
loss = normalized_loss / len(unique_labels)
|
| 342 |
|
| 343 |
-
|
|
|
|
| 344 |
optimizer.zero_grad()
|
| 345 |
loss.backward()
|
| 346 |
optimizer.step()
|
| 347 |
total_loss += loss.detach().cpu().item()
|
|
|
|
|
|
|
|
|
|
| 348 |
ibatch += 1
|
| 349 |
cumulative_times[0] += batch_start - run
|
| 350 |
cumulative_times[1] += load - batch_start
|
|
@@ -366,6 +384,10 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 366 |
labels = []
|
| 367 |
weights = []
|
| 368 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
with torch.no_grad():
|
| 370 |
for loader in test_loaders:
|
| 371 |
for batch, label, track, global_feats in loader:
|
|
@@ -386,6 +408,9 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 386 |
eval_end = time.time()
|
| 387 |
cumulative_times[3] += eval_end - run
|
| 388 |
|
|
|
|
|
|
|
|
|
|
| 389 |
if scores == []: #If validation set is empty.
|
| 390 |
continue
|
| 391 |
logits = torch.concatenate(scores).to(device)
|
|
@@ -579,6 +604,10 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
|
|
| 579 |
custom_scheduler.step(model, {'test_auc':test_auc})
|
| 580 |
scheduler.step()
|
| 581 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
print(f"Load: {cumulative_times[0]:.4f} s")
|
| 583 |
print(f"Batch: {cumulative_times[1]:.4f} s")
|
| 584 |
print(f"Train: {cumulative_times[2]:.4f} s")
|
|
@@ -788,6 +817,7 @@ if __name__ == "__main__":
|
|
| 788 |
add_arg("--directory", type=str, help="Append to Training Directory")
|
| 789 |
add_arg("--seed", type=int, default=2, help="Sets random seed")
|
| 790 |
add_arg("--abs", action="store_true", help="Use abs value of per-event weight")
|
|
|
|
| 791 |
|
| 792 |
pargs = parser.parse_args()
|
| 793 |
|
|
|
|
| 45 |
# except:
|
| 46 |
# pass
|
| 47 |
print(f'Current GPU memory usage: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024} GB')
|
| 48 |
+
# print(f'Current GPU cache usage: {torch.cuda.memory_cached() / 1024 / 1024 / 1024} GB')
|
| 49 |
+
# print(f'Current GPU max memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024} GB')
|
| 50 |
+
# print(f'Current GPU max cache usage: {torch.cuda.max_memory_cached() / 1024 / 1024 / 1024} GB')
|
| 51 |
+
# print(f'Numel in current tensors: {sum}')
|
| 52 |
mem()
|
| 53 |
|
| 54 |
|
|
|
|
| 263 |
for epoch in range(starting_epoch, config['Training']['epochs']):
|
| 264 |
start = time.time()
|
| 265 |
run = start
|
| 266 |
+
if (args.profile):
|
| 267 |
+
if (epoch == 2):
|
| 268 |
+
torch.cuda.cudart().cudaProfilerStart()
|
| 269 |
+
if (epoch == 3):
|
| 270 |
+
print("Done profiling")
|
| 271 |
+
torch.cuda.cudart().cudaProfilerStop()
|
| 272 |
+
torch.cuda.nvtx.range_push("Epoch Start")
|
| 273 |
+
print("executed push")
|
| 274 |
+
|
| 275 |
if (args.multigpu or args.multinode):
|
| 276 |
dist.barrier()
|
| 277 |
+
|
| 278 |
+
if (epoch == 5):
|
| 279 |
+
exit
|
| 280 |
|
| 281 |
# training
|
| 282 |
model.train()
|
|
|
|
| 301 |
if is_padded: #Padding the globals to match padded graphs.
|
| 302 |
global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
|
| 303 |
load = time.time()
|
| 304 |
+
if (args.profile):
|
| 305 |
+
torch.cuda.nvtx.range_push("Model Forward")
|
| 306 |
if (len(logits) == 0):
|
| 307 |
logits = model(graph, global_feats)
|
| 308 |
tlabels = label
|
|
|
|
| 313 |
weights = torch.concatenate((weights, track[:,1]), dim=0)
|
| 314 |
batch_lengths.append(logits.shape[0] - 1)
|
| 315 |
|
| 316 |
+
if (args.profile):
|
| 317 |
+
torch.cuda.nvtx.range_pop() # popping model forward
|
| 318 |
+
|
| 319 |
if is_padded:
|
| 320 |
keepmask = torch.full_like(logits[:,0], True, dtype=torch.bool)
|
| 321 |
keepmask[batch_lengths] = False
|
|
|
|
| 354 |
normalized_loss += label_loss
|
| 355 |
loss = normalized_loss / len(unique_labels)
|
| 356 |
|
| 357 |
+
if (args.profile):
|
| 358 |
+
torch.cuda.nvtx.range_push("Model Backward")
|
| 359 |
optimizer.zero_grad()
|
| 360 |
loss.backward()
|
| 361 |
optimizer.step()
|
| 362 |
total_loss += loss.detach().cpu().item()
|
| 363 |
+
|
| 364 |
+
if (args.profile):
|
| 365 |
+
torch.cuda.nvtx.range_pop() # pop model backward
|
| 366 |
ibatch += 1
|
| 367 |
cumulative_times[0] += batch_start - run
|
| 368 |
cumulative_times[1] += load - batch_start
|
|
|
|
| 384 |
labels = []
|
| 385 |
weights = []
|
| 386 |
model.eval()
|
| 387 |
+
|
| 388 |
+
if (args.profile):
|
| 389 |
+
torch.cuda.nvtx.range_push("Model Evaluation")
|
| 390 |
+
|
| 391 |
with torch.no_grad():
|
| 392 |
for loader in test_loaders:
|
| 393 |
for batch, label, track, global_feats in loader:
|
|
|
|
| 408 |
eval_end = time.time()
|
| 409 |
cumulative_times[3] += eval_end - run
|
| 410 |
|
| 411 |
+
if (args.profile):
|
| 412 |
+
torch.cuda.nvtx.range_pop() # pop evaluation
|
| 413 |
+
|
| 414 |
if scores == []: #If validation set is empty.
|
| 415 |
continue
|
| 416 |
logits = torch.concatenate(scores).to(device)
|
|
|
|
| 604 |
custom_scheduler.step(model, {'test_auc':test_auc})
|
| 605 |
scheduler.step()
|
| 606 |
|
| 607 |
+
if (args.profile):
|
| 608 |
+
torch.cuda.nvtx.range_pop() # pop epoch
|
| 609 |
+
print("executed pop")
|
| 610 |
+
|
| 611 |
print(f"Load: {cumulative_times[0]:.4f} s")
|
| 612 |
print(f"Batch: {cumulative_times[1]:.4f} s")
|
| 613 |
print(f"Train: {cumulative_times[2]:.4f} s")
|
|
|
|
| 817 |
add_arg("--directory", type=str, help="Append to Training Directory")
|
| 818 |
add_arg("--seed", type=int, default=2, help="Sets random seed")
|
| 819 |
add_arg("--abs", action="store_true", help="Use abs value of per-event weight")
|
| 820 |
+
add_arg("--profile", action="store_true", help="use nsight systems profiler")
|
| 821 |
|
| 822 |
pargs = parser.parse_args()
|
| 823 |
|