NeMo_Canary / tests /core /test_straggler_det.py
Respair's picture
Upload folder using huggingface_hub
b386992 verified
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import lightning.pytorch as pl
import pytest
import torch
from omegaconf import OmegaConf
from nemo.core.classes import ModelPT
from nemo.utils.exp_manager import exp_manager
try:
# `ptl_resiliency` is included in `gwe_resiliency_pkg` package
from ptl_resiliency import StragglerDetectionCallback
HAVE_STRAGGLER_DET = True
except (ImportError, ModuleNotFoundError):
HAVE_STRAGGLER_DET = False
class OnesDataset(torch.utils.data.Dataset):
def __init__(self, dataset_len):
super().__init__()
self.__dataset_len = dataset_len
def __getitem__(self, *args):
return torch.ones(2)
def __len__(self):
return self.__dataset_len
class ExampleModel(ModelPT):
def __init__(self, log_dir, **kwargs):
cfg = OmegaConf.structured({})
super().__init__(cfg)
pl.seed_everything(1234)
self.l1 = torch.nn.modules.Linear(in_features=2, out_features=1)
self.log_dir = log_dir
def on_train_start(self):
super().on_train_start()
rank = torch.distributed.get_rank()
def train_dataloader(self):
dataset = OnesDataset(1024 * 1024)
return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=2)
def val_dataloader(self):
dataset = OnesDataset(128 * 1024)
return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=2)
def forward(self, batch):
output = self.l1(batch)
output = torch.nn.functional.l1_loss(output, torch.zeros(output.size()).to(output.device))
return output
def validation_step(self, batch, batch_idx):
self.loss = self(batch)
return self.loss
def training_step(self, batch, batch_idx):
return self(batch)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.1)
def list_available_models(self, *args, **kwargs):
pass
def setup_training_data(self, *args, **kwargs):
pass
def setup_validation_data(self, *args, **kwargs):
pass
def on_validation_epoch_end(self):
self.log("val_loss", torch.stack([self.loss]).mean())
@pytest.mark.skipif(not HAVE_STRAGGLER_DET, reason="requires resiliency package to be installed.")
class TestStragglerDetection:
@pytest.mark.run_only_on('GPU')
def test_prints_perf_scores(self, tmp_path):
# Run dummy 1 rank DDP training
# Training time is limited to 3 seconds and straggler reporting is set to 1 second
# Check if there are straggler related logs in the captured log
max_steps = 1_000_000
tmp_path = tmp_path / "test_1"
print("TMP PATH", tmp_path)
trainer = pl.Trainer(
strategy='ddp',
devices=1,
accelerator='gpu',
enable_checkpointing=False,
logger=False,
max_steps=max_steps,
val_check_interval=0.33,
)
exp_manager(
trainer,
{
"max_time_per_run": "00:00:00:03",
"explicit_log_dir": str(tmp_path),
"create_checkpoint_callback": False,
"create_straggler_detection_callback": True,
"straggler_detection_params": {
"report_time_interval": 1.0,
"calc_relative_gpu_perf": True,
"calc_individual_gpu_perf": True,
"num_gpu_perf_scores_to_log": 1,
},
},
)
model = ExampleModel(log_dir=tmp_path)
trainer.fit(model)
# assume that NeMo logs are written into "nemo_log_globalrank-0_localrank-0.txt"
rank0_log_content = None
with open(tmp_path / "nemo_log_globalrank-0_localrank-0.txt") as f:
rank0_log_content = f.read()
assert "GPU relative performance" in rank0_log_content
assert "GPU individual performance" in rank0_log_content