File size: 8,296 Bytes
b386992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import MagicMock, patch

import pytest
import torch
from nemo.lightning.pytorch.callbacks.nsys import NsysCallback


class TestNsysCallback:
    @pytest.fixture(autouse=True)
    def setup_mocks(self):
        self.cuda_mock = patch('torch.cuda')
        self.cudart_mock = patch('torch.cuda.cudart')
        self.emit_nvtx_mock = patch('torch.autograd.profiler.emit_nvtx')
        self.get_rank_mock = patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')

        self.cuda_mock.start()
        self.cudart_mock.start()
        self.emit_nvtx_mock.start()
        self.get_rank_mock.start()

        # Mock CUDA availability
        torch.cuda.is_available = MagicMock(return_value=True)
        torch.cuda.current_device = MagicMock(return_value=0)

        # Set up fixed cudart mock for all tests
        self.fixed_cudart = MagicMock()
        torch.cuda.cudart = MagicMock(return_value=self.fixed_cudart)

        yield

        self.cuda_mock.stop()
        self.cudart_mock.stop()
        self.emit_nvtx_mock.stop()
        self.get_rank_mock.stop()

    @pytest.fixture
    def mock_trainer(self):
        trainer = MagicMock()
        trainer.strategy.root_device.type = 'cuda'
        return trainer

    @pytest.fixture
    def mock_pl_module(self):
        return MagicMock()

    def test_init_valid_params(self):
        """Test initialization with valid parameters."""
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0, 1], gen_shape=True)
        assert callback._nsys_profile_start_step == 10
        assert callback._nsys_profile_end_step == 20
        assert callback._nsys_profile_ranks == [0, 1]
        assert callback._nsys_profile_gen_shape == True

    def test_init_invalid_params(self):
        """Test initialization with invalid parameters."""
        with pytest.raises(AssertionError):
            NsysCallback(start_step='10', end_step=20)

        with pytest.raises(AssertionError):
            NsysCallback(start_step=10, end_step='20')

        with pytest.raises(AssertionError):
            NsysCallback(start_step=20, end_step=10)

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    @patch('torch.autograd.profiler.emit_nvtx')
    def test_on_train_batch_start_profiling(self, mock_emit_nvtx, mock_get_rank, mock_trainer, mock_pl_module):
        # Set mocked cudart via the fixture patch
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0], gen_shape=True)

        mock_trainer.strategy.current_epoch_step = 10
        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)

        self.fixed_cudart.cudaProfilerStart.assert_called_once()
        mock_emit_nvtx.assert_called_once_with(record_shapes=True)

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    def test_on_train_batch_start_no_profiling(self, mock_get_rank, mock_trainer, mock_pl_module):
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])

        mock_trainer.strategy.current_epoch_step = 9
        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 9)

        self.fixed_cudart.cudaProfilerStart.assert_not_called()

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    @patch('torch.autograd.profiler.emit_nvtx')
    def test_on_train_batch_end_profiling(self, mock_emit_nvtx, mock_get_rank, mock_trainer, mock_pl_module):
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])

        mock_trainer.strategy.current_epoch_step = 20
        assert callback._has_nsys_enabled == False
        callback._has_nsys_enabled = True
        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)

        self.fixed_cudart.cudaProfilerStop.assert_called_once()

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    @patch('torch.autograd.profiler.emit_nvtx')
    def test_on_train_batch_end_no_profiling(self, mock_emit_nvtx, mock_get_rank, mock_trainer, mock_pl_module):
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])

        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 19)

        self.fixed_cudart.cudaProfilerStop.assert_not_called()

    def test_non_cuda_device(self, mock_trainer, mock_pl_module):
        """Test behavior when the device is not CUDA."""
        mock_trainer.strategy.root_device.type = 'cpu'
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])

        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)

        # No exceptions should be raised, and no profiling calls should be made

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    def test_rank_not_in_profile_ranks(self, mock_get_rank, mock_trainer, mock_pl_module):
        """Test behavior when the current rank is not in the profile ranks."""
        mock_get_rank.return_value = 1
        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])

        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)

        # No profiling calls should be made

    @pytest.mark.parametrize(
        "start_step,end_step,batch_idx,expected_call",
        [
            (10, 20, 9, False),
            (10, 20, 10, True),
            (10, 20, 15, False),
            (10, 20, 20, False),
            (10, 20, 21, False),
        ],
    )
    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    @patch('torch.autograd.profiler.emit_nvtx')
    def test_profiling_range(
        self,
        mock_emit_nvtx,
        mock_get_rank,
        start_step,
        end_step,
        batch_idx,
        expected_call,
        mock_trainer,
        mock_pl_module,
    ):
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=start_step, end_step=end_step, ranks=[0])

        mock_trainer.strategy.current_epoch_step = batch_idx
        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, batch_idx)

        if expected_call:
            self.fixed_cudart.cudaProfilerStart.assert_called_once()
            mock_emit_nvtx.assert_called_once()
        else:
            self.fixed_cudart.cudaProfilerStart.assert_not_called()
            mock_emit_nvtx.assert_not_called()

    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
    def test_single_profile_range(self, mock_get_rank, mock_trainer, mock_pl_module):
        mock_get_rank.return_value = 0
        callback = NsysCallback(start_step=10, end_step=40, ranks=[0])

        # Ensure the device type is 'cuda'
        mock_trainer.strategy.root_device.type = 'cuda'

        # Start of range
        mock_trainer.strategy.current_epoch_step = 10
        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
        assert self.fixed_cudart.cudaProfilerStart.call_count == 1, "cudaProfilerStart was not called"

        # Middle of range
        mock_trainer.strategy.current_epoch_step = 25
        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 25)
        assert self.fixed_cudart.cudaProfilerStart.call_count == 1, "cudaProfilerStart was called again"

        # End of range
        mock_trainer.strategy.current_epoch_step = 40
        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 40)
        assert self.fixed_cudart.cudaProfilerStop.call_count == 1, "cudaProfilerStop was not called"