File size: 11,858 Bytes
909940e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os
import io
import sys
import timeit
import tokenize
import torch
import psutil
import inspect
from loguru import logger
from prettytable import PrettyTable

# implement by xtudbxk
# github: https://github.com/xtudbxk/lineprofiler
class MyLineProfiler():
    def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1):

        if base == 'ms':
            self.base_n = 1000
        elif base == 's':
            self.base_n = 1
        else:
            logguru.warning(f'Unsupported base - {base}, using "s" instead')

        self.base = base
        self.cuda_sync = cuda_sync
        self.gpuids = gpuids
        self.warmup = warmup
        self.warmup_counter = warmup
        # we should wait this line execute warup_counter times 
        # before recording the stats
        self.warmup_lineno = warmup_lineno

        # for time profiling
        self._times = {}
        self._func_name = None
        self._func_filename = None
        self._last_time = -1
        self._last_lineno = -1
        self._func_hit_count = 0
        self._func_firstlineno = 0

        # for memory profiling
        self._process = psutil.Process(os.getpid())
        self._memory = {}
        self._last_memory = 0

        # for cuda memory profiling
        self._gpu_memory = {}
        self._gpu_last_memory = 0

    def __trace_func__(self, frame, event, arg):
        # print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}')

        # check if run into the decorated func
        if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename:

            # --- obtain info for current hit ---
            # cuda related
            if self.cuda_sync is True:
                torch.cuda.synchronize()

            current_time = timeit.default_timer()
            memory = self._process.memory_info().rss
            gpu_memory = torch.cuda.memory_allocated()
            # --- ends ---
            
            # --- initilize the info when first hit ---
            if frame.f_lineno not in self._times: # first hit time for this line
                self._times[frame.f_lineno] = {'hit':0, 'time': 0}
                self._memory[frame.f_lineno] = 0
                self._gpu_memory[frame.f_lineno] = 0
            # --- ends ---

            # --- record info before call the decorated func ---
            # 'call' - before call the func
            if event == 'call':
                self._last_time = current_time
                self._last_lineno = frame.f_lineno
                self._last_memory = memory
                self._last_gpu_memory = gpu_memory

                if self.warmup_lineno < 0:
                    self.warmup_counter -= 1
                    if self.warmup_counter < 0:
                        self._func_hit_count += 1
            # --- ends ---

            # 'line' - after excuting the line
            # 'return' - return from the function
            if event == 'line' or event == 'return':

                if event == 'line' and self.warmup_counter < 0:
                    self._times[frame.f_lineno]['hit'] += 1


                # --- obtain the memory and time consumed by this line ---
                if self.warmup_counter < 0:
                    self._times[self._last_lineno]['time'] += current_time - self._last_time
                self._memory[self._last_lineno] += memory - self._last_memory
                self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory
                # --- ends ---

                if self.cuda_sync is True:
                    torch.cuda.synchronize()

                self._last_time = timeit.default_timer()
                self._last_memory = memory
                self._gpu_last_memory = gpu_memory
                self._last_lineno = frame.f_lineno

        return self.__trace_func__

    def decorate(self, func):
        if self._func_name is not None:
            logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"')
        self._func_name = func.__name__
        self._func_filename = func.__code__.co_filename
        self._func_firstlineno = func.__code__.co_firstlineno

        def _f(*args, **kwargs):
            origin_trace_func = sys.gettrace()
            sys.settrace(self.__trace_func__)
            ret = func(*args, **kwargs)
            sys.settrace(origin_trace_func)
            return ret
        return _f

    def _get_table(self):

        if len(self._times) <= 0:
            logger.warning(f"un recorded datas, please ensure the function is executed")
            return None
        
        # --- load the source code ---
        with open(self._func_filename, 'r') as f:
            source_lines = [line.strip('\n') for line in f.readlines()]
            code_str = "\n".join(source_lines)

        def_lineno = min(self._times.keys())
        final_lineno = max(self._times.keys())

        # remove the additional blank content
        pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t'))
        # --- ends ---

        # --- analysize the source code and collect infos for multi-line code ---
        new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens(
            io.StringIO(code_str).readline) if token.type == 4]
        # --- ends ---

        # --- merge the stats multi-line code ---
        sorted_linenos = [lineno for lineno in self._times.keys()]
        sorted_linenos.sort(key=int)
        
        lineno_cache = []
        for lineno in sorted_linenos:
            if lineno not in new_logic_linenos: 
                lineno_cache.append(lineno)
            else:
                # we should merge its info to the prev_lineno
                if len(lineno_cache) <= 0:
                    continue
                else:
                    lineno_cache.append(lineno)
                    first_lineno = lineno_cache[0]
                    for prev_lineno in lineno_cache[1:]:
                        self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"])
                        self._times[first_lineno]["time"] += self._times[prev_lineno]["time"]
                        del self._times[prev_lineno]

                        self._memory[first_lineno] += self._memory[prev_lineno]
                        del self._memory[prev_lineno]

                        self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno]
                        del self._gpu_memory[prev_lineno]
                    lineno_cache = []
        # --- ends ---

        # --- initialize the pretty table for output ---
        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
        # --- ends ---

        # --- compute some statisticals ---
        total_hit = 0 # for compute the hit percentage
        total_time = 0
        for lineno, stats in self._times.items():
            if lineno == def_lineno: continue
            total_hit += stats['hit']
            total_time += stats['time']

        total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024 
        total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024
        # --- ends ---

        peak_cpu_memory = 0
        peak_gpu_memory = 0
        for lineno in range(def_lineno, final_lineno+1):
            if lineno not in self._times:
                # the comment line, empty line or merged line from multi-lines code
                table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB'])
            else:
                stats = self._times[lineno]
                if lineno == def_lineno: 
                    table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline'])
                else:

                    line_result = [lineno, stats['hit'], 
                                  f'{stats["time"]*self.base_n:.4f} {self.base}', 
                                  f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan', 
                                  f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan', 
                                  f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan'

                    line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
                    peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024
                    line_result += [f'{peak_cpu_memory:5.3f} MB']

                    line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
                    peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024
                    line_result += [f'{peak_gpu_memory:5.3f} MB']

                    table.add_row(line_result)

        table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l')
        return table

    def print(self, filename=None, mode="w"):
        introducation = '''
1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
        '''
        print(introducation)

        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
        table = self._get_table()
        print(table)
        if filename is not None:
            with open(filename, mode) as f:
                f.write(introducation)
                f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n")
                f.write(str(table))
            
if __name__ == '__main__':
    import numpy as np
    def mytest(h='hello', 
               xx="xx"):
    
        h = h + 'world'
        a = []
        for _ in range(200):
            # a = np.zeros((1000, 1000), dtype=np.float32)
            a.append(np.zeros((1000, 1000), dtype=np.float32))
            a.append(
                    np.zeros((1000, 1000), 
                              dtype=np.float32))
            # print(a[0,0])
        print(h)

    profiler = MyLineProfiler(cuda_sync=False, warmup=2)
    mytest = profiler.decorate(mytest)
    for _ in range(5):
        mytest()
    profiler.print()