File size: 11,858 Bytes
909940e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
import os
import io
import sys
import timeit
import tokenize
import torch
import psutil
import inspect
from loguru import logger
from prettytable import PrettyTable
# implement by xtudbxk
# github: https://github.com/xtudbxk/lineprofiler
class MyLineProfiler():
def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1):
if base == 'ms':
self.base_n = 1000
elif base == 's':
self.base_n = 1
else:
logguru.warning(f'Unsupported base - {base}, using "s" instead')
self.base = base
self.cuda_sync = cuda_sync
self.gpuids = gpuids
self.warmup = warmup
self.warmup_counter = warmup
# we should wait this line execute warup_counter times
# before recording the stats
self.warmup_lineno = warmup_lineno
# for time profiling
self._times = {}
self._func_name = None
self._func_filename = None
self._last_time = -1
self._last_lineno = -1
self._func_hit_count = 0
self._func_firstlineno = 0
# for memory profiling
self._process = psutil.Process(os.getpid())
self._memory = {}
self._last_memory = 0
# for cuda memory profiling
self._gpu_memory = {}
self._gpu_last_memory = 0
def __trace_func__(self, frame, event, arg):
# print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}')
# check if run into the decorated func
if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename:
# --- obtain info for current hit ---
# cuda related
if self.cuda_sync is True:
torch.cuda.synchronize()
current_time = timeit.default_timer()
memory = self._process.memory_info().rss
gpu_memory = torch.cuda.memory_allocated()
# --- ends ---
# --- initilize the info when first hit ---
if frame.f_lineno not in self._times: # first hit time for this line
self._times[frame.f_lineno] = {'hit':0, 'time': 0}
self._memory[frame.f_lineno] = 0
self._gpu_memory[frame.f_lineno] = 0
# --- ends ---
# --- record info before call the decorated func ---
# 'call' - before call the func
if event == 'call':
self._last_time = current_time
self._last_lineno = frame.f_lineno
self._last_memory = memory
self._last_gpu_memory = gpu_memory
if self.warmup_lineno < 0:
self.warmup_counter -= 1
if self.warmup_counter < 0:
self._func_hit_count += 1
# --- ends ---
# 'line' - after excuting the line
# 'return' - return from the function
if event == 'line' or event == 'return':
if event == 'line' and self.warmup_counter < 0:
self._times[frame.f_lineno]['hit'] += 1
# --- obtain the memory and time consumed by this line ---
if self.warmup_counter < 0:
self._times[self._last_lineno]['time'] += current_time - self._last_time
self._memory[self._last_lineno] += memory - self._last_memory
self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory
# --- ends ---
if self.cuda_sync is True:
torch.cuda.synchronize()
self._last_time = timeit.default_timer()
self._last_memory = memory
self._gpu_last_memory = gpu_memory
self._last_lineno = frame.f_lineno
return self.__trace_func__
def decorate(self, func):
if self._func_name is not None:
logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"')
self._func_name = func.__name__
self._func_filename = func.__code__.co_filename
self._func_firstlineno = func.__code__.co_firstlineno
def _f(*args, **kwargs):
origin_trace_func = sys.gettrace()
sys.settrace(self.__trace_func__)
ret = func(*args, **kwargs)
sys.settrace(origin_trace_func)
return ret
return _f
def _get_table(self):
if len(self._times) <= 0:
logger.warning(f"un recorded datas, please ensure the function is executed")
return None
# --- load the source code ---
with open(self._func_filename, 'r') as f:
source_lines = [line.strip('\n') for line in f.readlines()]
code_str = "\n".join(source_lines)
def_lineno = min(self._times.keys())
final_lineno = max(self._times.keys())
# remove the additional blank content
pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t'))
# --- ends ---
# --- analysize the source code and collect infos for multi-line code ---
new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens(
io.StringIO(code_str).readline) if token.type == 4]
# --- ends ---
# --- merge the stats multi-line code ---
sorted_linenos = [lineno for lineno in self._times.keys()]
sorted_linenos.sort(key=int)
lineno_cache = []
for lineno in sorted_linenos:
if lineno not in new_logic_linenos:
lineno_cache.append(lineno)
else:
# we should merge its info to the prev_lineno
if len(lineno_cache) <= 0:
continue
else:
lineno_cache.append(lineno)
first_lineno = lineno_cache[0]
for prev_lineno in lineno_cache[1:]:
self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"])
self._times[first_lineno]["time"] += self._times[prev_lineno]["time"]
del self._times[prev_lineno]
self._memory[first_lineno] += self._memory[prev_lineno]
del self._memory[prev_lineno]
self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno]
del self._gpu_memory[prev_lineno]
lineno_cache = []
# --- ends ---
# --- initialize the pretty table for output ---
table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
# --- ends ---
# --- compute some statisticals ---
total_hit = 0 # for compute the hit percentage
total_time = 0
for lineno, stats in self._times.items():
if lineno == def_lineno: continue
total_hit += stats['hit']
total_time += stats['time']
total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024
total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024
# --- ends ---
peak_cpu_memory = 0
peak_gpu_memory = 0
for lineno in range(def_lineno, final_lineno+1):
if lineno not in self._times:
# the comment line, empty line or merged line from multi-lines code
table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB'])
else:
stats = self._times[lineno]
if lineno == def_lineno:
table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline'])
else:
line_result = [lineno, stats['hit'],
f'{stats["time"]*self.base_n:.4f} {self.base}',
f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan',
f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan',
f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan'
line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024
line_result += [f'{peak_cpu_memory:5.3f} MB']
line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024
line_result += [f'{peak_gpu_memory:5.3f} MB']
table.add_row(line_result)
table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l')
return table
def print(self, filename=None, mode="w"):
introducation = '''
1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
'''
print(introducation)
table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
table = self._get_table()
print(table)
if filename is not None:
with open(filename, mode) as f:
f.write(introducation)
f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n")
f.write(str(table))
if __name__ == '__main__':
import numpy as np
def mytest(h='hello',
xx="xx"):
h = h + 'world'
a = []
for _ in range(200):
# a = np.zeros((1000, 1000), dtype=np.float32)
a.append(np.zeros((1000, 1000), dtype=np.float32))
a.append(
np.zeros((1000, 1000),
dtype=np.float32))
# print(a[0,0])
print(h)
profiler = MyLineProfiler(cuda_sync=False, warmup=2)
mytest = profiler.decorate(mytest)
for _ in range(5):
mytest()
profiler.print()
|