|
|
import os |
|
|
import io |
|
|
import sys |
|
|
import timeit |
|
|
import tokenize |
|
|
import torch |
|
|
import psutil |
|
|
import inspect |
|
|
from loguru import logger |
|
|
from prettytable import PrettyTable |
|
|
|
|
|
|
|
|
|
|
|
class MyLineProfiler(): |
|
|
def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1): |
|
|
|
|
|
if base == 'ms': |
|
|
self.base_n = 1000 |
|
|
elif base == 's': |
|
|
self.base_n = 1 |
|
|
else: |
|
|
logguru.warning(f'Unsupported base - {base}, using "s" instead') |
|
|
|
|
|
self.base = base |
|
|
self.cuda_sync = cuda_sync |
|
|
self.gpuids = gpuids |
|
|
self.warmup = warmup |
|
|
self.warmup_counter = warmup |
|
|
|
|
|
|
|
|
self.warmup_lineno = warmup_lineno |
|
|
|
|
|
|
|
|
self._times = {} |
|
|
self._func_name = None |
|
|
self._func_filename = None |
|
|
self._last_time = -1 |
|
|
self._last_lineno = -1 |
|
|
self._func_hit_count = 0 |
|
|
self._func_firstlineno = 0 |
|
|
|
|
|
|
|
|
self._process = psutil.Process(os.getpid()) |
|
|
self._memory = {} |
|
|
self._last_memory = 0 |
|
|
|
|
|
|
|
|
self._gpu_memory = {} |
|
|
self._gpu_last_memory = 0 |
|
|
|
|
|
def __trace_func__(self, frame, event, arg): |
|
|
|
|
|
|
|
|
|
|
|
if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename: |
|
|
|
|
|
|
|
|
|
|
|
if self.cuda_sync is True: |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
current_time = timeit.default_timer() |
|
|
memory = self._process.memory_info().rss |
|
|
gpu_memory = torch.cuda.memory_allocated() |
|
|
|
|
|
|
|
|
|
|
|
if frame.f_lineno not in self._times: |
|
|
self._times[frame.f_lineno] = {'hit':0, 'time': 0} |
|
|
self._memory[frame.f_lineno] = 0 |
|
|
self._gpu_memory[frame.f_lineno] = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if event == 'call': |
|
|
self._last_time = current_time |
|
|
self._last_lineno = frame.f_lineno |
|
|
self._last_memory = memory |
|
|
self._last_gpu_memory = gpu_memory |
|
|
|
|
|
if self.warmup_lineno < 0: |
|
|
self.warmup_counter -= 1 |
|
|
if self.warmup_counter < 0: |
|
|
self._func_hit_count += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if event == 'line' or event == 'return': |
|
|
|
|
|
if event == 'line' and self.warmup_counter < 0: |
|
|
self._times[frame.f_lineno]['hit'] += 1 |
|
|
|
|
|
|
|
|
|
|
|
if self.warmup_counter < 0: |
|
|
self._times[self._last_lineno]['time'] += current_time - self._last_time |
|
|
self._memory[self._last_lineno] += memory - self._last_memory |
|
|
self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory |
|
|
|
|
|
|
|
|
if self.cuda_sync is True: |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
self._last_time = timeit.default_timer() |
|
|
self._last_memory = memory |
|
|
self._gpu_last_memory = gpu_memory |
|
|
self._last_lineno = frame.f_lineno |
|
|
|
|
|
return self.__trace_func__ |
|
|
|
|
|
def decorate(self, func): |
|
|
if self._func_name is not None: |
|
|
logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"') |
|
|
self._func_name = func.__name__ |
|
|
self._func_filename = func.__code__.co_filename |
|
|
self._func_firstlineno = func.__code__.co_firstlineno |
|
|
|
|
|
def _f(*args, **kwargs): |
|
|
origin_trace_func = sys.gettrace() |
|
|
sys.settrace(self.__trace_func__) |
|
|
ret = func(*args, **kwargs) |
|
|
sys.settrace(origin_trace_func) |
|
|
return ret |
|
|
return _f |
|
|
|
|
|
def _get_table(self): |
|
|
|
|
|
if len(self._times) <= 0: |
|
|
logger.warning(f"un recorded datas, please ensure the function is executed") |
|
|
return None |
|
|
|
|
|
|
|
|
with open(self._func_filename, 'r') as f: |
|
|
source_lines = [line.strip('\n') for line in f.readlines()] |
|
|
code_str = "\n".join(source_lines) |
|
|
|
|
|
def_lineno = min(self._times.keys()) |
|
|
final_lineno = max(self._times.keys()) |
|
|
|
|
|
|
|
|
pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t')) |
|
|
|
|
|
|
|
|
|
|
|
new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens( |
|
|
io.StringIO(code_str).readline) if token.type == 4] |
|
|
|
|
|
|
|
|
|
|
|
sorted_linenos = [lineno for lineno in self._times.keys()] |
|
|
sorted_linenos.sort(key=int) |
|
|
|
|
|
lineno_cache = [] |
|
|
for lineno in sorted_linenos: |
|
|
if lineno not in new_logic_linenos: |
|
|
lineno_cache.append(lineno) |
|
|
else: |
|
|
|
|
|
if len(lineno_cache) <= 0: |
|
|
continue |
|
|
else: |
|
|
lineno_cache.append(lineno) |
|
|
first_lineno = lineno_cache[0] |
|
|
for prev_lineno in lineno_cache[1:]: |
|
|
self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"]) |
|
|
self._times[first_lineno]["time"] += self._times[prev_lineno]["time"] |
|
|
del self._times[prev_lineno] |
|
|
|
|
|
self._memory[first_lineno] += self._memory[prev_lineno] |
|
|
del self._memory[prev_lineno] |
|
|
|
|
|
self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno] |
|
|
del self._gpu_memory[prev_lineno] |
|
|
lineno_cache = [] |
|
|
|
|
|
|
|
|
|
|
|
table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak']) |
|
|
|
|
|
|
|
|
|
|
|
total_hit = 0 |
|
|
total_time = 0 |
|
|
for lineno, stats in self._times.items(): |
|
|
if lineno == def_lineno: continue |
|
|
total_hit += stats['hit'] |
|
|
total_time += stats['time'] |
|
|
|
|
|
total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024 |
|
|
total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024 |
|
|
|
|
|
|
|
|
peak_cpu_memory = 0 |
|
|
peak_gpu_memory = 0 |
|
|
for lineno in range(def_lineno, final_lineno+1): |
|
|
if lineno not in self._times: |
|
|
|
|
|
table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB']) |
|
|
else: |
|
|
stats = self._times[lineno] |
|
|
if lineno == def_lineno: |
|
|
table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline']) |
|
|
else: |
|
|
|
|
|
line_result = [lineno, stats['hit'], |
|
|
f'{stats["time"]*self.base_n:.4f} {self.base}', |
|
|
f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan', |
|
|
f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan', |
|
|
f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan' |
|
|
|
|
|
line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB'] |
|
|
peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024 |
|
|
line_result += [f'{peak_cpu_memory:5.3f} MB'] |
|
|
|
|
|
line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB'] |
|
|
peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024 |
|
|
line_result += [f'{peak_gpu_memory:5.3f} MB'] |
|
|
|
|
|
table.add_row(line_result) |
|
|
|
|
|
table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l') |
|
|
return table |
|
|
|
|
|
def print(self, filename=None, mode="w"): |
|
|
introducation = ''' |
|
|
1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function. |
|
|
2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`. |
|
|
3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function. |
|
|
4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com |
|
|
''' |
|
|
print(introducation) |
|
|
|
|
|
table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak']) |
|
|
table = self._get_table() |
|
|
print(table) |
|
|
if filename is not None: |
|
|
with open(filename, mode) as f: |
|
|
f.write(introducation) |
|
|
f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n") |
|
|
f.write(str(table)) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
import numpy as np |
|
|
def mytest(h='hello', |
|
|
xx="xx"): |
|
|
|
|
|
h = h + 'world' |
|
|
a = [] |
|
|
for _ in range(200): |
|
|
|
|
|
a.append(np.zeros((1000, 1000), dtype=np.float32)) |
|
|
a.append( |
|
|
np.zeros((1000, 1000), |
|
|
dtype=np.float32)) |
|
|
|
|
|
print(h) |
|
|
|
|
|
profiler = MyLineProfiler(cuda_sync=False, warmup=2) |
|
|
mytest = profiler.decorate(mytest) |
|
|
for _ in range(5): |
|
|
mytest() |
|
|
profiler.print() |
|
|
|