Spaces:

mutou0308
/

GSASR

Running on Zero

GSASR / utils /gs_cuda_dmax /mylineprofiler.py

mt-cly

init

909940e 6 months ago

11.9 kB

	import os
	import io
	import sys
	import timeit
	import tokenize
	import torch
	import psutil
	import inspect
	from loguru import logger
	from prettytable import PrettyTable

	# implement by xtudbxk
	# github: https://github.com/xtudbxk/lineprofiler
	class MyLineProfiler():
	def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1):

	if base == 'ms':
	self.base_n = 1000
	elif base == 's':
	self.base_n = 1
	else:
	logguru.warning(f'Unsupported base - {base}, using "s" instead')

	self.base = base
	self.cuda_sync = cuda_sync
	self.gpuids = gpuids
	self.warmup = warmup
	self.warmup_counter = warmup
	# we should wait this line execute warup_counter times
	# before recording the stats
	self.warmup_lineno = warmup_lineno

	# for time profiling
	self._times = {}
	self._func_name = None
	self._func_filename = None
	self._last_time = -1
	self._last_lineno = -1
	self._func_hit_count = 0
	self._func_firstlineno = 0

	# for memory profiling
	self._process = psutil.Process(os.getpid())
	self._memory = {}
	self._last_memory = 0

	# for cuda memory profiling
	self._gpu_memory = {}
	self._gpu_last_memory = 0

	def __trace_func__(self, frame, event, arg):
	# print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}')

	# check if run into the decorated func
	if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename:

	# --- obtain info for current hit ---
	# cuda related
	if self.cuda_sync is True:
	torch.cuda.synchronize()

	current_time = timeit.default_timer()
	memory = self._process.memory_info().rss
	gpu_memory = torch.cuda.memory_allocated()
	# --- ends ---

	# --- initilize the info when first hit ---
	if frame.f_lineno not in self._times: # first hit time for this line
	self._times[frame.f_lineno] = {'hit':0, 'time': 0}
	self._memory[frame.f_lineno] = 0
	self._gpu_memory[frame.f_lineno] = 0
	# --- ends ---

	# --- record info before call the decorated func ---
	# 'call' - before call the func
	if event == 'call':
	self._last_time = current_time
	self._last_lineno = frame.f_lineno
	self._last_memory = memory
	self._last_gpu_memory = gpu_memory

	if self.warmup_lineno < 0:
	self.warmup_counter -= 1
	if self.warmup_counter < 0:
	self._func_hit_count += 1
	# --- ends ---

	# 'line' - after excuting the line
	# 'return' - return from the function
	if event == 'line' or event == 'return':

	if event == 'line' and self.warmup_counter < 0:
	self._times[frame.f_lineno]['hit'] += 1


	# --- obtain the memory and time consumed by this line ---
	if self.warmup_counter < 0:
	self._times[self._last_lineno]['time'] += current_time - self._last_time
	self._memory[self._last_lineno] += memory - self._last_memory
	self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory
	# --- ends ---

	if self.cuda_sync is True:
	torch.cuda.synchronize()

	self._last_time = timeit.default_timer()
	self._last_memory = memory
	self._gpu_last_memory = gpu_memory
	self._last_lineno = frame.f_lineno

	return self.__trace_func__

	def decorate(self, func):
	if self._func_name is not None:
	logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"')
	self._func_name = func.__name__
	self._func_filename = func.__code__.co_filename
	self._func_firstlineno = func.__code__.co_firstlineno

	def _f(args, *kwargs):
	origin_trace_func = sys.gettrace()
	sys.settrace(self.__trace_func__)
	ret = func(args, *kwargs)
	sys.settrace(origin_trace_func)
	return ret
	return _f

	def _get_table(self):

	if len(self._times) <= 0:
	logger.warning(f"un recorded datas, please ensure the function is executed")
	return None

	# --- load the source code ---
	with open(self._func_filename, 'r') as f:
	source_lines = [line.strip('\n') for line in f.readlines()]
	code_str = "\n".join(source_lines)

	def_lineno = min(self._times.keys())
	final_lineno = max(self._times.keys())

	# remove the additional blank content
	pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t'))
	# --- ends ---

	# --- analysize the source code and collect infos for multi-line code ---
	new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens(
	io.StringIO(code_str).readline) if token.type == 4]
	# --- ends ---

	# --- merge the stats multi-line code ---
	sorted_linenos = [lineno for lineno in self._times.keys()]
	sorted_linenos.sort(key=int)

	lineno_cache = []
	for lineno in sorted_linenos:
	if lineno not in new_logic_linenos:
	lineno_cache.append(lineno)
	else:
	# we should merge its info to the prev_lineno
	if len(lineno_cache) <= 0:
	continue
	else:
	lineno_cache.append(lineno)
	first_lineno = lineno_cache[0]
	for prev_lineno in lineno_cache[1:]:
	self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"])
	self._times[first_lineno]["time"] += self._times[prev_lineno]["time"]
	del self._times[prev_lineno]

	self._memory[first_lineno] += self._memory[prev_lineno]
	del self._memory[prev_lineno]

	self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno]
	del self._gpu_memory[prev_lineno]
	lineno_cache = []
	# --- ends ---

	# --- initialize the pretty table for output ---
	table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
	# --- ends ---

	# --- compute some statisticals ---
	total_hit = 0 # for compute the hit percentage
	total_time = 0
	for lineno, stats in self._times.items():
	if lineno == def_lineno: continue
	total_hit += stats['hit']
	total_time += stats['time']

	total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024
	total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024
	# --- ends ---

	peak_cpu_memory = 0
	peak_gpu_memory = 0
	for lineno in range(def_lineno, final_lineno+1):
	if lineno not in self._times:
	# the comment line, empty line or merged line from multi-lines code
	table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB'])
	else:
	stats = self._times[lineno]
	if lineno == def_lineno:
	table.add_row([lineno, self._func_hit_count, f'{total_timeself.base_n:.4f} {self.base}', f'{total_time/self._func_hit_countself.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline'])
	else:

	line_result = [lineno, stats['hit'],
	f'{stats["time"]*self.base_n:.4f} {self.base}',
	f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan',
	f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan',
	f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan'

	line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
	peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024
	line_result += [f'{peak_cpu_memory:5.3f} MB']

	line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
	peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024
	line_result += [f'{peak_gpu_memory:5.3f} MB']

	table.add_row(line_result)

	table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l')
	return table

	def print(self, filename=None, mode="w"):
	introducation = '''
	1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
	2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
	3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
	4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
	'''
	print(introducation)

	table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
	table = self._get_table()
	print(table)
	if filename is not None:
	with open(filename, mode) as f:
	f.write(introducation)
	f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n")
	f.write(str(table))

	if __name__ == '__main__':
	import numpy as np
	def mytest(h='hello',
	xx="xx"):

	h = h + 'world'
	a = []
	for _ in range(200):
	# a = np.zeros((1000, 1000), dtype=np.float32)
	a.append(np.zeros((1000, 1000), dtype=np.float32))
	a.append(
	np.zeros((1000, 1000),
	dtype=np.float32))
	# print(a[0,0])
	print(h)

	profiler = MyLineProfiler(cuda_sync=False, warmup=2)
	mytest = profiler.decorate(mytest)
	for _ in range(5):
	mytest()
	profiler.print()