Spaces:

rawanessam
/

FloorPlanTransformation

Running

App Files Files Community

FloorPlanTransformation / util /py /check_gpu.py

rawanessam

Upload 79 files

e9fe176 verified 7 months ago

raw

history blame contribute delete

5.88 kB

	import os
	import sys
	import subprocess
	import re
	import math
	import signal


	################## generate hosts list #####################
	prefix = 'visiongpu'
	suffix = 'bill'
	ind_list = range(1, 15) # range(14)?
	ind_high_priority_list = [2,4,5,12,13,14]
	ind_low_priority_list = [ind for ind in ind_list if ind not in ind_high_priority_list]
	hosts_hp = []
	for ind in ind_high_priority_list:
	host = prefix + "{:0>2d}".format(ind)
	if ind > 11:
	host += suffix
	hosts_hp.append(host)

	hosts_lp = []
	for ind in ind_low_priority_list:
	host = prefix + "{:0>2d}".format(ind)
	if ind > 11:
	host += suffix
	hosts_lp.append(host)


	users = ['ckzhang','jiajunwu']
	timeout_limit = 4 # sec

	####################### util functions ########################

	#### Timeout Control ###
	class TimeoutError(Exception):
	pass

	class timeout:
	def __init__(self, seconds=10, error_message='Timeout'):
	self.seconds = seconds
	self.error_message = error_message
	def handle_timeout(self, signum, frame):
	raise TimeoutError(self.error_message)
	def __enter__(self):
	signal.signal(signal.SIGALRM, self.handle_timeout)
	signal.alarm(self.seconds)
	def __exit__(self, type, value, traceback):
	signal.alarm(0)

	#### Color output ####
	class bcolors:
	HEADER = '\033[95m'
	OKBLUE = '\033[94m'
	OKGREEN = '\033[92m'
	WARNING = '\033[93m'
	FAIL = '\033[91m'
	ENDC = '\033[0m'
	BOLD = '\033[1m'
	UNDERLINE = '\033[4m'

	###################### Basic functions #######################

	def ssh_nvidia(host):
	""" retrieve nvidia-smi data from given host"""
	print "checking "+host+'\t',
	try:
	with timeout(seconds=timeout_limit):
	result = subprocess.check_output('ssh '+host+' "nvidia-smi;exit"', shell=True)
	print "done"
	return result
	except TimeoutError, exc:
	print "timeout"
	return ""

	def parse_nvidia(output, users=None):
	"""
	@param output nvidia-smi output
	@return list of tuples of (occupied mem, total mem, util percentile, user memory usage), one for each gpu
	"""
	if output == "":
	return [(-1, -1, -1, 0)]
	lines = output.strip().split('\n')
	gpu_counter = 0
	result = []
	for line in lines:
	# gpu line
	if len(re.findall('[0-9]+MiB\s/\s[0-9]+MiB', line)) > 0:
	data = re.findall('[0-9]+MiB\s/\s[0-9]+MiB\s\\|\s[0-9]+%', line)[0]
	occupied = int(re.findall('[0-9]+', data)[0])
	total = int(re.findall('[0-9]+',data)[1])
	util = int(re.findall('[0-9]+',data)[2])
	result.append( (occupied, total, util))

	gpu_counter += 1

	usage = [0]*gpu_counter
	if users != None:
	user_pattern = "\|".join(users)
	for line in lines:
	# find user's usage
	if len(re.findall(user_pattern, line)) > 0:
	gpu_id = int(re.findall('[0-9]+',line)[0])
	mem = int(re.findall('[0-9]+MiB',line)[-1][:-3])
	usage[gpu_id] += mem
	for gpu_id in xrange(len(usage)):
	result[gpu_id] = result[gpu_id] + tuple([usage[gpu_id]])

	return result

	def collect_gpu_data(hosts, users=None):
	""" given list of hosts, return map of hostname to gpu usage list defined by parse_nvidia"""
	result = dict()
	for host in hosts:
	result[host] = parse_nvidia(ssh_nvidia(host), users)
	return result

	def sort_gpu(hosts_lp, hosts_hp=[], util_thres=10, mem_thres=1000):
	"""
	Given lists of hosts and threshold, return number of available hosts
	"""
	map_hp = collect_gpu_data(hosts_hp, users)
	map_lp = collect_gpu_data(hosts_lp, users)
	print "All GPU checked. "
	print "Sorting..."
	map_merge = map_hp.copy()
	map_merge.update(map_lp)
	map_avail = dict()
	for host in map_merge:
	usable = 0
	mem = 0
	for gpu in map_merge[host]:
	if gpu[0] < 0: # Timeout
	mem = -1
	usable = -1
	elif gpu[0] < mem_thres and gpu[2] < util_thres:
	mem += (gpu[1]-gpu[0])/1000 # available memory
	usable += 1 # available gpu
	map_avail[host] = (usable, mem)

	hosts_lp_sorted = sorted(hosts_lp, key=lambda host: map_avail[host], reverse=True)
	hosts_hp_sorted = sorted(hosts_hp, key=lambda host: map_avail[host], reverse=True)

	return hosts_hp_sorted + hosts_lp_sorted, map_merge, map_avail

	def display(hosts, resources, map_avail):
	""" Given list of hosts, map to resources and map to available gpu numbers and total memories, display them in order
	Display format: hostname, available # gpu, total mem of available gpu, user used mem, then mem usage of each gpu is displayed.
	"""
	print bcolors.HEADER + "hostname\t#gpus\tava mem\tuser mem gpu-specific mem usage" + bcolors.ENDC

	for host in hosts:
	if resources[host][0][0] == -1: # Timeout
	out = host+'\t'+"Timeout"
	else:
	out = host+'\t'+ str(map_avail[host][0])+'\t'+ str(map_avail[host][1])+'G'+'\t'+ str(int(math.ceil(sum([gpu[3] for gpu in resources[host]])/1000.)))+'G\t'
	for gpu in resources[host]:
	out += str(gpu[0]).rjust(6)+'/'+str(gpu[1]).rjust(6)+'\t'

	if map_avail[host][0] == 4:
	print bcolors.OKGREEN + out + bcolors.ENDC
	elif sum([gpu[3] for gpu in resources[host]]) > 0:
	print bcolors.OKBLUE + out + bcolors.ENDC
	elif resources[host][0][0] == -1: # Timeout
	print bcolors.FAIL + out + bcolors.ENDC
	else:
	print out

	## test ##

	#hosts_sorted, resources, map_avail = sort_gpu([], hosts_hp)
	hosts_sorted, resources, map_avail = sort_gpu(hosts_lp, hosts_hp)
	display(hosts_sorted, resources, map_avail)