rawanessam's picture
Upload 79 files
e9fe176 verified
import os
import sys
import subprocess
import re
import math
import signal
################## generate hosts list #####################
prefix = 'visiongpu'
suffix = 'bill'
ind_list = range(1, 15) # range(14)?
ind_high_priority_list = [2,4,5,12,13,14]
ind_low_priority_list = [ind for ind in ind_list if ind not in ind_high_priority_list]
hosts_hp = []
for ind in ind_high_priority_list:
host = prefix + "{:0>2d}".format(ind)
if ind > 11:
host += suffix
hosts_hp.append(host)
hosts_lp = []
for ind in ind_low_priority_list:
host = prefix + "{:0>2d}".format(ind)
if ind > 11:
host += suffix
hosts_lp.append(host)
users = ['ckzhang','jiajunwu']
timeout_limit = 4 # sec
####################### util functions ########################
#### Timeout Control ###
class TimeoutError(Exception):
pass
class timeout:
def __init__(self, seconds=10, error_message='Timeout'):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum, frame):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, type, value, traceback):
signal.alarm(0)
#### Color output ####
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
###################### Basic functions #######################
def ssh_nvidia(host):
""" retrieve nvidia-smi data from given host"""
print "checking "+host+'\t',
try:
with timeout(seconds=timeout_limit):
result = subprocess.check_output('ssh '+host+' "nvidia-smi;exit"', shell=True)
print "done"
return result
except TimeoutError, exc:
print "timeout"
return ""
def parse_nvidia(output, users=None):
"""
@param output nvidia-smi output
@return list of tuples of (occupied mem, total mem, util percentile, user memory usage), one for each gpu
"""
if output == "":
return [(-1, -1, -1, 0)]
lines = output.strip().split('\n')
gpu_counter = 0
result = []
for line in lines:
# gpu line
if len(re.findall('[0-9]+MiB\s*/\s*[0-9]+MiB', line)) > 0:
data = re.findall('[0-9]+MiB\s*/\s*[0-9]+MiB\s*\|\s*[0-9]+%', line)[0]
occupied = int(re.findall('[0-9]+', data)[0])
total = int(re.findall('[0-9]+',data)[1])
util = int(re.findall('[0-9]+',data)[2])
result.append( (occupied, total, util))
gpu_counter += 1
usage = [0]*gpu_counter
if users != None:
user_pattern = "|".join(users)
for line in lines:
# find user's usage
if len(re.findall(user_pattern, line)) > 0:
gpu_id = int(re.findall('[0-9]+',line)[0])
mem = int(re.findall('[0-9]+MiB',line)[-1][:-3])
usage[gpu_id] += mem
for gpu_id in xrange(len(usage)):
result[gpu_id] = result[gpu_id] + tuple([usage[gpu_id]])
return result
def collect_gpu_data(hosts, users=None):
""" given list of hosts, return map of hostname to gpu usage list defined by parse_nvidia"""
result = dict()
for host in hosts:
result[host] = parse_nvidia(ssh_nvidia(host), users)
return result
def sort_gpu(hosts_lp, hosts_hp=[], util_thres=10, mem_thres=1000):
"""
Given lists of hosts and threshold, return number of available hosts
"""
map_hp = collect_gpu_data(hosts_hp, users)
map_lp = collect_gpu_data(hosts_lp, users)
print "All GPU checked. "
print "Sorting..."
map_merge = map_hp.copy()
map_merge.update(map_lp)
map_avail = dict()
for host in map_merge:
usable = 0
mem = 0
for gpu in map_merge[host]:
if gpu[0] < 0: # Timeout
mem = -1
usable = -1
elif gpu[0] < mem_thres and gpu[2] < util_thres:
mem += (gpu[1]-gpu[0])/1000 # available memory
usable += 1 # available gpu
map_avail[host] = (usable, mem)
hosts_lp_sorted = sorted(hosts_lp, key=lambda host: map_avail[host], reverse=True)
hosts_hp_sorted = sorted(hosts_hp, key=lambda host: map_avail[host], reverse=True)
return hosts_hp_sorted + hosts_lp_sorted, map_merge, map_avail
def display(hosts, resources, map_avail):
""" Given list of hosts, map to resources and map to available gpu numbers and total memories, display them in order
Display format: hostname, available # gpu, total mem of available gpu, user used mem, then mem usage of each gpu is displayed.
"""
print bcolors.HEADER + "hostname\t#gpus\tava mem\tuser mem gpu-specific mem usage" + bcolors.ENDC
for host in hosts:
if resources[host][0][0] == -1: # Timeout
out = host+'\t'+"Timeout"
else:
out = host+'\t'+ str(map_avail[host][0])+'\t'+ str(map_avail[host][1])+'G'+'\t'+ str(int(math.ceil(sum([gpu[3] for gpu in resources[host]])/1000.)))+'G\t'
for gpu in resources[host]:
out += str(gpu[0]).rjust(6)+'/'+str(gpu[1]).rjust(6)+'\t'
if map_avail[host][0] == 4:
print bcolors.OKGREEN + out + bcolors.ENDC
elif sum([gpu[3] for gpu in resources[host]]) > 0:
print bcolors.OKBLUE + out + bcolors.ENDC
elif resources[host][0][0] == -1: # Timeout
print bcolors.FAIL + out + bcolors.ENDC
else:
print out
## test ##
#hosts_sorted, resources, map_avail = sort_gpu([], hosts_hp)
hosts_sorted, resources, map_avail = sort_gpu(hosts_lp, hosts_hp)
display(hosts_sorted, resources, map_avail)