Spaces:
Sleeping
Sleeping
File size: 5,884 Bytes
e9fe176 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | import os
import sys
import subprocess
import re
import math
import signal
################## generate hosts list #####################
prefix = 'visiongpu'
suffix = 'bill'
ind_list = range(1, 15) # range(14)?
ind_high_priority_list = [2,4,5,12,13,14]
ind_low_priority_list = [ind for ind in ind_list if ind not in ind_high_priority_list]
hosts_hp = []
for ind in ind_high_priority_list:
host = prefix + "{:0>2d}".format(ind)
if ind > 11:
host += suffix
hosts_hp.append(host)
hosts_lp = []
for ind in ind_low_priority_list:
host = prefix + "{:0>2d}".format(ind)
if ind > 11:
host += suffix
hosts_lp.append(host)
users = ['ckzhang','jiajunwu']
timeout_limit = 4 # sec
####################### util functions ########################
#### Timeout Control ###
class TimeoutError(Exception):
pass
class timeout:
def __init__(self, seconds=10, error_message='Timeout'):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum, frame):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, type, value, traceback):
signal.alarm(0)
#### Color output ####
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
###################### Basic functions #######################
def ssh_nvidia(host):
""" retrieve nvidia-smi data from given host"""
print "checking "+host+'\t',
try:
with timeout(seconds=timeout_limit):
result = subprocess.check_output('ssh '+host+' "nvidia-smi;exit"', shell=True)
print "done"
return result
except TimeoutError, exc:
print "timeout"
return ""
def parse_nvidia(output, users=None):
"""
@param output nvidia-smi output
@return list of tuples of (occupied mem, total mem, util percentile, user memory usage), one for each gpu
"""
if output == "":
return [(-1, -1, -1, 0)]
lines = output.strip().split('\n')
gpu_counter = 0
result = []
for line in lines:
# gpu line
if len(re.findall('[0-9]+MiB\s*/\s*[0-9]+MiB', line)) > 0:
data = re.findall('[0-9]+MiB\s*/\s*[0-9]+MiB\s*\|\s*[0-9]+%', line)[0]
occupied = int(re.findall('[0-9]+', data)[0])
total = int(re.findall('[0-9]+',data)[1])
util = int(re.findall('[0-9]+',data)[2])
result.append( (occupied, total, util))
gpu_counter += 1
usage = [0]*gpu_counter
if users != None:
user_pattern = "|".join(users)
for line in lines:
# find user's usage
if len(re.findall(user_pattern, line)) > 0:
gpu_id = int(re.findall('[0-9]+',line)[0])
mem = int(re.findall('[0-9]+MiB',line)[-1][:-3])
usage[gpu_id] += mem
for gpu_id in xrange(len(usage)):
result[gpu_id] = result[gpu_id] + tuple([usage[gpu_id]])
return result
def collect_gpu_data(hosts, users=None):
""" given list of hosts, return map of hostname to gpu usage list defined by parse_nvidia"""
result = dict()
for host in hosts:
result[host] = parse_nvidia(ssh_nvidia(host), users)
return result
def sort_gpu(hosts_lp, hosts_hp=[], util_thres=10, mem_thres=1000):
"""
Given lists of hosts and threshold, return number of available hosts
"""
map_hp = collect_gpu_data(hosts_hp, users)
map_lp = collect_gpu_data(hosts_lp, users)
print "All GPU checked. "
print "Sorting..."
map_merge = map_hp.copy()
map_merge.update(map_lp)
map_avail = dict()
for host in map_merge:
usable = 0
mem = 0
for gpu in map_merge[host]:
if gpu[0] < 0: # Timeout
mem = -1
usable = -1
elif gpu[0] < mem_thres and gpu[2] < util_thres:
mem += (gpu[1]-gpu[0])/1000 # available memory
usable += 1 # available gpu
map_avail[host] = (usable, mem)
hosts_lp_sorted = sorted(hosts_lp, key=lambda host: map_avail[host], reverse=True)
hosts_hp_sorted = sorted(hosts_hp, key=lambda host: map_avail[host], reverse=True)
return hosts_hp_sorted + hosts_lp_sorted, map_merge, map_avail
def display(hosts, resources, map_avail):
""" Given list of hosts, map to resources and map to available gpu numbers and total memories, display them in order
Display format: hostname, available # gpu, total mem of available gpu, user used mem, then mem usage of each gpu is displayed.
"""
print bcolors.HEADER + "hostname\t#gpus\tava mem\tuser mem gpu-specific mem usage" + bcolors.ENDC
for host in hosts:
if resources[host][0][0] == -1: # Timeout
out = host+'\t'+"Timeout"
else:
out = host+'\t'+ str(map_avail[host][0])+'\t'+ str(map_avail[host][1])+'G'+'\t'+ str(int(math.ceil(sum([gpu[3] for gpu in resources[host]])/1000.)))+'G\t'
for gpu in resources[host]:
out += str(gpu[0]).rjust(6)+'/'+str(gpu[1]).rjust(6)+'\t'
if map_avail[host][0] == 4:
print bcolors.OKGREEN + out + bcolors.ENDC
elif sum([gpu[3] for gpu in resources[host]]) > 0:
print bcolors.OKBLUE + out + bcolors.ENDC
elif resources[host][0][0] == -1: # Timeout
print bcolors.FAIL + out + bcolors.ENDC
else:
print out
## test ##
#hosts_sorted, resources, map_avail = sort_gpu([], hosts_hp)
hosts_sorted, resources, map_avail = sort_gpu(hosts_lp, hosts_hp)
display(hosts_sorted, resources, map_avail)
|