Spaces:
Running
Running
| # Copyright (c) Microsoft Corporation. | |
| # Licensed under the MIT license. | |
| import subprocess | |
| import time | |
| import traceback | |
| from xml.dom import minidom | |
| def collect_gpu_usage(node_id): | |
| cmd = 'nvidia-smi -q -x'.split() | |
| info = None | |
| try: | |
| smi_output = subprocess.check_output(cmd) | |
| info = parse_nvidia_smi_result(smi_output) | |
| except Exception: | |
| traceback.print_exc() | |
| info = gen_empty_gpu_metric() | |
| return info | |
| def parse_nvidia_smi_result(smi): | |
| try: | |
| output = {} | |
| xmldoc = minidom.parseString(smi) | |
| gpuList = xmldoc.getElementsByTagName('gpu') | |
| output["Timestamp"] = time.asctime(time.localtime()) | |
| output["gpuCount"] = len(gpuList) | |
| output["gpuInfos"] = [] | |
| for gpuIndex, gpu in enumerate(gpuList): | |
| gpuInfo = {} | |
| gpuInfo['index'] = gpuIndex | |
| gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ | |
| .getElementsByTagName('gpu_util')[0]\ | |
| .childNodes[0].data.replace("%", "").strip() | |
| gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ | |
| .getElementsByTagName('memory_util')[0]\ | |
| .childNodes[0].data.replace("%", "").strip() | |
| processes = gpu.getElementsByTagName('processes') | |
| runningProNumber = len(processes[0].getElementsByTagName('process_info')) | |
| gpuInfo['activeProcessNum'] = runningProNumber | |
| gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ | |
| .childNodes[0].data | |
| memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] | |
| gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ | |
| .childNodes[0].data.replace("MiB", "").strip() | |
| gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ | |
| .childNodes[0].data.replace("MiB", "").strip() | |
| gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ | |
| .childNodes[0].data.replace("MiB", "").strip() | |
| output["gpuInfos"].append(gpuInfo) | |
| except Exception: | |
| traceback.print_exc() | |
| output = {} | |
| return output | |
| def gen_empty_gpu_metric(): | |
| try: | |
| output = {} | |
| output["Timestamp"] = time.asctime(time.localtime()) | |
| output["gpuCount"] = 0 | |
| output["gpuInfos"] = [] | |
| except Exception: | |
| traceback.print_exc() | |
| output = {} | |
| return output | |