| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from pynvml import * |
| import datetime |
|
|
| |
| |
| |
| def GetEccByType(handle, counterType, errorType): |
| strResult = '' |
| |
| try: |
| deviceMemory = nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, |
| NVML_MEMORY_LOCATION_DEVICE_MEMORY) |
| except NVMLError as err: |
| deviceMemory = handleError(err) |
| strResult += ' <device_memory>' + str(deviceMemory) + '</device_memory>\n' |
| |
| try: |
| registerFile = nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, |
| NVML_MEMORY_LOCATION_REGISTER_FILE) |
| except NVMLError as err: |
| registerFile = handleError(err) |
| |
| strResult += ' <register_file>' + str(registerFile) + '</register_file>\n' |
| |
| try: |
| l1Cache = nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, |
| NVML_MEMORY_LOCATION_L1_CACHE) |
| except NVMLError as err: |
| l1Cache = handleError(err) |
| strResult += ' <l1_cache>' + str(l1Cache) + '</l1_cache>\n' |
| |
| try: |
| l2Cache = nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, |
| NVML_MEMORY_LOCATION_L2_CACHE) |
| except NVMLError as err: |
| l2Cache = handleError(err) |
| strResult += ' <l2_cache>' + str(l2Cache) + '</l2_cache>\n' |
| |
| try: |
| textureMemory = nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, |
| NVML_MEMORY_LOCATION_TEXTURE_MEMORY) |
| except NVMLError as err: |
| textureMemory = handleError(err) |
| strResult += ' <texture_memory>' + str(textureMemory) + '</texture_memory>\n' |
| |
| try: |
| count = str(nvmlDeviceGetTotalEccErrors(handle, errorType, counterType)) |
| except NVMLError as err: |
| count = handleError(err) |
| strResult += ' <total>' + count + '</total>\n' |
| |
| return strResult |
|
|
| def GetEccByCounter(handle, counterType): |
| strResult = '' |
| strResult += ' <single_bit>\n' |
| strResult += str(GetEccByType(handle, counterType, NVML_MEMORY_ERROR_TYPE_CORRECTED)) |
| strResult += ' </single_bit>\n' |
| strResult += ' <double_bit>\n' |
| strResult += str(GetEccByType(handle, counterType, NVML_MEMORY_ERROR_TYPE_UNCORRECTED)) |
| strResult += ' </double_bit>\n' |
| return strResult |
|
|
| def GetEccStr(handle): |
| strResult = '' |
| strResult += ' <volatile>\n' |
| strResult += str(GetEccByCounter(handle, NVML_VOLATILE_ECC)) |
| strResult += ' </volatile>\n' |
| strResult += ' <aggregate>\n' |
| strResult += str(GetEccByCounter(handle, NVML_AGGREGATE_ECC)) |
| strResult += ' </aggregate>\n' |
| return strResult |
|
|
| def GetRetiredPagesByCause(handle, cause): |
| strResult = '' |
| try: |
| pages = nvmlDeviceGetRetiredPages(handle, cause) |
| count = str(len(pages)) |
| except NVMLError as err: |
| error = handleError(err) |
| pages = None |
| count = error |
| strResult += ' <retired_count>' + count + '</retired_count>\n' |
| if pages is not None: |
| strResult += ' <retired_page_addresses>\n' |
| for page in pages: |
| strResult += ' <retired_page_address>' + "0x%016x" % page + '</retired_page_address>\n' |
| strResult += ' </retired_page_addresses>\n' |
| else: |
| strResult += ' <retired_page_addresses>' + error + '</retired_page_addresses>\n' |
| return strResult |
|
|
| def GetRetiredPagesStr(handle): |
| strResult = '' |
| causes = [ "multiple_single_bit_retirement", "double_bit_retirement" ] |
| for idx in range(NVML_PAGE_RETIREMENT_CAUSE_COUNT): |
| strResult += ' <' + causes[idx] + '>\n' |
| strResult += GetRetiredPagesByCause(handle, idx) |
| strResult += ' </' + causes[idx] + '>\n' |
|
|
| strResult += ' <pending_retirement>' |
| try: |
| if NVML_FEATURE_DISABLED == nvmlDeviceGetRetiredPagesPendingStatus(handle): |
| strResult += "No" |
| else: |
| strResult += "Yes" |
| except NVMLError as err: |
| strResult += handleError(err) |
| strResult += '</pending_retirement>\n' |
| return strResult |
| |
| def StrGOM(mode): |
| if mode == NVML_GOM_ALL_ON: |
| return "All On"; |
| elif mode == NVML_GOM_COMPUTE: |
| return "Compute"; |
| elif mode == NVML_GOM_LOW_DP: |
| return "Low Double Precision"; |
| else: |
| return "Unknown"; |
|
|
| def GetClocksThrottleReasons(handle): |
| throttleReasons = [ |
| [nvmlClocksThrottleReasonGpuIdle, "clocks_throttle_reason_gpu_idle"], |
| [nvmlClocksThrottleReasonUserDefinedClocks, "clocks_throttle_reason_user_defined_clocks"], |
| [nvmlClocksThrottleReasonApplicationsClocksSetting, "clocks_throttle_reason_applications_clocks_setting"], |
| [nvmlClocksThrottleReasonSwPowerCap, "clocks_throttle_reason_sw_power_cap"], |
| [nvmlClocksThrottleReasonHwSlowdown, "clocks_throttle_reason_hw_slowdown"], |
| [nvmlClocksThrottleReasonUnknown, "clocks_throttle_reason_unknown"] |
| ]; |
|
|
| strResult = '' |
|
|
| try: |
| supportedClocksThrottleReasons = nvmlDeviceGetSupportedClocksThrottleReasons(handle); |
| clocksThrottleReasons = nvmlDeviceGetCurrentClocksThrottleReasons(handle); |
| strResult += ' <clocks_throttle_reasons>\n' |
| for (mask, name) in throttleReasons: |
| if (name != "clocks_throttle_reason_user_defined_clocks"): |
| if (mask & supportedClocksThrottleReasons): |
| val = "Active" if mask & clocksThrottleReasons else "Not Active"; |
| else: |
| val = handleError(NVML_ERROR_NOT_SUPPORTED); |
| strResult += " <%s>%s</%s>\n" % (name, val, name); |
| strResult += ' </clocks_throttle_reasons>\n' |
| except NVMLError as err: |
| strResult += ' <clocks_throttle_reasons>%s</clocks_throttle_reasons>\n' % (handleError(err)); |
|
|
| return strResult; |
| |
| |
| |
| |
| def handleError(err): |
| if (err.value == NVML_ERROR_NOT_SUPPORTED): |
| return "N/A" |
| else: |
| return err.__str__() |
|
|
| |
| def XmlDeviceQuery(): |
|
|
| strResult = '' |
| try: |
| |
| |
| |
| nvmlInit() |
|
|
| strResult += '<?xml version="1.0" ?>\n' |
| strResult += '<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v4.dtd">\n' |
| strResult += '<nvidia_smi_log>\n' |
|
|
| strResult += ' <timestamp>' + str(datetime.date.today()) + '</timestamp>\n' |
| strResult += ' <driver_version>' + str(nvmlSystemGetDriverVersion()) + '</driver_version>\n' |
|
|
| deviceCount = nvmlDeviceGetCount() |
| strResult += ' <attached_gpus>' + str(deviceCount) + '</attached_gpus>\n' |
|
|
| for i in range(0, deviceCount): |
| handle = nvmlDeviceGetHandleByIndex(i) |
| |
| pciInfo = nvmlDeviceGetPciInfo(handle) |
| |
| strResult += ' <gpu id="%s">\n' % pciInfo.busId |
| |
| strResult += ' <product_name>' + nvmlDeviceGetName(handle) + '</product_name>\n' |
| |
| brandNames = {NVML_BRAND_UNKNOWN : "Unknown", |
| NVML_BRAND_QUADRO : "Quadro", |
| NVML_BRAND_TESLA : "Tesla", |
| NVML_BRAND_NVS : "NVS", |
| NVML_BRAND_GRID : "Grid", |
| NVML_BRAND_GEFORCE : "GeForce", |
| } |
|
|
| try: |
| |
| brandName = brandNames[nvmlDeviceGetBrand(handle)] |
| except NVMLError as err: |
| brandName = handleError(err) |
|
|
|
|
| strResult += ' <product_brand>' + brandName + '</product_brand>\n' |
| |
| try: |
| state = ('Enabled' if (nvmlDeviceGetDisplayMode(handle) != 0) else 'Disabled') |
| except NVMLError as err: |
| state = handleError(err) |
| |
| strResult += ' <display_mode>' + state + '</display_mode>\n' |
| |
| try: |
| state = ('Enabled' if (nvmlDeviceGetDisplayActive(handle) != 0) else 'Disabled') |
| except NVMLError as err: |
| state = handleError(err) |
| |
| strResult += ' <display_active>' + state + '</display_active>\n' |
| |
| try: |
| mode = 'Enabled' if (nvmlDeviceGetPersistenceMode(handle) != 0) else 'Disabled' |
| except NVMLError as err: |
| mode = handleError(err) |
| |
| strResult += ' <persistence_mode>' + mode + '</persistence_mode>\n' |
| |
| try: |
| mode = 'Enabled' if (nvmlDeviceGetAccountingMode(handle) != 0) else 'Disabled' |
| except NVMLError as err: |
| mode = handleError(err) |
| |
| strResult += ' <accounting_mode>' + mode + '</accounting_mode>\n' |
| |
| try: |
| bufferSize = str(nvmlDeviceGetAccountingBufferSize(handle)) |
| except NVMLError as err: |
| bufferSize = handleError(err) |
| |
| strResult += ' <accounting_mode_buffer_size>' + bufferSize + '</accounting_mode_buffer_size>\n' |
| |
| strResult += ' <driver_model>\n' |
|
|
| try: |
| current = 'WDDM' if (nvmlDeviceGetCurrentDriverModel(handle) == NVML_DRIVER_WDDM) else 'TCC' |
| except NVMLError as err: |
| current = handleError(err) |
| strResult += ' <current_dm>' + current + '</current_dm>\n' |
|
|
| try: |
| pending = 'WDDM' if (nvmlDeviceGetPendingDriverModel(handle) == NVML_DRIVER_WDDM) else 'TCC' |
| except NVMLError as err: |
| pending = handleError(err) |
|
|
| strResult += ' <pending_dm>' + pending + '</pending_dm>\n' |
|
|
| strResult += ' </driver_model>\n' |
|
|
| try: |
| serial = nvmlDeviceGetSerial(handle) |
| except NVMLError as err: |
| serial = handleError(err) |
|
|
| strResult += ' <serial>' + serial + '</serial>\n' |
|
|
| try: |
| uuid = nvmlDeviceGetUUID(handle) |
| except NVMLError as err: |
| uuid = handleError(err) |
|
|
| strResult += ' <uuid>' + uuid + '</uuid>\n' |
| |
| try: |
| minor_number = nvmlDeviceGetMinorNumber(handle) |
| except NVMLError as err: |
| minor_number = handleError(err) |
|
|
| strResult += ' <minor_number>' + str(minor_number) + '</minor_number>\n' |
| |
| try: |
| vbios = nvmlDeviceGetVbiosVersion(handle) |
| except NVMLError as err: |
| vbios = handleError(err) |
|
|
| strResult += ' <vbios_version>' + vbios + '</vbios_version>\n' |
|
|
| try: |
| multiGpuBool = nvmlDeviceGetMultiGpuBoard(handle) |
| except NVMLError as err: |
| multiGpuBool = handleError(err); |
|
|
| if multiGpuBool == "N/A": |
| strResult += ' <multigpu_board>' + 'N/A' + '</multigpu_board>\n' |
| elif multiGpuBool: |
| strResult += ' <multigpu_board>' + 'Yes' + '</multigpu_board>\n' |
| else: |
| strResult += ' <multigpu_board>' + 'No' + '</multigpu_board>\n' |
|
|
| try: |
| boardId = nvmlDeviceGetBoardId(handle) |
| except NVMLError as err: |
| boardId = handleError(err) |
|
|
| try: |
| hexBID = "0x%x" % boardId |
| except: |
| hexBID = boardId |
|
|
| strResult += ' <board_id>' + hexBID + '</board_id>\n' |
|
|
| strResult += ' <inforom_version>\n' |
| |
| try: |
| img = nvmlDeviceGetInforomImageVersion(handle) |
| except NVMLError as err: |
| img = handleError(err) |
| |
| strResult += ' <img_version>' + img + '</img_version>\n' |
|
|
| try: |
| oem = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_OEM) |
| except NVMLError as err: |
| oem = handleError(err) |
| |
| strResult += ' <oem_object>' + oem + '</oem_object>\n' |
| |
| try: |
| ecc = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_ECC) |
| except NVMLError as err: |
| ecc = handleError(err) |
| |
| strResult += ' <ecc_object>' + ecc + '</ecc_object>\n' |
|
|
| try: |
| pwr = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_POWER) |
| except NVMLError as err: |
| pwr = handleError(err) |
| |
| strResult += ' <pwr_object>' + pwr + '</pwr_object>\n' |
| |
| strResult += ' </inforom_version>\n' |
|
|
| strResult += ' <gpu_operation_mode>\n' |
|
|
| try: |
| current = StrGOM(nvmlDeviceGetCurrentGpuOperationMode(handle)) |
| except NVMLError as err: |
| current = handleError(err) |
| strResult += ' <current_gom>' + current + '</current_gom>\n' |
|
|
| try: |
| pending = StrGOM(nvmlDeviceGetPendingGpuOperationMode(handle)) |
| except NVMLError as err: |
| pending = handleError(err) |
|
|
| strResult += ' <pending_gom>' + pending + '</pending_gom>\n' |
|
|
| strResult += ' </gpu_operation_mode>\n' |
|
|
| strResult += ' <pci>\n' |
| strResult += ' <pci_bus>%02X</pci_bus>\n' % pciInfo.bus |
| strResult += ' <pci_device>%02X</pci_device>\n' % pciInfo.device |
| strResult += ' <pci_domain>%04X</pci_domain>\n' % pciInfo.domain |
| strResult += ' <pci_device_id>%08X</pci_device_id>\n' % (pciInfo.pciDeviceId) |
| strResult += ' <pci_bus_id>' + str(pciInfo.busId) + '</pci_bus_id>\n' |
| strResult += ' <pci_sub_system_id>%08X</pci_sub_system_id>\n' % (pciInfo.pciSubSystemId) |
| strResult += ' <pci_gpu_link_info>\n' |
|
|
|
|
| strResult += ' <pcie_gen>\n' |
|
|
| try: |
| gen = str(nvmlDeviceGetMaxPcieLinkGeneration(handle)) |
| except NVMLError as err: |
| gen = handleError(err) |
|
|
| strResult += ' <max_link_gen>' + gen + '</max_link_gen>\n' |
|
|
| try: |
| gen = str(nvmlDeviceGetCurrPcieLinkGeneration(handle)) |
| except NVMLError as err: |
| gen = handleError(err) |
|
|
| strResult += ' <current_link_gen>' + gen + '</current_link_gen>\n' |
| strResult += ' </pcie_gen>\n' |
| strResult += ' <link_widths>\n' |
|
|
| try: |
| width = str(nvmlDeviceGetMaxPcieLinkWidth(handle)) + 'x' |
| except NVMLError as err: |
| width = handleError(err) |
|
|
| strResult += ' <max_link_width>' + width + '</max_link_width>\n' |
|
|
| try: |
| width = str(nvmlDeviceGetCurrPcieLinkWidth(handle)) + 'x' |
| except NVMLError as err: |
| width = handleError(err) |
|
|
| strResult += ' <current_link_width>' + width + '</current_link_width>\n' |
|
|
| strResult += ' </link_widths>\n' |
| strResult += ' </pci_gpu_link_info>\n' |
|
|
|
|
| strResult += ' <pci_bridge_chip>\n' |
|
|
| try: |
| bridgeHierarchy = nvmlDeviceGetBridgeChipInfo(handle) |
| bridge_type = '' |
| if bridgeHierarchy.bridgeChipInfo[0].type == 0: |
| bridge_type += 'PLX' |
| else: |
| bridge_type += 'BR04' |
| strResult += ' <bridge_chip_type>' + bridge_type + '</bridge_chip_type>\n' |
|
|
| if bridgeHierarchy.bridgeChipInfo[0].fwVersion == 0: |
| strFwVersion = 'N/A' |
| else: |
| strFwVersion = '%08X' % (bridgeHierarchy.bridgeChipInfo[0].fwVersion) |
| strResult += ' <bridge_chip_fw>%s</bridge_chip_fw>\n' % (strFwVersion) |
| except NVMLError as err: |
| strResult += ' <bridge_chip_type>' + handleError(err) + '</bridge_chip_type>\n' |
| strResult += ' <bridge_chip_fw>' + handleError(err) + '</bridge_chip_fw>\n' |
|
|
| |
| strResult += ' </pci_bridge_chip>\n' |
|
|
| try: |
| replay = nvmlDeviceGetPcieReplayCounter(handle) |
| strResult += ' <replay_counter>' + str(replay) + '</replay_counter>' |
| except NVMLError as err: |
| strResult += ' <replay_counter>' + handleError(err) + '</replay_counter>' |
|
|
| try: |
| tx_bytes = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_TX_BYTES) |
| strResult += ' <tx_util>' + str(tx_bytes) + ' KB/s' + '</tx_util>' |
| except NVMLError as err: |
| strResult += ' <tx_util>' + handleError(err) + '</tx_util>' |
|
|
| try: |
| rx_bytes = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_RX_BYTES) |
| strResult += ' <rx_util>' + str(rx_bytes) + ' KB/s' + '</rx_util>' |
| except NVMLError as err: |
| strResult += ' <rx_util>' + handleError(err) + '</rx_util>' |
|
|
|
|
| strResult += ' </pci>\n' |
|
|
| try: |
| fan = str(nvmlDeviceGetFanSpeed(handle)) + ' %' |
| except NVMLError as err: |
| fan = handleError(err) |
| strResult += ' <fan_speed>' + fan + '</fan_speed>\n' |
|
|
| try: |
| perfState = nvmlDeviceGetPowerState(handle) |
| perfStateStr = 'P%s' % perfState |
| except NVMLError as err: |
| perfStateStr = handleError(err) |
| strResult += ' <performance_state>' + perfStateStr + '</performance_state>\n' |
|
|
| strResult += GetClocksThrottleReasons(handle); |
|
|
| try: |
| memInfo = nvmlDeviceGetMemoryInfo(handle) |
| mem_total = str(memInfo.total / 1024 / 1024) + ' MiB' |
| mem_used = str(memInfo.used / 1024 / 1024) + ' MiB' |
| mem_free = str(memInfo.total / 1024 / 1024 - memInfo.used / 1024 / 1024) + ' MiB' |
| except NVMLError as err: |
| error = handleError(err) |
| mem_total = error |
| mem_used = error |
| mem_free = error |
|
|
| strResult += ' <fb_memory_usage>\n' |
| strResult += ' <total>' + mem_total + '</total>\n' |
| strResult += ' <used>' + mem_used + '</used>\n' |
| strResult += ' <free>' + mem_free + '</free>\n' |
| strResult += ' </fb_memory_usage>\n' |
|
|
| try: |
| memInfo = nvmlDeviceGetBAR1MemoryInfo(handle) |
| mem_total = str(memInfo.bar1Total / 1024 / 1024) + ' MiB' |
| mem_used = str(memInfo.bar1Used / 1024 / 1024) + ' MiB' |
| mem_free = str(memInfo.bar1Total / 1024 / 1024 - memInfo.bar1Used / 1024 / 1024) + ' MiB' |
| except NVMLError as err: |
| error = handleError(err) |
| mem_total = error |
| mem_used = error |
| mem_free = error |
|
|
| strResult += ' <bar1_memory_usage>\n' |
| strResult += ' <total>' + mem_total + '</total>\n' |
| strResult += ' <used>' + mem_used + '</used>\n' |
| strResult += ' <free>' + mem_free + '</free>\n' |
| strResult += ' </bar1_memory_usage>\n' |
| |
| try: |
| mode = nvmlDeviceGetComputeMode(handle) |
| if mode == NVML_COMPUTEMODE_DEFAULT: |
| modeStr = 'Default' |
| elif mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD: |
| modeStr = 'Exclusive Thread' |
| elif mode == NVML_COMPUTEMODE_PROHIBITED: |
| modeStr = 'Prohibited' |
| elif mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: |
| modeStr = 'Exclusive_Process' |
| else: |
| modeStr = 'Unknown' |
| except NVMLError as err: |
| modeStr = handleError(err) |
|
|
| strResult += ' <compute_mode>' + modeStr + '</compute_mode>\n' |
|
|
| try: |
| util = nvmlDeviceGetUtilizationRates(handle) |
| gpu_util = str(util.gpu) + ' %' |
| mem_util = str(util.memory) + ' %' |
| except NVMLError as err: |
| error = handleError(err) |
| gpu_util = error |
| mem_util = error |
| |
| strResult += ' <utilization>\n' |
| strResult += ' <gpu_util>' + gpu_util + '</gpu_util>\n' |
| strResult += ' <memory_util>' + mem_util + '</memory_util>\n' |
|
|
| try: |
| (util_int, ssize) = nvmlDeviceGetEncoderUtilization(handle) |
| encoder_util = str(util_int) + ' %' |
| except NVMLError as err: |
| error = handleError(err) |
| encoder_util = error |
|
|
| strResult += ' <encoder_util>' + encoder_util + '</encoder_util>\n' |
|
|
| try: |
| (util_int, ssize) = nvmlDeviceGetDecoderUtilization(handle) |
| decoder_util = str(util_int) + ' %' |
| except NVMLError as err: |
| error = handleError(err) |
| decoder_util = error |
|
|
| strResult += ' <decoder_util>' + decoder_util + '</decoder_util>\n' |
|
|
| strResult += ' </utilization>\n' |
| |
| try: |
| (current, pending) = nvmlDeviceGetEccMode(handle) |
| curr_str = 'Enabled' if (current != 0) else 'Disabled' |
| pend_str = 'Enabled' if (pending != 0) else 'Disabled' |
| except NVMLError as err: |
| error = handleError(err) |
| curr_str = error |
| pend_str = error |
|
|
| strResult += ' <ecc_mode>\n' |
| strResult += ' <current_ecc>' + curr_str + '</current_ecc>\n' |
| strResult += ' <pending_ecc>' + pend_str + '</pending_ecc>\n' |
| strResult += ' </ecc_mode>\n' |
|
|
| strResult += ' <ecc_errors>\n' |
| strResult += GetEccStr(handle) |
| strResult += ' </ecc_errors>\n' |
|
|
| strResult += ' <retired_pages>\n' |
| strResult += GetRetiredPagesStr(handle) |
| strResult += ' </retired_pages>\n' |
| |
| try: |
| temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) + ' C' |
| except NVMLError as err: |
| temp = handleError(err) |
|
|
| strResult += ' <temperature>\n' |
| strResult += ' <gpu_temp>' + temp + '</gpu_temp>\n' |
|
|
| try: |
| temp = str(nvmlDeviceGetTemperatureThreshold(handle, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN)) + ' C' |
| except NVMLError as err: |
| temp = handleError(err) |
|
|
| strResult += ' <gpu_temp_max_threshold>' + temp + '</gpu_temp_max_threshold>\n' |
|
|
| try: |
| temp = str(nvmlDeviceGetTemperatureThreshold(handle, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN)) + ' C' |
| except NVMLError as err: |
| temp = handleError(err) |
|
|
| strResult += ' <gpu_temp_slow_threshold>' + temp + '</gpu_temp_slow_threshold>\n' |
| strResult += ' </temperature>\n' |
|
|
| strResult += ' <power_readings>\n' |
| try: |
| perfState = 'P' + str(nvmlDeviceGetPowerState(handle)) |
| except NVMLError as err: |
| perfState = handleError(err) |
| strResult += ' <power_state>%s</power_state>\n' % perfState |
| try: |
| powMan = nvmlDeviceGetPowerManagementMode(handle) |
| powManStr = 'Supported' if powMan != 0 else 'N/A' |
| except NVMLError as err: |
| powManStr = handleError(err) |
| strResult += ' <power_management>' + powManStr + '</power_management>\n' |
| try: |
| powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000.0) |
| powDrawStr = '%.2f W' % powDraw |
| except NVMLError as err: |
| powDrawStr = handleError(err) |
| strResult += ' <power_draw>' + powDrawStr + '</power_draw>\n' |
| try: |
| powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000.0) |
| powLimitStr = '%.2f W' % powLimit |
| except NVMLError as err: |
| powLimitStr = handleError(err) |
| strResult += ' <power_limit>' + powLimitStr + '</power_limit>\n' |
| try: |
| powLimit = (nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000.0) |
| powLimitStr = '%.2f W' % powLimit |
| except NVMLError as err: |
| powLimitStr = handleError(err) |
| strResult += ' <default_power_limit>' + powLimitStr + '</default_power_limit>\n' |
|
|
| try: |
| enforcedPowLimit = (nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) |
| enforcedPowLimitStr = '%.2f W' % enforcedPowLimit |
| except NVMLError as err: |
| enforcedPowLimitStr = handleError(err) |
|
|
| strResult += ' <enforced_power_limit>' + enforcedPowLimitStr + '</enforced_power_limit>\n' |
|
|
| try: |
| powLimit = nvmlDeviceGetPowerManagementLimitConstraints(handle) |
| powLimitStrMin = '%.2f W' % (powLimit[0] / 1000.0) |
| powLimitStrMax = '%.2f W' % (powLimit[1] / 1000.0) |
| except NVMLError as err: |
| error = handleError(err) |
| powLimitStrMin = error |
| powLimitStrMax = error |
| strResult += ' <min_power_limit>' + powLimitStrMin + '</min_power_limit>\n' |
| strResult += ' <max_power_limit>' + powLimitStrMax + '</max_power_limit>\n' |
|
|
| strResult += ' </power_readings>\n' |
|
|
| strResult += ' <clocks>\n' |
| try: |
| graphics = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)) + ' MHz' |
| except NVMLError as err: |
| graphics = handleError(err) |
| strResult += ' <graphics_clock>' +graphics + '</graphics_clock>\n' |
| try: |
| sm = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)) + ' MHz' |
| except NVMLError as err: |
| sm = handleError(err) |
| strResult += ' <sm_clock>' + sm + '</sm_clock>\n' |
| try: |
| mem = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)) + ' MHz' |
| except NVMLError as err: |
| mem = handleError(err) |
| strResult += ' <mem_clock>' + mem + '</mem_clock>\n' |
| strResult += ' </clocks>\n' |
|
|
| strResult += ' <applications_clocks>\n' |
| try: |
| graphics = str(nvmlDeviceGetApplicationsClock(handle, NVML_CLOCK_GRAPHICS)) + ' MHz' |
| except NVMLError as err: |
| graphics = handleError(err) |
| strResult += ' <graphics_clock>' +graphics + '</graphics_clock>\n' |
| try: |
| mem = str(nvmlDeviceGetApplicationsClock(handle, NVML_CLOCK_MEM)) + ' MHz' |
| except NVMLError as err: |
| mem = handleError(err) |
| strResult += ' <mem_clock>' + mem + '</mem_clock>\n' |
| strResult += ' </applications_clocks>\n' |
| |
| strResult += ' <default_applications_clocks>\n' |
| try: |
| graphics = str(nvmlDeviceGetDefaultApplicationsClock(handle, NVML_CLOCK_GRAPHICS)) + ' MHz' |
| except NVMLError as err: |
| graphics = handleError(err) |
| strResult += ' <graphics_clock>' +graphics + '</graphics_clock>\n' |
| try: |
| mem = str(nvmlDeviceGetDefaultApplicationsClock(handle, NVML_CLOCK_MEM)) + ' MHz' |
| except NVMLError as err: |
| mem = handleError(err) |
| strResult += ' <mem_clock>' + mem + '</mem_clock>\n' |
| strResult += ' </default_applications_clocks>\n' |
|
|
| strResult += ' <max_clocks>\n' |
| try: |
| graphics = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_GRAPHICS)) + ' MHz' |
| except NVMLError as err: |
| graphics = handleError(err) |
| strResult += ' <graphics_clock>' + graphics + '</graphics_clock>\n' |
| try: |
| sm = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_SM)) + ' MHz' |
| except NVMLError as err: |
| sm = handleError(err) |
| strResult += ' <sm_clock>' + sm + '</sm_clock>\n' |
| try: |
| mem = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_MEM)) + ' MHz' |
| except NVMLError as err: |
| mem = handleError(err) |
| strResult += ' <mem_clock>' + mem + '</mem_clock>\n' |
| strResult += ' </max_clocks>\n' |
| |
| strResult += ' <clock_policy>\n' |
| try: |
| boostedState, boostedDefaultState = nvmlDeviceGetAutoBoostedClocksEnabled(handle) |
| if boostedState == NVML_FEATURE_DISABLED: |
| autoBoostStr = "Off" |
| else: |
| autoBoostStr = "On" |
| |
| if boostedDefaultState == NVML_FEATURE_DISABLED: |
| autoBoostDefaultStr = "Off" |
| else: |
| autoBoostDefaultStr = "On" |
| |
| except NVMLError_NotSupported: |
| autoBoostStr = "N/A" |
| autoBoostDefaultStr = "N/A" |
| except NVMLError as err: |
| autoBoostStr = handleError(err) |
| autoBoostDefaultStr = handleError(err) |
| pass |
| strResult += ' <auto_boost>' + autoBoostStr + '</auto_boost>\n' |
| strResult += ' <auto_boost_default>' + autoBoostDefaultStr + '</auto_boost_default>\n' |
| strResult += ' </clock_policy>\n' |
|
|
| try: |
| memClocks = nvmlDeviceGetSupportedMemoryClocks(handle) |
| strResult += ' <supported_clocks>\n' |
|
|
| for m in memClocks: |
| strResult += ' <supported_mem_clock>\n' |
| strResult += ' <value>%d MHz</value>\n' % m |
| try: |
| clocks = nvmlDeviceGetSupportedGraphicsClocks(handle, m) |
| for c in clocks: |
| strResult += ' <supported_graphics_clock>%d MHz</supported_graphics_clock>\n' % c |
| except NVMLError as err: |
| strResult += ' <supported_graphics_clock>%s</supported_graphics_clock>\n' % handleError(err) |
| strResult += ' </supported_mem_clock>\n' |
|
|
| strResult += ' </supported_clocks>\n' |
| except NVMLError as err: |
| strResult += ' <supported_clocks>' + handleError(err) + '</supported_clocks>\n' |
|
|
| try: |
| procs = nvmlDeviceGetComputeRunningProcesses(handle) |
| strResult += ' <processes>\n' |
| |
| for p in procs: |
| try: |
| name = str(nvmlSystemGetProcessName(p.pid)) |
| except NVMLError as err: |
| if (err.value == NVML_ERROR_NOT_FOUND): |
| |
| continue |
| else: |
| name = handleError(err) |
| |
| strResult += ' <process_info>\n' |
| strResult += ' <pid>%d</pid>\n' % p.pid |
| strResult += ' <process_name>' + name + '</process_name>\n' |
|
|
| if (p.usedGpuMemory == None): |
| mem = 'N\A' |
| else: |
| mem = '%d MiB' % (p.usedGpuMemory / 1024 / 1024) |
| strResult += ' <used_memory>' + mem + '</used_memory>\n' |
| strResult += ' </process_info>\n' |
| |
| strResult += ' </processes>\n' |
| except NVMLError as err: |
| strResult += ' <processes>' + handleError(err) + '</processes>\n' |
| |
|
|
| try: |
| pids = nvmlDeviceGetAccountingPids(handle) |
| strResult += ' <accounted_processes>\n' |
| |
| for pid in pids : |
| try: |
| stats = nvmlDeviceGetAccountingStats(handle, pid) |
| gpuUtilization = "%d %%" % stats.gpuUtilization |
| memoryUtilization = "%d %%" % stats.memoryUtilization |
| if (stats.maxMemoryUsage == None): |
| maxMemoryUsage = 'N\A' |
| else: |
| maxMemoryUsage = '%d MiB' % (stats.maxMemoryUsage / 1024 / 1024) |
| time = "%d ms" % stats.time |
| is_running = "%d" % stats.isRunning |
| except NVMLError as err: |
| if (err.value == NVML_ERROR_NOT_FOUND): |
| |
| continue |
| err = handleError(err) |
| gpuUtilization = err |
| memoryUtilization = err |
| maxMemoryUsage = err |
| time = err |
| is_running = err |
| |
| strResult += ' <accounted_process_info>\n' |
| strResult += ' <pid>%d</pid>\n' % pid |
| strResult += ' <gpu_util>' + gpuUtilization + '</gpu_util>\n' |
| strResult += ' <memory_util>' + memoryUtilization + '</memory_util>\n' |
| strResult += ' <max_memory_usage>' + maxMemoryUsage+ '</max_memory_usage>\n' |
| strResult += ' <time>' + time + '</time>\n' |
| strResult += ' <is_running>' + is_running + '</is_running>\n' |
| strResult += ' </accounted_process_info>\n' |
| |
| strResult += ' </accounted_processes>\n' |
| except NVMLError as err: |
| strResult += ' <accounted_processes>' + handleError(err) + '</accounted_processes>\n' |
|
|
| strResult += ' </gpu>\n' |
| |
| strResult += '</nvidia_smi_log>\n' |
| |
| except NVMLError as err: |
| strResult += 'nvidia_smi.py: ' + err.__str__() + '\n' |
| |
| nvmlShutdown() |
| |
| return strResult |
|
|
| |
| if __name__ == "__main__": |
| print(XmlDeviceQuery()) |
|
|