| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | #pragma once |
| |
|
| | #include <string> |
| |
|
| | |
| | #include <stdlib.h> |
| | #include <cstdlib> |
| | #include <initializer_list> |
| | #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp" |
| | #include "include/ck/utility/print.hpp" |
| | #include "library/include/ck/library/utility/device_memory.hpp" |
| | #include "library/include/ck/library/utility/host_tensor.hpp" |
| | #include "library/include/ck/library/utility/host_tensor_generator.hpp" |
| |
|
| | namespace ait { |
| |
|
| | inline thread_local bool target_has_graph_mode = false; |
| |
|
| | using DeviceError = hipError_t; |
| | using DevicePropertyType = hipDeviceProp_t; |
| | using StreamType = hipStream_t; |
| | using EventType = hipEvent_t; |
| | using GraphType = hipGraph_t; |
| | using GraphExecType = hipGraphExec_t; |
| | using Handle = void*; |
| |
|
| | inline DeviceError GetDevice(int* device_idx) { |
| | return hipGetDevice(device_idx); |
| | } |
| |
|
| | inline DeviceError GetDeviceProperties( |
| | DevicePropertyType* prop, |
| | int device_idx) { |
| | return hipGetDeviceProperties(prop, device_idx); |
| | } |
| |
|
| | inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) { |
| | std::ostringstream oss; |
| | oss << "\n Has 32-bit integer atomics for global memory: " |
| | << (arch.hasGlobalInt32Atomics ? "yes" : "no") |
| | << "\n Has 32-bit float atomic exch for global memory: " |
| | << (arch.hasGlobalFloatAtomicExch ? "yes" : "no") |
| | << "\n Has 32-bit integer atomics for shared memory: " |
| | << (arch.hasSharedInt32Atomics ? "yes" : "no") |
| | << "\n Has 32-bit float atomic exch for shared memory: " |
| | << (arch.hasSharedFloatAtomicExch ? "yes" : "no" |
| | << "\n Has 32-bit float atomic add in global and shared memory: " |
| | << (arch.hasFloatAtomicAdd ? "yes" : "no") |
| | << "\n Has 64-bit integer atomics for global memory: " |
| | << (arch.hasGlobalInt64Atomics ? "yes" : "no") |
| | << "\n Has 64-bit integer atomics for shared memory: " |
| | << (arch.hasSharedInt64Atomics ? "yes" : "no") |
| | << "\n Has double-precision floating point: " |
| | << (arch.hasDoubles ? "yes" : "no") |
| | << "\n Has warp vote instructions (__any, __all): " |
| | << (arch.hasWarpVote: ? "yes" : "no") |
| | << "\n Has warp ballot instructions (__ballot): " |
| | << (arch.hasWarpBallot: ? "yes" : "no") |
| | << "\n Has warp shuffle operations. (__shfl_*): " |
| | << (arch.hasWarpShuffle ? "yes" : "no") |
| | << "\n Has funnel two words into one with shift&mask caps: " |
| | << (arch.hasFunnelShift ? "yes" : "no") |
| | << "\n Has __threadfence_system: " |
| | << (arch.hasThreadFenceSystem ? "yes" : "no") |
| | << "\n Has __syncthreads_count, syncthreads_and, syncthreads_or: " |
| | << (arch.hasSyncThreadsExt ? "yes" : "no") |
| | << "\n Has surface functions: " |
| | << (arch.hasSurfaceFuncs ? "yes" : "no") |
| | << "\n Grid and group dims are 3D (rather than 2D): " |
| | << (arch.has3dGrid ? "yes" : "no") |
| | << "\n Has dynamic parallelism: " |
| | << (arch.hasDynamicParallelism ? "yes" : "no"); |
| | return oss.str(); |
| | } |
| |
|
| | inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) { |
| | std::ostringstream oss; |
| | oss << "Hardware accelerator device properties: " |
| | << "\n Device: " |
| | << "\n ASCII string identifying device: " << prop.name |
| | << "\n Major compute capability: " << prop.major |
| | << "\n Minor compute capability: " << prop.minor |
| | << "\n AMD GCN Arch Value: " << prop.gcnArch |
| | << "\n PCI bus ID of the device: " << prop.pciBusID |
| | << "\n PCI device ID of the device: " << prop.pciDeviceID |
| | << "\n Memory limits: " |
| | << "\n Constant memory available on device in bytes: " |
| | << prop.totalConstMem |
| | << "\n Global memory available on device in bytes: " |
| | << prop.totalGlobalMem |
| | << "\n Global memory bus width in bits: " << prop.memoryBusWidth |
| | << "\n Size of L2 cache in bytes: " << prop.l2CacheSize |
| | << "\n Shared memory available per block in bytes: " |
| | << prop.sharedMemPerBlock |
| | << "\n Maximum Shared Memory Per Multiprocessor in bytes: " |
| | << prop.maxSharedMemoryPerMultiProcessor; |
| | return oss.str(); |
| | } |
| |
|
| | inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { |
| | std::ostringstream oss; |
| | oss << "Hardware accelerator device properties: " |
| | << "\n Device: " |
| | << "\n ASCII string identifying device: " << prop.name |
| | << "\n Major compute capability: " << prop.major |
| | << "\n Minor compute capability: " << prop.minor |
| | << "\n AMD GCN Arch Value: " << prop.gcnArch |
| | << "\n PCI bus ID of the device: " << prop.pciBusID |
| | << "\n PCI device ID of the device: " << prop.pciDeviceID |
| |
|
| | << "\n Memory limits: " |
| | << "\n Constant memory available on device in bytes: " |
| | << prop.totalConstMem |
| | << "\n Global memory available on device in bytes: " |
| | << prop.totalGlobalMem |
| | << "\n Global memory bus width in bits: " << prop.memoryBusWidth |
| | << "\n Size of L2 cache in bytes: " << prop.l2CacheSize |
| | << "\n Shared memory available per block in bytes: " |
| | << prop.sharedMemPerBlock |
| | << "\n Maximum Shared Memory Per Multiprocessor in bytes: " |
| | << prop.maxSharedMemoryPerMultiProcessor |
| | << "\n Max global memory clock frequency in khz: " |
| | << prop.memoryClockRate |
| | << "\n Peak global memory bandwidth (GByte/s): " |
| | << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2 |
| |
|
| | << "\n Thread limits: " |
| | << "\n Warp size in threads: " << prop.warpSize |
| | << "\n Maximum size of each dimension of a grid: " |
| | << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " |
| | << prop.maxGridSize[2] |
| | << "\n Maximum size of each dimension of a block: " |
| | << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " " |
| | << prop.maxThreadsDim[2] << "\n Maximum number of threads per block: " |
| | << prop.maxThreadsPerBlock |
| | << "\n Registers available per block: " << prop.regsPerBlock |
| | << "\n Number of multiprocessors on device: " |
| | << prop.multiProcessorCount |
| | << "\n Maximum resident threads per multiprocessor: " |
| | << prop.maxThreadsPerMultiProcessor |
| | << "\n Max clock frequency of the multiProcessors in khz: " |
| | << prop.clockRate |
| |
|
| | << "\n Device features: " |
| | << "\n Device can possibly execute multiple kernels concurrently: " |
| | << (prop.concurrentKernels ? "yes" : "no") |
| | << "\n Device is on a multi-GPU board: " |
| | << (prop.isMultiGpuBoard ? "yes" : "no") |
| | << "\n HIP can map host memory: " |
| | << (prop.canMapHostMemory ? "yes" : "no") |
| | << PrintArchFeatureFlags(prop.arch); |
| |
|
| | return oss.str(); |
| | } |
| |
|
| | inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) { |
| | auto flags = non_blocking ? hipStreamNonBlocking : hipStreamDefault; |
| | return hipStreamCreateWithFlags(stream, flags); |
| | } |
| |
|
| | inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) { |
| | auto capture_mode = |
| | global ? hipStreamCaptureModeGlobal : hipStreamCaptureModeThreadLocal; |
| | return hipStreamBeginCapture(stream, capture_mode); |
| | } |
| |
|
| | inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) { |
| | return hipStreamEndCapture(stream, graph); |
| | } |
| |
|
| | inline DeviceError StreamDestroy(StreamType stream) { |
| | return hipStreamDestroy(stream); |
| | } |
| |
|
| | inline DeviceError GraphInstantiate( |
| | GraphExecType* graph_exec, |
| | GraphType graph) { |
| | return hipGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0); |
| | } |
| |
|
| | inline DeviceError GraphDestroy(GraphType graph) { |
| | return hipGraphDestroy(graph); |
| | } |
| |
|
| | inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) { |
| | |
| | return hipErrorUnknown; |
| | } |
| |
|
| | inline DeviceError GraphExecDestroy(GraphExecType graph_exec) { |
| | return hipGraphExecDestroy(graph_exec); |
| | } |
| |
|
| | inline DeviceError GraphExecLaunch( |
| | GraphExecType graph_exec, |
| | StreamType stream) { |
| | return hipGraphLaunch(graph_exec, stream); |
| | } |
| |
|
| | inline DeviceError CopyToDevice( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return hipMemcpyAsync(dst, src, size, hipMemcpyHostToDevice, stream); |
| | } |
| |
|
| | inline DeviceError CopyToHost( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToHost, stream); |
| | } |
| |
|
| | inline DeviceError DeviceToDeviceCopy( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice, stream); |
| | } |
| |
|
| | inline DeviceError FreeDeviceMemory(Handle src) { |
| | return hipFree(src); |
| | } |
| |
|
| | inline DeviceError FreeDeviceHostMemory(Handle src) { |
| | return hipHostFree(src); |
| | } |
| |
|
| | inline DeviceError FreeDeviceMemoryAsync( |
| | Handle src, |
| | StreamType = 0) { |
| | |
| | return hipFree(src); |
| | } |
| |
|
| | inline DeviceError DeviceMalloc(Handle* dst, size_t size) { |
| | return hipMalloc(dst, size); |
| | } |
| |
|
| | inline DeviceError DeviceMallocHost(Handle* dst, size_t size) { |
| | return hipHostMalloc(dst, size, hipHostMallocDefault); |
| | } |
| |
|
| | inline DeviceError DeviceMallocAsync( |
| | Handle* dst, |
| | size_t size, |
| | StreamType = 0) { |
| | |
| | return hipMalloc(dst, size); |
| | } |
| |
|
| | inline DeviceError GetDeviceSuccess() { |
| | return hipSuccess; |
| | } |
| |
|
| | inline DeviceError DeviceMemset(Handle src, int value, size_t size) { |
| | return hipMemset(src, value, size); |
| | } |
| |
|
| | inline DeviceError GetLastError() { |
| | return hipGetLastError(); |
| | } |
| |
|
| | inline std::string GetLastErrorString() { |
| | return hipGetErrorString(hipGetLastError()); |
| | } |
| |
|
| | inline DeviceError StreamSynchronize(StreamType stream) { |
| | return hipStreamSynchronize(stream); |
| | } |
| |
|
| | inline DeviceError CreateEvent(EventType* event) { |
| | return hipEventCreate(event); |
| | } |
| |
|
| | inline DeviceError DestroyEvent(EventType event) { |
| | return hipEventDestroy(event); |
| | } |
| |
|
| | inline DeviceError EventRecord(EventType event, StreamType stream = 0) { |
| | return hipEventRecord(event, stream); |
| | } |
| |
|
| | inline DeviceError EventSynchronize(EventType event) { |
| | return hipEventSynchronize(event); |
| | } |
| |
|
| | inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) { |
| | return hipEventElapsedTime(ms, start, end); |
| | } |
| |
|
| | inline DeviceError QueryEvent(EventType event) { |
| | return hipEventQuery(event); |
| | } |
| |
|
| | inline const char* GetErrorString(DeviceError err) { |
| | return hipGetErrorString(err); |
| | } |
| |
|
| | inline DeviceError GetDeviceNotReady() { |
| | return hipErrorNotReady; |
| | } |
| |
|
| | inline DeviceError GetDriverVersion(int* driverVersion) { |
| | return hipDriverGetVersion(driverVersion); |
| | } |
| |
|
| | inline DeviceError GetRuntimeVersion(int* runtimeVersion) { |
| | return hipRuntimeGetVersion(runtimeVersion); |
| | } |
| |
|
| | inline void ProfilerRangePush(const char* msg) { |
| | |
| | |
| | } |
| |
|
| | inline void ProfilerRangePop() { |
| | |
| | |
| | } |
| | } |
| |
|