| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | #pragma once |
| |
|
| | #include <string> |
| |
|
| | #include "cutlass/conv/conv2d_problem_size.h" |
| | #include "cutlass/conv/device/implicit_gemm_convolution.h" |
| | #include "cutlass/conv/kernel/default_conv2d_fprop.h" |
| | #include "cutlass/cutlass.h" |
| | #include "cutlass/util/host_tensor.h" |
| | #include "cutlass/util/reference/host/tensor_fill.h" |
| |
|
| | #include <nvtx3/nvToolsExt.h> |
| |
|
| | namespace ait { |
| |
|
| | inline thread_local bool target_has_graph_mode = true; |
| |
|
| | using DeviceError = cudaError_t; |
| | using DevicePropertyType = cudaDeviceProp; |
| | using StreamType = cudaStream_t; |
| | using EventType = cudaEvent_t; |
| | using GraphType = cudaGraph_t; |
| | using GraphExecType = cudaGraphExec_t; |
| | using Handle = void*; |
| |
|
| | using bfloat16 = __nv_bfloat16; |
| |
|
| | inline DeviceError GetDevice(int* device_idx) { |
| | return cudaGetDevice(device_idx); |
| | } |
| |
|
| | inline DeviceError GetDeviceProperties( |
| | DevicePropertyType* prop, |
| | int device_idx) { |
| | return cudaGetDeviceProperties(prop, device_idx); |
| | } |
| |
|
| | inline std::string GetUUIDToString(const char bytes[16]) { |
| | std::vector<std::tuple<int, int>> groups = { |
| | {0, 4}, {4, 6}, {6, 8}, {8, 10}, {10, 16}}; |
| | char const hex_chars[16] = { |
| | '0', |
| | '1', |
| | '2', |
| | '3', |
| | '4', |
| | '5', |
| | '6', |
| | '7', |
| | '8', |
| | '9', |
| | 'a', |
| | 'b', |
| | 'c', |
| | 'd', |
| | 'e', |
| | 'f'}; |
| |
|
| | std::string result = "GPU"; |
| | for (auto g : groups) { |
| | result += "-"; |
| | for (size_t i = std::get<0>(g); i < std::get<1>(g); ++i) { |
| | result += hex_chars[(bytes[i] & 0xF0) >> 4]; |
| | result += hex_chars[(bytes[i] & 0x0F)]; |
| | } |
| | } |
| | return result; |
| | } |
| |
|
| | inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { |
| | std::ostringstream oss; |
| | oss << "Hardware accelerator device properties: " |
| | << "\n Device: " |
| | << "\n ASCII string identifying device: " << prop.name |
| | << "\n Major compute capability: " << prop.major |
| | << "\n Minor compute capability: " << prop.minor |
| | << "\n UUID: " << GetUUIDToString(prop.uuid.bytes) |
| | << "\n Unique identifier for a group of devices on the same multi-GPU board: " |
| | << prop.multiGpuBoardGroupID |
| | << "\n PCI bus ID of the device: " << prop.pciBusID |
| | << "\n PCI device ID of the device: " << prop.pciDeviceID |
| | << "\n PCI domain ID of the device: " << prop.pciDomainID |
| |
|
| | << "\n Memory limits: " |
| | << "\n Constant memory available on device in bytes: " |
| | << prop.totalConstMem |
| | << "\n Global memory available on device in bytes: " |
| | << prop.totalGlobalMem |
| | << "\n Global memory bus width in bits: " << prop.memoryBusWidth |
| | << "\n Size of L2 cache in bytes: " << prop.l2CacheSize |
| | << "\n Device's maximum L2 persisting lines capacity in bytes: " |
| | << prop.persistingL2CacheMaxSize |
| | << "\n Shared memory reserved by CUDA driver per block in bytes: " |
| | << prop.reservedSharedMemPerBlock |
| | << "\n Shared memory available per block in bytes: " |
| | << prop.sharedMemPerBlock |
| | << "\n Per device maximum shared memory per block usable by special opt in: " |
| | << prop.sharedMemPerBlockOptin |
| | << "\n Shared memory available per multiprocessor in bytes: " |
| | << prop.sharedMemPerMultiprocessor |
| | << "\n The maximum value of cudaAccessPolicyWindow::num_bytes: " |
| | << prop.accessPolicyMaxWindowSize |
| | << "\n Max global memory clock frequency in khz: " |
| | << prop.memoryClockRate |
| | << "\n Peak global memory bandwidth (GByte/s): " |
| | << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2 |
| |
|
| | << "\n Thread limits: " |
| | << "\n Warp size in threads: " << prop.warpSize |
| | << "\n Maximum size of each dimension of a grid: " |
| | << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " |
| | << prop.maxGridSize[2] |
| | << "\n Maximum size of each dimension of a block: " |
| | << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " " |
| | << prop.maxThreadsDim[2] |
| | << "\n Number of asynchronous engines: " << prop.asyncEngineCount |
| | << "\n Maximum number of resident blocks per multiprocessor: " |
| | << prop.maxBlocksPerMultiProcessor |
| | << "\n Maximum number of threads per block: " |
| | << prop.maxThreadsPerBlock |
| | << "\n Maximum resident threads per multiprocessor: " |
| | << prop.maxThreadsPerMultiProcessor |
| | << "\n Maximum pitch in bytes allowed by memory copies: " |
| | << prop.memPitch << "\n Number of multiprocessors on device: " |
| | << prop.multiProcessorCount |
| | << "\n 32-bit registers available per block: " << prop.regsPerBlock |
| | << "\n 32-bit registers available per multiprocessor: " |
| | << prop.regsPerMultiprocessor |
| | << "\n Max clock frequency of the multiProcessors in khz: " |
| | << prop.clockRate |
| |
|
| | << "\n Device features: " |
| | << "\n Device has ECC support enabled: " |
| | << (prop.ECCEnabled ? "yes" : "no") |
| | << "\n Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: " |
| | << (prop.canMapHostMemory ? "yes" : "no") |
| | << "\n Device can access host registered memory at the same virtual address as the CPU: " |
| | << (prop.canUseHostPointerForRegisteredMem ? "yes" : "no") |
| | << "\n Device supports Compute Preemption: " |
| | << (prop.computePreemptionSupported ? "yes" : "no") |
| | << "\n Device can possibly execute multiple kernels concurrently: " |
| | << (prop.concurrentKernels ? "yes" : "no") |
| | << "\n Device can coherently access managed memory concurrently with the CPU: " |
| | << (prop.concurrentManagedAccess ? "yes" : "no") |
| | << "\n Device supports launching cooperative kernels via cudaLaunchCooperativeKernel: " |
| | << (prop.cooperativeLaunch ? "yes" : "no") |
| | << "\n Host can directly access managed memory on the device without migration: " |
| | << (prop.directManagedMemAccessFromHost ? "yes" : "no") |
| | << "\n Device supports caching globals in L1: " |
| | << (prop.globalL1CacheSupported ? "yes" : "no") |
| | << "\n Link between the device and the host supports native atomic operations: " |
| | << (prop.hostNativeAtomicSupported ? "yes" : "no") |
| | << "\n Device is integrated as opposed to discrete: " |
| | << (prop.integrated ? "yes" : "no") |
| | << "\n Device is on a multi-GPU board: " |
| | << (prop.isMultiGpuBoard ? "yes" : "no") |
| | << "\n Device supports caching locals in L1: " |
| | << (prop.localL1CacheSupported ? "yes" : "no") |
| | << "\n Device supports allocating managed memory on this system: " |
| | << (prop.managedMemory ? "yes" : "no") |
| | << "\n Device supports coherently accessing pageable memory without calling cudaHostRegister on it: " |
| | << (prop.pageableMemoryAccess ? "yes" : "no") |
| | << "\n Device accesses pageable memory via the host's page tables: " |
| | << (prop.pageableMemoryAccessUsesHostPageTables ? "yes" : "no") |
| | << "\n Device supports stream priorities: " |
| | << (prop.streamPrioritiesSupported ? "yes" : "no") |
| | << "\n Device is a Tesla device using TCC driver: " |
| | << (prop.tccDriver ? "yes" : "no") |
| | << "\n Device shares a unified address space with the host: " |
| | << (prop.unifiedAddressing ? "yes" : "no") |
| |
|
| | << "\n Texture limits: " |
| | << "\n Maximum 1D surface size: " << prop.maxSurface1D |
| | << "\n Maximum 1D layered surface dimensions: " |
| | << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1] |
| | << "\n Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " " |
| | << prop.maxSurface2D[1] |
| | << "\n Maximum 2D layered surface dimensions: " |
| | << prop.maxSurface2DLayered[0] << " " << prop.maxSurface2DLayered[1] |
| | << " " << prop.maxSurface2DLayered[2] |
| | << "\n Maximum 3D surface dimensions: " << prop.maxSurface3D[0] << " " |
| | << prop.maxSurface3D[1] << " " << prop.maxSurface3D[2] |
| | << "\n Maximum Cubemap surface dimensions: " << prop.maxSurfaceCubemap |
| | << "\n Maximum Cubemap layered surface dimensions: " |
| | << prop.maxSurfaceCubemapLayered[0] << " " |
| | << prop.maxSurfaceCubemapLayered[1] |
| | << "\n Maximum 1D texture size: " << prop.maxTexture1D |
| | << "\n Maximum 1D layered texture dimensions " |
| | << prop.maxTexture1DLayered[0] << " " << prop.maxTexture1DLayered[1] |
| | << "\n Maximum 1D mipmapped texture size: " << prop.maxTexture1DMipmap |
| | << "\n Maximum 2D texture dimensions: " << prop.maxTexture2D[0] << " " |
| | << prop.maxTexture2D[1] |
| | << "\n Maximum 2D texture dimensions if texture gather operations have to be performed: " |
| | << prop.maxTexture2DGather[0] << " " << prop.maxTexture2DGather[1] |
| | << "\n Maximum 2D layered texture dimensions: " |
| | << prop.maxTexture2DLayered[0] << " " << prop.maxTexture2DLayered[1] |
| | << " " << prop.maxTexture2DLayered[2] |
| | << "\n Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory: " |
| | << prop.maxTexture2DLinear[0] << " " << prop.maxTexture2DLinear[1] << " " |
| | << prop.maxTexture2DLinear[2] |
| | << "\n Maximum 2D mipmapped texture dimensions: " |
| | << prop.maxTexture2DMipmap[0] << " " << prop.maxTexture2DMipmap[1] |
| | << "\n Maximum 3D texture dimensions: " << prop.maxTexture3D[0] << " " |
| | << prop.maxTexture3D[1] << " " << prop.maxTexture3D[2] |
| | << "\n Maximum alternate 3D texture dimensions: " |
| | << prop.maxTexture3DAlt[0] << " " << prop.maxTexture3DAlt[1] << " " |
| | << prop.maxTexture3DAlt[2] |
| | << "\n Maximum Cubemap texture dimensions: " << prop.maxTextureCubemap |
| | << "\n Maximum Cubemap layered texture dimensions: " |
| | << prop.maxTextureCubemapLayered[0] << " " |
| | << prop.maxTextureCubemapLayered[1] |
| | << "\n Alignment requirements for surfaces: " << prop.surfaceAlignment |
| | << "\n Alignment requirement for textures: " << prop.textureAlignment |
| | << "\n Pitch alignment requirement for texture references bound to pitched memory: " |
| | << prop.texturePitchAlignment; |
| | return oss.str(); |
| | } |
| |
|
| | inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) { |
| | std::ostringstream oss; |
| | oss << "Hardware accelerator device properties: " |
| | << "\n Device: " |
| | << "\n ASCII string identifying device: " << prop.name |
| | << "\n Major compute capability: " << prop.major |
| | << "\n Minor compute capability: " << prop.minor |
| | << "\n UUID: " << GetUUIDToString(prop.uuid.bytes) |
| | << "\n Unique identifier for a group of devices on the same multi-GPU board: " |
| | << prop.multiGpuBoardGroupID |
| | << "\n PCI bus ID of the device: " << prop.pciBusID |
| | << "\n PCI device ID of the device: " << prop.pciDeviceID |
| | << "\n PCI domain ID of the device: " << prop.pciDomainID |
| |
|
| | << "\n Memory limits: " |
| | << "\n Constant memory available on device in bytes: " |
| | << prop.totalConstMem |
| | << "\n Global memory available on device in bytes: " |
| | << prop.totalGlobalMem |
| | << "\n Size of L2 cache in bytes: " << prop.l2CacheSize |
| | << "\n Shared memory available per block in bytes: " |
| | << prop.sharedMemPerBlock |
| | << "\n Shared memory available per multiprocessor in bytes: " |
| | << prop.sharedMemPerMultiprocessor; |
| | return oss.str(); |
| | } |
| |
|
| | inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) { |
| | auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault; |
| | return cudaStreamCreateWithFlags(stream, flags); |
| | } |
| |
|
| | inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) { |
| | auto capture_mode = |
| | global ? cudaStreamCaptureModeGlobal : cudaStreamCaptureModeThreadLocal; |
| | return cudaStreamBeginCapture(stream, capture_mode); |
| | } |
| |
|
| | inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) { |
| | return cudaStreamEndCapture(stream, graph); |
| | } |
| |
|
| | inline DeviceError StreamDestroy(StreamType stream) { |
| | return cudaStreamDestroy(stream); |
| | } |
| |
|
| | inline DeviceError GraphInstantiate( |
| | GraphExecType* graph_exec, |
| | GraphType graph) { |
| | return cudaGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0); |
| | } |
| |
|
| | inline DeviceError GraphDestroy(GraphType graph) { |
| | return cudaGraphDestroy(graph); |
| | } |
| |
|
| | inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) { |
| | cudaGraphExecUpdateResult update; |
| | return cudaGraphExecUpdate(graph_exec, graph, nullptr, &update); |
| | } |
| |
|
| | inline DeviceError GraphExecDestroy(GraphExecType graph_exec) { |
| | return cudaGraphExecDestroy(graph_exec); |
| | } |
| |
|
| | inline DeviceError GraphExecLaunch( |
| | GraphExecType graph_exec, |
| | StreamType stream) { |
| | return cudaGraphLaunch(graph_exec, stream); |
| | } |
| |
|
| | inline DeviceError CopyToDevice( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream); |
| | } |
| |
|
| | inline DeviceError CopyToHost( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream); |
| | } |
| |
|
| | inline DeviceError DeviceToDeviceCopy( |
| | Handle dst, |
| | const void* src, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream); |
| | } |
| |
|
| | inline DeviceError FreeDeviceMemory(Handle src) { |
| | return cudaFree(src); |
| | } |
| |
|
| | inline DeviceError FreeDeviceHostMemory(Handle src) { |
| | return cudaFreeHost(src); |
| | } |
| |
|
| | inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) { |
| | return cudaFreeAsync(src, stream); |
| | } |
| |
|
| | inline DeviceError DeviceMalloc(Handle* dst, size_t size) { |
| | return cudaMalloc(dst, size); |
| | } |
| |
|
| | inline DeviceError DeviceMallocHost(Handle* dst, size_t size) { |
| | return cudaMallocHost(dst, size); |
| | } |
| |
|
| | inline DeviceError DeviceMallocAsync( |
| | Handle* dst, |
| | size_t size, |
| | StreamType stream = 0) { |
| | return cudaMallocAsync(dst, size, stream); |
| | } |
| |
|
| | inline DeviceError GetDeviceSuccess() { |
| | return cudaSuccess; |
| | } |
| |
|
| | inline DeviceError DeviceMemset(Handle src, int value, size_t size) { |
| | return cudaMemset(src, value, size); |
| | } |
| |
|
| | inline DeviceError GetLastError() { |
| | return cudaGetLastError(); |
| | } |
| |
|
| | inline std::string GetLastErrorString() { |
| | return cudaGetErrorString(cudaGetLastError()); |
| | } |
| |
|
| | inline DeviceError StreamSynchronize(StreamType stream) { |
| | return cudaStreamSynchronize(stream); |
| | } |
| |
|
| | inline DeviceError CreateEvent(EventType* event) { |
| | return cudaEventCreate(event); |
| | } |
| |
|
| | inline DeviceError DestroyEvent(EventType event) { |
| | return cudaEventDestroy(event); |
| | } |
| |
|
| | inline DeviceError EventRecord(EventType event, StreamType stream = 0) { |
| | return cudaEventRecord(event, stream); |
| | } |
| |
|
| | inline DeviceError EventSynchronize(EventType event) { |
| | return cudaEventSynchronize(event); |
| | } |
| |
|
| | inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) { |
| | return cudaEventElapsedTime(ms, start, end); |
| | } |
| |
|
| | inline DeviceError QueryEvent(EventType event) { |
| | return cudaEventQuery(event); |
| | } |
| |
|
| | inline const char* GetErrorString(DeviceError err) { |
| | return cudaGetErrorString(err); |
| | } |
| |
|
| | inline DeviceError GetDeviceNotReady() { |
| | return cudaErrorNotReady; |
| | } |
| |
|
| | inline DeviceError GetDriverVersion(int* driverVersion) { |
| | return cudaDriverGetVersion(driverVersion); |
| | } |
| |
|
| | inline DeviceError GetRuntimeVersion(int* runtimeVersion) { |
| | return cudaRuntimeGetVersion(runtimeVersion); |
| | } |
| |
|
| | inline void ProfilerRangePush(const char* msg) { |
| | nvtxRangePushA(msg); |
| | } |
| |
|
| | inline void ProfilerRangePop() { |
| | nvtxRangePop(); |
| | } |
| |
|
| | } |
| |
|