File size: 16,124 Bytes

17db41a

//  Copyright (c) Meta Platforms, Inc. and affiliates.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
//
#pragma once

#include <string>

#include "cutlass/conv/conv2d_problem_size.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/cutlass.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"

#include <nvtx3/nvToolsExt.h>

namespace ait {

inline thread_local bool target_has_graph_mode = true;

using DeviceError = cudaError_t;
using DevicePropertyType = cudaDeviceProp;
using StreamType = cudaStream_t;
using EventType = cudaEvent_t;
using GraphType = cudaGraph_t;
using GraphExecType = cudaGraphExec_t;
using Handle = void*;

using bfloat16 = __nv_bfloat16;

inline DeviceError GetDevice(int* device_idx) {
  return cudaGetDevice(device_idx);
}

inline DeviceError GetDeviceProperties(
    DevicePropertyType* prop,
    int device_idx) {
  return cudaGetDeviceProperties(prop, device_idx);
}

inline std::string GetUUIDToString(const char bytes[16]) {
  std::vector<std::tuple<int, int>> groups = {
      {0, 4}, {4, 6}, {6, 8}, {8, 10}, {10, 16}};
  char const hex_chars[16] = {
      '0',
      '1',
      '2',
      '3',
      '4',
      '5',
      '6',
      '7',
      '8',
      '9',
      'a',
      'b',
      'c',
      'd',
      'e',
      'f'};

  std::string result = "GPU";
  for (auto g : groups) {
    result += "-";
    for (size_t i = std::get<0>(g); i < std::get<1>(g); ++i) {
      result += hex_chars[(bytes[i] & 0xF0) >> 4];
      result += hex_chars[(bytes[i] & 0x0F)];
    }
  }
  return result;
}

inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
  std::ostringstream oss;
  oss << "Hardware accelerator device properties: "
      << "\n  Device: "
      << "\n     ASCII string identifying device: " << prop.name
      << "\n     Major compute capability: " << prop.major
      << "\n     Minor compute capability: " << prop.minor
      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
      << prop.multiGpuBoardGroupID
      << "\n     PCI bus ID of the device: " << prop.pciBusID
      << "\n     PCI device ID of the device: " << prop.pciDeviceID
      << "\n     PCI domain ID of the device: " << prop.pciDomainID

      << "\n  Memory limits: "
      << "\n     Constant memory available on device in bytes: "
      << prop.totalConstMem
      << "\n     Global memory available on device in bytes: "
      << prop.totalGlobalMem
      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
      << "\n     Device's maximum L2 persisting lines capacity in bytes: "
      << prop.persistingL2CacheMaxSize
      << "\n     Shared memory reserved by CUDA driver per block in bytes: "
      << prop.reservedSharedMemPerBlock
      << "\n     Shared memory available per block in bytes: "
      << prop.sharedMemPerBlock
      << "\n     Per device maximum shared memory per block usable by special opt in: "
      << prop.sharedMemPerBlockOptin
      << "\n     Shared memory available per multiprocessor in bytes: "
      << prop.sharedMemPerMultiprocessor
      << "\n     The maximum value of cudaAccessPolicyWindow::num_bytes: "
      << prop.accessPolicyMaxWindowSize
      << "\n     Max global memory clock frequency in khz: "
      << prop.memoryClockRate
      << "\n     Peak global memory bandwidth (GByte/s): "
      << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2

      << "\n  Thread limits: "
      << "\n     Warp size in threads: " << prop.warpSize
      << "\n     Maximum size of each dimension of a grid: "
      << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
      << prop.maxGridSize[2]
      << "\n     Maximum size of each dimension of a block: "
      << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
      << prop.maxThreadsDim[2]
      << "\n     Number of asynchronous engines: " << prop.asyncEngineCount
      << "\n     Maximum number of resident blocks per multiprocessor: "
      << prop.maxBlocksPerMultiProcessor
      << "\n     Maximum number of threads per block: "
      << prop.maxThreadsPerBlock
      << "\n     Maximum resident threads per multiprocessor: "
      << prop.maxThreadsPerMultiProcessor
      << "\n     Maximum pitch in bytes allowed by memory copies: "
      << prop.memPitch << "\n     Number of multiprocessors on device: "
      << prop.multiProcessorCount
      << "\n     32-bit registers available per block: " << prop.regsPerBlock
      << "\n     32-bit registers available per multiprocessor: "
      << prop.regsPerMultiprocessor
      << "\n     Max clock frequency of the multiProcessors in khz: "
      << prop.clockRate

      << "\n  Device features: "
      << "\n     Device has ECC support enabled: "
      << (prop.ECCEnabled ? "yes" : "no")
      << "\n     Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: "
      << (prop.canMapHostMemory ? "yes" : "no")
      << "\n     Device can access host registered memory at the same virtual address as the CPU: "
      << (prop.canUseHostPointerForRegisteredMem ? "yes" : "no")
      << "\n     Device supports Compute Preemption: "
      << (prop.computePreemptionSupported ? "yes" : "no")
      << "\n     Device can possibly execute multiple kernels concurrently: "
      << (prop.concurrentKernels ? "yes" : "no")
      << "\n     Device can coherently access managed memory concurrently with the CPU: "
      << (prop.concurrentManagedAccess ? "yes" : "no")
      << "\n     Device supports launching cooperative kernels via cudaLaunchCooperativeKernel: "
      << (prop.cooperativeLaunch ? "yes" : "no")
      << "\n     Host can directly access managed memory on the device without migration: "
      << (prop.directManagedMemAccessFromHost ? "yes" : "no")
      << "\n     Device supports caching globals in L1: "
      << (prop.globalL1CacheSupported ? "yes" : "no")
      << "\n     Link between the device and the host supports native atomic operations: "
      << (prop.hostNativeAtomicSupported ? "yes" : "no")
      << "\n     Device is integrated as opposed to discrete: "
      << (prop.integrated ? "yes" : "no")
      << "\n     Device is on a multi-GPU board: "
      << (prop.isMultiGpuBoard ? "yes" : "no")
      << "\n     Device supports caching locals in L1: "
      << (prop.localL1CacheSupported ? "yes" : "no")
      << "\n     Device supports allocating managed memory on this system: "
      << (prop.managedMemory ? "yes" : "no")
      << "\n     Device supports coherently accessing pageable memory without calling cudaHostRegister on it: "
      << (prop.pageableMemoryAccess ? "yes" : "no")
      << "\n     Device accesses pageable memory via the host's page tables: "
      << (prop.pageableMemoryAccessUsesHostPageTables ? "yes" : "no")
      << "\n     Device supports stream priorities: "
      << (prop.streamPrioritiesSupported ? "yes" : "no")
      << "\n     Device is a Tesla device using TCC driver: "
      << (prop.tccDriver ? "yes" : "no")
      << "\n     Device shares a unified address space with the host: "
      << (prop.unifiedAddressing ? "yes" : "no")

      << "\n  Texture limits: "
      << "\n     Maximum 1D surface size: " << prop.maxSurface1D
      << "\n     Maximum 1D layered surface dimensions: "
      << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1]
      << "\n     Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " "
      << prop.maxSurface2D[1]
      << "\n     Maximum 2D layered surface dimensions: "
      << prop.maxSurface2DLayered[0] << " " << prop.maxSurface2DLayered[1]
      << " " << prop.maxSurface2DLayered[2]
      << "\n     Maximum 3D surface dimensions: " << prop.maxSurface3D[0] << " "
      << prop.maxSurface3D[1] << " " << prop.maxSurface3D[2]
      << "\n     Maximum Cubemap surface dimensions: " << prop.maxSurfaceCubemap
      << "\n     Maximum Cubemap layered surface dimensions: "
      << prop.maxSurfaceCubemapLayered[0] << " "
      << prop.maxSurfaceCubemapLayered[1]
      << "\n     Maximum 1D texture size: " << prop.maxTexture1D
      << "\n     Maximum 1D layered texture dimensions "
      << prop.maxTexture1DLayered[0] << " " << prop.maxTexture1DLayered[1]
      << "\n     Maximum 1D mipmapped texture size: " << prop.maxTexture1DMipmap
      << "\n     Maximum 2D texture dimensions: " << prop.maxTexture2D[0] << " "
      << prop.maxTexture2D[1]
      << "\n     Maximum 2D texture dimensions if texture gather operations have to be performed: "
      << prop.maxTexture2DGather[0] << " " << prop.maxTexture2DGather[1]
      << "\n     Maximum 2D layered texture dimensions: "
      << prop.maxTexture2DLayered[0] << " " << prop.maxTexture2DLayered[1]
      << " " << prop.maxTexture2DLayered[2]
      << "\n     Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory: "
      << prop.maxTexture2DLinear[0] << " " << prop.maxTexture2DLinear[1] << " "
      << prop.maxTexture2DLinear[2]
      << "\n     Maximum 2D mipmapped texture dimensions: "
      << prop.maxTexture2DMipmap[0] << " " << prop.maxTexture2DMipmap[1]
      << "\n     Maximum 3D texture dimensions: " << prop.maxTexture3D[0] << " "
      << prop.maxTexture3D[1] << " " << prop.maxTexture3D[2]
      << "\n     Maximum alternate 3D texture dimensions: "
      << prop.maxTexture3DAlt[0] << " " << prop.maxTexture3DAlt[1] << " "
      << prop.maxTexture3DAlt[2]
      << "\n     Maximum Cubemap texture dimensions: " << prop.maxTextureCubemap
      << "\n     Maximum Cubemap layered texture dimensions: "
      << prop.maxTextureCubemapLayered[0] << " "
      << prop.maxTextureCubemapLayered[1]
      << "\n     Alignment requirements for surfaces: " << prop.surfaceAlignment
      << "\n     Alignment requirement for textures: " << prop.textureAlignment
      << "\n     Pitch alignment requirement for texture references bound to pitched memory: "
      << prop.texturePitchAlignment;
  return oss.str();
}

inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
  std::ostringstream oss;
  oss << "Hardware accelerator device properties: "
      << "\n  Device: "
      << "\n     ASCII string identifying device: " << prop.name
      << "\n     Major compute capability: " << prop.major
      << "\n     Minor compute capability: " << prop.minor
      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
      << prop.multiGpuBoardGroupID
      << "\n     PCI bus ID of the device: " << prop.pciBusID
      << "\n     PCI device ID of the device: " << prop.pciDeviceID
      << "\n     PCI domain ID of the device: " << prop.pciDomainID

      << "\n  Memory limits: "
      << "\n     Constant memory available on device in bytes: "
      << prop.totalConstMem
      << "\n     Global memory available on device in bytes: "
      << prop.totalGlobalMem
      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
      << "\n     Shared memory available per block in bytes: "
      << prop.sharedMemPerBlock
      << "\n     Shared memory available per multiprocessor in bytes: "
      << prop.sharedMemPerMultiprocessor;
  return oss.str();
}

inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
  auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault;
  return cudaStreamCreateWithFlags(stream, flags);
}

inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) {
  auto capture_mode =
      global ? cudaStreamCaptureModeGlobal : cudaStreamCaptureModeThreadLocal;
  return cudaStreamBeginCapture(stream, capture_mode);
}

inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) {
  return cudaStreamEndCapture(stream, graph);
}

inline DeviceError StreamDestroy(StreamType stream) {
  return cudaStreamDestroy(stream);
}

inline DeviceError GraphInstantiate(
    GraphExecType* graph_exec,
    GraphType graph) {
  return cudaGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0);
}

inline DeviceError GraphDestroy(GraphType graph) {
  return cudaGraphDestroy(graph);
}

inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
  cudaGraphExecUpdateResult update;
  return cudaGraphExecUpdate(graph_exec, graph, nullptr, &update);
}

inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {
  return cudaGraphExecDestroy(graph_exec);
}

inline DeviceError GraphExecLaunch(
    GraphExecType graph_exec,
    StreamType stream) {
  return cudaGraphLaunch(graph_exec, stream);
}

inline DeviceError CopyToDevice(
    Handle dst,
    const void* src,
    size_t size,
    StreamType stream = 0) {
  return cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream);
}

inline DeviceError CopyToHost(
    Handle dst,
    const void* src,
    size_t size,
    StreamType stream = 0) {
  return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream);
}

inline DeviceError DeviceToDeviceCopy(
    Handle dst,
    const void* src,
    size_t size,
    StreamType stream = 0) {
  return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
}

inline DeviceError FreeDeviceMemory(Handle src) {
  return cudaFree(src);
}

inline DeviceError FreeDeviceHostMemory(Handle src) {
  return cudaFreeHost(src);
}

inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) {
  return cudaFreeAsync(src, stream);
}

inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
  return cudaMalloc(dst, size);
}

inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
  return cudaMallocHost(dst, size);
}

inline DeviceError DeviceMallocAsync(
    Handle* dst,
    size_t size,
    StreamType stream = 0) {
  return cudaMallocAsync(dst, size, stream);
}

inline DeviceError GetDeviceSuccess() {
  return cudaSuccess;
}

inline DeviceError DeviceMemset(Handle src, int value, size_t size) {
  return cudaMemset(src, value, size);
}

inline DeviceError GetLastError() {
  return cudaGetLastError();
}

inline std::string GetLastErrorString() {
  return cudaGetErrorString(cudaGetLastError());
}

inline DeviceError StreamSynchronize(StreamType stream) {
  return cudaStreamSynchronize(stream);
}

inline DeviceError CreateEvent(EventType* event) {
  return cudaEventCreate(event);
}

inline DeviceError DestroyEvent(EventType event) {
  return cudaEventDestroy(event);
}

inline DeviceError EventRecord(EventType event, StreamType stream = 0) {
  return cudaEventRecord(event, stream);
}

inline DeviceError EventSynchronize(EventType event) {
  return cudaEventSynchronize(event);
}

inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) {
  return cudaEventElapsedTime(ms, start, end);
}

inline DeviceError QueryEvent(EventType event) {
  return cudaEventQuery(event);
}

inline const char* GetErrorString(DeviceError err) {
  return cudaGetErrorString(err);
}

inline DeviceError GetDeviceNotReady() {
  return cudaErrorNotReady;
}

inline DeviceError GetDriverVersion(int* driverVersion) {
  return cudaDriverGetVersion(driverVersion);
}

inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
  return cudaRuntimeGetVersion(runtimeVersion);
}

inline void ProfilerRangePush(const char* msg) {
  nvtxRangePushA(msg);
}

inline void ProfilerRangePop() {
  nvtxRangePop();
}

} // namespace ait