add clip

17db41a about 3 years ago

16.1 kB

	// Copyright (c) Meta Platforms, Inc. and affiliates.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#pragma once

	#include <string>

	#include "cutlass/conv/conv2d_problem_size.h"
	#include "cutlass/conv/device/implicit_gemm_convolution.h"
	#include "cutlass/conv/kernel/default_conv2d_fprop.h"
	#include "cutlass/cutlass.h"
	#include "cutlass/util/host_tensor.h"
	#include "cutlass/util/reference/host/tensor_fill.h"

	#include <nvtx3/nvToolsExt.h>

	namespace ait {

	inline thread_local bool target_has_graph_mode = true;

	using DeviceError = cudaError_t;
	using DevicePropertyType = cudaDeviceProp;
	using StreamType = cudaStream_t;
	using EventType = cudaEvent_t;
	using GraphType = cudaGraph_t;
	using GraphExecType = cudaGraphExec_t;
	using Handle = void*;

	using bfloat16 = __nv_bfloat16;

	inline DeviceError GetDevice(int* device_idx) {
	return cudaGetDevice(device_idx);
	}

	inline DeviceError GetDeviceProperties(
	DevicePropertyType* prop,
	int device_idx) {
	return cudaGetDeviceProperties(prop, device_idx);
	}

	inline std::string GetUUIDToString(const char bytes[16]) {
	std::vector<std::tuple<int, int>> groups = {
	{0, 4}, {4, 6}, {6, 8}, {8, 10}, {10, 16}};
	char const hex_chars[16] = {
	'0',
	'1',
	'2',
	'3',
	'4',
	'5',
	'6',
	'7',
	'8',
	'9',
	'a',
	'b',
	'c',
	'd',
	'e',
	'f'};

	std::string result = "GPU";
	for (auto g : groups) {
	result += "-";
	for (size_t i = std::get<0>(g); i < std::get<1>(g); ++i) {
	result += hex_chars[(bytes[i] & 0xF0) >> 4];
	result += hex_chars[(bytes[i] & 0x0F)];
	}
	}
	return result;
	}

	inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
	std::ostringstream oss;
	oss << "Hardware accelerator device properties: "
	<< "\n Device: "
	<< "\n ASCII string identifying device: " << prop.name
	<< "\n Major compute capability: " << prop.major
	<< "\n Minor compute capability: " << prop.minor
	<< "\n UUID: " << GetUUIDToString(prop.uuid.bytes)
	<< "\n Unique identifier for a group of devices on the same multi-GPU board: "
	<< prop.multiGpuBoardGroupID
	<< "\n PCI bus ID of the device: " << prop.pciBusID
	<< "\n PCI device ID of the device: " << prop.pciDeviceID
	<< "\n PCI domain ID of the device: " << prop.pciDomainID

	<< "\n Memory limits: "
	<< "\n Constant memory available on device in bytes: "
	<< prop.totalConstMem
	<< "\n Global memory available on device in bytes: "
	<< prop.totalGlobalMem
	<< "\n Global memory bus width in bits: " << prop.memoryBusWidth
	<< "\n Size of L2 cache in bytes: " << prop.l2CacheSize
	<< "\n Device's maximum L2 persisting lines capacity in bytes: "
	<< prop.persistingL2CacheMaxSize
	<< "\n Shared memory reserved by CUDA driver per block in bytes: "
	<< prop.reservedSharedMemPerBlock
	<< "\n Shared memory available per block in bytes: "
	<< prop.sharedMemPerBlock
	<< "\n Per device maximum shared memory per block usable by special opt in: "
	<< prop.sharedMemPerBlockOptin
	<< "\n Shared memory available per multiprocessor in bytes: "
	<< prop.sharedMemPerMultiprocessor
	<< "\n The maximum value of cudaAccessPolicyWindow::num_bytes: "
	<< prop.accessPolicyMaxWindowSize
	<< "\n Max global memory clock frequency in khz: "
	<< prop.memoryClockRate
	<< "\n Peak global memory bandwidth (GByte/s): "
	<< (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2

	<< "\n Thread limits: "
	<< "\n Warp size in threads: " << prop.warpSize
	<< "\n Maximum size of each dimension of a grid: "
	<< prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
	<< prop.maxGridSize[2]
	<< "\n Maximum size of each dimension of a block: "
	<< prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
	<< prop.maxThreadsDim[2]
	<< "\n Number of asynchronous engines: " << prop.asyncEngineCount
	<< "\n Maximum number of resident blocks per multiprocessor: "
	<< prop.maxBlocksPerMultiProcessor
	<< "\n Maximum number of threads per block: "
	<< prop.maxThreadsPerBlock
	<< "\n Maximum resident threads per multiprocessor: "
	<< prop.maxThreadsPerMultiProcessor
	<< "\n Maximum pitch in bytes allowed by memory copies: "
	<< prop.memPitch << "\n Number of multiprocessors on device: "
	<< prop.multiProcessorCount
	<< "\n 32-bit registers available per block: " << prop.regsPerBlock
	<< "\n 32-bit registers available per multiprocessor: "
	<< prop.regsPerMultiprocessor
	<< "\n Max clock frequency of the multiProcessors in khz: "
	<< prop.clockRate

	<< "\n Device features: "
	<< "\n Device has ECC support enabled: "
	<< (prop.ECCEnabled ? "yes" : "no")
	<< "\n Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: "
	<< (prop.canMapHostMemory ? "yes" : "no")
	<< "\n Device can access host registered memory at the same virtual address as the CPU: "
	<< (prop.canUseHostPointerForRegisteredMem ? "yes" : "no")
	<< "\n Device supports Compute Preemption: "
	<< (prop.computePreemptionSupported ? "yes" : "no")
	<< "\n Device can possibly execute multiple kernels concurrently: "
	<< (prop.concurrentKernels ? "yes" : "no")
	<< "\n Device can coherently access managed memory concurrently with the CPU: "
	<< (prop.concurrentManagedAccess ? "yes" : "no")
	<< "\n Device supports launching cooperative kernels via cudaLaunchCooperativeKernel: "
	<< (prop.cooperativeLaunch ? "yes" : "no")
	<< "\n Host can directly access managed memory on the device without migration: "
	<< (prop.directManagedMemAccessFromHost ? "yes" : "no")
	<< "\n Device supports caching globals in L1: "
	<< (prop.globalL1CacheSupported ? "yes" : "no")
	<< "\n Link between the device and the host supports native atomic operations: "
	<< (prop.hostNativeAtomicSupported ? "yes" : "no")
	<< "\n Device is integrated as opposed to discrete: "
	<< (prop.integrated ? "yes" : "no")
	<< "\n Device is on a multi-GPU board: "
	<< (prop.isMultiGpuBoard ? "yes" : "no")
	<< "\n Device supports caching locals in L1: "
	<< (prop.localL1CacheSupported ? "yes" : "no")
	<< "\n Device supports allocating managed memory on this system: "
	<< (prop.managedMemory ? "yes" : "no")
	<< "\n Device supports coherently accessing pageable memory without calling cudaHostRegister on it: "
	<< (prop.pageableMemoryAccess ? "yes" : "no")
	<< "\n Device accesses pageable memory via the host's page tables: "
	<< (prop.pageableMemoryAccessUsesHostPageTables ? "yes" : "no")
	<< "\n Device supports stream priorities: "
	<< (prop.streamPrioritiesSupported ? "yes" : "no")
	<< "\n Device is a Tesla device using TCC driver: "
	<< (prop.tccDriver ? "yes" : "no")
	<< "\n Device shares a unified address space with the host: "
	<< (prop.unifiedAddressing ? "yes" : "no")

	<< "\n Texture limits: "
	<< "\n Maximum 1D surface size: " << prop.maxSurface1D
	<< "\n Maximum 1D layered surface dimensions: "
	<< prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1]
	<< "\n Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " "
	<< prop.maxSurface2D[1]
	<< "\n Maximum 2D layered surface dimensions: "
	<< prop.maxSurface2DLayered[0] << " " << prop.maxSurface2DLayered[1]
	<< " " << prop.maxSurface2DLayered[2]
	<< "\n Maximum 3D surface dimensions: " << prop.maxSurface3D[0] << " "
	<< prop.maxSurface3D[1] << " " << prop.maxSurface3D[2]
	<< "\n Maximum Cubemap surface dimensions: " << prop.maxSurfaceCubemap
	<< "\n Maximum Cubemap layered surface dimensions: "
	<< prop.maxSurfaceCubemapLayered[0] << " "
	<< prop.maxSurfaceCubemapLayered[1]
	<< "\n Maximum 1D texture size: " << prop.maxTexture1D
	<< "\n Maximum 1D layered texture dimensions "
	<< prop.maxTexture1DLayered[0] << " " << prop.maxTexture1DLayered[1]
	<< "\n Maximum 1D mipmapped texture size: " << prop.maxTexture1DMipmap
	<< "\n Maximum 2D texture dimensions: " << prop.maxTexture2D[0] << " "
	<< prop.maxTexture2D[1]
	<< "\n Maximum 2D texture dimensions if texture gather operations have to be performed: "
	<< prop.maxTexture2DGather[0] << " " << prop.maxTexture2DGather[1]
	<< "\n Maximum 2D layered texture dimensions: "
	<< prop.maxTexture2DLayered[0] << " " << prop.maxTexture2DLayered[1]
	<< " " << prop.maxTexture2DLayered[2]
	<< "\n Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory: "
	<< prop.maxTexture2DLinear[0] << " " << prop.maxTexture2DLinear[1] << " "
	<< prop.maxTexture2DLinear[2]
	<< "\n Maximum 2D mipmapped texture dimensions: "
	<< prop.maxTexture2DMipmap[0] << " " << prop.maxTexture2DMipmap[1]
	<< "\n Maximum 3D texture dimensions: " << prop.maxTexture3D[0] << " "
	<< prop.maxTexture3D[1] << " " << prop.maxTexture3D[2]
	<< "\n Maximum alternate 3D texture dimensions: "
	<< prop.maxTexture3DAlt[0] << " " << prop.maxTexture3DAlt[1] << " "
	<< prop.maxTexture3DAlt[2]
	<< "\n Maximum Cubemap texture dimensions: " << prop.maxTextureCubemap
	<< "\n Maximum Cubemap layered texture dimensions: "
	<< prop.maxTextureCubemapLayered[0] << " "
	<< prop.maxTextureCubemapLayered[1]
	<< "\n Alignment requirements for surfaces: " << prop.surfaceAlignment
	<< "\n Alignment requirement for textures: " << prop.textureAlignment
	<< "\n Pitch alignment requirement for texture references bound to pitched memory: "
	<< prop.texturePitchAlignment;
	return oss.str();
	}

	inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
	std::ostringstream oss;
	oss << "Hardware accelerator device properties: "
	<< "\n Device: "
	<< "\n ASCII string identifying device: " << prop.name
	<< "\n Major compute capability: " << prop.major
	<< "\n Minor compute capability: " << prop.minor
	<< "\n UUID: " << GetUUIDToString(prop.uuid.bytes)
	<< "\n Unique identifier for a group of devices on the same multi-GPU board: "
	<< prop.multiGpuBoardGroupID
	<< "\n PCI bus ID of the device: " << prop.pciBusID
	<< "\n PCI device ID of the device: " << prop.pciDeviceID
	<< "\n PCI domain ID of the device: " << prop.pciDomainID

	<< "\n Memory limits: "
	<< "\n Constant memory available on device in bytes: "
	<< prop.totalConstMem
	<< "\n Global memory available on device in bytes: "
	<< prop.totalGlobalMem
	<< "\n Size of L2 cache in bytes: " << prop.l2CacheSize
	<< "\n Shared memory available per block in bytes: "
	<< prop.sharedMemPerBlock
	<< "\n Shared memory available per multiprocessor in bytes: "
	<< prop.sharedMemPerMultiprocessor;
	return oss.str();
	}

	inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
	auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault;
	return cudaStreamCreateWithFlags(stream, flags);
	}

	inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) {
	auto capture_mode =
	global ? cudaStreamCaptureModeGlobal : cudaStreamCaptureModeThreadLocal;
	return cudaStreamBeginCapture(stream, capture_mode);
	}

	inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) {
	return cudaStreamEndCapture(stream, graph);
	}

	inline DeviceError StreamDestroy(StreamType stream) {
	return cudaStreamDestroy(stream);
	}

	inline DeviceError GraphInstantiate(
	GraphExecType* graph_exec,
	GraphType graph) {
	return cudaGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0);
	}

	inline DeviceError GraphDestroy(GraphType graph) {
	return cudaGraphDestroy(graph);
	}

	inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
	cudaGraphExecUpdateResult update;
	return cudaGraphExecUpdate(graph_exec, graph, nullptr, &update);
	}

	inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {
	return cudaGraphExecDestroy(graph_exec);
	}

	inline DeviceError GraphExecLaunch(
	GraphExecType graph_exec,
	StreamType stream) {
	return cudaGraphLaunch(graph_exec, stream);
	}

	inline DeviceError CopyToDevice(
	Handle dst,
	const void* src,
	size_t size,
	StreamType stream = 0) {
	return cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream);
	}

	inline DeviceError CopyToHost(
	Handle dst,
	const void* src,
	size_t size,
	StreamType stream = 0) {
	return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream);
	}

	inline DeviceError DeviceToDeviceCopy(
	Handle dst,
	const void* src,
	size_t size,
	StreamType stream = 0) {
	return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
	}

	inline DeviceError FreeDeviceMemory(Handle src) {
	return cudaFree(src);
	}

	inline DeviceError FreeDeviceHostMemory(Handle src) {
	return cudaFreeHost(src);
	}

	inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) {
	return cudaFreeAsync(src, stream);
	}

	inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
	return cudaMalloc(dst, size);
	}

	inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
	return cudaMallocHost(dst, size);
	}

	inline DeviceError DeviceMallocAsync(
	Handle* dst,
	size_t size,
	StreamType stream = 0) {
	return cudaMallocAsync(dst, size, stream);
	}

	inline DeviceError GetDeviceSuccess() {
	return cudaSuccess;
	}

	inline DeviceError DeviceMemset(Handle src, int value, size_t size) {
	return cudaMemset(src, value, size);
	}

	inline DeviceError GetLastError() {
	return cudaGetLastError();
	}

	inline std::string GetLastErrorString() {
	return cudaGetErrorString(cudaGetLastError());
	}

	inline DeviceError StreamSynchronize(StreamType stream) {
	return cudaStreamSynchronize(stream);
	}

	inline DeviceError CreateEvent(EventType* event) {
	return cudaEventCreate(event);
	}

	inline DeviceError DestroyEvent(EventType event) {
	return cudaEventDestroy(event);
	}

	inline DeviceError EventRecord(EventType event, StreamType stream = 0) {
	return cudaEventRecord(event, stream);
	}

	inline DeviceError EventSynchronize(EventType event) {
	return cudaEventSynchronize(event);
	}

	inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) {
	return cudaEventElapsedTime(ms, start, end);
	}

	inline DeviceError QueryEvent(EventType event) {
	return cudaEventQuery(event);
	}

	inline const char* GetErrorString(DeviceError err) {
	return cudaGetErrorString(err);
	}

	inline DeviceError GetDeviceNotReady() {
	return cudaErrorNotReady;
	}

	inline DeviceError GetDriverVersion(int* driverVersion) {
	return cudaDriverGetVersion(driverVersion);
	}

	inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
	return cudaRuntimeGetVersion(runtimeVersion);
	}

	inline void ProfilerRangePush(const char* msg) {
	nvtxRangePushA(msg);
	}

	inline void ProfilerRangePop() {
	nvtxRangePop();
	}

	} // namespace ait