Spaces:

qbhf2
/

GarmentCode

Sleeping

App Files Files Community

GarmentCode / NvidiaWarp-GarmentCode /warp /native /cutlass /tools /library /scripts /rt.py

qbhf2

added NvidiaWarp and GarmentCode repos

66c9c8a 11 months ago

raw

history blame contribute delete

21.6 kB

	#################################################################################################
	#
	# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: BSD-3-Clause
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# 1. Redistributions of source code must retain the above copyright notice, this
	# list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.
	#
	# 3. Neither the name of the copyright holder nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	#
	#################################################################################################

	# System imports
	import struct
	import io
	import ctypes

	# CUDA Python import
	from cuda import cuda
	from cuda import nvrtc

	# CUTLASS imports
	from library import *
	from gemm_operation import EmitGemmUniversalInstance

	#################################################################################################
	#
	# CUTLASS Py Runtime Components
	#
	#################################################################################################

	#
	def MaxAlignment(fmt):
	align = 1
	for x in fmt:
	align = max(align, struct.calcsize(x))
	return align

	#
	def AlignedOffset(offset, align):
	remainder = (offset % align)
	if remainder:
	offset += (align - remainder)
	return offset

	#
	def PackInteger(host_workspace, offset, value):
	fmt = "i"
	padding = AlignedOffset(offset, 4)
	struct.pack_into(fmt, host_workspace, offset, value)
	return padding + struct.calcsize(fmt)

	#
	def PackDevicePointer(host_workspace, offset, value):
	fmt = "P"
	offset = AlignedOffset(offset, 8)
	struct.pack_into(fmt, host_workspace, offset, value)
	return offset + struct.calcsize(fmt)

	#
	def ceil_div(a, b):
	return -(a // -b)

	#################################################################################################

	#
	class PitchLinearCoord:
	def __init__(self, contiguous, strided):
	self.contiguous = contiguous
	self.strided = strided

	#
	class GemmCoord:
	def __init__(self, m = 1, n = 1, k = 1):
	self.m = m
	self.n = n
	self.k = k
	self.fmt = "iii"

	#
	def ceil_div(self, rhs):
	return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))

	#
	def size(self):
	return struct.calcsize(self.fmt)

	#
	def alignment(self):
	return MaxAlignment(self.fmt)

	#
	def pack_into(self, host_workspace, offset):

	offset = AlignedOffset(offset, 4)

	struct.pack_into(
	self.fmt,
	host_workspace,
	offset,
	self.m, self.n, self.k)

	return offset + self.size()

	#
	class TensorRef:
	def __init__(self, pointer = None, layout = 0):
	self.pointer = pointer
	self.layout = layout

	def __str__(self):
	return "(%x, %d)" % (self.pointer._ptr, self.layout)

	#################################################################################################

	#
	class PredicatedTileAccessIteratorDesc:
	'''
	'''

	def __init__(
	self,
	element_size_bits,
	advance_rank,
	threadblock_shape,
	threadmap_iterations,
	threadmap_delta):

	self.element_size_bits = element_size_bits
	self.advance_rank = advance_rank
	self.threadblock_shape = threadblock_shape
	self.threadmap_iterations = threadmap_iterations
	self.threadmap_delta = threadmap_delta

	#
	class PredicatedTileAccessIteratorParams:
	'''
	'''
	#
	def __init__(self, desc, label):
	self.desc = desc
	self.label = label
	self.fmt = "qqqq"
	#
	def size(self):
	return struct.calcsize(self.fmt)

	#
	def alignment(self):
	return MaxAlignment(self.fmt)

	#
	def initialize(self, host_workspace, offset, stride):

	offset = AlignedOffset(offset, self.alignment())

	inc_strided = stride * \
	self.desc.threadmap_delta.strided * \
	self.desc.element_size_bits // 8

	if self.desc.advance_rank:
	inc_advance = self.desc.threadblock_shape.strided * \
	stride * \
	self.desc.element_size_bits // 8
	else:
	inc_advance = self.desc.threadblock_shape.contiguous * \
	self.desc.element_size_bits // 8

	inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
	self.desc.threadmap_delta.strided * \
	stride * \
	self.desc.element_size_bits // 8

	struct.pack_into(
	self.fmt,
	host_workspace,
	offset,
	stride, inc_strided, inc_next, inc_advance)

	return offset + self.size()
	#

	#################################################################################################

	#
	class EpilogueTileDesc:
	'''
	'''
	def __init__(self, column, row, group, cluster, tile):
	self.column = column
	self.row = row
	self.group = group
	self.cluster = cluster
	self.tile = tile

	#
	class EpilogueThreadMap:
	'''
	'''
	def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
	self.threads = threads
	self.elements_per_access = elements_per_access
	self.element_size_bits = element_size_bits
	self.shape = shape
	self.iterations = iterations
	self.delta = delta
	self.count = count
	pass

	#
	class EpilogueTileIteratorParams:
	'''
	'''
	#
	def __init__(self, desc, label):
	self.desc = desc
	self.label = label
	self.fmt = "qqqqqqqq"

	#
	def size(self):
	return struct.calcsize(self.fmt)

	#
	def alignment(self):
	return MaxAlignment(self.fmt)

	#
	def initialize(self, host_workspace, offset, stride):

	stride = stride * self.desc.element_size_bits // 8

	offset = AlignedOffset(offset, self.alignment())

	increment_row = stride * self.desc.delta.row

	increment_group = stride * self.desc.delta.group \
	- stride * self.desc.delta.row * (self.desc.iterations.row - 1)

	increment_cluster = stride * self.desc.delta.cluster \
	- stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
	- stride * self.desc.delta.row * (self.desc.iterations.row - 1)

	advance_row = stride * self.desc.shape.row

	advance_group = stride * \
	(self.desc.shape.group - 1) * \
	self.desc.shape.row * \
	self.desc.count.row

	advance_cluster = stride * \
	self.desc.count.group * \
	self.desc.shape.group * \
	self.desc.count.row * \
	self.desc.shape.row

	advance_tile = stride * \
	self.desc.shape.group * \
	self.desc.shape.row * \
	self.desc.shape.cluster * \
	self.desc.shape.tile

	struct.pack_into(
	self.fmt, \
	host_workspace, \
	offset, \
	stride, \
	increment_row, increment_group, increment_cluster, \
	advance_row, advance_group, advance_cluster, advance_tile)

	return offset + self.size()
	#

	#################################################################################################
	#
	# Launch configuration
	#
	#################################################################################################

	class LaunchConfiguration:
	def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
	self.grid = grid
	self.block = block
	self.shared_memory_capacity = smem

	#################################################################################################
	#
	# Functors
	#
	#################################################################################################

	#
	class Functor:
	def __init__(self):
	self.decl = ''
	self.definition = ''
	self.fmt = ''
	self.identifier = ''

	#
	def emit_declaration(self):
	return self.decl

	#
	def emit_definition(self):
	return self.definition

	#
	def size(self):
	'''
	Size of the packed Params structure
	'''
	return struct.calcsize(self.fmt)

	#
	def alignment(self):
	return MaxAlignment(self.fmt)

	#
	def initialize(self, host_workspace, offset, arguments):
	return offset + self.size()

	#################################################################################################

	#
	class LinearCombinationFunctorArguments:
	def __init__(self, alpha = 1.0, beta = 0.0):
	self.alpha = alpha
	self.beta = beta
	self.alpha_ptr = 0
	self.beta_ptr = 0

	#
	class LinearCombinationFunctor(Functor):
	def __init__(self):
	super().__init__()

	self.decl = """
	cutlass::epilogue::thread::LinearCombination<
	float,
	1,
	float,
	float
	>"""
	self.identifier = 'linear_combination'
	self.fmt = "ffPP"

	#
	def size(self):
	'''
	Size of the packed Params structure
	'''
	return struct.calcsize(self.fmt)

	#
	def alignment(self):
	return MaxAlignment(self.fmt)

	#
	def initialize(self, host_workspace, offset, arguments):

	offset = AlignedOffset(offset, self.alignment())

	struct.pack_into(
	self.fmt,
	host_workspace, offset,
	arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)

	return offset + self.size()

	#################################################################################################
	#
	# Base class for an executable operation
	#
	#################################################################################################

	#
	class ExecutableOperation:
	'''
	'''
	def __init__(self, operation):
	self.operation = operation
	self.module = None
	self.kernel = None

	#
	def name(self):
	return self.operation.procedural_name()

	#
	def emit(self):
	return ''

	#
	def can_implement(self, configuration, arguments):
	return False

	#
	def get_host_workspace_size(self, arguments):
	return 0

	#
	def get_device_workspace_size(self, arguments):
	return 0

	#
	def plan(self, arguments):
	return LaunchConfiguration()

	#
	def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
	raise NotImplementedError()

	#
	def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):

	cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
	packed = (ctypes.c_void_p * 1)()
	packed[0] = ctypes.addressof(cArg)

	err, = cuda.cuLaunchKernel(
	self.kernel,
	launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
	launch_config.block[0], launch_config.block[1], launch_config.block[2],
	launch_config.shared_memory_capacity,
	stream,
	packed,
	0)

	return err

	#################################################################################################


	#
	class GemmArguments:
	'''
	'''
	def __init__(self):
	self.problem_size = GemmCoord(0, 0, 0)
	self.A = TensorRef()
	self.B = TensorRef()
	self.C = TensorRef()
	self.D = TensorRef()
	self.output_op = LinearCombinationFunctorArguments()

	#
	class ThreadblockSwizzle:
	def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
	self.threadblock_shape = threadblock_shape
	self.log_threadblock_cohort = log_threadblock_cohort

	def grid_tiled_shape(self, problem_size):
	return GemmCoord(
	ceil_div(problem_size.m, self.threadblock_shape.m),
	ceil_div(problem_size.n, self.threadblock_shape.n),
	1)

	#
	class Gemm(ExecutableOperation):
	'''
	GEMM manages the CUTLASS runtime components
	'''
	#
	def __init__(self, operation):
	super().__init__(operation)

	self.emitter = EmitGemmUniversalInstance('_type')
	self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))

	self.threads = 256
	self.shared_memory_capacity = (32 << 10)

	self.params_A = PredicatedTileAccessIteratorParams(
	PredicatedTileAccessIteratorDesc(
	32,
	1,
	PitchLinearCoord(128, 8),
	PitchLinearCoord(1, 4),
	PitchLinearCoord(1, 2)), 'A')

	self.params_B = PredicatedTileAccessIteratorParams(
	PredicatedTileAccessIteratorDesc(
	32,
	1,
	PitchLinearCoord(128, 8),
	PitchLinearCoord(1, 4),
	PitchLinearCoord(1, 2)), 'B')

	self.params_C = EpilogueTileIteratorParams(
	EpilogueThreadMap(
	256,
	1,
	32,
	EpilogueTileDesc(128, 1, 4, 4, 1),
	EpilogueTileDesc(4, 1, 2, 1, 1),
	EpilogueTileDesc(32, 1, 8, 1, 1),
	EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')

	self.params_D = EpilogueTileIteratorParams(
	EpilogueThreadMap(
	256,
	1,
	32,
	EpilogueTileDesc(128, 1, 4, 4, 1),
	EpilogueTileDesc(4, 1, 2, 1, 1),
	EpilogueTileDesc(32, 1, 8, 1, 1),
	EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')

	self.output_op = LinearCombinationFunctor()

	#
	def emit(self):
	return self.emitter.emit(self.operation)

	#
	def can_implement(self, configuration, arguments):
	pass

	#
	def get_host_workspace_size(self, arguments):
	return 336

	#
	def get_device_workspace_size(self, arguments):
	return 0

	#
	def plan(self, arguments):
	grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
	return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)

	#
	def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):

	offset = 0

	# Compute intermediate results
	swizzle_log_tile = 0
	gemm_mode = 0
	batch_count = 1
	gemm_k_size = arguments.problem_size.k

	# Pack into the host workspace buffer
	offset = arguments.problem_size.pack_into(host_workspace, offset)

	grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
	offset = grid_tiled_shape.pack_into(host_workspace, offset)

	offset = PackInteger(host_workspace, offset, swizzle_log_tile)

	offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
	offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
	offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
	offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)

	offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)

	offset = PackInteger(host_workspace, offset, gemm_mode)
	offset = PackInteger(host_workspace, offset, batch_count)
	offset = PackInteger(host_workspace, offset, gemm_k_size)
	offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
	offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
	offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
	offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))

	return offset


	#################################################################################################
	#
	# Module represents a compilation unit
	#
	#################################################################################################

	#
	class CompilationOptions:
	'''
	Compilation options.
	'''

	#
	def __init__(self, architectures = [80], include_paths = []):
	self.includes = []
	self.include_paths = include_paths
	self.flags = ['-std=c++11', '-default-device']
	self.architectures = architectures

	#
	def get(self):
	options = []

	for flag in self.flags:
	options.append(bytes(str.encode(flag)))

	for incl in self.include_paths:
	options.append(bytes(str.encode('--include-path=%s' % incl)))

	arch_list = "-arch="
	for idx, arch in enumerate(self.architectures):
	if idx:
	arch_list += ","
	arch_list += "sm_%d" % arch

	options.append(bytes(str.encode(arch_list)))

	return options

	IncludeTemplate = r'''#include "${include}"
	'''

	KernelTemplate = r'''
	extern "C"
	__global__ void
	${operation_name}(${operation_name}${operation_suffix}::Params params) {

	// Dynamic shared memory base pointer
	extern __shared__ int SharedStorageBase[];

	// Declare pointer to dynamic shared memory.
	${operation_name}${operation_suffix}::SharedStorage *shared_storage =
	reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);

	${operation_name}${operation_suffix} op;

	op(params, *shared_storage);
	}

	'''

	#
	class Module:
	def __init__(self, name, operations, compilation_options):
	self.name = name
	self.operations = operations
	self.module = None
	self.log = None
	self.cubin_image = None
	self.source_buffer = ''

	#
	# Emit source
	#
	self.emit_()

	#
	# Compile
	#
	self.compile_(compilation_options)

	#
	# Load module
	#
	self.load_()

	# Done
	return

	# Emit a source buffer
	def emit_(self):

	# 1. Includes
	includes = []
	for operation in self.operations:
	for incl in operation.emitter.includes:
	if incl not in includes:
	includes.append(incl)

	for incl in includes:
	self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )

	# 2. Operations
	for operation in self.operations:
	self.source_buffer += operation.emit()
	values = {
	'operation_name': operation.name(),
	'operation_suffix': operation.emitter.operation_suffix
	}
	self.source_buffer += SubstituteTemplate(KernelTemplate, values)

	# Done
	return

	# Compile with NVRTC
	def compile_(self, compilation_options):

	err, program = nvrtc.nvrtcCreateProgram(
	str.encode(self.source_buffer),
	bytes(str.encode(self.name)),
	0, [], [])

	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
	raise RuntimeError('NVRTC Error: {}'.format(err))

	# Compile program
	options = compilation_options.get()

	err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:

	error_string = 'NVRTC Error: {}\n'.format(err)

	# Get log from compilation
	err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
	raise RuntimeError('NVRTC Error: {}'.format(err))

	self.log = b' ' * logSize
	err, = nvrtc.nvrtcGetProgramLog(program, self.log)
	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
	raise RuntimeError('NVRTC Error: {}'.format(err))

	raise RuntimeError(error_string + self.log.decode() + self.source_buffer)

	# Get data from compilation
	err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
	raise RuntimeError('NVRTC Error: {}'.format(err))

	self.cubin_image = b' ' * dataSize
	err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
	if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
	raise RuntimeError('NVRTC Error: {}'.format(err))

	return

	#
	def load_(self):

	# Load data as module data
	err, self.module = cuda.cuModuleLoadData(self.cubin_image)
	if err != cuda.CUresult.CUDA_SUCCESS:
	raise RuntimeError('Cuda Error: {}'.format(err))

	# Get functions
	for operation in self.operations:
	err, operation.kernel = cuda.cuModuleGetFunction(
	self.module,
	bytes(str.encode(operation.name())))

	if err != cuda.CUresult.CUDA_SUCCESS:
	raise RuntimeError('Cuda Error: {}'.format(err))

	operation.module = self

	return


	#################################################################################################
	#
	# Manifest represents an 'owner' for modules and operations
	#
	#################################################################################################

	#
	class Manifest:

	#
	def __init__(self):
	self.operations = {}
	self.modules = []
	pass

	#
	def append_module(self, module):
	'''
	Appends a module and takes ownership of operations used to construct it.
	'''

	self.modules.append(module)

	for operation in module.operations:
	self.operations[operation.name()] = operation


	#################################################################################################