Upload folder using huggingface_hub

b024d42 verified about 1 year ago

19.4 kB

	// SPDX-License-Identifier: MIT
	// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

	#pragma once

	#include "ck/wrapper/utils/tensor_utils.hpp"
	#include "ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp"

	#include "ck/host_utility/device_prop.hpp"
	#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"

	// Disable from doxygen docs generation
	/// @cond INTERNAL
	namespace ck {
	namespace wrapper {
	/// @endcond

	// Disable from doxygen docs generation
	/// @cond INTERNAL
	namespace {
	namespace detail {
	/**
	* \brief Create block descriptor (K0, MPerBlock or NPerBlock, K1).
	*
	*
	* \tparam K1 The number of K-dim elements that are packed together as a separate logical dimension.
	* \tparam TileLayout Tensor data tile layout (M,K) or (N,K).
	*
	* \return Block descriptor (K0, MPerBlock or NPerBlock, K1)
	*/
	template <index_t K1, typename TileLayout>
	__device__ constexpr auto GetBlockDescriptor()
	{
	using TileLayoutShape = typename TileLayout::LayoutShape;
	using TileLayoutDescriptor = typename TileLayout::LayoutUnrolledDescriptorType;

	constexpr auto K0PerBlock = Number<size<1>(TileLayoutShape{})>{} / Number<K1>{};
	// MPerBlock or NPerBlock
	constexpr auto Dim0 = Number<size<0>(TileLayoutShape{})>{};

	constexpr auto a_block_desc_k0_m_k1 = transform_tensor_descriptor(
	TileLayoutDescriptor{},
	make_tuple(make_unmerge_transform(make_tuple(K0PerBlock, Number<K1>{})),
	make_pass_through_transform(Dim0)),
	make_tuple(Sequence<1>{}, Sequence<0>{}),
	make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

	return a_block_desc_k0_m_k1;
	}

	} // namespace detail
	} // namespace
	/// @endcond

	/**
	* \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be
	* stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or
	* (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock)
	* or (K0PerBlock, NPerBlock, K1).
	*
	* \note C output Vgpr register layout (8D):
	* - MXdlPerWave - The number of MFMA instructions run by single wave in M
	* dimension per tile.
	* - NXdlPerWave - The number of MFMA instructions run by single wave in N
	* dimension per tile.
	* - MWave - Equals to 1 since this is for single wave.
	* - NWave - Equals to 1 since this is for single wave.
	* - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumInputsBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - GroupSize - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	*
	* \tparam DataType Input data types.
	* \tparam BlockSize Tensor to pad.
	* \tparam GemmTraits Traits of gemm xdl operation.
	* \param a_local_tile_tensor A tensor in LDS memory for blockwise gemm
	* (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) layout.
	* \param b_local_tile_tensor B tensor in LDS memory for blockwise gemm
	* (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1) layout.
	* \param c_reg_tensor C tensor VGPR memory for blockwise gemm.
	*/
	template <typename DataType,
	index_t BlockSize,
	typename GemmTraits,
	typename ATensorType,
	typename BTensorType,
	typename CTensorType>
	__device__ void blockwise_gemm_xdl(const ATensorType& a_local_tile_tensor,
	const BTensorType& b_local_tile_tensor,
	CTensorType& c_reg_tensor)
	{
	constexpr auto I3 = Number<3>{};

	static_assert(ATensorType::TensorBufferAddressSpace == MemoryTypeEnum::Lds);
	static_assert(BTensorType::TensorBufferAddressSpace == MemoryTypeEnum::Lds);
	static_assert(CTensorType::TensorBufferAddressSpace == MemoryTypeEnum::Vgpr);
	static_assert(is_same_v<DataType, typename ATensorType::TensorElementType>);
	static_assert(is_same_v<DataType, typename BTensorType::TensorElementType>);

	constexpr bool is_integer =
	is_same_v<DataType, int8_t> \|\| is_same_v<DataType, int16_t> \|\| is_same_v<DataType, int32_t>;
	using GemmAccDataType = std::conditional_t<is_integer, int32_t, float>;

	using ATileLayout = remove_cvref_t<decltype(layout(a_local_tile_tensor))>;
	using BTileLayout = remove_cvref_t<decltype(layout(b_local_tile_tensor))>;

	static_assert(typename ATileLayout::LayoutShape{}.Size() ==
	typename BTileLayout::LayoutShape{}.Size());
	constexpr bool is_3d_desc = typename ATileLayout::LayoutShape{}.Size() == I3;

	using ABlockDesc_K0_M_K1_Type =
	conditional_t<is_3d_desc,
	typename ATileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, ATileLayout>())>;
	using BBlockDesc_K0_N_K1_Type =
	conditional_t<is_3d_desc,
	typename BTileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, BTileLayout>())>;

	BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
	DataType,
	DataType,
	GemmAccDataType,
	ABlockDesc_K0_M_K1_Type,
	BBlockDesc_K0_N_K1_Type,
	GemmTraits::MPerXDL,
	GemmTraits::NPerXDL,
	GemmTraits::MXdlPerWave,
	GemmTraits::NXdlPerWave,
	GemmTraits::K1>
	blockwise_gemm_xdl_op{};

	blockwise_gemm_xdl_op.Run(
	a_local_tile_tensor.GetBuffer(), b_local_tile_tensor.GetBuffer(), c_reg_tensor.GetBuffer());
	}

	/**
	* \brief Create local partition per thread for C tensor.
	*
	* \note C output global memory layout (8D):
	* - MXdlPerWave - The number of MFMA instructions run by single wave in M
	* dimension.
	* - NXdlPerWave - The number of MFMA instructions run by single wave in N
	* dimension.
	* - MWave - The number of waves in single tile M dimension per tile.
	* - NWave - The number of waves in single tile N dimension per tile.
	* - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumInputsBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - GroupSize - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	*
	* \tparam DataType Input data types.
	* \tparam ATileLayout A tensor layout.
	* \tparam BTileLayout B tensor layout.
	* \tparam BlockSize Number of threads in block.
	* \tparam GemmTraits Traits of gemm xdl operation.
	* \param c_local_tile_tensor C tensor in LDS memory for blockwise gemm
	* (MPerBlock, NPerBlock) layout.
	*
	* \return Partition c tensor for blockwise gemm.
	*/
	template <typename DataType,
	typename ATileLayout,
	typename BTileLayout,
	index_t BlockSize,
	typename GemmTraits,
	typename CTensorType>
	__host__ __device__ constexpr auto
	make_blockwise_gemm_xdl_c_local_partition(CTensorType& c_local_tile_tensor)
	{
	constexpr auto I0 = Number<0>{};
	constexpr auto I1 = Number<1>{};
	constexpr auto I2 = Number<2>{};
	constexpr auto I3 = Number<3>{};
	constexpr auto I4 = Number<4>{};
	constexpr auto I5 = Number<5>{};
	constexpr auto I6 = Number<6>{};
	constexpr auto I7 = Number<7>{};

	static_assert(typename ATileLayout::LayoutShape{}.Size() ==
	typename BTileLayout::LayoutShape{}.Size());

	constexpr bool is_integer =
	is_same_v<DataType, int8_t> \|\| is_same_v<DataType, int16_t> \|\| is_same_v<DataType, int32_t>;
	using GemmAccDataType = std::conditional_t<is_integer, int32_t, float>;

	constexpr bool is_3d_desc = typename ATileLayout::LayoutShape{}.Size() == I3;
	using ABlockDesc_K0_M_K1_Type =
	conditional_t<is_3d_desc,
	typename ATileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, ATileLayout>())>;
	using BBlockDesc_K0_N_K1_Type =
	conditional_t<is_3d_desc,
	typename BTileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, BTileLayout>())>;

	using BlockwiseGemmXdlops =
	BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
	DataType,
	DataType,
	GemmAccDataType,
	ABlockDesc_K0_M_K1_Type,
	BBlockDesc_K0_N_K1_Type,
	GemmTraits::MPerXDL,
	GemmTraits::NPerXDL,
	GemmTraits::MXdlPerWave,
	GemmTraits::NXdlPerWave,
	GemmTraits::K1>;

	constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
	BlockwiseGemmXdlops::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
	constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0);
	constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1);
	constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I2);
	constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I3);
	constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I4);
	constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I5);
	constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I6);
	constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I7);

	// Calculate offset on grid
	const auto c_thread_mtx_on_block =
	BlockwiseGemmXdlops::CalculateCThreadOriginDataIndex(I0, I0, I0, I0);

	const index_t m_thread_data_on_grid =
	c_local_tile_tensor.GetMultiIdxOffsets()[I0] + c_thread_mtx_on_block[I0];

	const index_t n_thread_data_on_grid =
	c_local_tile_tensor.GetMultiIdxOffsets()[I1] + c_thread_mtx_on_block[I1];

	const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor = make_single_stage_tensor_adaptor(
	make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
	make_tuple(Sequence<0, 1, 2, 3, 4>{}),
	make_tuple(Sequence<0>{}));

	const auto m_thread_data_on_grid_idx =
	m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
	make_multi_index(m_thread_data_on_grid));

	const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor =
	make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
	make_tuple(Sequence<0, 1, 2>{}),
	make_tuple(Sequence<0>{}));

	const auto n_thread_data_on_grid_idx =
	n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
	make_multi_index(n_thread_data_on_grid));
	// Create partition shape based on descriptor dims.
	const auto partition_shape = make_tuple(M0, N0, I1, I1, M2, I1, M4, I1);

	const auto partition_desc = BlockwiseGemmXdlops::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(
	layout(c_local_tile_tensor).GetUnrolledDescriptor());

	const auto lower_upper_dims =
	generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<8>{});

	auto sliced_desc = transform_tensor_descriptor(
	partition_desc,
	make_tuple(
	make_slice_transform(partition_shape.At(Number<0>{}),
	m_thread_data_on_grid_idx[I0],
	partition_shape.At(Number<0>{}) + m_thread_data_on_grid_idx[I0]),
	make_slice_transform(partition_shape.At(Number<1>{}),
	n_thread_data_on_grid_idx[I0],
	partition_shape.At(Number<1>{}) + n_thread_data_on_grid_idx[I0]),
	make_slice_transform(partition_shape.At(Number<2>{}),
	m_thread_data_on_grid_idx[I1],
	partition_shape.At(Number<2>{}) + m_thread_data_on_grid_idx[I1]),
	make_slice_transform(partition_shape.At(Number<3>{}),
	n_thread_data_on_grid_idx[I1],
	partition_shape.At(Number<3>{}) + n_thread_data_on_grid_idx[I1]),
	make_slice_transform(partition_shape.At(Number<4>{}),
	m_thread_data_on_grid_idx[I2],
	partition_shape.At(Number<4>{}) + m_thread_data_on_grid_idx[I2]),
	make_slice_transform(partition_shape.At(Number<5>{}),
	m_thread_data_on_grid_idx[I3],
	partition_shape.At(Number<5>{}) + m_thread_data_on_grid_idx[I3]),
	make_slice_transform(partition_shape.At(Number<6>{}),
	m_thread_data_on_grid_idx[I4],
	partition_shape.At(Number<6>{}) + m_thread_data_on_grid_idx[I4]),
	make_slice_transform(partition_shape.At(Number<7>{}),
	n_thread_data_on_grid_idx[I2],
	partition_shape.At(Number<7>{}) + n_thread_data_on_grid_idx[I2])),
	lower_upper_dims,
	lower_upper_dims);

	const auto partition_layout =
	Layout<remove_reference_t<decltype(partition_shape)>, decltype(sliced_desc)>(
	partition_shape, sliced_desc);
	auto partition_tensor = make_tensor<CTensorType::TensorBufferAddressSpace>(
	c_local_tile_tensor.GetPointer(), partition_layout);
	return partition_tensor;
	}

	/**
	* \brief Create local partition per thread for C tensor.
	*
	* \note C output Vgpr register layout (8D):
	* - MXdlPerWave - The number of MFMA instructions run by single wave in M
	* dimension per tile.
	* - NXdlPerWave - The number of MFMA instructions run by single wave in N
	* dimension per tile.
	* - MWave - Equals to 1 since this is for single wave.
	* - NWave - Equals to 1 since this is for single wave.
	* - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumInputsBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - GroupSize - Mfma instruction internal layout (depeneds on the
	* instruction size).
	* - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the
	* instruction size).
	*
	* \tparam DataType Input data types.
	* \tparam ATileLayout A tensor layout.
	* \tparam BTileLayout B tensor layout.
	* \tparam BlockSize Number of threads in block.
	* \tparam GemmTraits Traits of gemm xdl operation.
	*
	* \return Vgpr c tensor for blockwise gemm.
	*/
	template <typename DataType,
	typename ATileLayout,
	typename BTileLayout,
	index_t BlockSize,
	typename GemmTraits>
	__host__ __device__ constexpr auto make_blockwise_gemm_xdl_c_vgpr()
	{
	constexpr auto I0 = Number<0>{};
	constexpr auto I1 = Number<1>{};
	constexpr auto I2 = Number<2>{};
	constexpr auto I3 = Number<3>{};
	constexpr auto I4 = Number<4>{};
	constexpr auto I5 = Number<5>{};
	constexpr auto I6 = Number<6>{};
	constexpr auto I7 = Number<7>{};

	static_assert(typename ATileLayout::LayoutShape{}.Size() ==
	typename BTileLayout::LayoutShape{}.Size());

	constexpr bool is_integer =
	is_same_v<DataType, int8_t> \|\| is_same_v<DataType, int16_t> \|\| is_same_v<DataType, int32_t>;
	using GemmAccDataType = std::conditional_t<is_integer, int32_t, float>;

	constexpr bool is_3d_desc = typename ATileLayout::LayoutShape{}.Size() == I3;
	using ABlockDesc_K0_M_K1_Type =
	conditional_t<is_3d_desc,
	typename ATileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, ATileLayout>())>;
	using BBlockDesc_K0_N_K1_Type =
	conditional_t<is_3d_desc,
	typename BTileLayout::LayoutUnrolledDescriptorType,
	decltype(detail::GetBlockDescriptor<GemmTraits::K1, BTileLayout>())>;

	using BlockwiseGemmXdlops =
	BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
	DataType,
	DataType,
	GemmAccDataType,
	ABlockDesc_K0_M_K1_Type,
	BBlockDesc_K0_N_K1_Type,
	GemmTraits::MPerXDL,
	GemmTraits::NPerXDL,
	GemmTraits::MXdlPerWave,
	GemmTraits::NXdlPerWave,
	GemmTraits::K1>;
	// Calcualte descriptor, shape and layout
	constexpr auto vgpr_desc = BlockwiseGemmXdlops::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
	const auto vgpr_shape = make_tuple(vgpr_desc.GetLengths()[I0],
	vgpr_desc.GetLengths()[I1],
	vgpr_desc.GetLengths()[I2],
	vgpr_desc.GetLengths()[I3],
	vgpr_desc.GetLengths()[I4],
	vgpr_desc.GetLengths()[I5],
	vgpr_desc.GetLengths()[I6],
	vgpr_desc.GetLengths()[I7]);
	const auto vgpr_layout = Layout<remove_reference_t<decltype(vgpr_shape)>, decltype(vgpr_desc)>(
	vgpr_shape, vgpr_desc);
	// Get vector type for Vgpr
	constexpr index_t ScalarPerVector = BlockwiseGemmXdlops::xdlops_gemm.GetRegSizePerXdlops();
	using VgprVectorType = typename vector_type<GemmAccDataType, ScalarPerVector>::type;
	return ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, VgprVectorType>(
	vgpr_layout);
	}

	} // namespace wrapper
	} // namespace ck