| |
| |
|
|
| #pragma once |
|
|
| #include "ck/config.h" |
| #include "ck/utility/env.hpp" |
|
|
| #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS |
| #include "hip/hip_runtime.h" |
| #include "hip/hip_fp16.h" |
| #endif |
|
|
| |
| |
| CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) |
|
|
| |
|
|
| #define CK_TIME_KERNEL 1 |
|
|
| |
| |
| #define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) |
|
|
| |
| #define CK_USE_LAUNCH_BOUNDS 1 |
|
|
| #ifdef CK_USE_LAUNCH_BOUNDS |
| |
| #define CK_MAX_THREAD_PER_BLOCK 256 |
| #define CK_MIN_BLOCK_PER_CU 2 |
|
|
| |
| #define CK_WAVELET_MAX_THREAD_PER_BLOCK 512 |
| #define CK_WAVELET_MIN_BLOCK_PER_CU 2 |
| #endif |
|
|
| |
| #ifdef CK_USE_WAVES_PER_EU |
| |
| #ifndef CK_MIN_WAVES_PER_EU |
| #define CK_MIN_WAVES_PER_EU 0 |
| #endif |
|
|
| #ifndef CK_MAX_WAVES_PER_EU |
| #define CK_MAX_WAVES_PER_EU 0 |
| #endif |
|
|
| #else |
| #define CK_USE_WAVES_PER_EU 0 |
| #endif |
|
|
| |
| #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \ |
| defined(__gfx942__) |
| #define __gfx9__ |
| #endif |
| #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) |
| #define __gfx94__ |
| #endif |
| #if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) |
| #define __gfx101__ |
| #endif |
| #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \ |
| defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \ |
| defined(__gfx10_3_generic__) |
| #define __gfx103__ |
| #endif |
| #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \ |
| defined(__gfx1103__) || defined(__gfx11_generic__) |
| #define __gfx11__ |
| #endif |
| #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) |
| #define __gfx12__ |
| #endif |
|
|
| |
| #ifndef __HIP_DEVICE_COMPILE__ |
| #define CK_BUFFER_RESOURCE_3RD_DWORD -1 |
| #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx9__) |
| #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 |
| #elif defined(__gfx103__) |
| #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 |
| #elif defined(__gfx11__) || defined(__gfx12__) |
| #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000 |
| #endif |
|
|
| |
| #ifndef __HIP_DEVICE_COMPILE__ |
| #elif defined(__gfx803__) || defined(__gfx900__) |
| #define CK_USE_AMD_V_MAC_F32 |
| #elif defined(__gfx906__) || defined(__gfx9__) || defined(__gfx103__) |
| #define CK_USE_AMD_V_FMAC_F32 |
| #define CK_USE_AMD_V_DOT2_F32_F16 |
| #define CK_USE_AMD_V_DOT4_I32_I8 |
| #elif defined(__gfx11__) || defined(__gfx12__) |
| #define CK_USE_AMD_V_FMAC_F32 |
| #define CK_USE_AMD_V_DOT2_F32_F16 |
| #define CK_USE_AMD_V_DOT4_I32_I8_GFX11 |
| #endif |
|
|
| |
| #ifndef __HIP_DEVICE_COMPILE__ |
| #define CK_USE_AMD_MFMA |
| #elif defined(__gfx9__) |
| #define CK_USE_AMD_MFMA |
| #endif |
|
|
| #if(defined(__gfx90a__) || defined(__gfx94__)) |
| #define CK_USE_AMD_MFMA_BF16_1K_OP |
| #endif |
|
|
| #if defined(__gfx94__) |
| #define CK_USE_AMD_MFMA_GFX940 |
| #endif |
|
|
| |
| #define CK_USE_AMD_BUFFER_LOAD 1 |
|
|
| |
| #define CK_USE_AMD_BUFFER_STORE 1 |
|
|
| |
| #define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 |
|
|
| |
| #ifndef __HIP_DEVICE_COMPILE__ |
| #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 |
| #elif defined(__gfx9__) |
| #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 |
| #else |
| #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0 |
| #endif |
|
|
| #if(defined(__gfx90a__) || defined(__gfx94__)) |
| #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1 |
| #else |
| #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0 |
| #endif |
|
|
| |
| #define CK_USE_AMD_INLINE_ASM 1 |
|
|
| |
| #define CK_USE_AMD_V_MAC_INLINE_ASM 1 |
|
|
| |
| |
| #define CK_USE_AMD_V_DOT_INLINE_ASM 0 |
|
|
| |
| #define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1 |
|
|
| |
| #define CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 0 |
|
|
| |
| #define CK_USE_SR_F8_CONVERSION 0 |
|
|
| |
| #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 |
|
|
| |
| #define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 |
|
|
| |
| #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 |
|
|
| |
| |
| |
| |
| #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK |
| #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 |
| #endif |
| #define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 |
| #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 |
| #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1 |
|
|
| |
| #define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1 |
|
|
| |
| #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 |
|
|
| |
| |
| #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0 |
|
|
| |
| #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 |
|
|
| |
| #define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1 |
| #define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1 |
| |
| #define CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING 0 |
| |
| #define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1 |
| |
| #define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1 |
| |
| #ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT |
| #define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0 |
| #endif |
|
|
| |
| |
| |
| |
| #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 |
|
|
| |
| #define CK_WORKAROUND_SWDEV_275126 1 |
|
|
| |
| #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1 |
|
|
| |
| #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1 |
|
|
| |
| |
| #define CK_WORKAROUND_SWDEV_325164 0 |
|
|
| |
| #define CK_WORKAROUND_SWDEV_383542 1 |
|
|
| |
| #define CK_WORKAROUND_SWDEV_388832 1 |
|
|
| |
| #ifndef CK_WORKAROUND_DENORM_FIX |
| #define CK_WORKAROUND_DENORM_FIX 0 |
| #else |
| |
| #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) |
| #endif |
|
|
| |
| #define CK_BUILD_DEPRECATED 1 |
|
|
| namespace ck { |
|
|
| enum struct InMemoryDataOperationEnum |
| { |
| Set, |
| AtomicAdd, |
| AtomicMax, |
| Add |
| }; |
|
|
| |
| template <InMemoryDataOperationEnum... Is> |
| struct InMemoryDataOperationEnumSequence |
| { |
| static constexpr int mSize = sizeof...(Is); |
|
|
| __host__ __device__ static constexpr InMemoryDataOperationEnum At(int I) |
| { |
| |
| const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set}; |
| return mData[I]; |
| } |
| }; |
|
|
| |
| using index_t = int32_t; |
| using long_index_t = int64_t; |
|
|
| } |
|
|