| | #pragma once |
| |
|
| | #include "ggml.h" |
| | #include "ggml-impl.h" |
| | #include "ggml-cuda.h" |
| |
|
| | #include <cstdint> |
| | #include <memory> |
| |
|
| | #if defined(GGML_USE_HIP) |
| | #define GGML_COMMON_DECL_HIP |
| | #define GGML_COMMON_IMPL_HIP |
| | #else |
| | #define GGML_COMMON_DECL_CUDA |
| | #define GGML_COMMON_IMPL_CUDA |
| | #if defined(GGML_USE_MUSA) |
| | #define GGML_COMMON_DECL_MUSA |
| | #define GGML_COMMON_IMPL_MUSA |
| | #endif |
| | #endif |
| | #include "ggml-common.h" |
| |
|
| | #include <array> |
| | #include <cassert> |
| | #include <cfloat> |
| | #include <cstdio> |
| | #include <string> |
| | #include <vector> |
| |
|
| | #if defined(GGML_USE_HIP) |
| | #include "vendors/hip.h" |
| | #elif defined(GGML_USE_MUSA) |
| | #include "vendors/musa.h" |
| | #else |
| | #include "vendors/cuda.h" |
| | #endif |
| |
|
| | #define STRINGIZE_IMPL(...) #__VA_ARGS__ |
| | #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) |
| |
|
| | #define WARP_SIZE 32 |
| | #define CUDART_HMAX 11070 |
| | #define CUDART_HMASK 12000 |
| |
|
| | #define GGML_CUDA_CC_PASCAL 600 |
| | #define GGML_CUDA_CC_DP4A 610 |
| | #define GGML_CUDA_CC_VOLTA 700 |
| | #define GGML_CUDA_CC_TURING 750 |
| | #define GGML_CUDA_CC_AMPERE 800 |
| | #define GGML_CUDA_CC_ADA_LOVELACE 890 |
| | #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
| | #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 |
| | #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) |
| |
|
| | |
| | |
| | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) |
| | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) |
| | #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) |
| | #define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) |
| | #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) |
| | #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) |
| |
|
| | |
| | #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) |
| | #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) |
| | #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) |
| | #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) |
| |
|
| | #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) |
| | #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) |
| | #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) |
| | #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) |
| | #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) |
| | #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) |
| | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) |
| | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) |
| | #define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) |
| | #define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) |
| | #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) |
| |
|
| | |
| | #define MUSART_HMASK 40300 |
| |
|
| | #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) |
| | #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) |
| | #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) |
| |
|
| | #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) |
| | #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) |
| | #define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) |
| | #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) |
| |
|
| | #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 |
| | # define GGML_CUDA_USE_CUB |
| | #endif |
| |
|
| | #ifdef __CUDA_ARCH_LIST__ |
| | constexpr bool ggml_cuda_has_arch_impl(int) { |
| | return false; |
| | } |
| |
|
| | template<class ... Archs> |
| | constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) { |
| | return arch == first || ggml_cuda_has_arch_impl(arch, rest...); |
| | } |
| |
|
| | constexpr bool ggml_cuda_has_arch(const int arch) { |
| | return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__); |
| | } |
| |
|
| | constexpr int ggml_cuda_highest_compiled_arch_impl(const int , const int cur) { |
| | if (cur == 0) { |
| | return -1; |
| | } |
| | return cur; |
| | } |
| |
|
| | template<class ... Archs> |
| | constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) { |
| | if (first <= arch && first > cur) { |
| | return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...); |
| | } else { |
| | return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...); |
| | } |
| | } |
| |
|
| | constexpr int ggml_cuda_highest_compiled_arch(const int arch) { |
| | return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__); |
| | } |
| | #else |
| | static int ggml_cuda_highest_compiled_arch(const int arch) { |
| | return arch; |
| | } |
| | #endif |
| |
|
| | |
| |
|
| | #define MATRIX_ROW_PADDING 512 |
| |
|
| | #define GGML_CUDA_MAX_STREAMS 8 |
| |
|
| | [[noreturn]] |
| | void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); |
| |
|
| | #define CUDA_CHECK_GEN(err, success, error_fn) \ |
| | do { \ |
| | auto err_ = (err); \ |
| | if (err_ != (success)) { \ |
| | ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ |
| | } \ |
| | } while (0) |
| |
|
| | #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) |
| |
|
| | #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) |
| | static const char * cublas_get_error_str(const cublasStatus_t err) { |
| | return cublasGetStatusString(err); |
| | } |
| | #else |
| | static const char * cublas_get_error_str(const cublasStatus_t err) { |
| | switch (err) { |
| | case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; |
| | case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; |
| | case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; |
| | case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; |
| | case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; |
| | case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; |
| | case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; |
| | case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; |
| | case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; |
| | default: return "unknown error"; |
| | } |
| | } |
| | #endif |
| |
|
| | #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) |
| |
|
| | #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) |
| | static const char * cu_get_error_str(CUresult err) { |
| | const char * err_str; |
| | cuGetErrorString(err, &err_str); |
| | return err_str; |
| | } |
| | #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) |
| | #endif |
| |
|
| | #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) |
| | # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ |
| | do { \ |
| | static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \ |
| | const int id = ggml_cuda_get_device(); \ |
| | if (!shared_memory_limit_raised[id]) { \ |
| | CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \ |
| | shared_memory_limit_raised[id] = true; \ |
| | } \ |
| | } while (0) |
| | #else |
| | # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ |
| | do { \ |
| | GGML_UNUSED(nbytes); \ |
| | } while (0) |
| | #endif |
| |
|
| | #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) |
| | #define GGML_CUDA_ASSUME(x) __builtin_assume(x) |
| | #else |
| | #define GGML_CUDA_ASSUME(x) |
| | #endif |
| |
|
| | #if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) |
| | #define GGML_USE_VMM |
| | #endif |
| |
|
| | #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL |
| | #define FP16_AVAILABLE |
| | #endif |
| |
|
| | #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 |
| | #define FAST_FP16_AVAILABLE |
| | #endif |
| |
|
| | #if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) |
| | #define AMD_MFMA_AVAILABLE |
| | #endif |
| |
|
| | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
| | #define TURING_MMA_AVAILABLE |
| | #endif |
| |
|
| | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| | #define AMPERE_MMA_AVAILABLE |
| | #endif |
| |
|
| | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| | #define CP_ASYNC_AVAILABLE |
| | #endif |
| |
|
| | #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) |
| | #define FLASH_ATTN_AVAILABLE |
| | #endif |
| |
|
| | static bool fp16_available(const int cc) { |
| | return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; |
| | } |
| |
|
| | static bool fast_fp16_available(const int cc) { |
| | return GGML_CUDA_CC_IS_AMD(cc) || |
| | (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(cc) != 610); |
| | } |
| |
|
| | |
| | static bool fast_fp16_hardware_available(const int cc) { |
| | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || |
| | (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); |
| | } |
| |
|
| | |
| | static bool fp16_mma_hardware_available(const int cc) { |
| | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || |
| | GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || |
| | (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); |
| | } |
| |
|
| | static bool bf16_mma_hardware_available(const int cc) { |
| | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| | } |
| |
|
| | static bool fp32_mma_hardware_available(const int cc) { |
| | return GGML_CUDA_CC_IS_CDNA(cc); |
| | } |
| |
|
| | static bool amd_mfma_available(const int cc) { |
| | #if !defined(GGML_HIP_NO_MMQ_MFMA) |
| | return GGML_CUDA_CC_IS_CDNA(cc); |
| | #else |
| | return false; |
| | #endif |
| | } |
| |
|
| | |
| | static bool turing_mma_available(const int cc) { |
| | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
| | } |
| |
|
| | static bool ampere_mma_available(const int cc) { |
| | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; |
| | } |
| |
|
| | static bool cp_async_available(const int cc) { |
| | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; |
| | } |
| |
|
| | static constexpr __device__ int ggml_cuda_get_physical_warp_size() { |
| | #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) |
| | return 64; |
| | #else |
| | return 32; |
| | #endif |
| | } |
| |
|
| | |
| | static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() { |
| | #ifdef GGML_USE_HIP |
| | return 16; |
| | #else |
| | #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA |
| | return 16; |
| | #else |
| | return 8; |
| | #endif |
| | #endif |
| | } |
| |
|
| |
|
| | [[noreturn]] |
| | static __device__ void no_device_code( |
| | const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { |
| |
|
| | #if defined(GGML_USE_HIP) |
| | printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", |
| | file_name, line, function_name, arch); |
| | GGML_UNUSED(arch_list); |
| | #else |
| | printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", |
| | file_name, line, function_name, arch, arch_list); |
| | #endif |
| | __trap(); |
| |
|
| | GGML_UNUSED(no_device_code); |
| |
|
| | #if defined(GGML_USE_MUSA) |
| | __builtin_unreachable(); |
| | #endif |
| | } |
| |
|
| | #ifdef __CUDA_ARCH__ |
| | #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) |
| | #else |
| | #define NO_DEVICE_CODE |
| | #endif |
| |
|
| | |
| | |
| | template <int n> |
| | struct ggml_cuda_unroll { |
| | template <typename Func, typename... Args> |
| | __device__ void operator()(const Func & f, Args... args) const { |
| | f(n - 1, args...); |
| | ggml_cuda_unroll<n - 1>{}(f, args...); |
| | } |
| | }; |
| |
|
| | template <> |
| | struct ggml_cuda_unroll<1> { |
| | template <typename Func, typename... Args> |
| | __device__ void operator()(const Func & f, Args... args) const { |
| | f(0, args...); |
| | } |
| | }; |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ int warp_reduce_sum(int x) { |
| | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| | return __reduce_add_sync(0xffffffff, x); |
| | #else |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x += __shfl_xor_sync(0xffffffff, x, offset, width); |
| | } |
| | return x; |
| | #endif |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ float warp_reduce_sum(float x) { |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x += __shfl_xor_sync(0xffffffff, x, offset, width); |
| | } |
| | return x; |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width); |
| | a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width); |
| | } |
| | return a; |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { |
| | #ifdef FP16_AVAILABLE |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width)); |
| | } |
| | return a; |
| |
|
| | #else |
| | NO_DEVICE_CODE; |
| | return a; |
| | #endif |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ int warp_reduce_all(int x) { |
| | if (width == ggml_cuda_get_physical_warp_size()) { |
| | return __all_sync(0xffffffff, x); |
| | } else { |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x = __shfl_xor_sync(0xffffffff, x, offset, width) && x; |
| | } |
| | return x; |
| | } |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ int warp_reduce_any(int x) { |
| | if (width == ggml_cuda_get_physical_warp_size()) { |
| | return __any_sync(0xffffffff, x); |
| | } else { |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x = __shfl_xor_sync(0xffffffff, x, offset, width) || x; |
| | } |
| | return x; |
| | } |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ float warp_reduce_max(float x) { |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width)); |
| | } |
| | return x; |
| | } |
| |
|
| | static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { |
| | #ifdef FP16_AVAILABLE |
| |
|
| | #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX |
| | return __float2half(fmaxf(__half2float(a), __half2float(b))); |
| | #else |
| | return __hmax(a, b); |
| | #endif |
| |
|
| | #else |
| | NO_DEVICE_CODE; |
| | GGML_UNUSED(b); |
| | return a; |
| | #endif |
| | } |
| |
|
| | static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { |
| | #if defined(GGML_USE_HIP) |
| | return half2(__hmax(a.x, b.x), __hmax(a.y, b.y)); |
| | #elif CUDART_VERSION >= CUDART_HMAX |
| | return __hmax2(a, b); |
| | #else |
| | half2 ret; |
| | reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); |
| | reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); |
| | return ret; |
| | #endif |
| | } |
| |
|
| | template<int width = WARP_SIZE> |
| | static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { |
| | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP) |
| | #pragma unroll |
| | for (int offset = width/2; offset > 0; offset >>= 1) { |
| | x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width)); |
| | } |
| | return x; |
| | #else |
| | GGML_UNUSED(x); |
| | NO_DEVICE_CODE; |
| | #endif |
| | } |
| |
|
| | #if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \ |
| | (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) |
| | static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { |
| | const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); |
| | const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); |
| | return mask_low | mask_high; |
| | } |
| | #endif |
| |
|
| | static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { |
| | #if defined(GGML_USE_HIP) |
| | #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) |
| | c = __builtin_amdgcn_sdot4(a, b, c, false); |
| | #elif defined(RDNA3) || defined(RDNA4) |
| | c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); |
| | #elif defined(RDNA1) || defined(__gfx900__) |
| | int tmp1; |
| | int tmp2; |
| | asm("\n \ |
| | v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ |
| | v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ |
| | v_add3_u32 %0, %1, %2, %0 \n \ |
| | v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ |
| | v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ |
| | v_add3_u32 %0, %1, %2, %0 \n \ |
| | " |
| | : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) |
| | : "v"(a), "v"(b) |
| | ); |
| | #else |
| | const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); |
| | const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); |
| | c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; |
| | #endif |
| | return c; |
| |
|
| | #else |
| |
|
| | #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| | return __dp4a(a, b, c); |
| | #else |
| | const int8_t * a8 = (const int8_t *) &a; |
| | const int8_t * b8 = (const int8_t *) &b; |
| | return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; |
| | #endif |
| |
|
| | #endif |
| | } |
| |
|
| | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) { |
| | acc += v*u; |
| | } |
| |
|
| | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) { |
| | acc += v.x*u.x; |
| | acc += v.y*u.y; |
| | } |
| |
|
| | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) { |
| | #if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA)) |
| | asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u)); |
| | #else |
| | #ifdef FAST_FP16_AVAILABLE |
| | const float2 tmp = __half22float2(v*u); |
| | acc += tmp.x + tmp.y; |
| | #else |
| | const float2 tmpv = __half22float2(v); |
| | const float2 tmpu = __half22float2(u); |
| | acc += tmpv.x * tmpu.x; |
| | acc += tmpv.y * tmpu.y; |
| | #endif |
| | #endif |
| | } |
| |
|
| | static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) { |
| | #ifdef FAST_FP16_AVAILABLE |
| | acc += v*u; |
| | #else |
| | const float2 tmpv = __half22float2(v); |
| | const float2 tmpu = __half22float2(u); |
| | float2 tmpacc = __half22float2(acc); |
| | tmpacc.x += tmpv.x * tmpu.x; |
| | tmpacc.y += tmpv.y * tmpu.y; |
| | acc = make_half2(tmpacc.x, tmpacc.y); |
| | #endif |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | template <int nbytes, int alignment = 0> |
| | static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) { |
| | if constexpr (alignment != 0) { |
| | static_assert(nbytes % alignment == 0, "bad alignment"); |
| | } |
| | constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment; |
| |
|
| | #pragma unroll |
| | for (int i = 0; i < nbytes/nb_per_cpy; ++i) { |
| | if constexpr (nb_per_cpy == 1) { |
| | ((char *) dst)[i] = ((const char *) src)[i]; |
| | } else if constexpr (nb_per_cpy == 2) { |
| | ((short *) dst)[i] = ((const short *) src)[i]; |
| | } else if constexpr (nb_per_cpy == 4) { |
| | ((int *) dst)[i] = ((const int *) src)[i]; |
| | } else if constexpr (nb_per_cpy == 8) { |
| | ((int2 *) dst)[i] = ((const int2 *) src)[i]; |
| | } else if constexpr (nb_per_cpy == 16) { |
| | ((int4 *) dst)[i] = ((const int4 *) src)[i]; |
| | } else { |
| | static_assert(nbytes == 0 && nbytes == -1, "bad nbytes"); |
| | } |
| | } |
| | } |
| |
|
| | static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { |
| | #if CUDART_VERSION >= 12080 |
| | const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x); |
| | return (float) e; |
| | #else |
| | uint32_t bits; |
| | if (x == 0) { |
| | bits = 0x00400000; |
| | } else { |
| | bits = (uint32_t) x << 23; |
| | } |
| |
|
| | float result; |
| | memcpy(&result, &bits, sizeof(float)); |
| | return result; |
| | #endif |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | static const uint3 init_fastdiv_values(uint32_t d) { |
| | GGML_ASSERT(d != 0); |
| |
|
| | |
| | uint32_t L = 0; |
| | while (L < 32 && (uint32_t{ 1 } << L) < d) { |
| | L++; |
| | } |
| |
|
| | uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1); |
| | |
| | return make_uint3(mp, L, d); |
| | } |
| |
|
| | static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, const uint3 fastdiv_values) { |
| | |
| | |
| | |
| | const uint32_t hi = __umulhi(n, fastdiv_values.x); |
| | |
| | return (hi + n) >> fastdiv_values.y; |
| | } |
| |
|
| | static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fastdiv_values) { |
| | |
| | return n - fastdiv(n, fastdiv_values) * fastdiv_values.z; |
| | } |
| |
|
| | |
| | static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) { |
| | |
| | const uint32_t div_val = fastdiv(n, fastdiv_values); |
| | const uint32_t mod_val = n - div_val * fastdiv_values.z; |
| | return make_uint2(div_val, mod_val); |
| | } |
| |
|
| | typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v); |
| |
|
| | static __device__ __forceinline__ float get_alibi_slope( |
| | const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1 |
| | ) { |
| | if (max_bias <= 0.0f) { |
| | return 1.0f; |
| | } |
| | const float base = h < n_head_log2 ? m0 : m1; |
| | const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; |
| |
|
| | return powf(base, exph); |
| | } |
| |
|
| | template <ggml_type type> |
| | struct ggml_cuda_type_traits; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_F16> { |
| | static constexpr int qk = 1; |
| | static constexpr int qr = 1; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> { |
| | static constexpr int qk = QK4_0; |
| | static constexpr int qr = QR4_0; |
| | static constexpr int qi = QI4_0; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> { |
| | static constexpr int qk = QK4_1; |
| | static constexpr int qr = QR4_1; |
| | static constexpr int qi = QI4_1; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> { |
| | static constexpr int qk = QK5_0; |
| | static constexpr int qr = QR5_0; |
| | static constexpr int qi = QI5_0; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> { |
| | static constexpr int qk = QK5_1; |
| | static constexpr int qr = QR5_1; |
| | static constexpr int qi = QI5_1; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> { |
| | static constexpr int qk = QK8_0; |
| | static constexpr int qr = QR8_0; |
| | static constexpr int qi = QI8_0; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> { |
| | static constexpr int qk = QK_MXFP4; |
| | static constexpr int qr = QR_MXFP4; |
| | static constexpr int qi = QI_MXFP4; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR2_K; |
| | static constexpr int qi = QI2_K; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR3_K; |
| | static constexpr int qi = QI3_K; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR4_K; |
| | static constexpr int qi = QI4_K; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR5_K; |
| | static constexpr int qi = QI5_K; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR6_K; |
| | static constexpr int qi = QI6_K; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR2_XXS; |
| | static constexpr int qi = QI2_XXS; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR2_XS; |
| | static constexpr int qi = QI2_XS; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR2_S; |
| | static constexpr int qi = QI2_S; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR3_XXS; |
| | static constexpr int qi = QI3_XXS; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR1_S; |
| | static constexpr int qi = QI1_S; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR1_M; |
| | static constexpr int qi = QI1_M; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> { |
| | static constexpr int qk = QK4_NL; |
| | static constexpr int qr = QR4_NL; |
| | static constexpr int qi = QI4_NL; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR4_XS; |
| | static constexpr int qi = QI4_XS; |
| | }; |
| |
|
| | template<> |
| | struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> { |
| | static constexpr int qk = QK_K; |
| | static constexpr int qr = QR3_S; |
| | static constexpr int qi = QI3_S; |
| | }; |
| |
|
| | |
| |
|
| | struct ggml_cuda_device_info { |
| | int device_count; |
| |
|
| | struct cuda_device_info { |
| | int cc; |
| | int nsm; |
| | size_t smpb; |
| | size_t smpbo; |
| | bool integrated; |
| | bool vmm; |
| | size_t vmm_granularity; |
| | size_t total_vram; |
| | int warp_size; |
| | }; |
| |
|
| | cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; |
| |
|
| | std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {}; |
| | }; |
| |
|
| | const ggml_cuda_device_info & ggml_cuda_info(); |
| |
|
| | void ggml_cuda_set_device(int device); |
| | int ggml_cuda_get_device(); |
| |
|
| | struct ggml_cuda_pool { |
| | virtual ~ggml_cuda_pool() = default; |
| |
|
| | virtual void * alloc(size_t size, size_t * actual_size) = 0; |
| | virtual void free(void * ptr, size_t size) = 0; |
| | }; |
| |
|
| | template<typename T> |
| | struct ggml_cuda_pool_alloc { |
| | ggml_cuda_pool * pool = nullptr; |
| | T * ptr = nullptr; |
| | size_t actual_size = 0; |
| |
|
| | ggml_cuda_pool_alloc() = default; |
| |
|
| | explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { |
| | } |
| |
|
| | ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { |
| | alloc(size); |
| | } |
| |
|
| | ~ggml_cuda_pool_alloc() { |
| | if (ptr != nullptr) { |
| | pool->free(ptr, actual_size); |
| | } |
| | } |
| |
|
| | |
| | T * alloc(size_t size) { |
| | GGML_ASSERT(pool != nullptr); |
| | GGML_ASSERT(ptr == nullptr); |
| | ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); |
| | return ptr; |
| | } |
| |
|
| | T * alloc(ggml_cuda_pool & pool, size_t size) { |
| | this->pool = &pool; |
| | return alloc(size); |
| | } |
| |
|
| | T * get() { |
| | return ptr; |
| | } |
| |
|
| | ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; |
| | ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; |
| | ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; |
| | ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; |
| | }; |
| |
|
| |
|
| | |
| |
|
| | struct ggml_tensor_extra_gpu { |
| | void * data_device[GGML_CUDA_MAX_DEVICES]; |
| | cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; |
| | }; |
| |
|
| |
|
| | #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS) |
| | #define USE_CUDA_GRAPH |
| | #endif |
| |
|
| | struct ggml_graph_node_properties { |
| | void * node_address; |
| | ggml_op node_op; |
| | int64_t ne[GGML_MAX_DIMS]; |
| | size_t nb[GGML_MAX_DIMS]; |
| | void * src_address[GGML_MAX_SRC]; |
| | int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; |
| | }; |
| |
|
| | struct ggml_cuda_graph { |
| | #ifdef USE_CUDA_GRAPH |
| | ~ggml_cuda_graph() { |
| | if (instance != nullptr) { |
| | CUDA_CHECK(cudaGraphExecDestroy(instance)); |
| | } |
| | if (graph != nullptr) { |
| | CUDA_CHECK(cudaGraphDestroy(graph)); |
| | } |
| | } |
| | cudaGraph_t graph = nullptr; |
| | cudaGraphExec_t instance = nullptr; |
| | size_t num_nodes = 0; |
| | std::vector<cudaGraphNode_t> nodes; |
| | std::vector<cudaKernelNodeParams> params; |
| | bool disable_due_to_gpu_arch = false; |
| | bool disable_due_to_too_many_updates = false; |
| | bool disable_due_to_failed_graph_capture = false; |
| | int number_consecutive_updates = 0; |
| | std::vector<ggml_graph_node_properties> ggml_graph_properties; |
| | bool use_cpy_indirection = false; |
| | std::vector<char *> cpy_dest_ptrs; |
| | char ** dest_ptrs_d; |
| | int dest_ptrs_size = 0; |
| | |
| | |
| | int graph_cpynode_index = -1; |
| | #endif |
| | }; |
| |
|
| | struct ggml_backend_cuda_context { |
| | int device; |
| | std::string name; |
| | cudaEvent_t copy_event = nullptr; |
| |
|
| | cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; |
| | cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; |
| |
|
| | std::unique_ptr<ggml_cuda_graph> cuda_graph; |
| |
|
| | explicit ggml_backend_cuda_context(int device) : |
| | device(device), |
| | name(GGML_CUDA_NAME + std::to_string(device)) { |
| | } |
| |
|
| | ~ggml_backend_cuda_context(); |
| |
|
| | cudaStream_t stream(int device, int stream) { |
| | if (streams[device][stream] == nullptr) { |
| | ggml_cuda_set_device(device); |
| | CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); |
| | } |
| | return streams[device][stream]; |
| | } |
| |
|
| | cudaStream_t stream() { |
| | return stream(device, 0); |
| | } |
| |
|
| | cublasHandle_t cublas_handle(int device) { |
| | if (cublas_handles[device] == nullptr) { |
| | ggml_cuda_set_device(device); |
| | CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); |
| | CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); |
| | } |
| | return cublas_handles[device]; |
| | } |
| |
|
| | cublasHandle_t cublas_handle() { |
| | return cublas_handle(device); |
| | } |
| |
|
| | |
| | std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; |
| |
|
| | static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device); |
| |
|
| | ggml_cuda_pool & pool(int device) { |
| | if (pools[device] == nullptr) { |
| | pools[device] = new_pool_for_device(device); |
| | } |
| | return *pools[device]; |
| | } |
| |
|
| | ggml_cuda_pool & pool() { |
| | return pool(device); |
| | } |
| | }; |
| |
|