Create optimized_matmul.cu
Browse files- core/kernels/optimized_matmul.cu +123 -0
core/kernels/optimized_matmul.cu
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cuda_fp16.h>
|
| 2 |
+
#include <cuda_runtime.h>
|
| 3 |
+
#include <torch/extension.h>
|
| 4 |
+
#include <mma.h> // Tensor Core WMMA API
|
| 5 |
+
|
| 6 |
+
using namespace nvcuda;
|
| 7 |
+
|
| 8 |
+
// Define FP16 type for Tensor Cores
|
| 9 |
+
using half_t = __half;
|
| 10 |
+
|
| 11 |
+
// WMMA tile sizes (fixed for Tensor Cores)
|
| 12 |
+
#define WMMA_M 16
|
| 13 |
+
#define WMMA_N 16
|
| 14 |
+
#define WMMA_K 16
|
| 15 |
+
#define BLOCK_SIZE 32
|
| 16 |
+
|
| 17 |
+
// Optimized GEMM kernel using Tensor Cores
|
| 18 |
+
__global__ void optimized_matmul_kernel(
|
| 19 |
+
const half_t* __restrict__ a, // Matrix A [m, k]
|
| 20 |
+
const half_t* __restrict__ b, // Matrix B [k, n]
|
| 21 |
+
half_t* __restrict__ c, // Matrix C [m, n]
|
| 22 |
+
int m, int n, int k) // Dimensions: A[m,k], B[k,n], C[m,n]
|
| 23 |
+
{
|
| 24 |
+
// Shared memory for WMMA tiles
|
| 25 |
+
__shared__ half_t shmem_a[WMMA_M * WMMA_K];
|
| 26 |
+
__shared__ half_t shmem_b[WMMA_K * WMMA_N];
|
| 27 |
+
|
| 28 |
+
// WMMA fragments
|
| 29 |
+
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half_t, wmma::row_major> a_frag;
|
| 30 |
+
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half_t, wmma::col_major> b_frag;
|
| 31 |
+
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half_t> c_frag;
|
| 32 |
+
|
| 33 |
+
// Thread indices
|
| 34 |
+
int tx = threadIdx.x;
|
| 35 |
+
int ty = threadIdx.y;
|
| 36 |
+
int bx = blockIdx.x;
|
| 37 |
+
int by = blockIdx.y;
|
| 38 |
+
|
| 39 |
+
// Global tile offsets
|
| 40 |
+
int row = by * WMMA_M + ty; // Row in C
|
| 41 |
+
int col = bx * WMMA_N + tx; // Column in C
|
| 42 |
+
|
| 43 |
+
// Initialize accumulator
|
| 44 |
+
wmma::fill_fragment(c_frag, __float2half(0.0f));
|
| 45 |
+
|
| 46 |
+
// Loop over K dimension in WMMA tiles
|
| 47 |
+
for (int tile_k = 0; tile_k < k; tile_k += WMMA_K) {
|
| 48 |
+
// Load A tile into shared memory (row-major)
|
| 49 |
+
if (row < m && tile_k + tx < k) {
|
| 50 |
+
shmem_a[ty * WMMA_K + tx] = a[row * k + tile_k + tx];
|
| 51 |
+
} else {
|
| 52 |
+
shmem_a[ty * WMMA_K + tx] = __float2half(0.0f);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
// Load B tile into shared memory (col-major)
|
| 56 |
+
if (tile_k + ty < k && col < n) {
|
| 57 |
+
shmem_b[ty * WMMA_N + tx] = b[(tile_k + ty) * n + col];
|
| 58 |
+
} else {
|
| 59 |
+
shmem_b[ty * WMMA_N + tx] = __float2half(0.0f);
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
__syncthreads();
|
| 63 |
+
|
| 64 |
+
// Load WMMA fragments from shared memory
|
| 65 |
+
wmma::load_matrix_sync(a_frag, shmem_a, WMMA_K);
|
| 66 |
+
wmma::load_matrix_sync(b_frag, shmem_b, WMMA_N);
|
| 67 |
+
|
| 68 |
+
// Perform Tensor Core matrix multiply-accumulate
|
| 69 |
+
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
|
| 70 |
+
|
| 71 |
+
__syncthreads();
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
// Store result to global memory
|
| 75 |
+
if (row < m && col < n) {
|
| 76 |
+
wmma::store_matrix_sync(&c[row * n + col], c_frag, n, wmma::mem_row_major);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// PyTorch binding
|
| 81 |
+
torch::Tensor optimized_matmul(
|
| 82 |
+
torch::Tensor a, // [m, k]
|
| 83 |
+
torch::Tensor b) // [k, n]
|
| 84 |
+
{
|
| 85 |
+
// Ensure inputs are FP16 and on CUDA
|
| 86 |
+
TORCH_CHECK(a.dtype() == torch::kFloat16, "Matrix A must be FP16");
|
| 87 |
+
TORCH_CHECK(b.dtype() == torch::kFloat16, "Matrix B must be FP16");
|
| 88 |
+
TORCH_CHECK(a.is_cuda(), "Matrix A must be on CUDA");
|
| 89 |
+
TORCH_CHECK(b.is_cuda(), "Matrix B must be on CUDA");
|
| 90 |
+
TORCH_CHECK(a.dim() == 2 && b.dim() == 2, "Inputs must be 2D tensors");
|
| 91 |
+
TORCH_CHECK(a.size(1) == b.size(0), "Inner dimensions must match");
|
| 92 |
+
|
| 93 |
+
// Dimensions
|
| 94 |
+
int m = a.size(0);
|
| 95 |
+
int k = a.size(1);
|
| 96 |
+
int n = b.size(1);
|
| 97 |
+
|
| 98 |
+
// Output tensor
|
| 99 |
+
auto c = torch::empty({m, n},
|
| 100 |
+
torch::TensorOptions().dtype(torch::kFloat16).device(a.device()));
|
| 101 |
+
|
| 102 |
+
// Grid and block dimensions
|
| 103 |
+
dim3 block(BLOCK_SIZE, WMMA_M / WARP_SIZE); // 32 threads per warp, WMMA_M/32 warps
|
| 104 |
+
dim3 grid((n + WMMA_N - 1) / WMMA_N, (m + WMMA_M - 1) / WMMA_M);
|
| 105 |
+
|
| 106 |
+
// Launch kernel
|
| 107 |
+
optimized_matmul_kernel<<<grid, block>>>(
|
| 108 |
+
(half_t*)a.data_ptr(),
|
| 109 |
+
(half_t*)b.data_ptr(),
|
| 110 |
+
(half_t*)c.data_ptr(),
|
| 111 |
+
m, n, k);
|
| 112 |
+
|
| 113 |
+
cudaError_t err = cudaGetLastError();
|
| 114 |
+
if (err != cudaSuccess) {
|
| 115 |
+
TORCH_CHECK(false, "CUDA error: ", cudaGetErrorString(err));
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
return c;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
| 122 |
+
m.def("optimized_matmul", &optimized_matmul, "Tensor Core-optimized GEMM");
|
| 123 |
+
}
|