Create fused_ops.cu
Browse files- core/kernels/fused_ops.cu +137 -0
core/kernels/fused_ops.cu
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cuda_fp16.h>
|
| 2 |
+
#include <cuda_runtime.h>
|
| 3 |
+
#include <torch/extension.h>
|
| 4 |
+
#include <mma.h> // Tensor Core WMMA API
|
| 5 |
+
|
| 6 |
+
using namespace nvcuda;
|
| 7 |
+
|
| 8 |
+
// Define FP16 type for Tensor Cores
|
| 9 |
+
using half_t = __half;
|
| 10 |
+
|
| 11 |
+
// Thread block and warp sizes
|
| 12 |
+
#define BLOCK_SIZE 32
|
| 13 |
+
#define WARP_SIZE 32
|
| 14 |
+
#define WMMA_M 16
|
| 15 |
+
#define WMMA_N 16
|
| 16 |
+
#define WMMA_K 16
|
| 17 |
+
|
| 18 |
+
// Fused sparse GEMM + ReLU kernel
|
| 19 |
+
__global__ void fused_sparse_gemm_relu_kernel(
|
| 20 |
+
const half_t* __restrict__ input, // Input tensor [batch_size, in_features]
|
| 21 |
+
const half_t* __restrict__ weight, // Weight tensor [out_features, in_features]
|
| 22 |
+
const half_t* __restrict__ mask, // Sparsity mask [out_features, in_features]
|
| 23 |
+
half_t* __restrict__ output, // Output tensor [batch_size, out_features]
|
| 24 |
+
const half_t* __restrict__ bias, // Bias tensor [out_features]
|
| 25 |
+
int batch_size, int in_features, int out_features)
|
| 26 |
+
{
|
| 27 |
+
// Shared memory for WMMA fragments
|
| 28 |
+
__shared__ half_t shmem_input[BLOCK_SIZE * WMMA_K];
|
| 29 |
+
__shared__ half_t shmem_weight[WMMA_M * WMMA_K];
|
| 30 |
+
|
| 31 |
+
// WMMA fragments
|
| 32 |
+
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half_t, wmma::row_major> a_frag;
|
| 33 |
+
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half_t, wmma::col_major> b_frag;
|
| 34 |
+
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half_t> c_frag;
|
| 35 |
+
|
| 36 |
+
// Thread indices
|
| 37 |
+
int bx = blockIdx.x;
|
| 38 |
+
int by = blockIdx.y;
|
| 39 |
+
int tx = threadIdx.x;
|
| 40 |
+
int ty = threadIdx.y;
|
| 41 |
+
|
| 42 |
+
// Global indices
|
| 43 |
+
int row = by * WMMA_M + ty; // Output row
|
| 44 |
+
int col = bx * WMMA_N + tx; // Output col
|
| 45 |
+
|
| 46 |
+
// Compute tile offsets
|
| 47 |
+
int batch_offset = blockIdx.z * in_features; // Batch dimension
|
| 48 |
+
|
| 49 |
+
// Initialize accumulator
|
| 50 |
+
wmma::fill_fragment(c_frag, __float2half(0.0f));
|
| 51 |
+
|
| 52 |
+
// Loop over K dimension (in_features) in WMMA tiles
|
| 53 |
+
for (int k = 0; k < in_features; k += WMMA_K) {
|
| 54 |
+
// Load input into shared memory
|
| 55 |
+
if (ty < WMMA_K && row < batch_size) {
|
| 56 |
+
shmem_input[ty * BLOCK_SIZE + tx] = input[batch_offset + row * in_features + k + tx];
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// Load sparse weight into shared memory (apply mask)
|
| 60 |
+
if (ty < WMMA_M && k + tx < in_features && row < out_features) {
|
| 61 |
+
half_t w = weight[row * in_features + k + tx];
|
| 62 |
+
half_t m = mask[row * in_features + k + tx];
|
| 63 |
+
shmem_weight[ty * WMMA_K + tx] = __hmul(w, m); // Apply sparsity mask
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
__syncthreads();
|
| 67 |
+
|
| 68 |
+
// Load WMMA fragments from shared memory
|
| 69 |
+
wmma::load_matrix_sync(a_frag, shmem_input, BLOCK_SIZE);
|
| 70 |
+
wmma::load_matrix_sync(b_frag, shmem_weight, WMMA_K);
|
| 71 |
+
|
| 72 |
+
// Perform Tensor Core matrix multiply-accumulate
|
| 73 |
+
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
|
| 74 |
+
|
| 75 |
+
__syncthreads();
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
// Store result with ReLU and bias
|
| 79 |
+
if (row < batch_size && col < out_features) {
|
| 80 |
+
half_t result = c_frag.x[ty * WMMA_N + tx];
|
| 81 |
+
result = __hadd(result, bias[col]); // Add bias
|
| 82 |
+
output[row * out_features + col] = __hgt(result, __float2half(0.0f)) ? result : __float2half(0.0f); // ReLU
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
// PyTorch binding
|
| 87 |
+
torch::Tensor fused_sparse_gemm_relu(
|
| 88 |
+
torch::Tensor input, // [batch_size, in_features]
|
| 89 |
+
torch::Tensor weight, // [out_features, in_features]
|
| 90 |
+
torch::Tensor mask, // [out_features, in_features]
|
| 91 |
+
torch::Tensor bias) // [out_features]
|
| 92 |
+
{
|
| 93 |
+
// Ensure inputs are FP16 and on CUDA
|
| 94 |
+
TORCH_CHECK(input.dtype() == torch::kFloat16, "Input must be FP16");
|
| 95 |
+
TORCH_CHECK(weight.dtype() == torch::kFloat16, "Weight must be FP16");
|
| 96 |
+
TORCH_CHECK(mask.dtype() == torch::kFloat16, "Mask must be FP16");
|
| 97 |
+
TORCH_CHECK(bias.dtype() == torch::kFloat16, "Bias must be FP16");
|
| 98 |
+
TORCH_CHECK(input.is_cuda(), "Input must be on CUDA");
|
| 99 |
+
TORCH_CHECK(weight.is_cuda(), "Weight must be on CUDA");
|
| 100 |
+
TORCH_CHECK(mask.is_cuda(), "Mask must be on CUDA");
|
| 101 |
+
TORCH_CHECK(bias.is_cuda(), "Bias must be on CUDA");
|
| 102 |
+
|
| 103 |
+
// Dimensions
|
| 104 |
+
int batch_size = input.size(0);
|
| 105 |
+
int in_features = input.size(1);
|
| 106 |
+
int out_features = weight.size(0);
|
| 107 |
+
|
| 108 |
+
// Output tensor
|
| 109 |
+
auto output = torch::empty({batch_size, out_features},
|
| 110 |
+
torch::TensorOptions().dtype(torch::kFloat16).device(input.device()));
|
| 111 |
+
|
| 112 |
+
// Grid and block dimensions
|
| 113 |
+
dim3 block(BLOCK_SIZE, WMMA_M / WARP_SIZE);
|
| 114 |
+
dim3 grid((out_features + WMMA_N - 1) / WMMA_N,
|
| 115 |
+
(batch_size + WMMA_M - 1) / WMMA_M,
|
| 116 |
+
batch_size);
|
| 117 |
+
|
| 118 |
+
// Launch kernel
|
| 119 |
+
fused_sparse_gemm_relu_kernel<<<grid, block>>>(
|
| 120 |
+
(half_t*)input.data_ptr(),
|
| 121 |
+
(half_t*)weight.data_ptr(),
|
| 122 |
+
(half_t*)mask.data_ptr(),
|
| 123 |
+
(half_t*)output.data_ptr(),
|
| 124 |
+
(half_t*)bias.data_ptr(),
|
| 125 |
+
batch_size, in_features, out_features);
|
| 126 |
+
|
| 127 |
+
cudaError_t err = cudaGetLastError();
|
| 128 |
+
if (err != cudaSuccess) {
|
| 129 |
+
TORCH_CHECK(false, "CUDA error: ", cudaGetErrorString(err));
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
return output;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
| 136 |
+
m.def("fused_sparse_gemm_relu", &fused_sparse_gemm_relu, "Fused sparse GEMM + ReLU with Tensor Cores");
|
| 137 |
+
}
|