| #pragma once |
|
|
| #include <limits> |
|
|
| namespace at { namespace cuda { namespace detail { |
|
|
| |
| |
| |
| |
| |
| |
| |
| #define CUDA_KERNEL_LOOP_TYPE(i, n, index_type) \ |
| int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x; \ |
| for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) |
|
|
| #define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int) |
|
|
|
|
| |
| constexpr int CUDA_NUM_THREADS = 1024; |
|
|
| |
| inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { |
| TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N); |
| constexpr int64_t max_int = std::numeric_limits<int>::max(); |
|
|
| |
| auto block_num = (N - 1) / max_threads_per_block + 1; |
| TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device"); |
|
|
| return static_cast<int>(block_num); |
| } |
|
|
| }}} |
|
|