| namespace at::cuda::detail { | |
| // CUDA: grid stride looping | |
| // | |
| // int64_t _i_n_d_e_x specifically prevents overflow in the loop increment. | |
| // If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final | |
| // iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be | |
| // greater than INT_MAX. But in that case _i_n_d_e_x >= n, so there are no | |
| // further iterations and the overflowed value in i=_i_n_d_e_x is not used. | |
| int64_t _i_n_d_e_x = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; \ | |
| for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
| // Use 1024 threads per block, which requires cuda sm_2x or above | |
| constexpr int CUDA_NUM_THREADS = 1024; | |
| // CUDA: number of blocks for threads. | |
| inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { | |
| TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N); | |
| constexpr int64_t max_int = std::numeric_limits<int>::max(); | |
| // Round up division for positive number that cannot cause integer overflow | |
| auto block_num = (N - 1) / max_threads_per_block + 1; | |
| TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device"); | |
| return static_cast<int>(block_num); | |
| } | |
| } // namespace at::cuda::detail | |