| |
| |
| |
|
|
| #include <torch/extension.h> |
| #include <cuda.h> |
| #include <cuda_runtime.h> |
| #include <vector> |
|
|
| #define MIN(x, y) ((x) < (y) ? (x) : (y)) |
| #define MAX(x, y) ((x) < (y) ? (y) : (x)) |
| #define inf std::numeric_limits<float>::infinity() |
|
|
| #define CHECK_CUDA(tensor) {\ |
| TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ |
| TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } |
| void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} |
|
|
|
|
| #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600 |
| #define atomicMax_block atomicMax |
| #endif |
|
|
|
|
| template <typename scalar_t> |
| __global__ void forward_agg_cuda_kernel( |
| const int LH1, const int LW1, const int LH2, const int LW2, |
| const int gap_left, const int gap_right, float norm, |
| const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower, |
| torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
| const float* weights, float* new_weights ) { |
|
|
| const auto UH1 = LH1 + bool(!gap_left); |
| const auto UW1 = LW1 + bool(!gap_left); |
| const auto UH2 = LH2; |
| const auto UW2 = LW2; |
|
|
| int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| const int uw2 = idx % UW2; idx /= UW2; |
| const int uh2 = idx % UH2; idx /= UH2; |
| const int uw1 = idx % UW1; idx /= UW1; |
| const int uh1 = idx; |
| if (uh1 >= UH1) return; |
|
|
| |
| float sumw = 0, nrm = 0, res = 0; |
| |
| for (int i = 0; i < 4; i++) { |
| const int v = i/2, u = i%2; |
| |
| const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; |
| if (lh1 < 0 || lh1 >= LH1) continue; |
| const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; |
| if (lw1 < 0 || lw1 >= LW1) continue; |
|
|
| |
| const float weight = weights ? weights[lh1*LW1 + lw1] : 1; |
| sumw += weight; |
|
|
| const int lh2 = uh2 + 1 - 2*v; |
| if (lh2 < 0 || lh2 >= LH2) continue; |
| const int lw2 = uw2 + 1 - 2*u; |
| if (lw2 < 0 || lw2 >= LW2) continue; |
|
|
| res += weight * lower[lh1][lw1][lh2][lw2]; |
| nrm += weight; |
| } |
|
|
| |
| nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); |
| upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); |
| if (uh2 == 1 && uw2 == 1) |
| new_weights[uh1*UW1 + uw1] = sumw; |
| } |
|
|
| torch::Tensor forward_agg_cuda( int level, float norm, const torch::Tensor lower, |
| const at::optional<at::Tensor> weights, torch::Tensor upper ) { |
| CHECK_CUDA(lower); |
| CHECK_CUDA(upper); |
| if (weights) CHECK_CUDA(weights.value()); |
|
|
| const auto UH1 = upper.size(0); |
| const auto UW1 = upper.size(1); |
| const auto UH2 = upper.size(2); |
| const auto UW2 = upper.size(3); |
| const auto LH1 = lower.size(0); |
| const auto LW1 = lower.size(1); |
| const auto LH2 = lower.size(2); |
| const auto LW2 = lower.size(3); |
| TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
|
|
| const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
| const int gap_right= 1 << MAX(0, level-2); |
|
|
| const int MAX_THREADS = 512; |
| const int THREADS_PER_BLOCK = MAX_THREADS; |
| const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
|
|
| torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat32)); |
|
|
| |
| AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_agg_cuda", ([&] { |
| forward_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>( |
| LH1, LW1, LH2, LW2, |
| gap_left, gap_right, norm, |
| lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| weights ? weights->data_ptr<float>() : nullptr, new_weights.data_ptr<float>() ); |
| })); |
| return new_weights; |
| } |
|
|
| template <typename scalar_t> |
| __global__ void forward_pool_agg_cuda_kernel( |
| const int LH1, const int LW1, const int LH2, const int LW2, |
| |
| const int gap_left, const int gap_right, float norm, |
| const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower, |
| torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
| const float* weights, float* new_weights ) { |
|
|
| const auto UH1 = LH1 + bool(!gap_left); |
| const auto UW1 = LW1 + bool(!gap_left); |
| const auto UH2 = (LH2-1)/2 + 1; |
| const auto UW2 = (LW2-1)/2 + 1; |
|
|
| int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| const int uw2 = idx % UW2; idx /= UW2; |
| const int uh2 = idx % UH2; idx /= UH2; |
| const int uw1 = idx % UW1; idx /= UW1; |
| const int uh1 = idx; |
| if (uh1 >= UH1) return; |
|
|
| |
| float sumw = 0, nrm = 0, res = 0; |
| |
| for (int i = 0; i < 4; i++) { |
| const int v = i/2, u = i%2; |
| |
| const int lh1 = uh1 + (1-v) * gap_left - v * gap_right; |
| if (lh1 < 0 || lh1 >= LH1) continue; |
| const int lw1 = uw1 + (1-u) * gap_left - u * gap_right; |
| if (lw1 < 0 || lw1 >= LW1) continue; |
|
|
| |
| const float weight = weights ? weights[lh1*LW1 + lw1] : 1; |
| sumw += weight; |
|
|
| const int lh2_ = 2*(uh2 + 1 - 2*v); |
| const int lw2_ = 2*(uw2 + 1 - 2*u); |
| float lower_max = -inf; |
| #pragma unroll |
| for (int j = -1; j <= 1; j++) { |
| const int lh2 = lh2_ + j; |
| if (lh2 < 0 || lh2 >= LH2) continue; |
| #pragma unroll |
| for (int i = -1; i <= 1; i++) { |
| const int lw2 = lw2_ + i; |
| if (lw2 < 0 || lw2 >= LW2) continue; |
| float l = lower[lh1][lw1][lh2][lw2]; |
| lower_max = MAX(lower_max, l); |
| }} |
| if (lower_max == -inf) continue; |
|
|
| res += weight * lower_max; |
| nrm += weight; |
| } |
|
|
| |
| nrm = sumw * (nrm < sumw ? powf(nrm/sumw, norm) : 1); |
| upper[uh1][uw1][uh2][uw2] = (nrm ? res / nrm : 0); |
| if (uh2 == 1 && uw2 == 1) |
| new_weights[uh1*UW1 + uw1] = sumw; |
| } |
|
|
| torch::Tensor forward_pool_agg_cuda( int level, float norm, const torch::Tensor lower, |
| const at::optional<at::Tensor> weights, torch::Tensor upper ) { |
| CHECK_CUDA(lower); |
| CHECK_CUDA(upper); |
| if (weights) CHECK_CUDA(weights.value()); |
|
|
| const auto LH1 = lower.size(0); |
| const auto LW1 = lower.size(1); |
| const auto LH2 = lower.size(2); |
| const auto LW2 = lower.size(3); |
| const auto UH1 = upper.size(0); |
| const auto UW1 = upper.size(1); |
| const auto UH2 = upper.size(2); |
| const auto UW2 = upper.size(3); |
| TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
| TORCH_CHECK( UH2 == (LH2-1)/2+1 && UW2 == (LW2-1)/2+1, "lower level should be twice as big" ); |
|
|
| const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
| const int gap_right= 1 << MAX(0, level-2); |
|
|
| const int MAX_THREADS = 512; |
| const int THREADS_PER_BLOCK = MAX_THREADS; |
| const int N_BLOCKS = (UH1*UW1*UH2*UW2 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
|
|
| torch::Tensor new_weights = torch::zeros({UH1, UW1}, upper.options().dtype(torch::kFloat)); |
| |
| |
| AT_DISPATCH_FLOATING_TYPES_AND_HALF(lower.type(), "forward_pool_agg_cuda", ([&] { |
| forward_pool_agg_cuda_kernel<<<N_BLOCKS, THREADS_PER_BLOCK>>>( |
| LH1, LW1, LH2, LW2, |
| |
| gap_left, gap_right, norm, |
| lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| weights ? weights->data<float>() : nullptr, new_weights.data<float>() ); |
| })); |
| return new_weights; |
| } |
|
|
| __device__ inline int in(int lower, int var, int upper) { |
| return lower <= var && var < upper; |
| } |
| __device__ inline int sl(bool b) { |
| return b ? 1 : -1; |
| } |
|
|
| __device__ short atomicMaxShort(short* address, short val) { |
| unsigned int *base_address = (unsigned int *)((size_t)address & ~3); |
| |
| unsigned int order_from[] = {0x0010, 0x0032}; |
| unsigned int from = order_from[((size_t)address & 3) / 2]; |
| |
| unsigned int order_back[] = {0x3254, 0x5410}; |
| unsigned int back = order_back[((size_t)address & 3) / 2]; |
| unsigned int old, assumed, max_, new_; |
|
|
| old = *base_address; |
| do { |
| assumed = old; |
| max_ = max(val, (short)__byte_perm(old, 0, from)); |
| new_ = __byte_perm(old, max_, back); |
| old = atomicCAS(base_address, assumed, new_); |
| } while (assumed != old); |
| return old; |
| } |
|
|
| template <typename scalar_t> |
| __device__ inline void TplAtomicMax_block( scalar_t* before, scalar_t after ) { assert(!"atomicMax not implemented for this dtype"); } |
| template <> |
| __device__ inline void TplAtomicMax_block( at::Half* before, at::Half after ) { atomicMaxShort( (int16_t*)before, *(int16_t*)&after ); } |
| template <> |
| __device__ inline void TplAtomicMax_block( float* before, float after ) { atomicMax_block( (int32_t*)before, *(int32_t*)&after ); } |
|
|
| template <typename scalar_t> |
| __global__ void backward_agg_unpool_cuda_kernel( |
| const int UH1, const int UW1, |
| const int UH2, const int UW2, |
| const int LH2, const int LW2, |
| const int gap_left, const int gap_right, |
| const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> upper, |
| torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> lower ) { |
|
|
| |
| |
| |
| |
| |
| const int lh1 = blockIdx.y; |
| const int lw1 = blockIdx.x; |
| const int UHW2 = UH2 * UW2; |
|
|
| __shared__ float* _shared_addr; |
| if (threadIdx.x == 0) |
| do{ _shared_addr = new float [2*UHW2]; } |
| while(!_shared_addr); |
| __syncthreads(); |
|
|
| float * layer_best = _shared_addr; |
| int * layer_bestp = (int*)(_shared_addr+1); |
| assert( layer_best ); |
|
|
| |
| |
| for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { |
| const int ux = idx % UW2; |
| const int uy = idx / UW2; |
| const int lx = 2*ux; |
| const int ly = 2*uy; |
|
|
| |
| float best = -inf; |
| int bestp = 0; |
| #pragma unroll |
| for (int j_= -1; j_<= 1; j_++) { |
| const int j = ly + j_; |
| if (j < 0 || j >= LH2) continue; |
| #pragma unroll |
| for (int i_= -1; i_<= 1; i_++) { |
| const int i = lx + i_; |
| if (i < 0 || i >= LW2) continue; |
| float cur = lower[lh1][lw1][j][i]; |
| if (cur > best) { best = cur; bestp = j*LW2+i; } |
| }} |
| layer_best[2*idx] = best; |
| layer_bestp[2*idx] = bestp; |
| } |
| |
| __syncthreads(); |
| |
| |
| |
| for (int idx = threadIdx.x; idx < UHW2; idx += blockDim.x) { |
| const int ux = idx % UW2; |
| const int uy = idx / UW2; |
|
|
| |
| scalar_t add = 0; |
| for (int v = -gap_left; v <= gap_right; v += gap_right+gap_left) { |
| for (int u = -gap_left; u <= gap_right; u += gap_right+gap_left) { |
| const int uh1 = lh1 + v, uw1 = lw1 + u; |
| const int uh2 = uy+sl(v>0), uw2 = ux+sl(u>0); |
| if (in(0, uh1, UH1) && in(0, uw1, UW1) && in(0, uh2, UH2) && in(0, uw2, UW2)) |
| add = MAX(add, upper[uh1][uw1][uh2][uw2]); |
| }} |
|
|
| |
| float best = layer_best[2*idx]; |
| int bestp = layer_bestp[2*idx]; |
| const int lx = bestp % LW2; |
| const int ly = bestp / LW2; |
|
|
| |
| scalar_t* before = & lower[lh1][lw1][ly][lx]; |
| scalar_t after = best + add; |
| TplAtomicMax_block<scalar_t>( before, after ); |
| } |
|
|
| __syncthreads(); |
|
|
| if (threadIdx.x == 0) |
| delete _shared_addr; |
| } |
|
|
| void backward_agg_unpool_cuda( int level, const torch::Tensor upper, torch::Tensor lower, bool exclude_borders ) { |
| CHECK_CUDA(lower); |
| CHECK_CUDA(upper); |
|
|
| const auto UH1 = upper.size(0); |
| const auto UW1 = upper.size(1); |
| const auto UH2 = upper.size(2); |
| const auto UW2 = upper.size(3); |
| const auto LH1 = lower.size(0); |
| const auto LW1 = lower.size(1); |
| const auto LH2 = lower.size(2); |
| const auto LW2 = lower.size(3); |
| TORCH_CHECK( UH1 == LH1 + int(level==1) && UW1 == LW1 + int(level==1), "inconsistent lower and upper shapes" ); |
| const int xb = exclude_borders; |
|
|
| const int gap_left = (level >= 2) ? 1 << (level-2) : 0; |
| const int gap_right= 1 << MAX(0, level-2); |
|
|
| const int64_t MAX_THREADS = 1024; |
| const int64_t THREADS_PER_LAYER = MIN(UH2*UW2, MAX_THREADS); |
|
|
| |
| AT_DISPATCH_FLOATING_TYPES_AND_HALF(upper.type(), "backward_agg_unpool_cuda", ([&] { |
| backward_agg_unpool_cuda_kernel<<<dim3(LW1,LH1), THREADS_PER_LAYER>>>( |
| UH1, UW1, UH2, UW2, LH2-xb, LW2-xb, |
| gap_left, gap_right, |
| upper.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| lower.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>()); |
| })); |
| CHECK_KERNEL(); |
| } |
|
|
| template <typename scalar_t> |
| __global__ void max_pool3d_cuda_kernel( |
| const int BS, const int NC, const int IH, const int IW, const int OH, const int OW, |
| const int ks, const int stride, |
| const torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> tensor, |
| torch::PackedTensorAccessor64<scalar_t,3,torch::RestrictPtrTraits> maxima, |
| torch::PackedTensorAccessor64<int64_t, 3,torch::RestrictPtrTraits> indices ) { |
|
|
| |
| int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| const int x = idx % OW; idx /= OW; |
| const int y = idx % OH; idx /= OH; |
| const int b = idx; |
| if (b >= BS) return; |
|
|
| float best = -inf; |
| int64_t best_pos = 0; |
| for (int64_t c = 0; c < NC; c++) { |
| for (int j = stride*y; j < stride*y+ks; j++) { |
| for (int i = stride*x; i < stride*x+ks; i++) { |
| |
| float cur = tensor[b][c][j][i]; |
| if (cur > best) {best = cur; best_pos = (c*IH + j)*IW+ i; } |
| }}} |
|
|
| |
| maxima [b][y][x] = best; |
| indices[b][y][x] = best_pos; |
| } |
|
|
| void max_pool3d_cuda( const torch::Tensor tensor, const int kernel_size, const int stride, |
| torch::Tensor maxima, torch::Tensor indices ) { |
| CHECK_CUDA(tensor); |
| TORCH_CHECK(tensor.dim() == 4, "tensor should be 4-dimensional: BxCxHxW"); |
| const int BS = tensor.size(0); |
| const int NC = tensor.size(1); |
| const int IH = tensor.size(2); |
| const int IW = tensor.size(3); |
|
|
| |
| TORCH_CHECK( maxima.sizes() == indices.sizes(), "maxima and indices should have the same shape" ); |
| TORCH_CHECK( BS == maxima.size(0), "bad batch size" ); |
| const int OH = maxima.size(1); |
| const int OW = maxima.size(2); |
|
|
| const int64_t THREADS_PER_LAYER = 512; |
| const int64_t N_BLOCKS = (BS*OH*OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; |
| |
| |
| AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.type(), "max_pool3d_cuda", ([&] { |
| max_pool3d_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
| BS, NC, IH, IW, OH, OW, kernel_size, stride, |
| tensor. packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>(), |
| maxima. packed_accessor64<scalar_t,3,torch::RestrictPtrTraits>(), |
| indices.packed_accessor64<int64_t,3,torch::RestrictPtrTraits>()); |
| })); |
| } |
|
|
|
|
| __device__ inline float ptdot( const float* m, float x, float y ) { |
| return x*m[0] + y*m[1] + m[2]; |
| } |
|
|
| __device__ inline float sqr(float v) { |
| return v*v; |
| } |
|
|
|
|
| __global__ void merge_corres_cuda_kernel( |
| const int OH, const int OW, const int OZ, const int IH, const int IW, |
| const float dmax2, int offset, const float* inv_rot, const int all_step, |
| const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> corres_a, |
| torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> all_corres_a ) { |
|
|
| |
| int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| const int i = idx % OW; idx /= OW; |
| const int j = idx; |
| if (j >= OH) return; |
|
|
| const float tol2 = 2*2; |
| auto all_cor = all_corres_a[j][i]; |
| |
| |
| float x = i*all_step + all_step/2; |
| float y = j*all_step + all_step/2; |
|
|
| |
| float xr = ptdot( inv_rot + 0, x, y ); |
| float yr = ptdot( inv_rot + 3, x, y ); |
|
|
| |
| int xb = (int)(0.5+ xr/4); |
| int yb = (int)(0.5+ yr/4); |
| |
| float best = dmax2; |
| #pragma unroll |
| for (int _v = -1; _v <= 1; _v++) { |
| #pragma unroll |
| for (int _u = -1; _u <= 1; _u++) { |
| const int v = yb+_v, u = xb+_u; |
| if (!(in(0, v, IH) && in(0, u, IW))) continue; |
| auto cor = corres_a[v][u]; |
| float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); |
| if (d < best) best = d; |
| }} |
|
|
| #pragma unroll |
| for (int _v = -1; _v <= 1; _v++) { |
| #pragma unroll |
| for (int _u = -1; _u <= 1; _u++) { |
| const int v = yb+_v, u = xb+_u; |
| if (!(in(0, v, IH) && in(0, u, IW))) continue; |
| auto cor = corres_a[v][u]; |
| float d = sqr(cor[offset]-x) + sqr(cor[offset+1]-y); |
| if (d <= tol2*best) { |
| |
| if (cor[4] > all_cor[4]) |
| for (int k = 0; k < OZ; k++) all_cor[k] = cor[k]; |
| } |
| }} |
| } |
|
|
| void merge_corres_cuda( const torch::Tensor corres, const int offset, const torch::Tensor _inv_rot, |
| const float dmax, torch::Tensor all_corres, const int all_step ) { |
| CHECK_CUDA( corres ); |
| CHECK_CUDA( all_corres ); |
| CHECK_CUDA( _inv_rot ); |
| TORCH_CHECK(_inv_rot.is_contiguous(), "inv_rot should be contiguous" ); |
|
|
| const int IH = corres.size(0); |
| const int IW = corres.size(1); |
| const int IZ = corres.size(2); |
| const int OH = all_corres.size(0); |
| const int OW = all_corres.size(1); |
| const int OZ = all_corres.size(2); |
| TORCH_CHECK( IZ == OZ, "corres and all_corres should have the same shape[2]" ); |
|
|
| const int THREADS_PER_LAYER = 512; |
| const int N_BLOCKS = (OH * OW + THREADS_PER_LAYER-1) / THREADS_PER_LAYER; |
| |
| merge_corres_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
| OH, OW, OZ, IH, IW, dmax*dmax, offset, _inv_rot.data_ptr<float>(), all_step, |
| corres.packed_accessor32<float,3,torch::RestrictPtrTraits>(), |
| all_corres.packed_accessor32<float,3,torch::RestrictPtrTraits>()); |
| CHECK_KERNEL(); |
| } |
|
|
|
|
| template <typename scalar_t> |
| __global__ void mask_correlations_radial_cuda_kernel( |
| float radius, const float alpha, |
| const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> targets, |
| torch::PackedTensorAccessor64<scalar_t,4,torch::RestrictPtrTraits> corr ) { |
|
|
| #define H1 ((int)corr.size(0)) |
| #define W1 ((int)corr.size(1)) |
| #define H2 ((int)corr.size(2)) |
| #define W2 ((int)corr.size(3)) |
|
|
| |
| const int j = blockIdx.x / W1; |
| const int i = blockIdx.x % W1; |
| if (j >= H1) return; |
|
|
| |
| const float cx = targets[j][i][0]; |
| const float cy = targets[j][i][1]; |
| if (cx != cx || cy != cy) return; |
| radius *= radius; |
| const float alpha_out = (alpha > 1 ? 1 : alpha); |
| const float alpha_in = (alpha < 1 ? 1 : alpha); |
| |
| for (int idx = threadIdx.x; idx < H2*W2; idx += blockDim.x) { |
| const int v = idx / W2; |
| const int u = idx % W2; |
|
|
| |
| float dis2 = sqr(u - cx) + sqr(v - cy); |
| float mul = alpha_in; |
| if (dis2 > radius) |
| mul = 1 - alpha_out*(1 - radius / dis2); |
|
|
| corr[j][i][v][u] *= mul; |
| } |
| } |
|
|
| void mask_correlations_radial_cuda( torch::Tensor corr, const torch::Tensor targets, |
| const float radius, const float alpha) { |
| CHECK_CUDA( corr ); |
| CHECK_CUDA( targets ); |
|
|
| const int THREADS_PER_LAYER = 512; |
| const int N_BLOCKS = H1*W1; |
|
|
| #undef H1 |
| #undef W1 |
| #undef H2 |
| #undef W2 |
|
|
| AT_DISPATCH_FLOATING_TYPES_AND_HALF(corr.type(), "mask_correlations_radial_cuda", ([&] { |
| mask_correlations_radial_cuda_kernel<<<N_BLOCKS, THREADS_PER_LAYER>>>( |
| radius, alpha, |
| targets.packed_accessor32<float,3,torch::RestrictPtrTraits>(), |
| corr.packed_accessor64<scalar_t,4,torch::RestrictPtrTraits>()); |
| })); |
| CHECK_KERNEL(); |
| } |
|
|